From b7a322deb6d5b4be1021153991a10108f8c9e84c Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Tue, 7 Jul 2020 19:22:41 -0700
Subject: [PATCH 01/41] [CI] Add RMM as an optional dependency

---
 CMakeLists.txt                    |  7 ++++++
 Jenkinsfile                       | 34 +++++++++++++++++++++++++++
 cmake/ExternalLibs.cmake          | 27 +++++++++++++++++++++
 src/common/device_helpers.cuh     | 33 ++++++++++++++++++++++----
 tests/ci_build/Dockerfile.rmm     | 39 +++++++++++++++++++++++++++++++
 tests/ci_build/build_via_cmake.sh | 13 ++++++++++-
 6 files changed, 147 insertions(+), 6 deletions(-)
 create mode 100644 cmake/ExternalLibs.cmake
 create mode 100644 tests/ci_build/Dockerfile.rmm

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d8190c5fb809..830f2669ec21 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -44,6 +44,7 @@ option(USE_NCCL  "Build with NCCL to enable distributed GPU support." OFF)
 option(BUILD_WITH_SHARED_NCCL "Build with shared NCCL library." OFF)
 set(GPU_COMPUTE_VER "" CACHE STRING
   "Semicolon separated list of compute versions to be built against, e.g. '35;61'")
+option(USE_RMM "Build with RAPIDS Memory Manager (RMM)" OFF)
 ## Copied From dmlc
 option(USE_HDFS "Build with HDFS support" OFF)
 option(USE_AZURE "Build with AZURE support" OFF)
@@ -79,6 +80,9 @@ endif (R_LIB AND GOOGLE_TEST)
 if (USE_AVX)
   message(SEND_ERROR  "The option 'USE_AVX' is deprecated as experimental AVX features have been removed from XGBoost.")
 endif (USE_AVX)
+if (USE_RMM AND NOT (USE_CUDA))
+  message(SEND_ERROR "`USE_RMM` must be enabled with `USE_CUDA` flag.")
+endif (USE_RMM AND NOT (USE_CUDA))
 
 #-- Sanitizer
 if (USE_SANITIZER)
@@ -170,6 +174,9 @@ endif (R_LIB)
 # Plugin
 add_subdirectory(${xgboost_SOURCE_DIR}/plugin)
 
+# 3rd-party libs
+include(cmake/ExternalLibs.cmake)
+
 #-- library
 if (BUILD_STATIC_LIB)
   add_library(xgboost STATIC)
diff --git a/Jenkinsfile b/Jenkinsfile
index 40db6f9c0fe9..2c9c40b4480e 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -66,6 +66,7 @@ pipeline {
             'build-cpu-non-omp': { BuildCPUNonOmp() },
             'build-gpu-cuda10.0': { BuildCUDA(cuda_version: '10.0') },
             'build-gpu-cuda10.1': { BuildCUDA(cuda_version: '10.1') },
+            'build-gpu-rmm-cuda10.2': { BuildCUDAWithRMM(cuda_version: '10.2') },
             'build-jvm-packages': { BuildJVMPackages(spark_version: '2.4.3') },
             'build-jvm-doc': { BuildJVMDoc() }
           ])
@@ -84,6 +85,7 @@ pipeline {
             'test-python-mgpu-cuda10.1': { TestPythonGPU(cuda_version: '10.1', multi_gpu: true) },
             'test-cpp-gpu': { TestCppGPU(cuda_version: '10.1') },
             'test-cpp-mgpu': { TestCppGPU(cuda_version: '10.1', multi_gpu: true) },
+            'test-rmm-cpp-gpu': { TestCppGPUWithRMM(cuda_version: '10.2') },
             'test-jvm-jdk8': { CrossTestJVMwithJDK(jdk_version: '8', spark_version: '2.4.3') },
             'test-jvm-jdk11': { CrossTestJVMwithJDK(jdk_version: '11') },
             'test-jvm-jdk12': { CrossTestJVMwithJDK(jdk_version: '12') },
@@ -262,6 +264,22 @@ def BuildCUDA(args) {
   }
 }
 
+def BuildCUDAWithRMM(args) {
+  node('linux && cpu_build') {
+    unstash name: 'srcs'
+    echo "Build with CUDA ${args.cuda_version} and RMM"
+    def container_type = "rmm"
+    def docker_binary = "docker"
+    def docker_args = "--build-arg CUDA_VERSION=${args.cuda_version}"
+    sh """
+    ${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/build_via_cmake.sh --conda-env=rmm_test -DUSE_CUDA=ON -DUSE_RMM=ON
+    """
+    echo 'Stashing C++ test executable (testxgboost)...'
+    stash name: 'xgboost_rmm_cpp_tests', includes: 'build/testxgboost'
+    deleteDir()
+  }
+}
+
 def BuildJVMPackages(args) {
   node('linux && cpu') {
     unstash name: 'srcs'
@@ -368,6 +386,22 @@ def TestCppGPU(args) {
   }
 }
 
+def TestCppGPUWithRMM(args) {
+  node('linux && gpu') {
+    unstash name: 'xgboost_rmm_cpp_tests'
+    unstash name: 'srcs'
+    echo "Test C++, CUDA ${args.cuda_version} with RMM"
+    def container_type = "rmm"
+    def docker_binary = "nvidia-docker"
+    def docker_args = "--build-arg CUDA_VERSION=${args.cuda_version}"
+    echo "Using a single GPU"
+    sh """
+    ${dockerRun} ${container_type} ${docker_binary} ${docker_args} bash -c "source activate rmm_test && build/testxgboost --gtest_filter=-*.MGPU_*"
+    """
+    deleteDir()
+  }
+}
+
 def CrossTestJVMwithJDK(args) {
   node('linux && cpu') {
     unstash name: 'xgboost4j_jar'
diff --git a/cmake/ExternalLibs.cmake b/cmake/ExternalLibs.cmake
new file mode 100644
index 000000000000..7181699d508f
--- /dev/null
+++ b/cmake/ExternalLibs.cmake
@@ -0,0 +1,27 @@
+# RMM
+if (USE_RMM)
+  # Use Conda env if available
+  if(DEFINED ENV{CONDA_PREFIX})
+    set(CMAKE_PREFIX_PATH "$ENV{CONDA_PREFIX};${CMAKE_PREFIX_PATH}")
+    message(STATUS "Detected Conda environment, CMAKE_PREFIX_PATH set to: ${CMAKE_PREFIX_PATH}")
+  else()
+    message(STATUS "No Conda environment detected")
+  endif()
+
+  find_path(RMM_INCLUDE "rmm"
+    HINTS "$ENV{RMM_ROOT}/include")
+
+  find_library(RMM_LIBRARY "rmm"
+    HINTS "$ENV{RMM_ROOT}/lib" "$ENV{RMM_ROOT}/build")
+
+  if ((NOT RMM_LIBRARY) OR (NOT RMM_INCLUDE))
+    message(FATAL_ERROR "Could not locate RMM library")
+  endif ()
+
+  message(STATUS "RMM: RMM_LIBRARY set to ${RMM_LIBRARY}")
+  message(STATUS "RMM: RMM_INCLUDE set to ${RMM_INCLUDE}")
+
+  target_include_directories(objxgboost PUBLIC ${RMM_INCLUDE})
+  target_link_libraries(objxgboost PUBLIC ${RMM_LIBRARY} cuda)
+  target_compile_definitions(objxgboost PUBLIC -DXGBOOST_USE_RMM=1)
+endif ()
diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh
index d339dc72d712..c948ce130450 100644
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -36,7 +36,15 @@
 
 #ifdef XGBOOST_USE_NCCL
 #include "nccl.h"
-#endif
+#endif  // XGBOOST_USE_NCCL
+
+#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+#include "rmm/mr/device/cuda_memory_resource.hpp"
+#include "rmm/mr/device/default_memory_resource.hpp"
+#include "rmm/mr/device/device_memory_resource.hpp"
+#include "rmm/mr/device/pool_memory_resource.hpp"
+#include "rmm/mr/device/thrust_allocator_adaptor.hpp"
+#endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
 
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 || defined(__clang__)
 
@@ -370,12 +378,21 @@ inline void DebugSyncDevice(std::string file="", int32_t line = -1) {
 }
 
 namespace detail {
+
+#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+template <typename T>
+using XGBBaseDeviceAllocator = rmm::mr::thrust_allocator<T>;
+#else  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+template <typename T>
+using XGBBaseDeviceAllocator = thrust::device_malloc_allocator<T>;
+#endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+
 /**
  * \brief Default memory allocator, uses cudaMalloc/Free and logs allocations if verbose.
  */
 template <class T>
-struct XGBDefaultDeviceAllocatorImpl : thrust::device_malloc_allocator<T> {
-  using SuperT = thrust::device_malloc_allocator<T>;
+struct XGBDefaultDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
+  using SuperT = XGBBaseDeviceAllocator<T>;
   using pointer = thrust::device_ptr<T>;  // NOLINT
   template<typename U>
   struct rebind  // NOLINT
@@ -391,13 +408,19 @@ struct XGBDefaultDeviceAllocatorImpl : thrust::device_malloc_allocator<T> {
     GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T));
     return SuperT::deallocate(ptr, n);
   }
+#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+  using cuda_mr = rmm::mr::cuda_memory_resource;
+  using pool_mr = rmm::mr::pool_memory_resource<cuda_mr>;
+  XGBDefaultDeviceAllocatorImpl() : SuperT(new pool_mr(new cuda_mr), cudaStream_t{0}) {}
+#endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
 };
 
 /**
- * \brief Caching memory allocator, uses cub::CachingDeviceAllocator as a back-end and logs allocations if verbose. Does not initialise memory on construction.
+ * \brief Caching memory allocator, uses cub::CachingDeviceAllocator as a back-end and logs
+ * allocations if verbose. Does not initialise memory on construction.
  */
 template <class T>
-struct XGBCachingDeviceAllocatorImpl : thrust::device_malloc_allocator<T> {
+struct XGBCachingDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
   using pointer = thrust::device_ptr<T>;  // NOLINT
   template<typename U>
   struct rebind  // NOLINT
diff --git a/tests/ci_build/Dockerfile.rmm b/tests/ci_build/Dockerfile.rmm
new file mode 100644
index 000000000000..18730a9c0889
--- /dev/null
+++ b/tests/ci_build/Dockerfile.rmm
@@ -0,0 +1,39 @@
+ARG CUDA_VERSION
+FROM nvidia/cuda:$CUDA_VERSION-devel-ubuntu18.04
+
+# Environment
+ENV DEBIAN_FRONTEND noninteractive
+SHELL ["/bin/bash", "-c"]   # Use Bash as shell
+
+# Install all basic requirements
+RUN \
+    apt-get update && \
+    apt-get install -y wget unzip bzip2 libgomp1 build-essential ninja-build git && \
+    # Python
+    wget -O Miniconda3.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
+    bash Miniconda3.sh -b -p /opt/python && \
+    # CMake
+    wget -nv -nc https://cmake.org/files/v3.13/cmake-3.13.0-Linux-x86_64.sh --no-check-certificate && \
+    bash cmake-3.13.0-Linux-x86_64.sh --skip-license --prefix=/usr
+
+ENV PATH=/opt/python/bin:$PATH
+
+# Create new Conda environment with RMM
+RUN \
+    conda create -n rmm_test -c nvidia -c rapidsai -c conda-forge -c defaults \
+        python=3.7 rmm=0.14 cudatoolkit=$CUDA_VERSION
+
+ENV GOSU_VERSION 1.10
+
+# Install lightweight sudo (not bound to TTY)
+RUN set -ex; \
+    wget -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \
+    chmod +x /usr/local/bin/gosu && \
+    gosu nobody true
+
+# Default entry-point to use if running locally
+# It will preserve attributes of created files
+COPY entrypoint.sh /scripts/
+
+WORKDIR /workspace
+ENTRYPOINT ["/scripts/entrypoint.sh"]
diff --git a/tests/ci_build/build_via_cmake.sh b/tests/ci_build/build_via_cmake.sh
index 98808141b0e8..8fc0c0acc66f 100755
--- a/tests/ci_build/build_via_cmake.sh
+++ b/tests/ci_build/build_via_cmake.sh
@@ -1,10 +1,21 @@
 #!/usr/bin/env bash
 set -e
 
+if [[ "$1" == --conda-env=* ]]
+then
+  conda_env=$(echo "$1" | sed 's/^--conda-env=//g' -)
+  echo "Activating Conda environment ${conda_env}"
+  shift 1
+  cmake_args="$@"
+  source activate ${conda_env}
+else
+  cmake_args="$@"
+fi
+
 rm -rf build
 mkdir build
 cd build
-cmake .. "$@" -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DCMAKE_VERBOSE_MAKEFILE=ON
+cmake .. ${cmake_args} -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DCMAKE_VERBOSE_MAKEFILE=ON
 make clean
 make -j$(nproc)
 cd ..

From e15845d4e72e890c2babe31a988b26503a7d9038 Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Wed, 8 Jul 2020 16:08:59 -0700
Subject: [PATCH 02/41] Replace caching allocator with pool allocator from RMM

---
 src/common/device_helpers.cuh | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh
index c948ce130450..3870f6082d2d 100644
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -420,7 +420,7 @@ struct XGBDefaultDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
  * allocations if verbose. Does not initialise memory on construction.
  */
 template <class T>
-struct XGBCachingDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
+struct XGBCachingDeviceAllocatorImpl : thrust::device_malloc_allocator<T> {
   using pointer = thrust::device_ptr<T>;  // NOLINT
   template<typename U>
   struct rebind  // NOLINT
@@ -462,8 +462,13 @@ using XGBDeviceAllocator = detail::XGBDefaultDeviceAllocatorImpl<T>;
 /*! Be careful that the initialization constructor is a no-op, which means calling
  *  `vec.resize(n)` won't initialize the memory region to 0. Instead use
  * `vec.resize(n, 0)`*/
+#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+template <typename T>
+using XGBCachingDeviceAllocator = detail::XGBDefaultDeviceAllocatorImpl<T>;
+#else  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
 template <typename T>
 using XGBCachingDeviceAllocator = detail::XGBCachingDeviceAllocatorImpl<T>;
+#endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
 /** \brief Specialisation of thrust device vector using custom allocator. */
 template <typename T>
 using device_vector = thrust::device_vector<T,  XGBDeviceAllocator<T>>;  // NOLINT

From 812c2092240e2e9011d319602656f8e26236b1f1 Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Wed, 8 Jul 2020 21:43:52 -0700
Subject: [PATCH 03/41] Revert "Replace caching allocator with pool allocator
 from RMM"

This reverts commit e15845d4e72e890c2babe31a988b26503a7d9038.
---
 src/common/device_helpers.cuh | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh
index 3870f6082d2d..e8054d9948b0 100644
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -462,13 +462,8 @@ using XGBDeviceAllocator = detail::XGBDefaultDeviceAllocatorImpl<T>;
 /*! Be careful that the initialization constructor is a no-op, which means calling
  *  `vec.resize(n)` won't initialize the memory region to 0. Instead use
  * `vec.resize(n, 0)`*/
-#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
-template <typename T>
-using XGBCachingDeviceAllocator = detail::XGBDefaultDeviceAllocatorImpl<T>;
-#else  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
 template <typename T>
 using XGBCachingDeviceAllocator = detail::XGBCachingDeviceAllocatorImpl<T>;
-#endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
 /** \brief Specialisation of thrust device vector using custom allocator. */
 template <typename T>
 using device_vector = thrust::device_vector<T,  XGBDeviceAllocator<T>>;  // NOLINT

From a8911128d0dcf598e5ed50adf0db4d777a873d35 Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Wed, 8 Jul 2020 21:48:41 -0700
Subject: [PATCH 04/41] Use rmm::mr::get_default_resource()

---
 src/common/device_helpers.cuh | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh
index e8054d9948b0..ab40a46ba2d0 100644
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -39,10 +39,7 @@
 #endif  // XGBOOST_USE_NCCL
 
 #if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
-#include "rmm/mr/device/cuda_memory_resource.hpp"
 #include "rmm/mr/device/default_memory_resource.hpp"
-#include "rmm/mr/device/device_memory_resource.hpp"
-#include "rmm/mr/device/pool_memory_resource.hpp"
 #include "rmm/mr/device/thrust_allocator_adaptor.hpp"
 #endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
 
@@ -409,9 +406,7 @@ struct XGBDefaultDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
     return SuperT::deallocate(ptr, n);
   }
 #if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
-  using cuda_mr = rmm::mr::cuda_memory_resource;
-  using pool_mr = rmm::mr::pool_memory_resource<cuda_mr>;
-  XGBDefaultDeviceAllocatorImpl() : SuperT(new pool_mr(new cuda_mr), cudaStream_t{0}) {}
+  XGBDefaultDeviceAllocatorImpl() : SuperT(rmm::mr::get_default_resource(), cudaStream_t{0}) {}
 #endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
 };
 

From b5eb54d8c71ca62bdf2d0012f4fcd17ecef133e9 Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Thu, 9 Jul 2020 10:05:59 -0700
Subject: [PATCH 05/41] Try setting default resource (doesn't work yet)

---
 tests/cpp/helpers.cc   |  4 ++++
 tests/cpp/helpers.cu   | 12 ++++++++++++
 tests/cpp/helpers.h    |  2 ++
 tests/cpp/test_main.cc |  3 +++
 4 files changed, 21 insertions(+)

diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc
index 2274e57e7307..e3b3ccb88194 100644
--- a/tests/cpp/helpers.cc
+++ b/tests/cpp/helpers.cc
@@ -478,4 +478,8 @@ std::unique_ptr<GradientBooster> CreateTrainedGBM(
   return gbm;
 }
 
+#ifndef XGBOOST_USE_CUDA
+void SetUpRMMResource() {}
+#endif  // XGBOOST_USE_CUDA
+
 }  // namespace xgboost
diff --git a/tests/cpp/helpers.cu b/tests/cpp/helpers.cu
index 9b70ea543231..7d240cb60bc7 100644
--- a/tests/cpp/helpers.cu
+++ b/tests/cpp/helpers.cu
@@ -4,6 +4,11 @@
 #include "../../src/data/device_adapter.cuh"
 #include "../../src/data/iterative_device_dmatrix.h"
 
+#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+#include "rmm/mr/device/default_memory_resource.hpp"
+#include "rmm/mr/device/cnmem_memory_resource.hpp"
+#endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+
 namespace xgboost {
 
 CudaArrayIterForTest::CudaArrayIterForTest(float sparsity, size_t rows,
@@ -40,4 +45,11 @@ std::shared_ptr<DMatrix> RandomDataGenerator::GenerateDeviceDMatrix(bool with_la
       0, bins_);
   return m;
 }
+
+void SetUpRMMResource() {
+#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+	rmm::mr::cnmem_memory_resource pool_mr{};
+	rmm::mr::set_default_resource(&pool_mr);
+#endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+}
 }  // namespace xgboost
diff --git a/tests/cpp/helpers.h b/tests/cpp/helpers.h
index 0783b9a89dba..441e8a6b0c5d 100644
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@@ -352,5 +352,7 @@ inline int Next(DataIterHandle self) {
   return static_cast<CudaArrayIterForTest*>(self)->Next();
 }
 
+void SetUpRMMResource();
+
 }  // namespace xgboost
 #endif
diff --git a/tests/cpp/test_main.cc b/tests/cpp/test_main.cc
index d9f3e8f338a8..eaf2274250d3 100644
--- a/tests/cpp/test_main.cc
+++ b/tests/cpp/test_main.cc
@@ -5,7 +5,10 @@
 #include <string>
 #include <vector>
 
+#include "helpers.h"
+
 int main(int argc, char ** argv) {
+  xgboost::SetUpRMMResource();
   xgboost::Args args {{"verbosity", "2"}};
   xgboost::ConsoleLogger::Configure(args);
 

From 6abd4c0e29904bf9b67000bb9acf1bcd8cccea03 Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Thu, 9 Jul 2020 10:31:02 -0700
Subject: [PATCH 06/41] Allocate pool_mr in the heap

---
 tests/cpp/helpers.cu | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/tests/cpp/helpers.cu b/tests/cpp/helpers.cu
index 7d240cb60bc7..c5ec9cc72dc9 100644
--- a/tests/cpp/helpers.cu
+++ b/tests/cpp/helpers.cu
@@ -5,8 +5,10 @@
 #include "../../src/data/iterative_device_dmatrix.h"
 
 #if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+#include <memory>
 #include "rmm/mr/device/default_memory_resource.hpp"
-#include "rmm/mr/device/cnmem_memory_resource.hpp"
+#include "rmm/mr/device/cuda_memory_resource.hpp"
+#include "rmm/mr/device/pool_memory_resource.hpp"
 #endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
 
 namespace xgboost {
@@ -48,8 +50,11 @@ std::shared_ptr<DMatrix> RandomDataGenerator::GenerateDeviceDMatrix(bool with_la
 
 void SetUpRMMResource() {
 #if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
-	rmm::mr::cnmem_memory_resource pool_mr{};
-	rmm::mr::set_default_resource(&pool_mr);
+  using cuda_mr_t = rmm::mr::cuda_memory_resource;
+  using pool_mr_t = rmm::mr::pool_memory_resource<cuda_mr_t>;
+  auto cuda_mr = std::make_unique<cuda_mr_t>();
+  auto pool_mr = std::make_unique<pool_mr_t>(cuda_mr.release());
+	rmm::mr::set_default_resource(pool_mr.release());
 #endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
 }
 }  // namespace xgboost

From 2bdbc238c80c88ab0ba2793850b4ab2c9ddb9da3 Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Thu, 9 Jul 2020 12:04:45 -0700
Subject: [PATCH 07/41] Prevent leaking pool_mr handle

---
 tests/cpp/helpers.cc   | 14 +++++++++++---
 tests/cpp/helpers.cu   | 20 +++++++++++++++-----
 tests/cpp/helpers.h    |  5 ++++-
 tests/cpp/test_main.cc |  3 ++-
 4 files changed, 32 insertions(+), 10 deletions(-)

diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc
index e3b3ccb88194..04c7f934c621 100644
--- a/tests/cpp/helpers.cc
+++ b/tests/cpp/helpers.cc
@@ -478,8 +478,16 @@ std::unique_ptr<GradientBooster> CreateTrainedGBM(
   return gbm;
 }
 
-#ifndef XGBOOST_USE_CUDA
-void SetUpRMMResource() {}
-#endif  // XGBOOST_USE_CUDA
+#if !defined(XGBOOST_USE_RMM) || XGBOOST_USE_RMM != 1
+class RMMAllocator {};
+
+void DeleteRMMResource(RMMAllocator* r) {
+  delete r;
+}
+
+RMMAllocatorPtr SetUpRMMResource() {
+  return RMMAllocatorPtr(nullptr, DeleteRMMResource);
+}
+#endif  // !defined(XGBOOST_USE_RMM) || XGBOOST_USE_RMM != 1
 
 }  // namespace xgboost
diff --git a/tests/cpp/helpers.cu b/tests/cpp/helpers.cu
index c5ec9cc72dc9..ebaa0a49f20c 100644
--- a/tests/cpp/helpers.cu
+++ b/tests/cpp/helpers.cu
@@ -48,13 +48,23 @@ std::shared_ptr<DMatrix> RandomDataGenerator::GenerateDeviceDMatrix(bool with_la
   return m;
 }
 
-void SetUpRMMResource() {
 #if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
-  using cuda_mr_t = rmm::mr::cuda_memory_resource;
-  using pool_mr_t = rmm::mr::pool_memory_resource<cuda_mr_t>;
+using cuda_mr_t = rmm::mr::cuda_memory_resource;
+using pool_mr_t = rmm::mr::pool_memory_resource<cuda_mr_t>;
+class RMMAllocator {
+ public:
+  std::unique_ptr<pool_mr_t> handle;
+};
+
+void DeleteRMMResource(RMMAllocator* r) {
+  delete r;
+}
+
+RMMAllocatorPtr SetUpRMMResource() {
   auto cuda_mr = std::make_unique<cuda_mr_t>();
   auto pool_mr = std::make_unique<pool_mr_t>(cuda_mr.release());
-	rmm::mr::set_default_resource(pool_mr.release());
-#endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+	rmm::mr::set_default_resource(pool_mr.get());
+  return RMMAllocatorPtr(new RMMAllocator{std::move(pool_mr)}, DeleteRMMResource);
 }
+#endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
 }  // namespace xgboost
diff --git a/tests/cpp/helpers.h b/tests/cpp/helpers.h
index 441e8a6b0c5d..967223fed9bb 100644
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@@ -8,6 +8,7 @@
 #include <fstream>
 #include <cstdio>
 #include <string>
+#include <memory>
 #include <vector>
 #include <sys/stat.h>
 #include <sys/types.h>
@@ -352,7 +353,9 @@ inline int Next(DataIterHandle self) {
   return static_cast<CudaArrayIterForTest*>(self)->Next();
 }
 
-void SetUpRMMResource();
+class RMMAllocator;
+using RMMAllocatorPtr = std::unique_ptr<RMMAllocator, void(*)(RMMAllocator*)>;
+RMMAllocatorPtr SetUpRMMResource();
 
 }  // namespace xgboost
 #endif
diff --git a/tests/cpp/test_main.cc b/tests/cpp/test_main.cc
index eaf2274250d3..ed35ff775e82 100644
--- a/tests/cpp/test_main.cc
+++ b/tests/cpp/test_main.cc
@@ -3,12 +3,13 @@
 #include <xgboost/base.h>
 #include <xgboost/logging.h>
 #include <string>
+#include <memory>
 #include <vector>
 
 #include "helpers.h"
 
 int main(int argc, char ** argv) {
-  xgboost::SetUpRMMResource();
+  auto rmm_alloc = xgboost::SetUpRMMResource();
   xgboost::Args args {{"verbosity", "2"}};
   xgboost::ConsoleLogger::Configure(args);
 

From c7236326ae2a32d42ab0dab2627dcb94af87a31d Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Thu, 9 Jul 2020 12:57:49 -0700
Subject: [PATCH 08/41] Separate EXPECT_DEATH() in separate test suite suffixed
 DeathTest

---
 tests/cpp/common/test_span.cc                 | 82 +++++++++++++++----
 tests/cpp/common/test_span.cu                 |  2 +-
 tests/cpp/common/test_transform_range.cc      |  2 +-
 .../cpp/tree/gpu_hist/test_row_partitioner.cu |  2 +-
 4 files changed, 68 insertions(+), 20 deletions(-)

diff --git a/tests/cpp/common/test_span.cc b/tests/cpp/common/test_span.cc
index c49763f3f61b..53fd32e1a1f0 100644
--- a/tests/cpp/common/test_span.cc
+++ b/tests/cpp/common/test_span.cc
@@ -97,11 +97,6 @@ TEST(Span, FromPtrLen) {
     }
   }
 
-  {
-    auto lazy = [=]() {Span<float const, 16> tmp (arr, 5);};
-    EXPECT_DEATH(lazy(), "\\[xgboost\\] Condition .* failed.\n");
-  }
-
   // dynamic extent
   {
     Span<float, 16> s (arr, 16);
@@ -122,6 +117,15 @@ TEST(Span, FromPtrLen) {
   }
 }
 
+TEST(SpanDeathTest, FromPtrLen) {
+  float arr[16];
+  InitializeRange(arr, arr+16);
+  {
+    auto lazy = [=]() {Span<float const, 16> tmp (arr, 5);};
+    EXPECT_DEATH(lazy(), "\\[xgboost\\] Condition .* failed.\n");
+  }
+}
+
 TEST(Span, FromFirstLast) {
   float arr[16];
   InitializeRange(arr, arr+16);
@@ -285,7 +289,13 @@ TEST(Span, ElementAccess) {
     ASSERT_EQ(i, arr[j]);
     ++j;
   }
+}
+
+TEST(SpanDeathTest, ElementAccess) {
+  float arr[16];
+  InitializeRange(arr, arr + 16);
 
+  Span<float> s (arr);
   EXPECT_DEATH(s[16], "\\[xgboost\\] Condition .* failed.\n");
   EXPECT_DEATH(s[-1], "\\[xgboost\\] Condition .* failed.\n");
 
@@ -312,7 +322,9 @@ TEST(Span, FrontBack) {
     ASSERT_EQ(s.front(), 0);
     ASSERT_EQ(s.back(), 3);
   }
+}
 
+TEST(SpanDeathTest, FrontBack) {
   {
     Span<float, 0> s;
     EXPECT_DEATH(s.front(), "\\[xgboost\\] Condition .* failed.\n");
@@ -340,10 +352,6 @@ TEST(Span, FirstLast) {
     for (size_t i = 0; i < first.size(); ++i) {
       ASSERT_EQ(first[i], arr[i]);
     }
-    auto constexpr kOne = static_cast<Span<float, 4>::index_type>(-1);
-    EXPECT_DEATH(s.first<kOne>(), "\\[xgboost\\] Condition .* failed.\n");
-    EXPECT_DEATH(s.first<17>(), "\\[xgboost\\] Condition .* failed.\n");
-    EXPECT_DEATH(s.first<32>(), "\\[xgboost\\] Condition .* failed.\n");
   }
 
   {
@@ -359,10 +367,6 @@ TEST(Span, FirstLast) {
     for (size_t i = 0; i < last.size(); ++i) {
       ASSERT_EQ(last[i], arr[i+12]);
     }
-    auto constexpr kOne = static_cast<Span<float, 4>::index_type>(-1);
-    EXPECT_DEATH(s.last<kOne>(), "\\[xgboost\\] Condition .* failed.\n");
-    EXPECT_DEATH(s.last<17>(), "\\[xgboost\\] Condition .* failed.\n");
-    EXPECT_DEATH(s.last<32>(), "\\[xgboost\\] Condition .* failed.\n");
   }
 
   // dynamic extent
@@ -379,10 +383,6 @@ TEST(Span, FirstLast) {
       ASSERT_EQ(first[i], s[i]);
     }
 
-    EXPECT_DEATH(s.first(-1), "\\[xgboost\\] Condition .* failed.\n");
-    EXPECT_DEATH(s.first(17), "\\[xgboost\\] Condition .* failed.\n");
-    EXPECT_DEATH(s.first(32), "\\[xgboost\\] Condition .* failed.\n");
-
     delete [] arr;
   }
 
@@ -399,6 +399,50 @@ TEST(Span, FirstLast) {
       ASSERT_EQ(s[12 + i], last[i]);
     }
 
+    delete [] arr;
+  }
+}
+
+TEST(SpanDeathTest, FirstLast) {
+  // static extent
+  {
+    float arr[16];
+    InitializeRange(arr, arr + 16);
+
+    Span<float> s (arr);
+    auto constexpr kOne = static_cast<Span<float, 4>::index_type>(-1);
+    EXPECT_DEATH(s.first<kOne>(), "\\[xgboost\\] Condition .* failed.\n");
+    EXPECT_DEATH(s.first<17>(), "\\[xgboost\\] Condition .* failed.\n");
+    EXPECT_DEATH(s.first<32>(), "\\[xgboost\\] Condition .* failed.\n");
+  }
+
+  {
+    float arr[16];
+    InitializeRange(arr, arr + 16);
+
+    Span<float> s (arr);
+    auto constexpr kOne = static_cast<Span<float, 4>::index_type>(-1);
+    EXPECT_DEATH(s.last<kOne>(), "\\[xgboost\\] Condition .* failed.\n");
+    EXPECT_DEATH(s.last<17>(), "\\[xgboost\\] Condition .* failed.\n");
+    EXPECT_DEATH(s.last<32>(), "\\[xgboost\\] Condition .* failed.\n");
+  }
+
+  // dynamic extent
+  {
+    float *arr = new float[16];
+    InitializeRange(arr, arr + 16);
+    Span<float> s (arr, 16);
+    EXPECT_DEATH(s.first(-1), "\\[xgboost\\] Condition .* failed.\n");
+    EXPECT_DEATH(s.first(17), "\\[xgboost\\] Condition .* failed.\n");
+    EXPECT_DEATH(s.first(32), "\\[xgboost\\] Condition .* failed.\n");
+
+    delete [] arr;
+  }
+
+  {
+    float *arr = new float[16];
+    InitializeRange(arr, arr + 16);
+    Span<float> s (arr, 16);
     EXPECT_DEATH(s.last(-1), "\\[xgboost\\] Condition .* failed.\n");
     EXPECT_DEATH(s.last(17), "\\[xgboost\\] Condition .* failed.\n");
     EXPECT_DEATH(s.last(32), "\\[xgboost\\] Condition .* failed.\n");
@@ -420,7 +464,11 @@ TEST(Span, Subspan) {
   auto s4 = s1.subspan(2, dynamic_extent);
   ASSERT_EQ(s1.data() + 2, s4.data());
   ASSERT_EQ(s4.size(), s1.size() - 2);
+}
 
+TEST(SpanDeathTest, Subspan) {
+  int arr[16] {0};
+  Span<int> s1 (arr);
   EXPECT_DEATH(s1.subspan(-1, 0), "\\[xgboost\\] Condition .* failed.\n");
   EXPECT_DEATH(s1.subspan(17, 0), "\\[xgboost\\] Condition .* failed.\n");
 
diff --git a/tests/cpp/common/test_span.cu b/tests/cpp/common/test_span.cu
index 00e00d4f4c3c..7e9336902b36 100644
--- a/tests/cpp/common/test_span.cu
+++ b/tests/cpp/common/test_span.cu
@@ -221,7 +221,7 @@ struct TestElementAccess {
   }
 };
 
-TEST(GPUSpan, ElementAccess) {
+TEST(GPUSpanDeathTest, ElementAccess) {
   dh::safe_cuda(cudaSetDevice(0));
   auto test_element_access = []() {
     thrust::host_vector<float> h_vec (16);
diff --git a/tests/cpp/common/test_transform_range.cc b/tests/cpp/common/test_transform_range.cc
index 68319dfd3ff0..84163ea669b7 100644
--- a/tests/cpp/common/test_transform_range.cc
+++ b/tests/cpp/common/test_transform_range.cc
@@ -59,7 +59,7 @@ TEST(Transform, DeclareUnifiedTest(Basic)) {
 }
 
 #if !defined(__CUDACC__)
-TEST(Transform, Exception) {
+TEST(TransformDeathTest, Exception) {
   size_t const kSize {16};
   std::vector<bst_float> h_in(kSize);
   const HostDeviceVector<bst_float> in_vec{h_in, -1};
diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
index 3210a25a1d5f..4879ca080937 100644
--- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
+++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
@@ -119,7 +119,7 @@ void TestIncorrectRow() {
   });
 }
 
-TEST(RowPartitioner, IncorrectRow) {
+TEST(RowPartitionerDeathTest, IncorrectRow) {
   ASSERT_DEATH({ TestIncorrectRow(); },".*");
 }
 }  // namespace tree

From 78c22544d89306d5f4b24ea19278b40b5e181b87 Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Thu, 9 Jul 2020 13:01:05 -0700
Subject: [PATCH 09/41] Turn off death tests for RMM

---
 Jenkinsfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 2c9c40b4480e..e2f3980dc9bd 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -396,7 +396,7 @@ def TestCppGPUWithRMM(args) {
     def docker_args = "--build-arg CUDA_VERSION=${args.cuda_version}"
     echo "Using a single GPU"
     sh """
-    ${dockerRun} ${container_type} ${docker_binary} ${docker_args} bash -c "source activate rmm_test && build/testxgboost --gtest_filter=-*.MGPU_*"
+    ${dockerRun} ${container_type} ${docker_binary} ${docker_args} bash -c "source activate rmm_test && build/testxgboost --gtest_filter=-*.MGPU_*:*DeathTest.*"
     """
     deleteDir()
   }

From a520fa157a2d1c67b6c92a72c60a390ae365faf7 Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Thu, 9 Jul 2020 16:09:52 -0700
Subject: [PATCH 10/41] Address reviewer's feedback

---
 tests/cpp/CMakeLists.txt |  2 ++
 tests/cpp/helpers.cc     | 29 ++++++++++++++++++++++++++---
 tests/cpp/helpers.cu     | 27 ---------------------------
 tests/cpp/helpers.h      |  2 +-
 tests/cpp/test_main.cc   |  2 +-
 5 files changed, 30 insertions(+), 32 deletions(-)

diff --git a/tests/cpp/CMakeLists.txt b/tests/cpp/CMakeLists.txt
index 75948e5251f7..f6cfc24b26f1 100644
--- a/tests/cpp/CMakeLists.txt
+++ b/tests/cpp/CMakeLists.txt
@@ -29,6 +29,8 @@ if (USE_CUDA)
     $<$<COMPILE_LANGUAGE:CUDA>:${GEN_CODE}>)
   target_compile_definitions(testxgboost
     PRIVATE -DXGBOOST_USE_CUDA=1)
+  find_package(CUDA)
+  target_include_directories(testxgboost PRIVATE ${CUDA_INCLUDE_DIRS})
   set_target_properties(testxgboost PROPERTIES
     CUDA_SEPARABLE_COMPILATION OFF)
 
diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc
index 04c7f934c621..9e7683c6757b 100644
--- a/tests/cpp/helpers.cc
+++ b/tests/cpp/helpers.cc
@@ -20,6 +20,13 @@
 #include "../../src/gbm/gbtree_model.h"
 #include "xgboost/predictor.h"
 
+#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+#include <memory>
+#include "rmm/mr/device/default_memory_resource.hpp"
+#include "rmm/mr/device/cuda_memory_resource.hpp"
+#include "rmm/mr/device/pool_memory_resource.hpp"
+#endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+
 bool FileExists(const std::string& filename) {
   struct stat st;
   return stat(filename.c_str(), &st) == 0;
@@ -478,14 +485,30 @@ std::unique_ptr<GradientBooster> CreateTrainedGBM(
   return gbm;
 }
 
-#if !defined(XGBOOST_USE_RMM) || XGBOOST_USE_RMM != 1
-class RMMAllocator {};
+#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+using cuda_mr_t = rmm::mr::cuda_memory_resource;
+using pool_mr_t = rmm::mr::pool_memory_resource<cuda_mr_t>;
+class RMMAllocator {
+ public:
+  std::unique_ptr<pool_mr_t> handle;
+};
 
 void DeleteRMMResource(RMMAllocator* r) {
   delete r;
 }
 
-RMMAllocatorPtr SetUpRMMResource() {
+RMMAllocatorPtr SetUpRMMResourceForCppTests() {
+  auto cuda_mr = std::make_unique<cuda_mr_t>();
+  auto pool_mr = std::make_unique<pool_mr_t>(cuda_mr.release());
+	rmm::mr::set_default_resource(pool_mr.get());
+  return RMMAllocatorPtr(new RMMAllocator{std::move(pool_mr)}, DeleteRMMResource);
+}
+#else  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
+class RMMAllocator {};
+
+void DeleteRMMResource(RMMAllocator* r) {}
+
+RMMAllocatorPtr SetUpRMMResourceForCppTests() {
   return RMMAllocatorPtr(nullptr, DeleteRMMResource);
 }
 #endif  // !defined(XGBOOST_USE_RMM) || XGBOOST_USE_RMM != 1
diff --git a/tests/cpp/helpers.cu b/tests/cpp/helpers.cu
index ebaa0a49f20c..9b70ea543231 100644
--- a/tests/cpp/helpers.cu
+++ b/tests/cpp/helpers.cu
@@ -4,13 +4,6 @@
 #include "../../src/data/device_adapter.cuh"
 #include "../../src/data/iterative_device_dmatrix.h"
 
-#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
-#include <memory>
-#include "rmm/mr/device/default_memory_resource.hpp"
-#include "rmm/mr/device/cuda_memory_resource.hpp"
-#include "rmm/mr/device/pool_memory_resource.hpp"
-#endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
-
 namespace xgboost {
 
 CudaArrayIterForTest::CudaArrayIterForTest(float sparsity, size_t rows,
@@ -47,24 +40,4 @@ std::shared_ptr<DMatrix> RandomDataGenerator::GenerateDeviceDMatrix(bool with_la
       0, bins_);
   return m;
 }
-
-#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
-using cuda_mr_t = rmm::mr::cuda_memory_resource;
-using pool_mr_t = rmm::mr::pool_memory_resource<cuda_mr_t>;
-class RMMAllocator {
- public:
-  std::unique_ptr<pool_mr_t> handle;
-};
-
-void DeleteRMMResource(RMMAllocator* r) {
-  delete r;
-}
-
-RMMAllocatorPtr SetUpRMMResource() {
-  auto cuda_mr = std::make_unique<cuda_mr_t>();
-  auto pool_mr = std::make_unique<pool_mr_t>(cuda_mr.release());
-	rmm::mr::set_default_resource(pool_mr.get());
-  return RMMAllocatorPtr(new RMMAllocator{std::move(pool_mr)}, DeleteRMMResource);
-}
-#endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
 }  // namespace xgboost
diff --git a/tests/cpp/helpers.h b/tests/cpp/helpers.h
index 967223fed9bb..eb1b5d7733a2 100644
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@@ -355,7 +355,7 @@ inline int Next(DataIterHandle self) {
 
 class RMMAllocator;
 using RMMAllocatorPtr = std::unique_ptr<RMMAllocator, void(*)(RMMAllocator*)>;
-RMMAllocatorPtr SetUpRMMResource();
+RMMAllocatorPtr SetUpRMMResourceForCppTests();
 
 }  // namespace xgboost
 #endif
diff --git a/tests/cpp/test_main.cc b/tests/cpp/test_main.cc
index ed35ff775e82..cb2b78679e11 100644
--- a/tests/cpp/test_main.cc
+++ b/tests/cpp/test_main.cc
@@ -9,7 +9,7 @@
 #include "helpers.h"
 
 int main(int argc, char ** argv) {
-  auto rmm_alloc = xgboost::SetUpRMMResource();
+  auto rmm_alloc = xgboost::SetUpRMMResourceForCppTests();
   xgboost::Args args {{"verbosity", "2"}};
   xgboost::ConsoleLogger::Configure(args);
 

From a73391cb6e5603cccb3a4c7420c5976bb5a864e7 Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Thu, 9 Jul 2020 19:35:27 -0700
Subject: [PATCH 11/41] Prevent leaking of cuda_mr

---
 tests/cpp/helpers.cc | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc
index 9e7683c6757b..3b5cfeced15c 100644
--- a/tests/cpp/helpers.cc
+++ b/tests/cpp/helpers.cc
@@ -490,7 +490,10 @@ using cuda_mr_t = rmm::mr::cuda_memory_resource;
 using pool_mr_t = rmm::mr::pool_memory_resource<cuda_mr_t>;
 class RMMAllocator {
  public:
-  std::unique_ptr<pool_mr_t> handle;
+  cuda_mr_t cuda_mr;
+  pool_mr_t pool_mr;
+  RMMAllocator() : cuda_mr(), pool_mr(&cuda_mr) {}
+  ~RMMAllocator() = default;
 };
 
 void DeleteRMMResource(RMMAllocator* r) {
@@ -498,10 +501,9 @@ void DeleteRMMResource(RMMAllocator* r) {
 }
 
 RMMAllocatorPtr SetUpRMMResourceForCppTests() {
-  auto cuda_mr = std::make_unique<cuda_mr_t>();
-  auto pool_mr = std::make_unique<pool_mr_t>(cuda_mr.release());
-	rmm::mr::set_default_resource(pool_mr.get());
-  return RMMAllocatorPtr(new RMMAllocator{std::move(pool_mr)}, DeleteRMMResource);
+  auto ptr = RMMAllocatorPtr(new RMMAllocator(), DeleteRMMResource);
+  rmm::mr::set_default_resource(&ptr->pool_mr);
+  return ptr;
 }
 #else  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
 class RMMAllocator {};

From fa4ec118255ef2455cf6ec5149c88248676537bb Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Tue, 21 Jul 2020 20:26:04 -0700
Subject: [PATCH 12/41] Fix Jenkinsfile syntax

---
 Jenkinsfile | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 6c27372d0f74..882b6753b04a 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -280,12 +280,12 @@ def BuildCUDA(args) {
     stash name: "xgboost_cpp_tests_cuda${args.cuda_version}", includes: 'build/testxgboost'
     if (args.build_rmm) {
       echo "Build with CUDA ${args.cuda_version} and RMM"
-      def container_type = "rmm"
-      def docker_binary = "docker"
-      def docker_args = "--build-arg CUDA_VERSION=${args.cuda_version}"
+      container_type = "rmm"
+      docker_binary = "docker"
+      docker_args = "--build-arg CUDA_VERSION=${args.cuda_version}"
       sh """
       rm -rf build/
-      ${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/build_via_cmake.sh --conda-env=rmm_test -DUSE_CUDA=ON -DUSE_RMM=ON ${arch_flag}
+      ${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/build_via_cmake.sh --conda-env=rmm_test -DUSE_CUDA=ON -DUSE_NCCL=ON -DUSE_RMM=ON ${arch_flag}
       """
       echo 'Stashing C++ test executable (testxgboost)...'
       stash name: "xgboost_cpp_tests_rmm_cuda${args.cuda_version}", includes: 'build/testxgboost'
@@ -398,9 +398,9 @@ def TestCppGPU(args) {
       sh "rm -rfv build/"
       unstash name: "xgboost_cpp_tests_rmm_cuda${artifact_cuda_version}"
       echo "Test C++, CUDA ${args.host_cuda_version} with RMM"
-      def container_type = "rmm"
-      def docker_binary = "nvidia-docker"
-      def docker_args = "--build-arg CUDA_VERSION=${args.host_cuda_version}"
+      container_type = "rmm"
+      docker_binary = "nvidia-docker"
+      docker_args = "--build-arg CUDA_VERSION=${args.host_cuda_version}"
       echo "Using a single GPU"
       sh """
       ${dockerRun} ${container_type} ${docker_binary} ${docker_args} bash -c "source activate rmm_test && build/testxgboost --gtest_filter=-*DeathTest.*"

From 871fc296f0ddbe74ba650ebf586e5b75f570d17f Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Tue, 21 Jul 2020 20:28:09 -0700
Subject: [PATCH 13/41] Remove unnecessary function in Jenkinsfile

---
 Jenkinsfile | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 882b6753b04a..52952729cb9d 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -410,12 +410,6 @@ def TestCppGPU(args) {
   }
 }
 
-def TestCppGPUWithRMM(args) {
-  node('linux && gpu') {
-    deleteDir()
-  }
-}
-
 def CrossTestJVMwithJDK(args) {
   node('linux && cpu') {
     unstash name: 'xgboost4j_jar'

From 48051dfa97eeaa998d0655bc10f6f22dd945a419 Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Tue, 21 Jul 2020 20:59:12 -0700
Subject: [PATCH 14/41] [CI] Install NCCL into RMM container

---
 tests/ci_build/Dockerfile.rmm | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/tests/ci_build/Dockerfile.rmm b/tests/ci_build/Dockerfile.rmm
index 18730a9c0889..8f52f91f3736 100644
--- a/tests/ci_build/Dockerfile.rmm
+++ b/tests/ci_build/Dockerfile.rmm
@@ -1,5 +1,6 @@
 ARG CUDA_VERSION
-FROM nvidia/cuda:$CUDA_VERSION-devel-ubuntu18.04
+FROM nvidia/cuda:$CUDA_VERSION-devel-ubuntu16.04
+ARG CUDA_VERSION
 
 # Environment
 ENV DEBIAN_FRONTEND noninteractive
@@ -16,6 +17,13 @@ RUN \
     wget -nv -nc https://cmake.org/files/v3.13/cmake-3.13.0-Linux-x86_64.sh --no-check-certificate && \
     bash cmake-3.13.0-Linux-x86_64.sh --skip-license --prefix=/usr
 
+# NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html)
+RUN \
+    export CUDA_SHORT=`echo $CUDA_VERSION | egrep -o '[0-9]+\.[0-9]'` && \
+    export NCCL_VERSION=2.7.5-1 && \
+    apt-get update && \
+    apt-get install -y --allow-downgrades --allow-change-held-packages libnccl2=${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-dev=${NCCL_VERSION}+cuda${CUDA_SHORT}
+
 ENV PATH=/opt/python/bin:$PATH
 
 # Create new Conda environment with RMM

From c0a05ce091bd6e3ed0d504738a9e69ab581cca4f Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Tue, 21 Jul 2020 21:07:11 -0700
Subject: [PATCH 15/41] Run Python tests

---
 Jenkinsfile                   | 25 ++++++++++++-------------
 tests/ci_build/test_python.sh |  8 +++++++-
 2 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 52952729cb9d..b188f83e9f56 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -89,9 +89,9 @@ pipeline {
           parallel ([
             'test-python-cpu': { TestPythonCPU() },
             'test-python-gpu-cuda10.0': { TestPythonGPU(host_cuda_version: '10.0') },
-            'test-python-gpu-cuda10.2': { TestPythonGPU(host_cuda_version: '10.2') },
+            'test-python-gpu-cuda10.2': { TestPythonGPU(host_cuda_version: '10.2', test_rmm: true) },
             'test-python-gpu-cuda11.0': { TestPythonGPU(artifact_cuda_version: '11.0', host_cuda_version: '11.0') },
-            'test-python-mgpu-cuda10.2': { TestPythonGPU(host_cuda_version: '10.2', multi_gpu: true) },
+            'test-python-mgpu-cuda10.2': { TestPythonGPU(host_cuda_version: '10.2', multi_gpu: true, test_rmm: true) },
             'test-cpp-gpu-cuda10.2': { TestCppGPU(artifact_cuda_version: '10.2', host_cuda_version: '10.2', test_rmm: true) },
             'test-cpp-gpu-cuda11.0': { TestCppGPU(artifact_cuda_version: '11.0', host_cuda_version: '11.0') },
             'test-jvm-jdk8': { CrossTestJVMwithJDK(jdk_version: '8', spark_version: '3.0.0') },
@@ -286,7 +286,11 @@ def BuildCUDA(args) {
       sh """
       rm -rf build/
       ${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/build_via_cmake.sh --conda-env=rmm_test -DUSE_CUDA=ON -DUSE_NCCL=ON -DUSE_RMM=ON ${arch_flag}
+      ${dockerRun} ${container_type} ${docker_binary} ${docker_args} bash -c "cd python-package && rm -rf dist/* && python setup.py bdist_wheel --universal"
+      ${dockerRun} ${container_type} ${docker_binary} ${docker_args} python tests/ci_build/rename_whl.py python-package/dist/*.whl ${commit_id} manylinux2010_x86_64
       """
+      echo 'Stashing Python wheel...'
+      stash name: "xgboost_whl_rmm_cuda${args.cuda_version}", includes: 'python-package/dist/*.whl'
       echo 'Stashing C++ test executable (testxgboost)...'
       stash name: "xgboost_cpp_tests_rmm_cuda${args.cuda_version}", includes: 'build/testxgboost'
     }
@@ -354,17 +358,12 @@ def TestPythonGPU(args) {
     def container_type = "gpu"
     def docker_binary = "nvidia-docker"
     def docker_args = "--build-arg CUDA_VERSION=${args.host_cuda_version}"
-    if (args.multi_gpu) {
-      echo "Using multiple GPUs"
-      sh """
-      ${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/test_python.sh mgpu
-      """
-    } else {
-      echo "Using a single GPU"
-      sh """
-      ${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/test_python.sh gpu
-      """
-    }
+    def mgpu_indicator = (args.multi_gpu) ? 'mgpu' : 'gpu'
+    sh "${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/test_python.sh ${mgpu_indicator}"
+    sh "rm -rfv build/ python-package/dist/"
+    unstash name: "xgboost_whl_rmm_cuda${artifact_cuda_version}"
+    unstash name: "xgboost_cpp_tests_rmm_cuda${artifact_cuda_version}"
+    sh "${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/test_python.sh ${mgpu_indicator}"
     deleteDir()
   }
 }
diff --git a/tests/ci_build/test_python.sh b/tests/ci_build/test_python.sh
index fbd3644667d8..8aa73fd58452 100755
--- a/tests/ci_build/test_python.sh
+++ b/tests/ci_build/test_python.sh
@@ -26,12 +26,17 @@ function install_xgboost {
   fi
 }
 
+function uninstall_xgboost {
+  pip uninstall -y xgboost
+}
+
 # Run specified test suite
 case "$suite" in
   gpu)
     source activate gpu_test
     install_xgboost
     pytest -v -s -rxXs --fulltrace -m "not mgpu" tests/python-gpu
+    uninstall_xgboost
     ;;
 
   mgpu)
@@ -41,7 +46,7 @@ case "$suite" in
 
     cd tests/distributed
     ./runtests-gpu.sh
-    cd -
+    uninstall_xgboost
     ;;
 
   cpu)
@@ -49,6 +54,7 @@ case "$suite" in
     pytest -v -s --fulltrace tests/python
     cd tests/distributed
     ./runtests.sh
+    uninstall_xgboost
     ;;
 
   *)

From c12e0a6cd58d143c641a5dbc11554503f2b40dab Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Tue, 21 Jul 2020 22:20:57 -0700
Subject: [PATCH 16/41] Try building with RMM, CUDA 10.0

---
 Jenkinsfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index b188f83e9f56..4ce2a555251e 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -70,7 +70,7 @@ pipeline {
             'build-cpu-non-omp': { BuildCPUNonOmp() },
             // Build reference, distribution-ready Python wheel with CUDA 10.0
             // using CentOS 6 image
-            'build-gpu-cuda10.0': { BuildCUDA(cuda_version: '10.0') },
+            'build-gpu-cuda10.0': { BuildCUDA(cuda_version: '10.0', build_rmm: true) },
             // The build-gpu-* builds below use Ubuntu image
             'build-gpu-cuda10.1': { BuildCUDA(cuda_version: '10.1') },
             'build-gpu-cuda10.2': { BuildCUDA(cuda_version: '10.2', build_rmm: true) },

From a3e0e2f2e5c8dde7a03f4ef7efd6513ffece47cc Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Tue, 21 Jul 2020 22:45:48 -0700
Subject: [PATCH 17/41] Do not use RMM for CUDA 10.0 target

---
 Jenkinsfile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 4ce2a555251e..85d92de17203 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -70,7 +70,7 @@ pipeline {
             'build-cpu-non-omp': { BuildCPUNonOmp() },
             // Build reference, distribution-ready Python wheel with CUDA 10.0
             // using CentOS 6 image
-            'build-gpu-cuda10.0': { BuildCUDA(cuda_version: '10.0', build_rmm: true) },
+            'build-gpu-cuda10.0': { BuildCUDA(cuda_version: '10.0') },
             // The build-gpu-* builds below use Ubuntu image
             'build-gpu-cuda10.1': { BuildCUDA(cuda_version: '10.1') },
             'build-gpu-cuda10.2': { BuildCUDA(cuda_version: '10.2', build_rmm: true) },
@@ -395,7 +395,7 @@ def TestCppGPU(args) {
     sh "${dockerRun} ${container_type} ${docker_binary} ${docker_args} build/testxgboost"
     if (args.test_rmm) {
       sh "rm -rfv build/"
-      unstash name: "xgboost_cpp_tests_rmm_cuda${artifact_cuda_version}"
+      unstash name: "xgboost_cpp_tests_rmm_cuda${args.host_cuda_version}"
       echo "Test C++, CUDA ${args.host_cuda_version} with RMM"
       container_type = "rmm"
       docker_binary = "nvidia-docker"

From 3aeab6987af2edda9636ea03f902246043537fc7 Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Tue, 21 Jul 2020 23:15:47 -0700
Subject: [PATCH 18/41] Actually test for test_rmm flag

---
 Jenkinsfile | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 85d92de17203..ab564cb39ec6 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -360,10 +360,12 @@ def TestPythonGPU(args) {
     def docker_args = "--build-arg CUDA_VERSION=${args.host_cuda_version}"
     def mgpu_indicator = (args.multi_gpu) ? 'mgpu' : 'gpu'
     sh "${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/test_python.sh ${mgpu_indicator}"
-    sh "rm -rfv build/ python-package/dist/"
-    unstash name: "xgboost_whl_rmm_cuda${artifact_cuda_version}"
-    unstash name: "xgboost_cpp_tests_rmm_cuda${artifact_cuda_version}"
-    sh "${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/test_python.sh ${mgpu_indicator}"
+    if (args.test_rmm) {
+      sh "rm -rfv build/ python-package/dist/"
+      unstash name: "xgboost_whl_rmm_cuda${artifact_cuda_version}"
+      unstash name: "xgboost_cpp_tests_rmm_cuda${artifact_cuda_version}"
+      sh "${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/test_python.sh ${mgpu_indicator}"
+    }
     deleteDir()
   }
 }

From 862d58092bd9f812dc9aed2a36cc30a1d7c8eb91 Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Tue, 21 Jul 2020 23:18:54 -0700
Subject: [PATCH 19/41] Fix TestPythonGPU

---
 Jenkinsfile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index ab564cb39ec6..f75987400fd6 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -362,8 +362,8 @@ def TestPythonGPU(args) {
     sh "${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/test_python.sh ${mgpu_indicator}"
     if (args.test_rmm) {
       sh "rm -rfv build/ python-package/dist/"
-      unstash name: "xgboost_whl_rmm_cuda${artifact_cuda_version}"
-      unstash name: "xgboost_cpp_tests_rmm_cuda${artifact_cuda_version}"
+      unstash name: "xgboost_whl_rmm_cuda${args.host_cuda_version}"
+      unstash name: "xgboost_cpp_tests_rmm_cuda${args.host_cuda_version}"
       sh "${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/test_python.sh ${mgpu_indicator}"
     }
     deleteDir()
@@ -393,7 +393,7 @@ def TestCppGPU(args) {
     echo "Test C++, CUDA ${args.host_cuda_version}"
     def container_type = "gpu"
     def docker_binary = "nvidia-docker"
-    def docker_args = "--build-arg CUDA_VERSION=${args.host_cuda_version}"
+    def docker_args = "--build-arg CUDA_VERSION=${artifact_cuda_version}"
     sh "${dockerRun} ${container_type} ${docker_binary} ${docker_args} build/testxgboost"
     if (args.test_rmm) {
       sh "rm -rfv build/"

From 2a064bffd19d7e8931ef0d1aaf8dacbe74f896d3 Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Wed, 29 Jul 2020 09:19:59 +0000
Subject: [PATCH 20/41] Use CNMeM allocator, since pool allocator doesn't yet
 support multiGPU

---
 tests/cpp/helpers.cc | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc
index 3b5cfeced15c..a8733a316891 100644
--- a/tests/cpp/helpers.cc
+++ b/tests/cpp/helpers.cc
@@ -22,9 +22,10 @@
 
 #if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
 #include <memory>
+#include <numeric>
+#include <vector>
 #include "rmm/mr/device/default_memory_resource.hpp"
-#include "rmm/mr/device/cuda_memory_resource.hpp"
-#include "rmm/mr/device/pool_memory_resource.hpp"
+#include "rmm/mr/device/cnmem_memory_resource.hpp"
 #endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
 
 bool FileExists(const std::string& filename) {
@@ -486,13 +487,18 @@ std::unique_ptr<GradientBooster> CreateTrainedGBM(
 }
 
 #if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
-using cuda_mr_t = rmm::mr::cuda_memory_resource;
-using pool_mr_t = rmm::mr::pool_memory_resource<cuda_mr_t>;
+
+std::vector<int> GetVisibleGPUs() {
+  std::vector<int> gpus(common::AllVisibleGPUs());
+  std::iota(gpus.begin(), gpus.end(), 0);
+  return gpus;
+}
+
+using cnmem_mr_t = rmm::mr::cnmem_memory_resource;
 class RMMAllocator {
  public:
-  cuda_mr_t cuda_mr;
-  pool_mr_t pool_mr;
-  RMMAllocator() : cuda_mr(), pool_mr(&cuda_mr) {}
+  cnmem_mr_t cnmem_mr;
+  RMMAllocator() : cnmem_mr(0, GetVisibleGPUs()) {}
   ~RMMAllocator() = default;
 };
 
@@ -502,7 +508,7 @@ void DeleteRMMResource(RMMAllocator* r) {
 
 RMMAllocatorPtr SetUpRMMResourceForCppTests() {
   auto ptr = RMMAllocatorPtr(new RMMAllocator(), DeleteRMMResource);
-  rmm::mr::set_default_resource(&ptr->pool_mr);
+  rmm::mr::set_default_resource(&ptr->cnmem_mr);
   return ptr;
 }
 #else  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1

From 789021fa31112e25b683aef39fff375403060141 Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Thu, 30 Jul 2020 21:39:23 +0000
Subject: [PATCH 21/41] Use 10.0 container to build RMM-enabled XGBoost

---
 Jenkinsfile                   | 12 ++++++------
 tests/ci_build/Dockerfile.rmm |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 1c30760ef9e5..62e5a70ee891 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -70,10 +70,10 @@ pipeline {
             'build-cpu-non-omp': { BuildCPUNonOmp() },
             // Build reference, distribution-ready Python wheel with CUDA 10.0
             // using CentOS 6 image
-            'build-gpu-cuda10.0': { BuildCUDA(cuda_version: '10.0') },
+            'build-gpu-cuda10.0': { BuildCUDA(cuda_version: '10.0', build_rmm: true) },
             // The build-gpu-* builds below use Ubuntu image
             'build-gpu-cuda10.1': { BuildCUDA(cuda_version: '10.1') },
-            'build-gpu-cuda10.2': { BuildCUDA(cuda_version: '10.2', build_rmm: true) },
+            'build-gpu-cuda10.2': { BuildCUDA(cuda_version: '10.2') },
             'build-gpu-cuda11.0': { BuildCUDA(cuda_version: '11.0') },
             'build-jvm-packages-gpu-cuda10.0': { BuildJVMPackagesWithCUDA(spark_version: '3.0.0', cuda_version: '10.0') },
             'build-jvm-packages': { BuildJVMPackages(spark_version: '3.0.0') },
@@ -386,8 +386,8 @@ def TestPythonGPU(args) {
     sh "${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/test_python.sh ${mgpu_indicator}"
     if (args.test_rmm) {
       sh "rm -rfv build/ python-package/dist/"
-      unstash name: "xgboost_whl_rmm_cuda${args.host_cuda_version}"
-      unstash name: "xgboost_cpp_tests_rmm_cuda${args.host_cuda_version}"
+      unstash name: "xgboost_whl_rmm_cuda${artifact_cuda_version}"
+      unstash name: "xgboost_cpp_tests_rmm_cuda${artifact_cuda_version}"
       sh "${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/test_python.sh ${mgpu_indicator}"
     }
     deleteDir()
@@ -417,11 +417,11 @@ def TestCppGPU(args) {
     echo "Test C++, CUDA ${args.host_cuda_version}"
     def container_type = "gpu"
     def docker_binary = "nvidia-docker"
-    def docker_args = "--build-arg CUDA_VERSION=${artifact_cuda_version}"
+    def docker_args = "--build-arg CUDA_VERSION=${args.host_cuda_version}"
     sh "${dockerRun} ${container_type} ${docker_binary} ${docker_args} build/testxgboost"
     if (args.test_rmm) {
       sh "rm -rfv build/"
-      unstash name: "xgboost_cpp_tests_rmm_cuda${args.host_cuda_version}"
+      unstash name: "xgboost_cpp_tests_rmm_cuda${artifact_cuda_version}"
       echo "Test C++, CUDA ${args.host_cuda_version} with RMM"
       container_type = "rmm"
       docker_binary = "nvidia-docker"
diff --git a/tests/ci_build/Dockerfile.rmm b/tests/ci_build/Dockerfile.rmm
index 8f52f91f3736..feac054274c1 100644
--- a/tests/ci_build/Dockerfile.rmm
+++ b/tests/ci_build/Dockerfile.rmm
@@ -20,7 +20,7 @@ RUN \
 # NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html)
 RUN \
     export CUDA_SHORT=`echo $CUDA_VERSION | egrep -o '[0-9]+\.[0-9]'` && \
-    export NCCL_VERSION=2.7.5-1 && \
+    export NCCL_VERSION=2.4.8-1 && \
     apt-get update && \
     apt-get install -y --allow-downgrades --allow-change-held-packages libnccl2=${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-dev=${NCCL_VERSION}+cuda${CUDA_SHORT}
 

From f27d836aa63d202c11444f0fa5c25514c063ab4e Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Fri, 31 Jul 2020 09:52:37 +0000
Subject: [PATCH 22/41] Revert "Use 10.0 container to build RMM-enabled
 XGBoost"

This reverts commit 789021fa31112e25b683aef39fff375403060141.
---
 Jenkinsfile                   | 12 ++++++------
 tests/ci_build/Dockerfile.rmm |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 62e5a70ee891..1c30760ef9e5 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -70,10 +70,10 @@ pipeline {
             'build-cpu-non-omp': { BuildCPUNonOmp() },
             // Build reference, distribution-ready Python wheel with CUDA 10.0
             // using CentOS 6 image
-            'build-gpu-cuda10.0': { BuildCUDA(cuda_version: '10.0', build_rmm: true) },
+            'build-gpu-cuda10.0': { BuildCUDA(cuda_version: '10.0') },
             // The build-gpu-* builds below use Ubuntu image
             'build-gpu-cuda10.1': { BuildCUDA(cuda_version: '10.1') },
-            'build-gpu-cuda10.2': { BuildCUDA(cuda_version: '10.2') },
+            'build-gpu-cuda10.2': { BuildCUDA(cuda_version: '10.2', build_rmm: true) },
             'build-gpu-cuda11.0': { BuildCUDA(cuda_version: '11.0') },
             'build-jvm-packages-gpu-cuda10.0': { BuildJVMPackagesWithCUDA(spark_version: '3.0.0', cuda_version: '10.0') },
             'build-jvm-packages': { BuildJVMPackages(spark_version: '3.0.0') },
@@ -386,8 +386,8 @@ def TestPythonGPU(args) {
     sh "${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/test_python.sh ${mgpu_indicator}"
     if (args.test_rmm) {
       sh "rm -rfv build/ python-package/dist/"
-      unstash name: "xgboost_whl_rmm_cuda${artifact_cuda_version}"
-      unstash name: "xgboost_cpp_tests_rmm_cuda${artifact_cuda_version}"
+      unstash name: "xgboost_whl_rmm_cuda${args.host_cuda_version}"
+      unstash name: "xgboost_cpp_tests_rmm_cuda${args.host_cuda_version}"
       sh "${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/test_python.sh ${mgpu_indicator}"
     }
     deleteDir()
@@ -417,11 +417,11 @@ def TestCppGPU(args) {
     echo "Test C++, CUDA ${args.host_cuda_version}"
     def container_type = "gpu"
     def docker_binary = "nvidia-docker"
-    def docker_args = "--build-arg CUDA_VERSION=${args.host_cuda_version}"
+    def docker_args = "--build-arg CUDA_VERSION=${artifact_cuda_version}"
     sh "${dockerRun} ${container_type} ${docker_binary} ${docker_args} build/testxgboost"
     if (args.test_rmm) {
       sh "rm -rfv build/"
-      unstash name: "xgboost_cpp_tests_rmm_cuda${artifact_cuda_version}"
+      unstash name: "xgboost_cpp_tests_rmm_cuda${args.host_cuda_version}"
       echo "Test C++, CUDA ${args.host_cuda_version} with RMM"
       container_type = "rmm"
       docker_binary = "nvidia-docker"
diff --git a/tests/ci_build/Dockerfile.rmm b/tests/ci_build/Dockerfile.rmm
index feac054274c1..8f52f91f3736 100644
--- a/tests/ci_build/Dockerfile.rmm
+++ b/tests/ci_build/Dockerfile.rmm
@@ -20,7 +20,7 @@ RUN \
 # NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html)
 RUN \
     export CUDA_SHORT=`echo $CUDA_VERSION | egrep -o '[0-9]+\.[0-9]'` && \
-    export NCCL_VERSION=2.4.8-1 && \
+    export NCCL_VERSION=2.7.5-1 && \
     apt-get update && \
     apt-get install -y --allow-downgrades --allow-change-held-packages libnccl2=${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-dev=${NCCL_VERSION}+cuda${CUDA_SHORT}
 

From a4b86a9ea8848e02c527e2072cf297590ced477c Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Fri, 31 Jul 2020 09:53:51 +0000
Subject: [PATCH 23/41] Fix Jenkinsfile

---
 Jenkinsfile | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 1c30760ef9e5..8b51fcb817b2 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -386,8 +386,8 @@ def TestPythonGPU(args) {
     sh "${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/test_python.sh ${mgpu_indicator}"
     if (args.test_rmm) {
       sh "rm -rfv build/ python-package/dist/"
-      unstash name: "xgboost_whl_rmm_cuda${args.host_cuda_version}"
-      unstash name: "xgboost_cpp_tests_rmm_cuda${args.host_cuda_version}"
+      unstash name: "xgboost_whl_rmm_cuda${artifact_cuda_version}"
+      unstash name: "xgboost_cpp_tests_rmm_cuda${artifact_cuda_version}"
       sh "${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/test_python.sh ${mgpu_indicator}"
     }
     deleteDir()
@@ -417,11 +417,11 @@ def TestCppGPU(args) {
     echo "Test C++, CUDA ${args.host_cuda_version}"
     def container_type = "gpu"
     def docker_binary = "nvidia-docker"
-    def docker_args = "--build-arg CUDA_VERSION=${artifact_cuda_version}"
+    def docker_args = "--build-arg CUDA_VERSION=${args.host_cuda_version}"
     sh "${dockerRun} ${container_type} ${docker_binary} ${docker_args} build/testxgboost"
     if (args.test_rmm) {
       sh "rm -rfv build/"
-      unstash name: "xgboost_cpp_tests_rmm_cuda${args.host_cuda_version}"
+      unstash name: "xgboost_cpp_tests_rmm_cuda${artifact_cuda_version}"
       echo "Test C++, CUDA ${args.host_cuda_version} with RMM"
       container_type = "rmm"
       docker_binary = "nvidia-docker"

From e5eb262c8feb1d16b7e2dc3559d960117f71764b Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Fri, 31 Jul 2020 09:48:10 +0000
Subject: [PATCH 24/41] [CI] Assign larger /dev/shm to NCCL

---
 Jenkinsfile | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 8b51fcb817b2..4902251fedf8 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -383,12 +383,14 @@ def TestPythonGPU(args) {
     def docker_binary = "nvidia-docker"
     def docker_args = "--build-arg CUDA_VERSION=${args.host_cuda_version}"
     def mgpu_indicator = (args.multi_gpu) ? 'mgpu' : 'gpu'
-    sh "${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/test_python.sh ${mgpu_indicator}"
+    // Allocate extra space in /dev/shm to enable NCCL
+    def docker_extra_params = (args.multi_gpu) ? "CI_DOCKER_EXTRA_PARAMS_INIT='--shm-size=4g'" : ''
+    sh "${docker_extra_params} ${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/test_python.sh ${mgpu_indicator}"
     if (args.test_rmm) {
       sh "rm -rfv build/ python-package/dist/"
       unstash name: "xgboost_whl_rmm_cuda${artifact_cuda_version}"
       unstash name: "xgboost_cpp_tests_rmm_cuda${artifact_cuda_version}"
-      sh "${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/test_python.sh ${mgpu_indicator}"
+      sh "${docker_extra_params} ${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/test_python.sh ${mgpu_indicator}"
     }
     deleteDir()
   }

From 4cf7f0007b1b995d1e0b48ce9b69a5949a4a1285 Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Fri, 31 Jul 2020 09:48:30 +0000
Subject: [PATCH 25/41] Use 10.2 artifact to run multi-GPU Python tests

---
 Jenkinsfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 4902251fedf8..1af5192c4c85 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -92,7 +92,7 @@ pipeline {
             'test-python-gpu-cuda10.0': { TestPythonGPU(host_cuda_version: '10.0') },
             'test-python-gpu-cuda10.2': { TestPythonGPU(host_cuda_version: '10.2', test_rmm: true) },
             'test-python-gpu-cuda11.0': { TestPythonGPU(artifact_cuda_version: '11.0', host_cuda_version: '11.0') },
-            'test-python-mgpu-cuda10.2': { TestPythonGPU(host_cuda_version: '10.2', multi_gpu: true, test_rmm: true) },
+            'test-python-mgpu-cuda10.2': { TestPythonGPU(artifact_cuda_version: '10.2', host_cuda_version: '10.2', multi_gpu: true, test_rmm: true) },
             'test-cpp-gpu-cuda10.2': { TestCppGPU(artifact_cuda_version: '10.2', host_cuda_version: '10.2', test_rmm: true) },
             'test-cpp-gpu-cuda11.0': { TestCppGPU(artifact_cuda_version: '11.0', host_cuda_version: '11.0') },
             'test-jvm-jdk8-cuda10.0': { CrossTestJVMwithJDKGPU(artifact_cuda_version: '10.0', host_cuda_version: '10.0') },

From d023a503f88751b192a56cc2b9bdb2250bfb559c Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Fri, 31 Jul 2020 09:45:28 +0000
Subject: [PATCH 26/41] Add CUDA 10.0 -> 11.0 cross-version test; remove CUDA
 10.0 target

---
 Jenkinsfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 1af5192c4c85..60a90e66548c 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -89,8 +89,8 @@ pipeline {
         script {
           parallel ([
             'test-python-cpu': { TestPythonCPU() },
-            'test-python-gpu-cuda10.0': { TestPythonGPU(host_cuda_version: '10.0') },
             'test-python-gpu-cuda10.2': { TestPythonGPU(host_cuda_version: '10.2', test_rmm: true) },
+            'test-python-gpu-cuda11.0-cross': { TestPythonGPU(artifact_cuda_version: '10.0', host_cuda_version: '11.0') },
             'test-python-gpu-cuda11.0': { TestPythonGPU(artifact_cuda_version: '11.0', host_cuda_version: '11.0') },
             'test-python-mgpu-cuda10.2': { TestPythonGPU(artifact_cuda_version: '10.2', host_cuda_version: '10.2', multi_gpu: true, test_rmm: true) },
             'test-cpp-gpu-cuda10.2': { TestCppGPU(artifact_cuda_version: '10.2', host_cuda_version: '10.2', test_rmm: true) },

From abc64a311e9d0894cfc760fbaead3760b07f2b37 Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Fri, 31 Jul 2020 10:40:23 +0000
Subject: [PATCH 27/41] Rename Conda env rmm_test -> gpu_test

---
 Jenkinsfile                   | 4 ++--
 tests/ci_build/Dockerfile.rmm | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 60a90e66548c..8fbac6bebc21 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -287,7 +287,7 @@ def BuildCUDA(args) {
       docker_args = "--build-arg CUDA_VERSION=${args.cuda_version}"
       sh """
       rm -rf build/
-      ${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/build_via_cmake.sh --conda-env=rmm_test -DUSE_CUDA=ON -DUSE_NCCL=ON -DUSE_RMM=ON ${arch_flag}
+      ${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/build_via_cmake.sh --conda-env=gpu_test -DUSE_CUDA=ON -DUSE_NCCL=ON -DUSE_RMM=ON ${arch_flag}
       ${dockerRun} ${container_type} ${docker_binary} ${docker_args} bash -c "cd python-package && rm -rf dist/* && python setup.py bdist_wheel --universal"
       ${dockerRun} ${container_type} ${docker_binary} ${docker_args} python tests/ci_build/rename_whl.py python-package/dist/*.whl ${commit_id} manylinux2010_x86_64
       """
@@ -430,7 +430,7 @@ def TestCppGPU(args) {
       docker_args = "--build-arg CUDA_VERSION=${args.host_cuda_version}"
       echo "Using a single GPU"
       sh """
-      ${dockerRun} ${container_type} ${docker_binary} ${docker_args} bash -c "source activate rmm_test && build/testxgboost --gtest_filter=-*DeathTest.*"
+      ${dockerRun} ${container_type} ${docker_binary} ${docker_args} bash -c "source activate gpu_test && build/testxgboost --gtest_filter=-*DeathTest.*"
       """
     }
     deleteDir()
diff --git a/tests/ci_build/Dockerfile.rmm b/tests/ci_build/Dockerfile.rmm
index 8f52f91f3736..d5ebca97aec8 100644
--- a/tests/ci_build/Dockerfile.rmm
+++ b/tests/ci_build/Dockerfile.rmm
@@ -28,7 +28,7 @@ ENV PATH=/opt/python/bin:$PATH
 
 # Create new Conda environment with RMM
 RUN \
-    conda create -n rmm_test -c nvidia -c rapidsai -c conda-forge -c defaults \
+    conda create -n gpu_test -c nvidia -c rapidsai -c conda-forge -c defaults \
         python=3.7 rmm=0.14 cudatoolkit=$CUDA_VERSION
 
 ENV GOSU_VERSION 1.10

From 1e7e42e8003b6d9b880770c80f234e22743d531f Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Fri, 31 Jul 2020 11:09:25 +0000
Subject: [PATCH 28/41] Use env var to opt into CNMeM pool for C++ tests

---
 Jenkinsfile          | 4 ++--
 tests/cpp/helpers.cc | 3 +++
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 8fbac6bebc21..e6afcc229a74 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -428,9 +428,9 @@ def TestCppGPU(args) {
       container_type = "rmm"
       docker_binary = "nvidia-docker"
       docker_args = "--build-arg CUDA_VERSION=${args.host_cuda_version}"
-      echo "Using a single GPU"
+      def docker_extra_params = "CI_DOCKER_EXTRA_PARAMS_INIT='-e XGBOOST_CPP_TEST_USE_RMM_POOL=1'"
       sh """
-      ${dockerRun} ${container_type} ${docker_binary} ${docker_args} bash -c "source activate gpu_test && build/testxgboost --gtest_filter=-*DeathTest.*"
+      ${docker_extra_params} ${dockerRun} ${container_type} ${docker_binary} ${docker_args} bash -c "source activate gpu_test && build/testxgboost --gtest_filter=-*DeathTest.*"
       """
     }
     deleteDir()
diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc
index a8733a316891..030c7c2918ae 100644
--- a/tests/cpp/helpers.cc
+++ b/tests/cpp/helpers.cc
@@ -507,6 +507,9 @@ void DeleteRMMResource(RMMAllocator* r) {
 }
 
 RMMAllocatorPtr SetUpRMMResourceForCppTests() {
+  if (dmlc::GetEnv("XGBOOST_CPP_TEST_USE_RMM_POOL", 0) != 1) {
+    return RMMAllocatorPtr(nullptr, DeleteRMMResource);
+  }
   auto ptr = RMMAllocatorPtr(new RMMAllocator(), DeleteRMMResource);
   rmm::mr::set_default_resource(&ptr->cnmem_mr);
   return ptr;

From 1069ae0af62b48566ed32688a21ca1b1c32e341e Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Fri, 31 Jul 2020 11:39:22 -0700
Subject: [PATCH 29/41] Use identical CUDA version for RMM builds and tests

---
 Jenkinsfile | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index e6afcc229a74..551c236db094 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -89,10 +89,11 @@ pipeline {
         script {
           parallel ([
             'test-python-cpu': { TestPythonCPU() },
-            'test-python-gpu-cuda10.2': { TestPythonGPU(host_cuda_version: '10.2', test_rmm: true) },
+            // artifact_cuda_version doesn't apply to RMM tests; RMM tests will always match CUDA version between artifact and host env
+            'test-python-gpu-cuda10.2': { TestPythonGPU(artifact_cuda_version: '10.0', host_cuda_version: '10.2', test_rmm: true) },
             'test-python-gpu-cuda11.0-cross': { TestPythonGPU(artifact_cuda_version: '10.0', host_cuda_version: '11.0') },
             'test-python-gpu-cuda11.0': { TestPythonGPU(artifact_cuda_version: '11.0', host_cuda_version: '11.0') },
-            'test-python-mgpu-cuda10.2': { TestPythonGPU(artifact_cuda_version: '10.2', host_cuda_version: '10.2', multi_gpu: true, test_rmm: true) },
+            'test-python-mgpu-cuda10.2': { TestPythonGPU(artifact_cuda_version: '10.0', host_cuda_version: '10.2', multi_gpu: true, test_rmm: true) },
             'test-cpp-gpu-cuda10.2': { TestCppGPU(artifact_cuda_version: '10.2', host_cuda_version: '10.2', test_rmm: true) },
             'test-cpp-gpu-cuda11.0': { TestCppGPU(artifact_cuda_version: '11.0', host_cuda_version: '11.0') },
             'test-jvm-jdk8-cuda10.0': { CrossTestJVMwithJDKGPU(artifact_cuda_version: '10.0', host_cuda_version: '10.0') },
@@ -388,8 +389,8 @@ def TestPythonGPU(args) {
     sh "${docker_extra_params} ${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/test_python.sh ${mgpu_indicator}"
     if (args.test_rmm) {
       sh "rm -rfv build/ python-package/dist/"
-      unstash name: "xgboost_whl_rmm_cuda${artifact_cuda_version}"
-      unstash name: "xgboost_cpp_tests_rmm_cuda${artifact_cuda_version}"
+      unstash name: "xgboost_whl_rmm_cuda${args.host_cuda_version}"
+      unstash name: "xgboost_cpp_tests_rmm_cuda${args.host_cuda_version}"
       sh "${docker_extra_params} ${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/test_python.sh ${mgpu_indicator}"
     }
     deleteDir()
@@ -423,7 +424,7 @@ def TestCppGPU(args) {
     sh "${dockerRun} ${container_type} ${docker_binary} ${docker_args} build/testxgboost"
     if (args.test_rmm) {
       sh "rm -rfv build/"
-      unstash name: "xgboost_cpp_tests_rmm_cuda${artifact_cuda_version}"
+      unstash name: "xgboost_cpp_tests_rmm_cuda${args.host_cuda_version}"
       echo "Test C++, CUDA ${args.host_cuda_version} with RMM"
       container_type = "rmm"
       docker_binary = "nvidia-docker"

From 99a75209bb77150df61ebfebf636f0b394a6b1b0 Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Thu, 6 Aug 2020 06:46:55 +0000
Subject: [PATCH 30/41] Use Pytest fixtures to enable RMM pool in Python tests

---
 Jenkinsfile                            |  2 +-
 tests/ci_build/test_python.sh          | 18 +++++++----
 tests/python-gpu/conftest.py           | 32 ++++++++++++++++++++
 tests/python-gpu/test_gpu_with_dask.py | 41 +++++++++++++-------------
 4 files changed, 67 insertions(+), 26 deletions(-)
 create mode 100644 tests/python-gpu/conftest.py

diff --git a/Jenkinsfile b/Jenkinsfile
index 551c236db094..a1a93a628953 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -391,7 +391,7 @@ def TestPythonGPU(args) {
       sh "rm -rfv build/ python-package/dist/"
       unstash name: "xgboost_whl_rmm_cuda${args.host_cuda_version}"
       unstash name: "xgboost_cpp_tests_rmm_cuda${args.host_cuda_version}"
-      sh "${docker_extra_params} ${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/test_python.sh ${mgpu_indicator}"
+      sh "${docker_extra_params} ${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/test_python.sh ${mgpu_indicator} --use-rmm-pool"
     }
     deleteDir()
   }
diff --git a/tests/ci_build/test_python.sh b/tests/ci_build/test_python.sh
index 79eff9618971..28cd5e526b3b 100755
--- a/tests/ci_build/test_python.sh
+++ b/tests/ci_build/test_python.sh
@@ -2,7 +2,15 @@
 set -e
 set -x
 
-suite=$1
+if [ "$#" -lt 1 ]
+then
+  suite=''
+  args=''
+else
+  suite=$1
+  shift 1
+  args="$@"
+fi
 
 # Install XGBoost Python package
 function install_xgboost {
@@ -35,14 +43,14 @@ case "$suite" in
   gpu)
     source activate gpu_test
     install_xgboost
-    pytest -v -s -rxXs --fulltrace -m "not mgpu" tests/python-gpu
+    pytest -v -s -rxXs --fulltrace -m "not mgpu" ${args} tests/python-gpu
     uninstall_xgboost
     ;;
 
   mgpu)
     source activate gpu_test
     install_xgboost
-    pytest -v -s -rxXs --fulltrace -m "mgpu" tests/python-gpu
+    pytest -v -s -rxXs --fulltrace -m "mgpu" ${args} tests/python-gpu
 
     cd tests/distributed
     ./runtests-gpu.sh
@@ -52,14 +60,14 @@ case "$suite" in
   cpu)
     source activate cpu_test
     install_xgboost
-    pytest -v -s --fulltrace tests/python
+    pytest -v -s -rxXs --fulltrace ${args} tests/python
     cd tests/distributed
     ./runtests.sh
     uninstall_xgboost
     ;;
 
   *)
-    echo "Usage: $0 {gpu|mgpu|cpu}"
+    echo "Usage: $0 {gpu|mgpu|cpu} [extra args to pass to pytest]"
     exit 1
     ;;
 esac
diff --git a/tests/python-gpu/conftest.py b/tests/python-gpu/conftest.py
new file mode 100644
index 000000000000..af2d6aa10b6d
--- /dev/null
+++ b/tests/python-gpu/conftest.py
@@ -0,0 +1,32 @@
+import pytest
+import logging
+
+def has_rmm():
+    try:
+        import rmm
+        return True
+    except ImportError:
+        return False
+
+@pytest.fixture(scope='module', autouse=True)
+def setup_rmm_pool(request, pytestconfig):
+    if pytestconfig.getoption('--use-rmm-pool') and request.module.__name__ != 'test_gpu_with_dask':
+        if not has_rmm():
+            raise ImportError('The --use-rmm-pool option requires the RMM package')
+        import rmm
+        from dask_cuda.utils import get_n_gpus
+        rmm.reinitialize(pool_allocator=True, devices=list(range(get_n_gpus())))
+
+@pytest.fixture(scope='module', autouse=True)
+def local_cuda_cluster_rmm_kwargs(request, pytestconfig):
+    if pytestconfig.getoption('--use-rmm-pool') and request.module.__name__ == 'test_gpu_with_dask':
+        if not has_rmm():
+            raise ImportError('The --use-rmm-pool option requires the RMM package')
+        import rmm
+        from dask_cuda.utils import get_n_gpus
+        rmm.reinitialize()
+        return {'rmm_pool_size': '8GB'}
+    return {}
+
+def pytest_addoption(parser):
+    parser.addoption('--use-rmm-pool', action='store_true', default=False, help='Use RMM pool')
diff --git a/tests/python-gpu/test_gpu_with_dask.py b/tests/python-gpu/test_gpu_with_dask.py
index 45209b25871e..b3e5d8a14ed2 100644
--- a/tests/python-gpu/test_gpu_with_dask.py
+++ b/tests/python-gpu/test_gpu_with_dask.py
@@ -3,7 +3,6 @@
 import pytest
 import numpy as np
 import asyncio
-import unittest
 import xgboost
 import subprocess
 from hypothesis import given, strategies, settings, note
@@ -151,24 +150,24 @@ def run_gpu_hist(params, num_rounds, dataset, DMatrixT, client):
     assert tm.non_increasing(history['train'][dataset.metric])
 
 
-class TestDistributedGPU(unittest.TestCase):
+class TestDistributedGPU:
     @pytest.mark.skipif(**tm.no_dask())
     @pytest.mark.skipif(**tm.no_cudf())
     @pytest.mark.skipif(**tm.no_dask_cudf())
     @pytest.mark.skipif(**tm.no_dask_cuda())
     @pytest.mark.mgpu
-    def test_dask_dataframe(self):
-        with LocalCUDACluster() as cluster:
+    def test_dask_dataframe(self, local_cuda_cluster_rmm_kwargs):
+        with LocalCUDACluster(**local_cuda_cluster_rmm_kwargs) as cluster:
             with Client(cluster) as client:
                 run_with_dask_dataframe(dxgb.DaskDMatrix, client)
                 run_with_dask_dataframe(dxgb.DaskDeviceQuantileDMatrix, client)
 
-    @given(parameter_strategy, strategies.integers(1, 20),
-           tm.dataset_strategy)
+    @given(params=parameter_strategy, num_rounds=strategies.integers(1, 20),
+           dataset=tm.dataset_strategy)
     @settings(deadline=duration(seconds=120))
     @pytest.mark.mgpu
-    def test_gpu_hist(self, params, num_rounds, dataset):
-        with LocalCUDACluster(n_workers=2) as cluster:
+    def test_gpu_hist(self, params, num_rounds, dataset, local_cuda_cluster_rmm_kwargs):
+        with LocalCUDACluster(n_workers=2, **local_cuda_cluster_rmm_kwargs) as cluster:
             with Client(cluster) as client:
                 run_gpu_hist(params, num_rounds, dataset, dxgb.DaskDMatrix,
                              client)
@@ -177,8 +176,8 @@ def test_gpu_hist(self, params, num_rounds, dataset):
 
     @pytest.mark.skipif(**tm.no_cupy())
     @pytest.mark.mgpu
-    def test_dask_array(self):
-        with LocalCUDACluster() as cluster:
+    def test_dask_array(self, local_cuda_cluster_rmm_kwargs):
+        with LocalCUDACluster(**local_cuda_cluster_rmm_kwargs) as cluster:
             with Client(cluster) as client:
                 run_with_dask_array(dxgb.DaskDMatrix, client)
                 run_with_dask_array(dxgb.DaskDeviceQuantileDMatrix, client)
@@ -186,15 +185,15 @@ def test_dask_array(self):
     @pytest.mark.skipif(**tm.no_dask())
     @pytest.mark.skipif(**tm.no_dask_cuda())
     @pytest.mark.mgpu
-    def test_empty_dmatrix(self):
-        with LocalCUDACluster() as cluster:
+    def test_empty_dmatrix(self, local_cuda_cluster_rmm_kwargs):
+        with LocalCUDACluster(**local_cuda_cluster_rmm_kwargs) as cluster:
             with Client(cluster) as client:
                 parameters = {'tree_method': 'gpu_hist',
                               'debug_synchronize': True}
                 run_empty_dmatrix_reg(client, parameters)
                 run_empty_dmatrix_cls(client, parameters)
 
-    def run_quantile(self, name):
+    def run_quantile(self, name, local_cuda_cluster_rmm_kwargs):
         if sys.platform.startswith("win"):
             pytest.skip("Skipping dask tests on Windows")
 
@@ -217,7 +216,7 @@ def runit(worker_addr, rabit_args):
             env[port[0]] = port[1]
             return subprocess.run([exe, test], env=env, stdout=subprocess.PIPE)
 
-        with LocalCUDACluster() as cluster:
+        with LocalCUDACluster(**local_cuda_cluster_rmm_kwargs) as cluster:
             with Client(cluster) as client:
                 workers = list(dxgb._get_client_workers(client).keys())
                 rabit_args = client.sync(dxgb._get_rabit_args, workers, client)
@@ -235,14 +234,14 @@ def runit(worker_addr, rabit_args):
     @pytest.mark.skipif(**tm.no_dask())
     @pytest.mark.mgpu
     @pytest.mark.gtest
-    def test_quantile_basic(self):
-        self.run_quantile('AllReduceBasic')
+    def test_quantile_basic(self, local_cuda_cluster_rmm_kwargs):
+        self.run_quantile('AllReduceBasic', local_cuda_cluster_rmm_kwargs)
 
     @pytest.mark.skipif(**tm.no_dask())
     @pytest.mark.mgpu
     @pytest.mark.gtest
-    def test_quantile_same_on_all_workers(self):
-        self.run_quantile('SameOnAllWorkers')
+    def test_quantile_same_on_all_workers(self, local_cuda_cluster_rmm_kwargs):
+        self.run_quantile('SameOnAllWorkers', local_cuda_cluster_rmm_kwargs)
 
 
 async def run_from_dask_array_asyncio(scheduler_address):
@@ -272,8 +271,10 @@ async def run_from_dask_array_asyncio(scheduler_address):
         return output
 
 
-def test_with_asyncio():
-    with LocalCUDACluster() as cluster:
+@pytest.mark.skipif(**tm.no_dask())
+@pytest.mark.mgpu
+def test_with_asyncio(local_cuda_cluster_rmm_kwargs):
+    with LocalCUDACluster(**local_cuda_cluster_rmm_kwargs) as cluster:
         with Client(cluster) as client:
             address = client.scheduler.address
             output = asyncio.run(run_from_dask_array_asyncio(address))

From 92d14814d8d967172b765932cccd1e56d0afea48 Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Fri, 7 Aug 2020 23:18:01 +0000
Subject: [PATCH 31/41] Move RMM to plugin/CMakeLists.txt; use PLUGIN_RMM

---
 CMakeLists.txt           | 11 ++++-------
 Jenkinsfile              |  2 +-
 cmake/ExternalLibs.cmake | 27 ---------------------------
 plugin/CMakeLists.txt    | 19 +++++++++++++++++++
 4 files changed, 24 insertions(+), 35 deletions(-)
 delete mode 100644 cmake/ExternalLibs.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 88867ce3780d..134e13498ba5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -47,7 +47,6 @@ option(USE_NCCL  "Build with NCCL to enable distributed GPU support." OFF)
 option(BUILD_WITH_SHARED_NCCL "Build with shared NCCL library." OFF)
 set(GPU_COMPUTE_VER "" CACHE STRING
   "Semicolon separated list of compute versions to be built against, e.g. '35;61'")
-option(USE_RMM "Build with RAPIDS Memory Manager (RMM)" OFF)
 ## Copied From dmlc
 option(USE_HDFS "Build with HDFS support" OFF)
 option(USE_AZURE "Build with AZURE support" OFF)
@@ -61,6 +60,7 @@ address, leak, undefined and thread.")
 ## Plugins
 option(PLUGIN_LZ4 "Build lz4 plugin" OFF)
 option(PLUGIN_DENSE_PARSER "Build dense parser plugin" OFF)
+option(PLUGIN_RMM "Build with RAPIDS Memory Manager (RMM)" OFF)
 option(ADD_PKGCONFIG "Add xgboost.pc into system." ON)
 
 #-- Checks for building XGBoost
@@ -83,9 +83,9 @@ endif (R_LIB AND GOOGLE_TEST)
 if (USE_AVX)
   message(SEND_ERROR  "The option 'USE_AVX' is deprecated as experimental AVX features have been removed from XGBoost.")
 endif (USE_AVX)
-if (USE_RMM AND NOT (USE_CUDA))
-  message(SEND_ERROR "`USE_RMM` must be enabled with `USE_CUDA` flag.")
-endif (USE_RMM AND NOT (USE_CUDA))
+if (PLUGIN_RMM AND NOT (USE_CUDA))
+  message(SEND_ERROR "`PLUGIN_RMM` must be enabled with `USE_CUDA` flag.")
+endif (PLUGIN_RMM AND NOT (USE_CUDA))
 if (ENABLE_ALL_WARNINGS)
   if ((NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang") AND (NOT CMAKE_CXX_COMPILER_ID STREQUAL "GNU"))
     message(SEND_ERROR "ENABLE_ALL_WARNINGS is only available for Clang and GCC.")
@@ -194,9 +194,6 @@ endif (R_LIB)
 # Plugin
 add_subdirectory(${xgboost_SOURCE_DIR}/plugin)
 
-# 3rd-party libs
-include(cmake/ExternalLibs.cmake)
-
 #-- library
 if (BUILD_STATIC_LIB)
   add_library(xgboost STATIC)
diff --git a/Jenkinsfile b/Jenkinsfile
index a1a93a628953..54d449a79b33 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -288,7 +288,7 @@ def BuildCUDA(args) {
       docker_args = "--build-arg CUDA_VERSION=${args.cuda_version}"
       sh """
       rm -rf build/
-      ${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/build_via_cmake.sh --conda-env=gpu_test -DUSE_CUDA=ON -DUSE_NCCL=ON -DUSE_RMM=ON ${arch_flag}
+      ${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/build_via_cmake.sh --conda-env=gpu_test -DUSE_CUDA=ON -DUSE_NCCL=ON -DPLUGIN_RMM=ON ${arch_flag}
       ${dockerRun} ${container_type} ${docker_binary} ${docker_args} bash -c "cd python-package && rm -rf dist/* && python setup.py bdist_wheel --universal"
       ${dockerRun} ${container_type} ${docker_binary} ${docker_args} python tests/ci_build/rename_whl.py python-package/dist/*.whl ${commit_id} manylinux2010_x86_64
       """
diff --git a/cmake/ExternalLibs.cmake b/cmake/ExternalLibs.cmake
deleted file mode 100644
index 7181699d508f..000000000000
--- a/cmake/ExternalLibs.cmake
+++ /dev/null
@@ -1,27 +0,0 @@
-# RMM
-if (USE_RMM)
-  # Use Conda env if available
-  if(DEFINED ENV{CONDA_PREFIX})
-    set(CMAKE_PREFIX_PATH "$ENV{CONDA_PREFIX};${CMAKE_PREFIX_PATH}")
-    message(STATUS "Detected Conda environment, CMAKE_PREFIX_PATH set to: ${CMAKE_PREFIX_PATH}")
-  else()
-    message(STATUS "No Conda environment detected")
-  endif()
-
-  find_path(RMM_INCLUDE "rmm"
-    HINTS "$ENV{RMM_ROOT}/include")
-
-  find_library(RMM_LIBRARY "rmm"
-    HINTS "$ENV{RMM_ROOT}/lib" "$ENV{RMM_ROOT}/build")
-
-  if ((NOT RMM_LIBRARY) OR (NOT RMM_INCLUDE))
-    message(FATAL_ERROR "Could not locate RMM library")
-  endif ()
-
-  message(STATUS "RMM: RMM_LIBRARY set to ${RMM_LIBRARY}")
-  message(STATUS "RMM: RMM_INCLUDE set to ${RMM_INCLUDE}")
-
-  target_include_directories(objxgboost PUBLIC ${RMM_INCLUDE})
-  target_link_libraries(objxgboost PUBLIC ${RMM_LIBRARY} cuda)
-  target_compile_definitions(objxgboost PUBLIC -DXGBOOST_USE_RMM=1)
-endif ()
diff --git a/plugin/CMakeLists.txt b/plugin/CMakeLists.txt
index 86f784ca0155..1eb1274a397c 100644
--- a/plugin/CMakeLists.txt
+++ b/plugin/CMakeLists.txt
@@ -6,3 +6,22 @@ endif (PLUGIN_LZ4)
 if (PLUGIN_DENSE_PARSER)
   target_sources(objxgboost PRIVATE ${xgboost_SOURCE_DIR}/plugin/dense_parser/dense_libsvm.cc)
 endif (PLUGIN_DENSE_PARSER)
+
+if (PLUGIN_RMM)
+  find_path(RMM_INCLUDE "rmm"
+    HINTS "$ENV{RMM_ROOT}/include")
+
+  find_library(RMM_LIBRARY "rmm"
+    HINTS "$ENV{RMM_ROOT}/lib" "$ENV{RMM_ROOT}/build")
+
+  if ((NOT RMM_LIBRARY) OR (NOT RMM_INCLUDE))
+    message(FATAL_ERROR "Could not locate RMM library")
+  endif ()
+
+  message(STATUS "RMM: RMM_LIBRARY set to ${RMM_LIBRARY}")
+  message(STATUS "RMM: RMM_INCLUDE set to ${RMM_INCLUDE}")
+
+  target_include_directories(objxgboost PUBLIC ${RMM_INCLUDE})
+  target_link_libraries(objxgboost PUBLIC ${RMM_LIBRARY} cuda)
+  target_compile_definitions(objxgboost PUBLIC -DXGBOOST_USE_RMM=1)
+endif (PLUGIN_RMM)

From e74fd0dcb0ca45fedaf056011795ec519e25d2b4 Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Sat, 8 Aug 2020 01:19:00 +0000
Subject: [PATCH 32/41] Use per-device MR; use command arg in gtest

---
 Jenkinsfile                    |  3 +--
 src/common/device_helpers.cuh  | 15 +++++++++--
 src/predictor/gpu_predictor.cu | 24 +++++++++++-------
 tests/cpp/helpers.cc           | 46 +++++++++++++++++++++++-----------
 tests/cpp/helpers.h            |  2 +-
 tests/cpp/test_main.cc         |  2 +-
 6 files changed, 62 insertions(+), 30 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 54d449a79b33..274caeb3e352 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -429,9 +429,8 @@ def TestCppGPU(args) {
       container_type = "rmm"
       docker_binary = "nvidia-docker"
       docker_args = "--build-arg CUDA_VERSION=${args.host_cuda_version}"
-      def docker_extra_params = "CI_DOCKER_EXTRA_PARAMS_INIT='-e XGBOOST_CPP_TEST_USE_RMM_POOL=1'"
       sh """
-      ${docker_extra_params} ${dockerRun} ${container_type} ${docker_binary} ${docker_args} bash -c "source activate gpu_test && build/testxgboost --gtest_filter=-*DeathTest.*"
+      ${dockerRun} ${container_type} ${docker_binary} ${docker_args} bash -c "source activate gpu_test && build/testxgboost --use-rmm-pool --gtest_filter=-*DeathTest.*"
       """
     }
     deleteDir()
diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh
index ab40a46ba2d0..748aaf9c3524 100644
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -39,7 +39,8 @@
 #endif  // XGBOOST_USE_NCCL
 
 #if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
-#include "rmm/mr/device/default_memory_resource.hpp"
+#include <unistd.h>
+#include "rmm/mr/device/per_device_resource.hpp"
 #include "rmm/mr/device/thrust_allocator_adaptor.hpp"
 #endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
 
@@ -406,7 +407,8 @@ struct XGBDefaultDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
     return SuperT::deallocate(ptr, n);
   }
 #if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
-  XGBDefaultDeviceAllocatorImpl() : SuperT(rmm::mr::get_default_resource(), cudaStream_t{0}) {}
+  XGBDefaultDeviceAllocatorImpl()
+    : SuperT(rmm::mr::get_current_device_resource(), cudaStream_t{0}) {}
 #endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
 };
 
@@ -770,6 +772,15 @@ xgboost::common::Span<T> ToSpan(thrust::device_vector<T>& vec,
   return ToSpan(vec, offset, size);
 }
 
+template <typename VectorT, typename T = typename VectorT::value_type,
+  typename IndexT = typename xgboost::common::Span<T>::index_type>
+xgboost::common::Span<T> ToSpan(
+    std::unique_ptr<VectorT>& vec,
+    IndexT offset = 0,
+    IndexT size = std::numeric_limits<size_t>::max()) {
+  return ToSpan(*vec.get(), offset, size);
+}
+
 // thrust begin, similiar to std::begin
 template <typename T>
 thrust::device_ptr<T> tbegin(xgboost::HostDeviceVector<T>& vector) {  // NOLINT
diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu
index 9a498136d41a..fa31af2e4c8e 100644
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -213,9 +213,10 @@ __global__ void PredictKernel(Data data,
 
 class DeviceModel {
  public:
-  dh::device_vector<RegTree::Node> nodes;
-  dh::device_vector<size_t> tree_segments;
-  dh::device_vector<int> tree_group;
+  // Need to lazily construct the vectors because GPU id is only known at runtime
+  std::unique_ptr<dh::device_vector<RegTree::Node>> nodes;
+  std::unique_ptr<dh::device_vector<size_t>> tree_segments;
+  std::unique_ptr<dh::device_vector<int>> tree_group;
   size_t tree_beg_;  // NOLINT
   size_t tree_end_;  // NOLINT
   int num_group;
@@ -224,16 +225,16 @@ class DeviceModel {
                  const thrust::host_vector<size_t>& h_tree_segments,
                  const thrust::host_vector<RegTree::Node>& h_nodes,
                  size_t tree_begin, size_t tree_end) {
-    nodes.resize(h_nodes.size());
-    dh::safe_cuda(cudaMemcpyAsync(nodes.data().get(), h_nodes.data(),
+    nodes->resize(h_nodes.size());
+    dh::safe_cuda(cudaMemcpyAsync(nodes->data().get(), h_nodes.data(),
                                   sizeof(RegTree::Node) * h_nodes.size(),
                                   cudaMemcpyHostToDevice));
-    tree_segments.resize(h_tree_segments.size());
-    dh::safe_cuda(cudaMemcpyAsync(tree_segments.data().get(), h_tree_segments.data(),
+    tree_segments->resize(h_tree_segments.size());
+    dh::safe_cuda(cudaMemcpyAsync(tree_segments->data().get(), h_tree_segments.data(),
                                   sizeof(size_t) * h_tree_segments.size(),
                                   cudaMemcpyHostToDevice));
-    tree_group.resize(model.tree_info.size());
-    dh::safe_cuda(cudaMemcpyAsync(tree_group.data().get(), model.tree_info.data(),
+    tree_group->resize(model.tree_info.size());
+    dh::safe_cuda(cudaMemcpyAsync(tree_group->data().get(), model.tree_info.data(),
                                   sizeof(int) * model.tree_info.size(),
                                   cudaMemcpyHostToDevice));
     this->tree_beg_ = tree_begin;
@@ -243,6 +244,11 @@ class DeviceModel {
 
   void Init(const gbm::GBTreeModel& model, size_t tree_begin, size_t tree_end, int32_t gpu_id) {
     dh::safe_cuda(cudaSetDevice(gpu_id));
+    // Allocate device vectors using correct GPU ID context
+    nodes.reset(new dh::device_vector<RegTree::Node>());
+    tree_segments.reset(new dh::device_vector<size_t>());
+    tree_group.reset(new dh::device_vector<int>());
+
     CHECK_EQ(model.param.size_leaf_vector, 0);
     // Copy decision trees to device
     thrust::host_vector<size_t> h_tree_segments{};
diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc
index 030c7c2918ae..0d9d3770498b 100644
--- a/tests/cpp/helpers.cc
+++ b/tests/cpp/helpers.cc
@@ -24,8 +24,9 @@
 #include <memory>
 #include <numeric>
 #include <vector>
-#include "rmm/mr/device/default_memory_resource.hpp"
-#include "rmm/mr/device/cnmem_memory_resource.hpp"
+#include "rmm/mr/device/per_device_resource.hpp"
+#include "rmm/mr/device/cuda_memory_resource.hpp"
+#include "rmm/mr/device/pool_memory_resource.hpp"
 #endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
 
 bool FileExists(const std::string& filename) {
@@ -488,17 +489,23 @@ std::unique_ptr<GradientBooster> CreateTrainedGBM(
 
 #if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
 
-std::vector<int> GetVisibleGPUs() {
-  std::vector<int> gpus(common::AllVisibleGPUs());
-  std::iota(gpus.begin(), gpus.end(), 0);
-  return gpus;
-}
-
-using cnmem_mr_t = rmm::mr::cnmem_memory_resource;
+using cuda_mr_t = rmm::mr::cuda_memory_resource;
+using pool_mr_t = rmm::mr::pool_memory_resource<cuda_mr_t>;
 class RMMAllocator {
  public:
-  cnmem_mr_t cnmem_mr;
-  RMMAllocator() : cnmem_mr(0, GetVisibleGPUs()) {}
+  std::vector<std::unique_ptr<cuda_mr_t>> cuda_mr;
+  std::vector<std::unique_ptr<pool_mr_t>> pool_mr;
+  int n_gpu;
+  RMMAllocator() : n_gpu(common::AllVisibleGPUs()) {
+    int current_device;
+    CHECK_EQ(cudaGetDevice(&current_device), cudaSuccess);
+    for (int i = 0; i < n_gpu; ++i) {
+      CHECK_EQ(cudaSetDevice(i), cudaSuccess);
+      cuda_mr.push_back(std::unique_ptr<cuda_mr_t>(new cuda_mr_t));
+      pool_mr.push_back(std::unique_ptr<pool_mr_t>(new pool_mr_t(cuda_mr[i].get())));
+    }
+    CHECK_EQ(cudaSetDevice(current_device), cudaSuccess);
+  }
   ~RMMAllocator() = default;
 };
 
@@ -506,12 +513,21 @@ void DeleteRMMResource(RMMAllocator* r) {
   delete r;
 }
 
-RMMAllocatorPtr SetUpRMMResourceForCppTests() {
-  if (dmlc::GetEnv("XGBOOST_CPP_TEST_USE_RMM_POOL", 0) != 1) {
+RMMAllocatorPtr SetUpRMMResourceForCppTests(int argc, char** argv) {
+  bool use_rmm_pool = false;
+  for (int i = 1; i < argc; ++i) {
+    if (argv[i] == std::string("--use-rmm-pool")) {
+      use_rmm_pool = true;
+    }
+  }
+  if (!use_rmm_pool) {
     return RMMAllocatorPtr(nullptr, DeleteRMMResource);
   }
+  LOG(INFO) << "Using RMM memory pool";
   auto ptr = RMMAllocatorPtr(new RMMAllocator(), DeleteRMMResource);
-  rmm::mr::set_default_resource(&ptr->cnmem_mr);
+  for (int i = 0; i < ptr->n_gpu; ++i) {
+    rmm::mr::set_per_device_resource(rmm::cuda_device_id(i), ptr->pool_mr[i].get());
+  }
   return ptr;
 }
 #else  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
@@ -519,7 +535,7 @@ class RMMAllocator {};
 
 void DeleteRMMResource(RMMAllocator* r) {}
 
-RMMAllocatorPtr SetUpRMMResourceForCppTests() {
+RMMAllocatorPtr SetUpRMMResourceForCppTests(int argc, char** argv) {
   return RMMAllocatorPtr(nullptr, DeleteRMMResource);
 }
 #endif  // !defined(XGBOOST_USE_RMM) || XGBOOST_USE_RMM != 1
diff --git a/tests/cpp/helpers.h b/tests/cpp/helpers.h
index eb1b5d7733a2..5d4ce6cefa68 100644
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@@ -355,7 +355,7 @@ inline int Next(DataIterHandle self) {
 
 class RMMAllocator;
 using RMMAllocatorPtr = std::unique_ptr<RMMAllocator, void(*)(RMMAllocator*)>;
-RMMAllocatorPtr SetUpRMMResourceForCppTests();
+RMMAllocatorPtr SetUpRMMResourceForCppTests(int argc, char** argv);
 
 }  // namespace xgboost
 #endif
diff --git a/tests/cpp/test_main.cc b/tests/cpp/test_main.cc
index cb2b78679e11..b93329c2e788 100644
--- a/tests/cpp/test_main.cc
+++ b/tests/cpp/test_main.cc
@@ -9,11 +9,11 @@
 #include "helpers.h"
 
 int main(int argc, char ** argv) {
-  auto rmm_alloc = xgboost::SetUpRMMResourceForCppTests();
   xgboost::Args args {{"verbosity", "2"}};
   xgboost::ConsoleLogger::Configure(args);
 
   testing::InitGoogleTest(&argc, argv);
   testing::FLAGS_gtest_death_test_style = "threadsafe";
+  auto rmm_alloc = xgboost::SetUpRMMResourceForCppTests(argc, argv);
   return RUN_ALL_TESTS();
 }

From 2ee04b3eac05cf26af1942bd0e78d9fd009d8268 Mon Sep 17 00:00:00 2001
From: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Fri, 7 Aug 2020 19:35:29 -0700
Subject: [PATCH 33/41] Set CMake prefix path to use Conda env

---
 tests/ci_build/build_via_cmake.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/ci_build/build_via_cmake.sh b/tests/ci_build/build_via_cmake.sh
index 3af8bbe9b20d..44c9b5d4a29b 100755
--- a/tests/ci_build/build_via_cmake.sh
+++ b/tests/ci_build/build_via_cmake.sh
@@ -8,14 +8,16 @@ then
   shift 1
   cmake_args="$@"
   source activate ${conda_env}
+  cmake_prefix_flag="-DCMAKE_PREFIX_PATH=$CONDA_PREFIX"
 else
   cmake_args="$@"
+  cmake_prefix_flag=''
 fi
 
 rm -rf build
 mkdir build
 cd build
-cmake .. ${cmake_args} -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DCMAKE_VERBOSE_MAKEFILE=ON -DENABLE_ALL_WARNINGS=ON -GNinja
+cmake .. ${cmake_args} -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DCMAKE_VERBOSE_MAKEFILE=ON -DENABLE_ALL_WARNINGS=ON -GNinja ${cmake_prefix_flag}
 ninja clean
 time ninja -v
 cd ..

From 87422a2f120fac78c8f372cf317b9709cddeb30d Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Fri, 7 Aug 2020 19:57:46 -0700
Subject: [PATCH 34/41] Use 0.15 nightly version of RMM

---
 tests/ci_build/Dockerfile.gpu | 4 ++--
 tests/ci_build/Dockerfile.rmm | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/ci_build/Dockerfile.gpu b/tests/ci_build/Dockerfile.gpu
index 055caf3c1028..efc3d9186067 100644
--- a/tests/ci_build/Dockerfile.gpu
+++ b/tests/ci_build/Dockerfile.gpu
@@ -17,8 +17,8 @@ ENV PATH=/opt/python/bin:$PATH
 
 # Create new Conda environment with cuDF, Dask, and cuPy
 RUN \
-    conda create -n gpu_test -c rapidsai -c nvidia -c conda-forge -c defaults \
-        python=3.7 cudf=0.14 cudatoolkit=$CUDA_VERSION dask dask-cuda dask-cudf cupy \
+    conda create -n gpu_test -c rapidsai-nightly -c rapidsai -c nvidia -c conda-forge -c defaults \
+        python=3.7 cudf=0.15* rmm=0.15* cudatoolkit=$CUDA_VERSION dask dask-cuda dask-cudf cupy \
         numpy pytest scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis
 
 ENV GOSU_VERSION 1.10
diff --git a/tests/ci_build/Dockerfile.rmm b/tests/ci_build/Dockerfile.rmm
index d5ebca97aec8..a92f09c47f28 100644
--- a/tests/ci_build/Dockerfile.rmm
+++ b/tests/ci_build/Dockerfile.rmm
@@ -28,8 +28,8 @@ ENV PATH=/opt/python/bin:$PATH
 
 # Create new Conda environment with RMM
 RUN \
-    conda create -n gpu_test -c nvidia -c rapidsai -c conda-forge -c defaults \
-        python=3.7 rmm=0.14 cudatoolkit=$CUDA_VERSION
+    conda create -n gpu_test -c nvidia -c rapidsai-nightly -c rapidsai -c conda-forge -c defaults \
+        python=3.7 rmm=0.15* cudatoolkit=$CUDA_VERSION
 
 ENV GOSU_VERSION 1.10
 

From 9021a75ea7263bef9ce94fb02d77d184eeac208a Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Fri, 7 Aug 2020 20:00:59 -0700
Subject: [PATCH 35/41] Remove unnecessary header

---
 src/common/device_helpers.cuh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh
index 748aaf9c3524..8971f705dea8 100644
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -39,7 +39,6 @@
 #endif  // XGBOOST_USE_NCCL
 
 #if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
-#include <unistd.h>
 #include "rmm/mr/device/per_device_resource.hpp"
 #include "rmm/mr/device/thrust_allocator_adaptor.hpp"
 #endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1

From 377580a94966767880870cf4bb9283f45bdc1e82 Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <chohyu01@cs.wasshington.edu>
Date: Sat, 8 Aug 2020 08:42:42 +0000
Subject: [PATCH 36/41] Fix a unit test when cudf is missing

---
 python-package/xgboost/data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py
index b29eac7959f3..9491efd1c38c 100644
--- a/python-package/xgboost/data.py
+++ b/python-package/xgboost/data.py
@@ -317,7 +317,7 @@ def _is_cudf_df(data):
         import cudf
     except ImportError:
         return False
-    return isinstance(data, cudf.DataFrame)
+    return hasattr(cudf, 'DataFrame') and isinstance(data, cudf.DataFrame)
 
 
 def _cudf_array_interfaces(data):

From 3df7cc3b668283b9b9233ded0021a21102877889 Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Mon, 10 Aug 2020 19:57:58 +0000
Subject: [PATCH 37/41] Add RMM demos

---
 demo/rmm_plugin/README.md              | 31 ++++++++++++++++++++++++++
 demo/rmm_plugin/rmm_mgpu_with_dask.py  | 27 ++++++++++++++++++++++
 demo/rmm_plugin/rmm_singlegpu.py       | 14 ++++++++++++
 tests/pytest.ini                       |  3 ++-
 tests/python-gpu/conftest.py           | 11 ++++++++-
 tests/python-gpu/test_gpu_demos.py     |  1 +
 tests/python-gpu/test_gpu_with_dask.py |  2 ++
 7 files changed, 87 insertions(+), 2 deletions(-)
 create mode 100644 demo/rmm_plugin/README.md
 create mode 100644 demo/rmm_plugin/rmm_mgpu_with_dask.py
 create mode 100644 demo/rmm_plugin/rmm_singlegpu.py

diff --git a/demo/rmm_plugin/README.md b/demo/rmm_plugin/README.md
new file mode 100644
index 000000000000..ad73c61f3097
--- /dev/null
+++ b/demo/rmm_plugin/README.md
@@ -0,0 +1,31 @@
+Using XGBoost with RAPIDS Memory Manager (RMM) plugin (EXPERIMENTAL)
+====================================================================
+[RAPIDS Memory Manager (RMM)](https://github.com/rapidsai/rmm) library provides a collection of
+efficient memory allocators for NVIDIA GPUs. It is now possible to use XGBoost with memory
+allocators provided by RMM, by enabling the RMM integration plugin.
+
+The demos in this directory highlights one RMM allocator in particular: **the pool sub-allocator**.
+This allocator addresses the slow speed of `cudaMalloc()` by allocating a large chunk of memory
+upfront. Subsequent allocations will draw from the pool of already allocated memory and thus avoid
+the overhead of calling `cudaMalloc()` directly. See
+[this GTC talk slides](https://on-demand.gputechconf.com/gtc/2015/presentation/S5530-Stephen-Jones.pdf)
+for more details.
+
+Before running the demos, ensure that XGBoost is compiled with the RMM plugin enabled. To do this,
+run CMake with option `-DPLUGIN_RMM=ON` (`-DUSE_CUDA=ON` also required):
+```
+cmake .. -DUSE_CUDA=ON -DUSE_NCCL=ON -DPLUGIN_RMM=ON
+make -j4
+```
+CMake will attempt to locate the RMM library in your build environment. You may choose to build
+RMM from the source, or install it using the Conda package manager. If CMake cannot find RMM, you
+should specify the location of RMM with the CMake prefix:
+```
+# If using Conda:
+cmake .. -DUSE_CUDA=ON -DUSE_NCCL=ON -DPLUGIN_RMM=ON -DCMAKE_PREFIX_PATH=$CONDA_PREFIX
+# If using RMM installed with a custom location
+cmake .. -DUSE_CUDA=ON -DUSE_NCCL=ON -DPLUGIN_RMM=ON -DCMAKE_PREFIX_PATH=/path/to/rmm
+```
+
+* [Using RMM with a single GPU](./rmm_singlegpu.py)
+* [Using RMM with a local Dask cluster consisting of multiple GPUs](./rmm_mgpu_with_dask.py)
diff --git a/demo/rmm_plugin/rmm_mgpu_with_dask.py b/demo/rmm_plugin/rmm_mgpu_with_dask.py
new file mode 100644
index 000000000000..eac0c5da4822
--- /dev/null
+++ b/demo/rmm_plugin/rmm_mgpu_with_dask.py
@@ -0,0 +1,27 @@
+import xgboost as xgb
+from sklearn.datasets import make_classification
+import dask
+from dask.distributed import Client
+from dask_cuda import LocalCUDACluster
+
+def main(client):
+    X, y = make_classification(n_samples=10000, n_informative=5, n_classes=3)
+    X = dask.array.from_array(X)
+    y = dask.array.from_array(y)
+    dtrain = xgb.dask.DaskDMatrix(client, X, label=y)
+
+    params = {'max_depth': 8, 'eta': 0.01, 'objective': 'multi:softprob', 'num_class': 3,
+              'tree_method': 'gpu_hist'}
+    output = xgb.dask.train(client, params, dtrain, num_boost_round=100,
+                            evals=[(dtrain, 'train')])
+    bst = output['booster']
+    history = output['history']
+    for i, e in enumerate(history['train']['merror']):
+        print(f'[{i}] train-merror: {e}')
+
+if __name__ == '__main__':
+    # To use RMM pool allocator with a GPU Dask cluster, just add rmm_pool_size option to
+    # LocalCUDACluster constructor.
+    with LocalCUDACluster(rmm_pool_size='2GB') as cluster:
+        with Client(cluster) as client:
+            main(client)
diff --git a/demo/rmm_plugin/rmm_singlegpu.py b/demo/rmm_plugin/rmm_singlegpu.py
new file mode 100644
index 000000000000..c56e0a0cef43
--- /dev/null
+++ b/demo/rmm_plugin/rmm_singlegpu.py
@@ -0,0 +1,14 @@
+import xgboost as xgb
+import rmm
+from sklearn.datasets import make_classification
+
+# Initialize RMM pool allocator
+rmm.reinitialize(pool_allocator=True)
+
+X, y = make_classification(n_samples=10000, n_informative=5, n_classes=3)
+dtrain = xgb.DMatrix(X, label=y)
+
+params = {'max_depth': 8, 'eta': 0.01, 'objective': 'multi:softprob', 'num_class': 3,
+          'tree_method': 'gpu_hist'}
+# XGBoost will automatically use the RMM pool allocator
+bst = xgb.train(params, dtrain, num_boost_round=100, evals=[(dtrain, 'train')])
diff --git a/tests/pytest.ini b/tests/pytest.ini
index 136782056f95..f34505d3532c 100644
--- a/tests/pytest.ini
+++ b/tests/pytest.ini
@@ -2,4 +2,5 @@
 markers =
     mgpu: Mark a test that requires multiple GPUs to run.
     ci: Mark a test that runs only on CI.
-    gtest: Mark a test that requires C++ Google Test executable.
\ No newline at end of file
+    gtest: Mark a test that requires C++ Google Test executable.
+    no_rmm_pool_setup: Mark a test to skip the setup_rmm_pool() fixture.
diff --git a/tests/python-gpu/conftest.py b/tests/python-gpu/conftest.py
index af2d6aa10b6d..1ed7cae7d308 100644
--- a/tests/python-gpu/conftest.py
+++ b/tests/python-gpu/conftest.py
@@ -8,9 +8,18 @@ def has_rmm():
     except ImportError:
         return False
 
+def get_module_attributes(module):
+    if not hasattr(module, 'pytestmark'):
+        return []
+    if isinstance(module.pytestmark, list):
+        return [x.name for x in module.pytestmark]
+    return [module.pytestmark.name]
+
 @pytest.fixture(scope='module', autouse=True)
 def setup_rmm_pool(request, pytestconfig):
-    if pytestconfig.getoption('--use-rmm-pool') and request.module.__name__ != 'test_gpu_with_dask':
+    if (pytestconfig.getoption('--use-rmm-pool') and
+            'no_rmm_pool_setup' not in get_module_attributes(request.module)):
+        print('!!! Setting up RMM pool')
         if not has_rmm():
             raise ImportError('The --use-rmm-pool option requires the RMM package')
         import rmm
diff --git a/tests/python-gpu/test_gpu_demos.py b/tests/python-gpu/test_gpu_demos.py
index a3a9aaff5b08..58dc262e6b4e 100644
--- a/tests/python-gpu/test_gpu_demos.py
+++ b/tests/python-gpu/test_gpu_demos.py
@@ -6,6 +6,7 @@
 import testing as tm
 import test_demos as td         # noqa
 
+pytestmark = pytest.mark.no_rmm_pool_setup
 
 @pytest.mark.skipif(**tm.no_cupy())
 def test_data_iterator():
diff --git a/tests/python-gpu/test_gpu_with_dask.py b/tests/python-gpu/test_gpu_with_dask.py
index b3e5d8a14ed2..dc9ba0a49d4d 100644
--- a/tests/python-gpu/test_gpu_with_dask.py
+++ b/tests/python-gpu/test_gpu_with_dask.py
@@ -12,6 +12,8 @@
 if sys.platform.startswith("win"):
     pytest.skip("Skipping dask tests on Windows", allow_module_level=True)
 
+pytestmark = pytest.mark.no_rmm_pool_setup
+
 sys.path.append("tests/python")
 from test_with_dask import run_empty_dmatrix_reg  # noqa
 from test_with_dask import run_empty_dmatrix_cls  # noqa

From 567fb330a6beeb32dc0d7e873e64ebd41fe71c4f Mon Sep 17 00:00:00 2001
From: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Mon, 10 Aug 2020 13:34:09 -0700
Subject: [PATCH 38/41] Remove print()

---
 tests/python-gpu/conftest.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/python-gpu/conftest.py b/tests/python-gpu/conftest.py
index 1ed7cae7d308..05ddf10c5036 100644
--- a/tests/python-gpu/conftest.py
+++ b/tests/python-gpu/conftest.py
@@ -19,7 +19,6 @@ def get_module_attributes(module):
 def setup_rmm_pool(request, pytestconfig):
     if (pytestconfig.getoption('--use-rmm-pool') and
             'no_rmm_pool_setup' not in get_module_attributes(request.module)):
-        print('!!! Setting up RMM pool')
         if not has_rmm():
             raise ImportError('The --use-rmm-pool option requires the RMM package')
         import rmm

From 1e63c465713e492de3531b811ae6d09a2cb80905 Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Tue, 11 Aug 2020 22:21:43 +0000
Subject: [PATCH 39/41] Use HostDeviceVector in GPU predictor

---
 src/common/device_helpers.cuh    |  9 -----
 src/common/host_device_vector.cu |  2 ++
 src/predictor/gpu_predictor.cu   | 58 ++++++++++++--------------------
 3 files changed, 23 insertions(+), 46 deletions(-)

diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh
index 8971f705dea8..beb94680f493 100644
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -771,15 +771,6 @@ xgboost::common::Span<T> ToSpan(thrust::device_vector<T>& vec,
   return ToSpan(vec, offset, size);
 }
 
-template <typename VectorT, typename T = typename VectorT::value_type,
-  typename IndexT = typename xgboost::common::Span<T>::index_type>
-xgboost::common::Span<T> ToSpan(
-    std::unique_ptr<VectorT>& vec,
-    IndexT offset = 0,
-    IndexT size = std::numeric_limits<size_t>::max()) {
-  return ToSpan(*vec.get(), offset, size);
-}
-
 // thrust begin, similiar to std::begin
 template <typename T>
 thrust::device_ptr<T> tbegin(xgboost::HostDeviceVector<T>& vector) {  // NOLINT
diff --git a/src/common/host_device_vector.cu b/src/common/host_device_vector.cu
index 7470f1c07731..39a0fbe9efb0 100644
--- a/src/common/host_device_vector.cu
+++ b/src/common/host_device_vector.cu
@@ -11,6 +11,7 @@
 
 #include "xgboost/data.h"
 #include "xgboost/host_device_vector.h"
+#include "xgboost/tree_model.h"
 #include "device_helpers.cuh"
 
 namespace xgboost {
@@ -402,6 +403,7 @@ template class HostDeviceVector<FeatureType>;
 template class HostDeviceVector<Entry>;
 template class HostDeviceVector<uint64_t>;  // bst_row_t
 template class HostDeviceVector<uint32_t>;  // bst_feature_t
+template class HostDeviceVector<RegTree::Node>;
 
 #if defined(__APPLE__)
 /*
diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu
index fa31af2e4c8e..c05688eaf4d8 100644
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -214,44 +214,20 @@ __global__ void PredictKernel(Data data,
 class DeviceModel {
  public:
   // Need to lazily construct the vectors because GPU id is only known at runtime
-  std::unique_ptr<dh::device_vector<RegTree::Node>> nodes;
-  std::unique_ptr<dh::device_vector<size_t>> tree_segments;
-  std::unique_ptr<dh::device_vector<int>> tree_group;
+  HostDeviceVector<RegTree::Node> nodes;
+  HostDeviceVector<size_t> tree_segments;
+  HostDeviceVector<int> tree_group;
   size_t tree_beg_;  // NOLINT
   size_t tree_end_;  // NOLINT
   int num_group;
 
-  void CopyModel(const gbm::GBTreeModel& model,
-                 const thrust::host_vector<size_t>& h_tree_segments,
-                 const thrust::host_vector<RegTree::Node>& h_nodes,
-                 size_t tree_begin, size_t tree_end) {
-    nodes->resize(h_nodes.size());
-    dh::safe_cuda(cudaMemcpyAsync(nodes->data().get(), h_nodes.data(),
-                                  sizeof(RegTree::Node) * h_nodes.size(),
-                                  cudaMemcpyHostToDevice));
-    tree_segments->resize(h_tree_segments.size());
-    dh::safe_cuda(cudaMemcpyAsync(tree_segments->data().get(), h_tree_segments.data(),
-                                  sizeof(size_t) * h_tree_segments.size(),
-                                  cudaMemcpyHostToDevice));
-    tree_group->resize(model.tree_info.size());
-    dh::safe_cuda(cudaMemcpyAsync(tree_group->data().get(), model.tree_info.data(),
-                                  sizeof(int) * model.tree_info.size(),
-                                  cudaMemcpyHostToDevice));
-    this->tree_beg_ = tree_begin;
-    this->tree_end_ = tree_end;
-    this->num_group = model.learner_model_param->num_output_group;
-  }
-
   void Init(const gbm::GBTreeModel& model, size_t tree_begin, size_t tree_end, int32_t gpu_id) {
     dh::safe_cuda(cudaSetDevice(gpu_id));
-    // Allocate device vectors using correct GPU ID context
-    nodes.reset(new dh::device_vector<RegTree::Node>());
-    tree_segments.reset(new dh::device_vector<size_t>());
-    tree_group.reset(new dh::device_vector<int>());
 
     CHECK_EQ(model.param.size_leaf_vector, 0);
     // Copy decision trees to device
-    thrust::host_vector<size_t> h_tree_segments{};
+    tree_segments = std::move(HostDeviceVector<size_t>({}, gpu_id));
+    auto& h_tree_segments = tree_segments.HostVector();
     h_tree_segments.reserve((tree_end - tree_begin) + 1);
     size_t sum = 0;
     h_tree_segments.push_back(sum);
@@ -260,13 +236,21 @@ class DeviceModel {
       h_tree_segments.push_back(sum);
     }
 
-    thrust::host_vector<RegTree::Node> h_nodes(h_tree_segments.back());
+    nodes = std::move(HostDeviceVector<RegTree::Node>(h_tree_segments.back(), RegTree::Node(),
+                                                      gpu_id));
+    auto& h_nodes = nodes.HostVector();
     for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
       auto& src_nodes = model.trees.at(tree_idx)->GetNodes();
       std::copy(src_nodes.begin(), src_nodes.end(),
                 h_nodes.begin() + h_tree_segments[tree_idx - tree_begin]);
     }
-    CopyModel(model, h_tree_segments, h_nodes, tree_begin, tree_end);
+
+    tree_group = std::move(HostDeviceVector<int>(model.tree_info.size(), 0, gpu_id));
+    auto& h_tree_group = tree_group.HostVector();
+    std::memcpy(h_tree_group.data(), model.tree_info.data(), sizeof(int) * model.tree_info.size());
+    this->tree_beg_ = tree_begin;
+    this->tree_end_ = tree_end;
+    this->num_group = model.learner_model_param->num_output_group;
   }
 };
 
@@ -293,8 +277,8 @@ class GPUPredictor : public xgboost::Predictor {
     dh::LaunchKernel {GRID_SIZE, BLOCK_THREADS, shared_memory_bytes} (
         PredictKernel<SparsePageLoader, SparsePageView>,
         data,
-        dh::ToSpan(model_.nodes), predictions->DeviceSpan().subspan(batch_offset),
-        dh::ToSpan(model_.tree_segments), dh::ToSpan(model_.tree_group),
+        model_.nodes.DeviceSpan(), predictions->DeviceSpan().subspan(batch_offset),
+        model_.tree_segments.DeviceSpan(), model_.tree_group.DeviceSpan(),
         model_.tree_beg_, model_.tree_end_, num_features, num_rows,
         entry_start, use_shared, model_.num_group);
   }
@@ -309,8 +293,8 @@ class GPUPredictor : public xgboost::Predictor {
     dh::LaunchKernel {GRID_SIZE, BLOCK_THREADS} (
         PredictKernel<EllpackLoader, EllpackDeviceAccessor>,
         batch,
-        dh::ToSpan(model_.nodes), out_preds->DeviceSpan().subspan(batch_offset),
-        dh::ToSpan(model_.tree_segments), dh::ToSpan(model_.tree_group),
+        model_.nodes.DeviceSpan(), out_preds->DeviceSpan().subspan(batch_offset),
+        model_.tree_segments.DeviceSpan(), model_.tree_group.DeviceSpan(),
         model_.tree_beg_, model_.tree_end_, batch.NumFeatures(), num_rows,
         entry_start, use_shared, model_.num_group);
   }
@@ -441,8 +425,8 @@ class GPUPredictor : public xgboost::Predictor {
     dh::LaunchKernel {GRID_SIZE, BLOCK_THREADS, shared_memory_bytes} (
         PredictKernel<Loader, typename Loader::BatchT>,
         m->Value(),
-        dh::ToSpan(d_model.nodes), out_preds->predictions.DeviceSpan(),
-        dh::ToSpan(d_model.tree_segments), dh::ToSpan(d_model.tree_group),
+        d_model.nodes.DeviceSpan(), out_preds->predictions.DeviceSpan(),
+        d_model.tree_segments.DeviceSpan(), d_model.tree_group.DeviceSpan(),
         tree_begin, tree_end, m->NumColumns(), info.num_row_,
         entry_start, use_shared, output_groups);
   }

From ad216c51bba4b04d3bf039507c7b8398e9a47fae Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Tue, 11 Aug 2020 22:52:30 +0000
Subject: [PATCH 40/41] Simplify pytest setup; use LocalCUDACluster fixture

---
 tests/pytest.ini                       |   1 -
 tests/python-gpu/conftest.py           |  37 +++++----
 tests/python-gpu/test_gpu_demos.py     |   2 -
 tests/python-gpu/test_gpu_with_dask.py | 107 ++++++++++++-------------
 4 files changed, 74 insertions(+), 73 deletions(-)

diff --git a/tests/pytest.ini b/tests/pytest.ini
index f34505d3532c..5a0d27a6cec6 100644
--- a/tests/pytest.ini
+++ b/tests/pytest.ini
@@ -3,4 +3,3 @@ markers =
     mgpu: Mark a test that requires multiple GPUs to run.
     ci: Mark a test that runs only on CI.
     gtest: Mark a test that requires C++ Google Test executable.
-    no_rmm_pool_setup: Mark a test to skip the setup_rmm_pool() fixture.
diff --git a/tests/python-gpu/conftest.py b/tests/python-gpu/conftest.py
index 05ddf10c5036..1865ce529a98 100644
--- a/tests/python-gpu/conftest.py
+++ b/tests/python-gpu/conftest.py
@@ -1,6 +1,10 @@
+import sys
 import pytest
 import logging
 
+sys.path.append("tests/python")
+import testing as tm                          # noqa
+
 def has_rmm():
     try:
         import rmm
@@ -8,33 +12,34 @@ def has_rmm():
     except ImportError:
         return False
 
-def get_module_attributes(module):
-    if not hasattr(module, 'pytestmark'):
-        return []
-    if isinstance(module.pytestmark, list):
-        return [x.name for x in module.pytestmark]
-    return [module.pytestmark.name]
-
-@pytest.fixture(scope='module', autouse=True)
+@pytest.fixture(scope='session', autouse=True)
 def setup_rmm_pool(request, pytestconfig):
-    if (pytestconfig.getoption('--use-rmm-pool') and
-            'no_rmm_pool_setup' not in get_module_attributes(request.module)):
+    if pytestconfig.getoption('--use-rmm-pool'):
         if not has_rmm():
             raise ImportError('The --use-rmm-pool option requires the RMM package')
         import rmm
         from dask_cuda.utils import get_n_gpus
-        rmm.reinitialize(pool_allocator=True, devices=list(range(get_n_gpus())))
+        rmm.reinitialize(pool_allocator=True, initial_pool_size=1024*1024*1024,
+                         devices=list(range(get_n_gpus())))
 
-@pytest.fixture(scope='module', autouse=True)
-def local_cuda_cluster_rmm_kwargs(request, pytestconfig):
-    if pytestconfig.getoption('--use-rmm-pool') and request.module.__name__ == 'test_gpu_with_dask':
+@pytest.fixture(scope='function')
+def local_cuda_cluster(request, pytestconfig):
+    kwargs = {}
+    if hasattr(request, 'param'):
+        kwargs.update(request.param)
+    if pytestconfig.getoption('--use-rmm-pool'):
         if not has_rmm():
             raise ImportError('The --use-rmm-pool option requires the RMM package')
         import rmm
         from dask_cuda.utils import get_n_gpus
         rmm.reinitialize()
-        return {'rmm_pool_size': '8GB'}
-    return {}
+        kwargs['rmm_pool_size'] = '2GB'
+    if tm.no_dask_cuda()['condition']:
+        raise ImportError('The local_cuda_cluster fixture requires dask_cuda package')
+    from dask_cuda import LocalCUDACluster
+    cluster = LocalCUDACluster(**kwargs)
+    yield cluster
+    cluster.close()
 
 def pytest_addoption(parser):
     parser.addoption('--use-rmm-pool', action='store_true', default=False, help='Use RMM pool')
diff --git a/tests/python-gpu/test_gpu_demos.py b/tests/python-gpu/test_gpu_demos.py
index 58dc262e6b4e..f74d2adc2823 100644
--- a/tests/python-gpu/test_gpu_demos.py
+++ b/tests/python-gpu/test_gpu_demos.py
@@ -6,8 +6,6 @@
 import testing as tm
 import test_demos as td         # noqa
 
-pytestmark = pytest.mark.no_rmm_pool_setup
-
 @pytest.mark.skipif(**tm.no_cupy())
 def test_data_iterator():
     script = os.path.join(td.PYTHON_DEMO_DIR, 'data_iterator.py')
diff --git a/tests/python-gpu/test_gpu_with_dask.py b/tests/python-gpu/test_gpu_with_dask.py
index dc9ba0a49d4d..a06bfc28361f 100644
--- a/tests/python-gpu/test_gpu_with_dask.py
+++ b/tests/python-gpu/test_gpu_with_dask.py
@@ -12,8 +12,6 @@
 if sys.platform.startswith("win"):
     pytest.skip("Skipping dask tests on Windows", allow_module_level=True)
 
-pytestmark = pytest.mark.no_rmm_pool_setup
-
 sys.path.append("tests/python")
 from test_with_dask import run_empty_dmatrix_reg  # noqa
 from test_with_dask import run_empty_dmatrix_cls  # noqa
@@ -24,7 +22,6 @@
 try:
     import dask.dataframe as dd
     from xgboost import dask as dxgb
-    from dask_cuda import LocalCUDACluster
     from dask.distributed import Client
     from dask import array as da
     import cudf
@@ -158,44 +155,45 @@ class TestDistributedGPU:
     @pytest.mark.skipif(**tm.no_dask_cudf())
     @pytest.mark.skipif(**tm.no_dask_cuda())
     @pytest.mark.mgpu
-    def test_dask_dataframe(self, local_cuda_cluster_rmm_kwargs):
-        with LocalCUDACluster(**local_cuda_cluster_rmm_kwargs) as cluster:
-            with Client(cluster) as client:
-                run_with_dask_dataframe(dxgb.DaskDMatrix, client)
-                run_with_dask_dataframe(dxgb.DaskDeviceQuantileDMatrix, client)
+    def test_dask_dataframe(self, local_cuda_cluster):
+        with Client(local_cuda_cluster) as client:
+            run_with_dask_dataframe(dxgb.DaskDMatrix, client)
+            run_with_dask_dataframe(dxgb.DaskDeviceQuantileDMatrix, client)
 
     @given(params=parameter_strategy, num_rounds=strategies.integers(1, 20),
            dataset=tm.dataset_strategy)
     @settings(deadline=duration(seconds=120))
+    @pytest.mark.skipif(**tm.no_dask())
+    @pytest.mark.skipif(**tm.no_dask_cuda())
+    @pytest.mark.parametrize('local_cuda_cluster', [{'n_workers': 2}], indirect=['local_cuda_cluster'])
     @pytest.mark.mgpu
-    def test_gpu_hist(self, params, num_rounds, dataset, local_cuda_cluster_rmm_kwargs):
-        with LocalCUDACluster(n_workers=2, **local_cuda_cluster_rmm_kwargs) as cluster:
-            with Client(cluster) as client:
-                run_gpu_hist(params, num_rounds, dataset, dxgb.DaskDMatrix,
-                             client)
-                run_gpu_hist(params, num_rounds, dataset,
-                             dxgb.DaskDeviceQuantileDMatrix, client)
+    def test_gpu_hist(self, params, num_rounds, dataset, local_cuda_cluster):
+        with Client(local_cuda_cluster) as client:
+            run_gpu_hist(params, num_rounds, dataset, dxgb.DaskDMatrix,
+                         client)
+            run_gpu_hist(params, num_rounds, dataset,
+                         dxgb.DaskDeviceQuantileDMatrix, client)
 
     @pytest.mark.skipif(**tm.no_cupy())
+    @pytest.mark.skipif(**tm.no_dask())
+    @pytest.mark.skipif(**tm.no_dask_cuda())
     @pytest.mark.mgpu
-    def test_dask_array(self, local_cuda_cluster_rmm_kwargs):
-        with LocalCUDACluster(**local_cuda_cluster_rmm_kwargs) as cluster:
-            with Client(cluster) as client:
-                run_with_dask_array(dxgb.DaskDMatrix, client)
-                run_with_dask_array(dxgb.DaskDeviceQuantileDMatrix, client)
+    def test_dask_array(self, local_cuda_cluster):
+        with Client(local_cuda_cluster) as client:
+            run_with_dask_array(dxgb.DaskDMatrix, client)
+            run_with_dask_array(dxgb.DaskDeviceQuantileDMatrix, client)
 
     @pytest.mark.skipif(**tm.no_dask())
     @pytest.mark.skipif(**tm.no_dask_cuda())
     @pytest.mark.mgpu
-    def test_empty_dmatrix(self, local_cuda_cluster_rmm_kwargs):
-        with LocalCUDACluster(**local_cuda_cluster_rmm_kwargs) as cluster:
-            with Client(cluster) as client:
-                parameters = {'tree_method': 'gpu_hist',
-                              'debug_synchronize': True}
-                run_empty_dmatrix_reg(client, parameters)
-                run_empty_dmatrix_cls(client, parameters)
-
-    def run_quantile(self, name, local_cuda_cluster_rmm_kwargs):
+    def test_empty_dmatrix(self, local_cuda_cluster):
+        with Client(local_cuda_cluster) as client:
+            parameters = {'tree_method': 'gpu_hist',
+                          'debug_synchronize': True}
+            run_empty_dmatrix_reg(client, parameters)
+            run_empty_dmatrix_cls(client, parameters)
+
+    def run_quantile(self, name, local_cuda_cluster):
         if sys.platform.startswith("win"):
             pytest.skip("Skipping dask tests on Windows")
 
@@ -218,32 +216,33 @@ def runit(worker_addr, rabit_args):
             env[port[0]] = port[1]
             return subprocess.run([exe, test], env=env, stdout=subprocess.PIPE)
 
-        with LocalCUDACluster(**local_cuda_cluster_rmm_kwargs) as cluster:
-            with Client(cluster) as client:
-                workers = list(dxgb._get_client_workers(client).keys())
-                rabit_args = client.sync(dxgb._get_rabit_args, workers, client)
-                futures = client.map(runit,
-                                     workers,
-                                     pure=False,
-                                     workers=workers,
-                                     rabit_args=rabit_args)
-                results = client.gather(futures)
-                for ret in results:
-                    msg = ret.stdout.decode('utf-8')
-                    assert msg.find('1 test from GPUQuantile') != -1, msg
-                    assert ret.returncode == 0, msg
+        with Client(local_cuda_cluster) as client:
+            workers = list(dxgb._get_client_workers(client).keys())
+            rabit_args = client.sync(dxgb._get_rabit_args, workers, client)
+            futures = client.map(runit,
+                                 workers,
+                                 pure=False,
+                                 workers=workers,
+                                 rabit_args=rabit_args)
+            results = client.gather(futures)
+            for ret in results:
+                msg = ret.stdout.decode('utf-8')
+                assert msg.find('1 test from GPUQuantile') != -1, msg
+                assert ret.returncode == 0, msg
 
     @pytest.mark.skipif(**tm.no_dask())
+    @pytest.mark.skipif(**tm.no_dask_cuda())
     @pytest.mark.mgpu
     @pytest.mark.gtest
-    def test_quantile_basic(self, local_cuda_cluster_rmm_kwargs):
-        self.run_quantile('AllReduceBasic', local_cuda_cluster_rmm_kwargs)
+    def test_quantile_basic(self, local_cuda_cluster):
+        self.run_quantile('AllReduceBasic', local_cuda_cluster)
 
     @pytest.mark.skipif(**tm.no_dask())
+    @pytest.mark.skipif(**tm.no_dask_cuda())
     @pytest.mark.mgpu
     @pytest.mark.gtest
-    def test_quantile_same_on_all_workers(self, local_cuda_cluster_rmm_kwargs):
-        self.run_quantile('SameOnAllWorkers', local_cuda_cluster_rmm_kwargs)
+    def test_quantile_same_on_all_workers(self, local_cuda_cluster):
+        self.run_quantile('SameOnAllWorkers', local_cuda_cluster)
 
 
 async def run_from_dask_array_asyncio(scheduler_address):
@@ -274,11 +273,11 @@ async def run_from_dask_array_asyncio(scheduler_address):
 
 
 @pytest.mark.skipif(**tm.no_dask())
+@pytest.mark.skipif(**tm.no_dask_cuda())
 @pytest.mark.mgpu
-def test_with_asyncio(local_cuda_cluster_rmm_kwargs):
-    with LocalCUDACluster(**local_cuda_cluster_rmm_kwargs) as cluster:
-        with Client(cluster) as client:
-            address = client.scheduler.address
-            output = asyncio.run(run_from_dask_array_asyncio(address))
-            assert isinstance(output['booster'], xgboost.Booster)
-            assert isinstance(output['history'], dict)
+def test_with_asyncio(local_cuda_cluster):
+    with Client(local_cuda_cluster) as client:
+        address = client.scheduler.address
+        output = asyncio.run(run_from_dask_array_asyncio(address))
+        assert isinstance(output['booster'], xgboost.Booster)
+        assert isinstance(output['history'], dict)

From b4195cd5ea4f91b858527117a34c805e55a1f716 Mon Sep 17 00:00:00 2001
From: Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Tue, 11 Aug 2020 22:55:49 +0000
Subject: [PATCH 41/41] Address reviewers' commments

---
 tests/cpp/helpers.cc | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc
index 0d9d3770498b..858b651981fb 100644
--- a/tests/cpp/helpers.cc
+++ b/tests/cpp/helpers.cc
@@ -489,20 +489,20 @@ std::unique_ptr<GradientBooster> CreateTrainedGBM(
 
 #if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
 
-using cuda_mr_t = rmm::mr::cuda_memory_resource;
-using pool_mr_t = rmm::mr::pool_memory_resource<cuda_mr_t>;
+using CUDAMemoryResource = rmm::mr::cuda_memory_resource;
+using PoolMemoryResource = rmm::mr::pool_memory_resource<CUDAMemoryResource>;
 class RMMAllocator {
  public:
-  std::vector<std::unique_ptr<cuda_mr_t>> cuda_mr;
-  std::vector<std::unique_ptr<pool_mr_t>> pool_mr;
+  std::vector<std::unique_ptr<CUDAMemoryResource>> cuda_mr;
+  std::vector<std::unique_ptr<PoolMemoryResource>> pool_mr;
   int n_gpu;
   RMMAllocator() : n_gpu(common::AllVisibleGPUs()) {
     int current_device;
     CHECK_EQ(cudaGetDevice(&current_device), cudaSuccess);
     for (int i = 0; i < n_gpu; ++i) {
       CHECK_EQ(cudaSetDevice(i), cudaSuccess);
-      cuda_mr.push_back(std::unique_ptr<cuda_mr_t>(new cuda_mr_t));
-      pool_mr.push_back(std::unique_ptr<pool_mr_t>(new pool_mr_t(cuda_mr[i].get())));
+      cuda_mr.push_back(std::make_unique<CUDAMemoryResource>());
+      pool_mr.push_back(std::make_unique<PoolMemoryResource>(cuda_mr[i].get()));
     }
     CHECK_EQ(cudaSetDevice(current_device), cudaSuccess);
   }