Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[BLOCKING] [jvm-packages] add gpu_hist and enable gpu scheduling #5171

Merged
merged 12 commits into from
Jul 27, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 42 additions & 0 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ pipeline {
'build-gpu-cuda10.1': { BuildCUDA(cuda_version: '10.1') },
'build-gpu-cuda10.2': { BuildCUDA(cuda_version: '10.2') },
'build-gpu-cuda11.0': { BuildCUDA(cuda_version: '11.0') },
'build-jvm-packages-gpu-cuda10.0': { BuildJVMPackagesWithCUDA(spark_version: '3.0.0', cuda_version: '10.0') },
'build-jvm-packages': { BuildJVMPackages(spark_version: '3.0.0') },
'build-jvm-doc': { BuildJVMDoc() }
])
Expand All @@ -94,6 +95,7 @@ pipeline {
'test-python-mgpu-cuda10.2': { TestPythonGPU(host_cuda_version: '10.2', multi_gpu: true) },
'test-cpp-gpu-cuda10.2': { TestCppGPU(artifact_cuda_version: '10.2', host_cuda_version: '10.2') },
'test-cpp-gpu-cuda11.0': { TestCppGPU(artifact_cuda_version: '11.0', host_cuda_version: '11.0') },
'test-jvm-jdk8-cuda10.0': { CrossTestJVMwithJDKGPU(artifact_cuda_version: '10.0', host_cuda_version: '10.0') },
'test-jvm-jdk8': { CrossTestJVMwithJDK(jdk_version: '8', spark_version: '3.0.0') },
'test-jvm-jdk11': { CrossTestJVMwithJDK(jdk_version: '11') },
'test-jvm-jdk12': { CrossTestJVMwithJDK(jdk_version: '12') },
Expand Down Expand Up @@ -282,6 +284,28 @@ def BuildCUDA(args) {
}
}

def BuildJVMPackagesWithCUDA(args) {
node('linux && gpu') {
unstash name: 'srcs'
echo "Build XGBoost4J-Spark with Spark ${args.spark_version}, CUDA ${args.cuda_version}"
def container_type = "jvm_gpu_build"
def docker_binary = "nvidia-docker"
def docker_args = "--build-arg CUDA_VERSION=${args.cuda_version}"
def arch_flag = ""
if (env.BRANCH_NAME != 'master' && !(env.BRANCH_NAME.startsWith('release'))) {
arch_flag = "-DGPU_COMPUTE_VER=75"
}
// Use only 4 CPU cores
def docker_extra_params = "CI_DOCKER_EXTRA_PARAMS_INIT='--cpuset-cpus 0-3'"
sh """
${docker_extra_params} ${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/build_jvm_packages.sh ${args.spark_version} -Duse.cuda=ON $arch_flag
"""
echo "Stashing XGBoost4J JAR with CUDA ${args.cuda_version} ..."
stash name: 'xgboost4j_jar_gpu', includes: "jvm-packages/xgboost4j/target/*.jar,jvm-packages/xgboost4j-spark/target/*.jar,jvm-packages/xgboost4j-example/target/*.jar"
deleteDir()
}
}

def BuildJVMPackages(args) {
node('linux && cpu') {
unstash name: 'srcs'
Expand Down Expand Up @@ -386,6 +410,24 @@ def TestCppGPU(args) {
}
}

def CrossTestJVMwithJDKGPU(args) {
def nodeReq = 'linux && mgpu'
node(nodeReq) {
unstash name: "xgboost4j_jar_gpu"
unstash name: 'srcs'
if (args.spark_version != null) {
echo "Test XGBoost4J on a machine with JDK ${args.jdk_version}, Spark ${args.spark_version}, CUDA ${args.host_cuda_version}"
} else {
echo "Test XGBoost4J on a machine with JDK ${args.jdk_version}, CUDA ${args.host_cuda_version}"
}
def container_type = "gpu_jvm"
def docker_binary = "nvidia-docker"
def docker_args = "--build-arg CUDA_VERSION=${args.host_cuda_version}"
sh "${dockerRun} ${container_type} ${docker_binary} ${docker_args} tests/ci_build/test_jvm_gpu_cross.sh"
deleteDir()
}
}

def CrossTestJVMwithJDK(args) {
node('linux && cpu') {
unstash name: 'xgboost4j_jar'
Expand Down
8 changes: 8 additions & 0 deletions doc/jvm/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,14 @@ If you are on Mac OS and using a compiler that supports OpenMP, you need to go t

in order to get the benefit of multi-threading.

Building with GPU support
-------------------------
If you want to build XGBoost4J that supports distributed GPU training, run

.. code-block:: bash

mvn -Duse.cuda=ON install

********
Contents
********
Expand Down
16 changes: 15 additions & 1 deletion jvm-packages/create_jni.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
#!/usr/bin/env python
import errno
import argparse
import glob
import os
import shutil
import subprocess
import sys
from contextlib import contextmanager


# Monkey-patch the API inconsistency between Python2.X and 3.X.
if sys.platform.startswith("linux"):
sys.platform = "linux"
Expand All @@ -20,6 +20,7 @@
"USE_S3": "OFF",

"USE_CUDA": "OFF",
"USE_NCCL": "OFF",
"JVM_BINDINGS": "ON"
}

Expand Down Expand Up @@ -68,6 +69,10 @@ def normpath(path):


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--use-cuda', type=str, choices=['ON', 'OFF'], default='OFF')
cli_args = parser.parse_args()

if sys.platform == "darwin":
# Enable of your compiler supports OpenMP.
CONFIG["USE_OPENMP"] = "OFF"
Expand All @@ -88,12 +93,21 @@ def normpath(path):
else:
maybe_parallel_build = ""

if cli_args.use_cuda == 'ON':
CONFIG['USE_CUDA'] = 'ON'
CONFIG['USE_NCCL'] = 'ON'

args = ["-D{0}:BOOL={1}".format(k, v) for k, v in CONFIG.items()]

# if enviorment set rabit_mock
if os.getenv("RABIT_MOCK", None) is not None:
args.append("-DRABIT_MOCK:BOOL=ON")

# if enviorment set GPU_ARCH_FLAG
gpu_arch_flag = os.getenv("GPU_ARCH_FLAG", None)
if gpu_arch_flag is not None:
args.append("%s" % gpu_arch_flag)

run("cmake .. " + " ".join(args) + maybe_generator)
run("cmake --build . --config Release" + maybe_parallel_build)

Expand Down
87 changes: 78 additions & 9 deletions jvm-packages/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
<scala.version>2.12.8</scala.version>
<scala.binary.version>2.12</scala.binary.version>
<hadoop.version>2.7.3</hadoop.version>
<use.cuda>OFF</use.cuda>
</properties>
<repositories>
<repository>
Expand All @@ -52,7 +53,65 @@
<module>xgboost4j-spark</module>
<module>xgboost4j-flink</module>
</modules>

<profiles>
<profile>
<!-- default active profile excluding gpu related test suites -->
<id>default</id>
<activation>
<activeByDefault>true</activeByDefault>
</activation>
<build>
<plugins>
<plugin>
<groupId>org.scalatest</groupId>
<artifactId>scalatest-maven-plugin</artifactId>
<configuration>
<tagsToExclude>ml.dmlc.xgboost4j.java.GpuTestSuite</tagsToExclude>
</configuration>
</plugin>
</plugins>
</build>
</profile>

<!-- gpu profile with both cpu and gpu test suites -->
<profile>
<id>gpu</id>
<activation>
<property>
<name>use.cuda</name>
<value>ON</value>
</property>
</activation>
<build>
<plugins>
<plugin>
<groupId>org.scalatest</groupId>
<artifactId>scalatest-maven-plugin</artifactId>
</plugin>
</plugins>
</build>
</profile>

<!-- gpu-with-gpu-tests profile with only gpu test suites -->
<profile>
<id>gpu-with-gpu-tests</id>
Copy link
Member

@CodingCat CodingCat Jul 24, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so we really need two profile regarding gpu? mvn does support something like testing a particular suite

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @CodingCat, Thx for the review.

I suppose "mvn does support something like testing a particular suite" is specifying like

 mvn test -Dsuites=ml.dmlc.xgboost4j.scala.DMatrixSuite

Yeah, Seems mvn can specify which test suite to run, Suppose the test suite must be package name. So if PR has added another gpu related suites, then developer needs to change the CI build script to add another gpu suite.

This profile is only for CI used, and it will pick any suite tagged with GpuTestSuite to run. So the only thing for developer to do is just adding a GpuTestSuite tag to the new gpu suites, developer does not need to touch CI part, then everything will work as expected.

<properties>
<use.cuda>ON</use.cuda>
</properties>
<build>
<plugins>
<plugin>
<groupId>org.scalatest</groupId>
<artifactId>scalatest-maven-plugin</artifactId>
<configuration>
<tagsToInclude>ml.dmlc.xgboost4j.java.GpuTestSuite</tagsToInclude>
</configuration>
</plugin>
</plugins>
</build>
</profile>

<profile>
<id>release</id>
<build>
Expand Down Expand Up @@ -242,6 +301,25 @@
<filtering>true</filtering>
</resource>
</resources>

<pluginManagement>
<plugins>
<plugin>
<groupId>org.scalatest</groupId>
<artifactId>scalatest-maven-plugin</artifactId>
<version>1.0</version>
<executions>
<execution>
<id>test</id>
<goals>
<goal>test</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</pluginManagement>

<plugins>
<plugin>
<groupId>org.scalastyle</groupId>
Expand Down Expand Up @@ -336,15 +414,6 @@
<plugin>
<groupId>org.scalatest</groupId>
<artifactId>scalatest-maven-plugin</artifactId>
<version>1.0</version>
<executions>
<execution>
<id>test</id>
<goals>
<goal>test</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
<extensions>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,20 @@ object SparkMLlibPipeline {

def main(args: Array[String]): Unit = {

if (args.length != 3) {
println("Usage: SparkMLlibPipeline input_path native_model_path pipeline_model_path")
if (args.length != 3 && args.length != 4) {
println("Usage: SparkMLlibPipeline input_path native_model_path pipeline_model_path " +
"[cpu|gpu]")
sys.exit(1)
}

val inputPath = args(0)
val nativeModelPath = args(1)
val pipelineModelPath = args(2)

val (treeMethod, numWorkers) = if (args.length == 4 && args(3) == "gpu") {
("gpu_hist", 1)
} else ("auto", 2)

val spark = SparkSession
.builder()
.appName("XGBoost4J-Spark Pipeline Example")
Expand Down Expand Up @@ -76,7 +81,8 @@ object SparkMLlibPipeline {
"objective" -> "multi:softprob",
"num_class" -> 3,
"num_round" -> 100,
"num_workers" -> 2
"num_workers" -> numWorkers,
"tree_method" -> treeMethod
)
)
booster.setFeaturesCol("features")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,14 @@ object SparkTraining {
def main(args: Array[String]): Unit = {
if (args.length < 1) {
// scalastyle:off
println("Usage: program input_path")
println("Usage: program input_path [cpu|gpu]")
sys.exit(1)
}

val (treeMethod, numWorkers) = if (args.length == 2 && args(1) == "gpu") {
("gpu_hist", 1)
} else ("auto", 2)

val spark = SparkSession.builder().getOrCreate()
val inputPath = args(0)
val schema = new StructType(Array(
Expand Down Expand Up @@ -68,7 +73,8 @@ object SparkTraining {
"objective" -> "multi:softprob",
"num_class" -> 3,
"num_round" -> 100,
"num_workers" -> 2,
"num_workers" -> numWorkers,
"tree_method" -> treeMethod,
"eval_sets" -> Map("eval1" -> eval1, "eval2" -> eval2))
val xgbClassifier = new XGBoostClassifier(xgbParam).
setFeaturesCol("features").
Expand Down
Loading