[RFC] Version 0.90 release candidate (#4475)

* Release 0.90 * Add script to automatically generate acknowledgment * Update NEWS.md
dmlc · May 20, 2019 · 515f5f5 · 515f5f5
1 parent adcd8ea
commit 515f5f5
Show file tree

Hide file tree

Showing 16 changed files with 244 additions and 15 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,5 +1,5 @@
 cmake_minimum_required(VERSION 3.3)
-project(xgboost LANGUAGES CXX C VERSION 0.82)
+project(xgboost LANGUAGES CXX C VERSION 0.90)
 include(cmake/Utils.cmake)
 list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake/modules")
 cmake_policy(SET CMP0022 NEW)

diff --git a/Jenkinsfile b/Jenkinsfile
@@ -155,8 +155,11 @@ def Doxygen() {
     def container_type = "cpu"
     def docker_binary = "docker"
     sh """
-    ${dockerRun} ${container_type} ${docker_binary} tests/ci_build/doxygen.sh
+    ${dockerRun} ${container_type} ${docker_binary} tests/ci_build/doxygen.sh ${BRANCH_NAME}
     """
+    archiveArtifacts artifacts: "build/${BRANCH_NAME}.tar.bz2", allowEmptyArchive: true
+    echo 'Uploading doc...'
+    s3Upload file: "build/${BRANCH_NAME}.tar.bz2", bucket: 'xgboost-docs', acl: 'PublicRead', path: "doxygen/${BRANCH_NAME}.tar.bz2"
     deleteDir()
   }
 }

diff --git a/NEWS.md b/NEWS.md
@@ -3,6 +3,142 @@ XGBoost Change Log
 
 This file records the changes in xgboost library in reverse chronological order.
 
+## v0.90 (2019.05.18)
+
+### XGBoost Python package drops Python 2.x (#4379, #4381)
+Python 2.x is reaching its end-of-life at the end of this year. [Many scientific Python packages are now moving to drop Python 2.x](https://python3statement.org/).
+
+### XGBoost4J-Spark now requires Spark 2.4.x (#4377)
+* Spark 2.3 is reaching its end-of-life soon. See discussion at #4389.
+* **Consistent handling of missing values** (#4309, #4349, #4411): Many users had reported issue with inconsistent predictions between XGBoost4J-Spark and the Python XGBoost package. The issue was caused by Spark mis-handling non-zero missing values (NaN, -1, 999 etc). We now alert the user whenever Spark doesn't handle missing values correctly (#4309, #4349). See [the tutorial for dealing with missing values in XGBoost4J-Spark](https://xgboost.readthedocs.io/en/release_0.90/jvm/xgboost4j_spark_tutorial.html#dealing-with-missing-values). This fix also depends on the availability of Spark 2.4.x.
+
+### Roadmap: better performance scaling for multi-core CPUs (#4310)
+* Poor performance scaling of the `hist` algorithm for multi-core CPUs has been under investigation (#3810). #4310 optimizes quantile sketches and other pre-processing tasks. Special thanks to @SmirnovEgorRu.
+
+### Roadmap: Harden distributed training (#4250)
+* Make distributed training in XGBoost more robust by hardening [Rabit](https://github.com/dmlc/rabit), which implements [the AllReduce primitive](https://en.wikipedia.org/wiki/Reduce_%28parallel_pattern%29). In particular, improve test coverage on mechanisms for fault tolerance and recovery. Special thanks to @chenqin.
+
+### New feature: Multi-class metric functions for GPUs (#4368)
+* Metrics for multi-class classification have been ported to GPU: `merror`, `mlogloss`. Special thanks to @trivialfis.
+* With supported metrics, XGBoost will select the correct devices based on your system and `n_gpus` parameter.
+
+### New feature: Scikit-learn-like random forest API (#4148, #4255, #4258)
+* XGBoost Python package now offers `XGBRFClassifier` and `XGBRFRegressor` API to train random forests. See [the tutorial](https://xgboost.readthedocs.io/en/release_0.90/tutorials/rf.html). Special thanks to @canonizer
+
+### New feature: use external memory in GPU predictor (#4284, #4396, #4438, #4457)
+* It is now possible to make predictions on GPU when the input is read from external memory. This is useful when you want to make predictions with big dataset that does not fit into the GPU memory. Special thanks to @rongou, @canonizer, @sriramch.
+
+  ```python
+  dtest = xgboost.DMatrix('test_data.libsvm#dtest.cache')
+  bst.set_param('predictor', 'gpu_predictor')
+  bst.predict(dtest)
+  ```
+
+* Coming soon: GPU training (`gpu_hist`) with external memory
+
+### New feature: XGBoost can now handle comments in LIBSVM files (#4430)
+* Special thanks to @trivialfis and @hcho3
+
+### New feature: Embed XGBoost in your C/C++ applications using CMake (#4323, #4333, #4453)
+* It is now easier than ever to embed XGBoost in your C/C++ applications. In your CMakeLists.txt, add `xgboost::xgboost` as a linked library:
+
+  ```cmake
+  find_package(xgboost REQUIRED)
+  add_executable(api-demo c-api-demo.c)
+  target_link_libraries(api-demo xgboost::xgboost)
+  ```
+
+  [XGBoost C API documentation is available.](https://xgboost.readthedocs.io/en/release_0.90/dev) Special thanks to @trivialfis
+
+### Performance improvements
+* Use feature interaction constraints to narrow split search space (#4341, #4428)
+* Additional optimizations for `gpu_hist` (#4248, #4283)
+* Reduce OpenMP thread launches in `gpu_hist` (#4343)
+* Additional optimizations for multi-node multi-GPU random forests. (#4238)
+* Allocate unique prediction buffer for each input matrix, to avoid re-sizing GPU array (#4275)
+* Remove various synchronisations from CUDA API calls (#4205)
+* XGBoost4J-Spark
+  - Allow the user to control whether to cache partitioned training data, to potentially reduce execution time (#4268)
+
+### Bug-fixes
+* Fix node reuse in `hist` (#4404)
+* Fix GPU histogram allocation (#4347)
+* Fix matrix attributes not sliced (#4311)
+* Revise AUC and AUCPR metrics now work with weighted ranking task (#4216, #4436)
+* Fix timer invocation for InitDataOnce() in `gpu_hist` (#4206)
+* Fix R-devel errors (#4251)
+* Make gradient update in GPU linear updater thread-safe (#4259)
+* Prevent out-of-range access in column matrix (#4231)
+* Don't store DMatrix handle in Python object until it's initialized, to improve exception safety (#4317)
+* XGBoost4J-Spark
+  - Fix non-deterministic order within a zipped partition on prediction (#4388)
+  - Remove race condition on tracker shutdown (#4224)
+  - Allow set the parameter `maxLeaves`. (#4226)
+  - Allow partial evaluation of dataframe before prediction (#4407)
+  - Automatically set `maximize_evaluation_metrics` if not explicitly given (#4446)
+
+### API changes
+* Deprecate `reg:linear` in favor of `reg:squarederror`. (#4267, #4427)
+* Add attribute getter and setter to the Booster object in XGBoost4J (#4336)
+
+### Maintenance: Refactor C++ code for legibility and maintainability
+* Fix clang-tidy warnings. (#4149)
+* Remove deprecated C APIs. (#4266)
+* Use Monitor class to time functions in `hist`. (#4273)
+* Retire DVec class in favour of c++20 style span for device memory. (#4293)
+* Improve HostDeviceVector exception safety (#4301)
+
+### Maintenance: testing, continuous integration, build system
+* **Major refactor of CMakeLists.txt** (#4323, #4333, #4453): adopt modern CMake and export XGBoost as a target
+* **Major improvement in Jenkins CI pipeline** (#4234)
+  - Migrate all Linux tests to Jenkins (#4401)
+  - Builds and tests are now de-coupled, to test an artifact against multiple versions of CUDA, JDK, and other dependencies (#4401)
+  - Add Windows GPU to Jenkins CI pipeline (#4463, #4469)
+* Support CUDA 10.1 (#4223, #4232, #4265, #4468)
+* Python wheels are now built with CUDA 9.0, so that JIT is not required on Volta architecture (#4459)
+* Integrate with NVTX CUDA profiler (#4205)
+* Add a test for cpu predictor using external memory (#4308)
+* Refactor tests to get rid of duplication (#4358)
+* Remove test dependency on `craigcitro/r-travis`, since it's deprecated (#4353)
+* Add files from local R build to `.gitignore` (#4346)
+* Make XGBoost4J compatible with Java 9+ by revising NativeLibLoader (#4351)
+* Jenkins build for CUDA 10.0 (#4281)
+* Remove remaining `silent` and `debug_verbose` in Python tests (#4299)
+* Use all cores to build XGBoost4J lib on linux (#4304)
+* Upgrade Jenkins Linux build environment to GCC 5.3.1, CMake 3.6.0 (#4306)
+* Make CMakeLists.txt compatible with CMake 3.3 (#4420)
+* Add OpenMP option in CMakeLists.txt (#4339)
+* Get rid of a few trivial compiler warnings (#4312)
+* Add external Docker build cache, to speed up builds on Jenkins CI (#4331, #4334, #4458)
+* Fix Windows tests (#4403)
+* Fix a broken python test (#4395)
+* Use a fixed seed to split data in XGBoost4J-Spark tests, for reproducibility (#4417)
+* Add additional Python tests to test training under constraints (#4426)
+* Enable building with shared NCCL. (#4447)
+
+### Usability Improvements, Documentation
+* Document limitation of one-split-at-a-time Greedy tree learning heuristic (#4233)
+* Update build doc: PyPI wheel now support multi-GPU (#4219)
+* Fix docs for `num_parallel_tree` (#4221)
+* Fix document about `colsample_by*` parameter (#4340)
+* Make the train and test input with same colnames. (#4329)
+* Update R contribute link. (#4236)
+* Fix travis R tests (#4277)
+* Log version number in crash log in XGBoost4J-Spark (#4271, #4303)
+* Allow supression of Rabit output in Booster::train in XGBoost4J (#4262)
+* Add tutorial on handling missing values in XGBoost4J-Spark (#4425)
+* Fix typos (#4345, #4393, #4432, #4435)
+* Added language classifier in setup.py (#4327)
+* Added Travis CI badge (#4344)
+* Add BentoML to use case section (#4400)
+* Remove subtly sexist remark (#4418)
+* Add R vignette about parsing JSON dumps (#4439)
+
+### Acknowledgement
+**Contributors**: Nan Zhu (@CodingCat), Adam Pocock (@Craigacp), Daniel Hen (@Daniel8hen), Jiaxiang Li (@JiaxiangBU), Rory Mitchell (@RAMitchell), Egor Smirnov (@SmirnovEgorRu), Andy Adinets (@canonizer), Jonas (@elcombato), Harry Braviner (@harrybraviner), Philip Hyunsu Cho (@hcho3), Tong He (@hetong007), James Lamb (@jameslamb), Jean-Francois Zinque (@jeffzi), Yang Yang (@jokerkeny), Mayank Suman (@mayanksuman), jess (@monkeywithacupcake), Hajime Morrita (@omo), Ravi Kalia (@project-delphi), @ras44, Rong Ou (@rongou), Shaochen Shi (@shishaochen), Xu Xiao (@sperlingxx), @sriramch, Jiaming Yuan (@trivialfis), Christopher Suchanek (@wsuchy), Bozhao (@yubozhao)
+
+**Reviewers**: Nan Zhu (@CodingCat), Adam Pocock (@Craigacp), Daniel Hen (@Daniel8hen), Jiaxiang Li (@JiaxiangBU), Laurae (@Laurae2), Rory Mitchell (@RAMitchell), Egor Smirnov (@SmirnovEgorRu), @alois-bissuel, Andy Adinets (@canonizer), Chen Qin (@chenqin), Harry Braviner (@harrybraviner), Philip Hyunsu Cho (@hcho3), Tong He (@hetong007), @jakirkham, James Lamb (@jameslamb), Julien Schueller (@jschueller), Mayank Suman (@mayanksuman), Hajime Morrita (@omo), Rong Ou (@rongou), Sara Robinson (@sararob), Shaochen Shi (@shishaochen), Xu Xiao (@sperlingxx), @sriramch, Sean Owen (@srowen), Sergei Lebedev (@superbobry), Yuan (Terry) Tang (@terrytangyuan), Theodore Vasiloudis (@thvasilo), Matthew Tovbin (@tovbinm), Jiaming Yuan (@trivialfis), Xin Yin (@xydrolase)
+
 ## v0.82 (2019.03.03)
 This release is packed with many new features and bug fixes.
 

diff --git a/dev/query_contributors.py b/dev/query_contributors.py
@@ -0,0 +1,63 @@
+"""Query list of all contributors and reviewers in a release"""
+
+from sh.contrib import git
+import sys
+import re
+import requests
+import json
+
+if len(sys.argv) != 5:
+    print(f'Usage: {sys.argv[0]} [starting commit/tag] [ending commit/tag] [GitHub username] [GitHub password]')
+    sys.exit(1)
+
+from_commit = sys.argv[1]
+to_commit = sys.argv[2]
+username = sys.argv[3]
+password = sys.argv[4]
+
+contributors = set()
+reviewers = set()
+
+for line in git.log(f'{from_commit}..{to_commit}', '--pretty=format:%s', '--reverse'):
+    m = re.search('\(#([0-9]+)\)', line.rstrip())
+    if m:
+        pr_id = m.group(1)
+        print(f'PR #{pr_id}')
+
+        r = requests.get(f'https://api.github.com/repos/dmlc/xgboost/pulls/{pr_id}/commits', auth=(username, password))
+        assert r.status_code == requests.codes.ok, f'Code: {r.status_code}, Text: {r.text}'
+        commit_list = json.loads(r.text)
+        try:
+            contributors.update([commit['author']['login'] for commit in commit_list])
+        except TypeError:
+            contributors.update(str(input(f'Error fetching contributors for PR #{pr_id}. Enter it manually, as a space-separated list:')).split(' '))
+
+        r = requests.get(f'https://api.github.com/repos/dmlc/xgboost/pulls/{pr_id}/reviews', auth=(username, password))
+        assert r.status_code == requests.codes.ok, f'Code: {r.status_code}, Text: {r.text}'
+        review_list = json.loads(r.text)
+        reviewers.update([x['user']['login'] for x in review_list])
+
+        r = requests.get(f'https://api.github.com/repos/dmlc/xgboost/issues/{pr_id}/comments', auth=(username, password))
+        assert r.status_code == requests.codes.ok, f'Code: {r.status_code}, Text: {r.text}'
+        comment_list = json.loads(r.text)
+        reviewers.update([x['user']['login'] for x in comment_list])
+
+print('Contributors:', end='')
+for x in sorted(contributors):
+    r = requests.get(f'https://api.github.com/users/{x}', auth=(username, password))
+    assert r.status_code == requests.codes.ok, f'Code: {r.status_code}, Text: {r.text}'
+    user_info = json.loads(r.text)
+    if user_info['name'] is None:
+        print(f"@{x}, ", end='')
+    else:
+        print(f"{user_info['name']} (@{x}), ", end='')
+
+print('Reviewers:', end='')
+for x in sorted(reviewers):
+    r = requests.get(f'https://api.github.com/users/{x}', auth=(username, password))
+    assert r.status_code == requests.codes.ok, f'Code: {r.status_code}, Text: {r.text}'
+    user_info = json.loads(r.text)
+    if user_info['name'] is None:
+        print(f"@{x}, ", end='')
+    else:
+        print(f"{user_info['name']} (@{x}), ", end='')
diff --git a/doc/conf.py b/doc/conf.py
@@ -33,6 +33,11 @@
   call('if [ -d tmp ]; then rm -rf tmp; fi; mkdir -p tmp/jvm; cd tmp/jvm; tar xvf {}'.format(filename), shell=True)
 except HTTPError:
   print('JVM doc not found. Skipping...')
+try:
+  filename, _ = urllib.request.urlretrieve('https://s3-us-west-2.amazonaws.com/xgboost-docs/doxygen/{}.tar.bz2'.format(git_branch[0]))
+  call('mkdir -p tmp/dev; cd tmp/dev; tar xvf {}; mv doc_doxygen/html/* .; rm -rf doc_doxygen'.format(filename), shell=True)
+except HTTPError:
+  print('C API doc not found. Skipping...')
 
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the

diff --git a/doc/tutorials/index.rst b/doc/tutorials/index.rst
@@ -14,6 +14,7 @@ See `Awesome XGBoost <https://github.com/dmlc/xgboost/tree/master/demo>`_ for mo
   Distributed XGBoost with XGBoost4J-Spark <https://xgboost.readthedocs.io/en/latest/jvm/xgboost4j_spark_tutorial.html>
   dart
   monotonic
+  rf
   feature_interaction_constraint
   input_format
   param_tuning

diff --git a/doc/rf.rst → doc/tutorials/rf.rst b/doc/rf.rst → doc/tutorials/rf.rst
diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
@@ -6,7 +6,7 @@
 
     <groupId>ml.dmlc</groupId>
     <artifactId>xgboost-jvm</artifactId>
-    <version>0.83-SNAPSHOT</version>
+    <version>0.90</version>
     <packaging>pom</packaging>
     <name>XGBoost JVM Package</name>
     <description>JVM Package for XGBoost</description>

diff --git a/jvm-packages/xgboost4j-example/pom.xml b/jvm-packages/xgboost4j-example/pom.xml
@@ -6,10 +6,10 @@
     <parent>
         <groupId>ml.dmlc</groupId>
         <artifactId>xgboost-jvm</artifactId>
-        <version>0.83-SNAPSHOT</version>
+        <version>0.90</version>
     </parent>
     <artifactId>xgboost4j-example</artifactId>
-    <version>0.83-SNAPSHOT</version>
+    <version>0.90</version>
     <packaging>jar</packaging>
     <build>
         <plugins>
@@ -26,7 +26,7 @@
         <dependency>
             <groupId>ml.dmlc</groupId>
             <artifactId>xgboost4j-spark</artifactId>
-            <version>0.83-SNAPSHOT</version>
+            <version>0.90</version>
         </dependency>
         <dependency>
             <groupId>org.apache.spark</groupId>
@@ -37,7 +37,7 @@
         <dependency>
             <groupId>ml.dmlc</groupId>
             <artifactId>xgboost4j-flink</artifactId>
-            <version>0.83-SNAPSHOT</version>
+            <version>0.90</version>
         </dependency>
         <dependency>
             <groupId>org.apache.commons</groupId>

diff --git a/jvm-packages/xgboost4j-flink/pom.xml b/jvm-packages/xgboost4j-flink/pom.xml
@@ -6,10 +6,10 @@
     <parent>
         <groupId>ml.dmlc</groupId>
         <artifactId>xgboost-jvm</artifactId>
-        <version>0.83-SNAPSHOT</version>
+        <version>0.90</version>
     </parent>
     <artifactId>xgboost4j-flink</artifactId>
-    <version>0.83-SNAPSHOT</version>
+    <version>0.90</version>
     <build>
         <plugins>
             <plugin>
@@ -26,7 +26,7 @@
         <dependency>
             <groupId>ml.dmlc</groupId>
             <artifactId>xgboost4j</artifactId>
-            <version>0.83-SNAPSHOT</version>
+            <version>0.90</version>
         </dependency>
         <dependency>
             <groupId>org.apache.commons</groupId>

diff --git a/jvm-packages/xgboost4j-spark/pom.xml b/jvm-packages/xgboost4j-spark/pom.xml
@@ -6,7 +6,7 @@
     <parent>
         <groupId>ml.dmlc</groupId>
         <artifactId>xgboost-jvm</artifactId>
-        <version>0.83-SNAPSHOT</version>
+        <version>0.90</version>
     </parent>
     <artifactId>xgboost4j-spark</artifactId>
     <build>
@@ -24,7 +24,7 @@
         <dependency>
             <groupId>ml.dmlc</groupId>
             <artifactId>xgboost4j</artifactId>
-            <version>0.83-SNAPSHOT</version>
+            <version>0.90</version>
         </dependency>
         <dependency>
             <groupId>org.apache.spark</groupId>

diff --git a/jvm-packages/xgboost4j/pom.xml b/jvm-packages/xgboost4j/pom.xml
@@ -6,10 +6,10 @@
     <parent>
         <groupId>ml.dmlc</groupId>
         <artifactId>xgboost-jvm</artifactId>
-        <version>0.83-SNAPSHOT</version>
+        <version>0.90</version>
     </parent>
     <artifactId>xgboost4j</artifactId>
-    <version>0.83-SNAPSHOT</version>
+    <version>0.90</version>
     <packaging>jar</packaging>
 
     <dependencies>

diff --git a/python-package/setup.py b/python-package/setup.py
@@ -54,4 +54,5 @@
                    'Programming Language :: Python :: 3.5',
                    'Programming Language :: Python :: 3.6',
                    'Programming Language :: Python :: 3.7'],
+      python_requires='>=3.4',
       url='https://github.com/dmlc/xgboost')
diff --git a/python-package/setup_pip.py b/python-package/setup_pip.py
@@ -64,4 +64,14 @@ def has_ext_modules(self):
       # root directory for some machines, and cause confusions on building
       # data_files=[('xgboost', LIB_PATH)],
       distclass=BinaryDistribution,
+      license='Apache-2.0',
+      classifiers=['License :: OSI Approved :: Apache Software License',
+                   'Development Status :: 5 - Production/Stable',
+                   'Operating System :: OS Independent',
+                   'Programming Language :: Python',
+                   'Programming Language :: Python :: 3',
+                   'Programming Language :: Python :: 3.5',
+                   'Programming Language :: Python :: 3.6',
+                   'Programming Language :: Python :: 3.7'],
+      python_requires='>=3.4',
       url='https://github.com/dmlc/xgboost')
diff --git a/python-package/xgboost/VERSION b/python-package/xgboost/VERSION
@@ -1 +1 @@
-0.83.dev0
+0.90
diff --git a/tests/ci_build/doxygen.sh b/tests/ci_build/doxygen.sh
@@ -1,9 +1,19 @@
 #!/bin/bash
+
+if [ $# -ne 1 ]; then
+  echo "Usage: $0 [branch name]"
+  exit 1
+fi
+
 set -e
 set -x
 
+branch_name=$1
+
 rm -rf build
 mkdir build
 cd build
 cmake .. -DBUILD_C_DOC=ON
 make -j
+
+tar cvjf ${branch_name}.tar.bz2 doc_doxygen/