diff --git a/CMakeLists.txt b/CMakeLists.txt index 538ee4790cda..ab93bbb0ee7c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -14,11 +14,8 @@ if (CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0) endif() message(STATUS "xgboost VERSION: ${xgboost_VERSION}") -set(XGBOOST_DEFINITIONS - ${XGBOOST_DEFINITIONS} - -DXGBOOST_VER_MAJOR=${xgboost_VERSION_MAJOR} - -DXGBOOST_VER_MINOR=${xgboost_VERSION_MINOR} - -DXGBOOST_VER_PATCH=${xgboost_VERSION_PATCH}) +include (${xgboost_SOURCE_DIR}/cmake/Version.cmake) +write_version() set_default_configuration_release() #-- Options diff --git a/amalgamation/xgboost-all0.cc b/amalgamation/xgboost-all0.cc index 963b9782dd45..c56a3bfa59cf 100644 --- a/amalgamation/xgboost-all0.cc +++ b/amalgamation/xgboost-all0.cc @@ -25,7 +25,9 @@ // gbms #include "../src/gbm/gbm.cc" #include "../src/gbm/gbtree.cc" +#include "../src/gbm/gbtree_model.cc" #include "../src/gbm/gblinear.cc" +#include "../src/gbm/gblinear_model.cc" // data #include "../src/data/data.cc" diff --git a/cmake/Version.cmake b/cmake/Version.cmake new file mode 100644 index 000000000000..f564031a17da --- /dev/null +++ b/cmake/Version.cmake @@ -0,0 +1,5 @@ +function (write_version) + configure_file( + ${xgboost_SOURCE_DIR}/cmake/build_config.h.in + ${xgboost_SOURCE_DIR}/include/xgboost/build_config.h @ONLY) +endfunction (write_version) diff --git a/cmake/build_config.h.in b/cmake/build_config.h.in new file mode 100644 index 000000000000..3bfab88dc59b --- /dev/null +++ b/cmake/build_config.h.in @@ -0,0 +1,28 @@ +/*! + * Copyright 2019 by Contributors + * \file build_config.h + * + * Generated from `cmake/build_config.h.in` by cmake. + */ +#ifndef XGBOOST_BUILD_CONFIG_H_ +#define XGBOOST_BUILD_CONFIG_H_ + +// These check are for Makefile. +#if !defined(XGBOOST_MM_PREFETCH_PRESENT) && !defined(XGBOOST_BUILTIN_PREFETCH_PRESENT) +/* default logic for software pre-fetching */ +#if (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_AMD64))) || defined(__INTEL_COMPILER) +// Enable _mm_prefetch for Intel compiler and MSVC+x86 + #define XGBOOST_MM_PREFETCH_PRESENT + #define XGBOOST_BUILTIN_PREFETCH_PRESENT +#elif defined(__GNUC__) +// Enable __builtin_prefetch for GCC +#define XGBOOST_BUILTIN_PREFETCH_PRESENT +#endif // GUARDS + +#endif // !defined(XGBOOST_MM_PREFETCH_PRESENT) && !defined() + +#define XGBOOST_VER_MAJOR @xgboost_VERSION_MAJOR@ +#define XGBOOST_VER_MINOR @xgboost_VERSION_MINOR@ +#define XGBOOST_VER_PATCH @xgboost_VERSION_PATCH@ + +#endif // XGBOOST_BUILD_CONFIG_H_ diff --git a/include/xgboost/base.h b/include/xgboost/base.h index 0922cb22e73c..479fbbc0ffc6 100644 --- a/include/xgboost/base.h +++ b/include/xgboost/base.h @@ -197,6 +197,21 @@ inline XGBOOST_DEVICE void GradientPairInternal::SetHess(float h) { } // namespace detail +class Json; + +struct Serializable { + /*! + * \brief load the model from a json object + * \param in json object where to load the model from + */ + virtual void Load(Json const& in) = 0; + /*! + * \breif saves the model to a json object + * \param out json container where to save the model to + */ + virtual void Save(Json* out) const = 0; +}; + /*! \brief gradient statistics pair usually needed in gradient boosting */ using GradientPair = detail::GradientPairInternal; diff --git a/include/xgboost/build_config.h b/include/xgboost/build_config.h index 6d364a6ff081..f626f390a2c7 100644 --- a/include/xgboost/build_config.h +++ b/include/xgboost/build_config.h @@ -1,6 +1,8 @@ /*! * Copyright 2019 by Contributors * \file build_config.h + * + * Generated from `cmake/build_config.h.in` by cmake. */ #ifndef XGBOOST_BUILD_CONFIG_H_ #define XGBOOST_BUILD_CONFIG_H_ @@ -19,4 +21,8 @@ #endif // !defined(XGBOOST_MM_PREFETCH_PRESENT) && !defined() +#define XGBOOST_VER_MAJOR 1 +#define XGBOOST_VER_MINOR 0 +#define XGBOOST_VER_PATCH 0 + #endif // XGBOOST_BUILD_CONFIG_H_ diff --git a/include/xgboost/gbm.h b/include/xgboost/gbm.h index 99ff7989c9f7..4b43583beb99 100644 --- a/include/xgboost/gbm.h +++ b/include/xgboost/gbm.h @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -27,7 +28,7 @@ namespace xgboost { /*! * \brief interface of gradient boosting model. */ -class GradientBooster { +class GradientBooster : public Serializable { protected: GenericParameter const* learner_param_; @@ -46,11 +47,21 @@ class GradientBooster { * \param fi input stream. */ virtual void Load(dmlc::Stream* fi) = 0; + /*! + * \brief load model from json + * \param in input json model. + */ + void Load(Json const& in) override = 0; /*! * \brief save model to stream. * \param fo output stream */ virtual void Save(dmlc::Stream* fo) const = 0; + /*! + * \brief Save model to Json + * \param out output json document. + */ + void Save(Json* out) const override = 0; /*! * \brief whether the model allow lazy checkpoint * return true if model is only updated in DoBoost diff --git a/include/xgboost/learner.h b/include/xgboost/learner.h index 8897dc831c63..23f6bfaef7ba 100644 --- a/include/xgboost/learner.h +++ b/include/xgboost/learner.h @@ -9,13 +9,13 @@ #define XGBOOST_LEARNER_H_ #include - #include #include #include #include #include #include +#include #include #include @@ -41,7 +41,7 @@ namespace xgboost { * * \endcode */ -class Learner : public rabit::Serializable { +class Learner : public Serializable, public rabit::Serializable { public: /*! \brief virtual destructor */ ~Learner() override = default; @@ -49,11 +49,23 @@ class Learner : public rabit::Serializable { * \brief Configure Learner based on set parameters. */ virtual void Configure() = 0; + + /*! + * \brief load model from json object + * \param in input json object + */ + void Load(Json const& in) override = 0; /*! * \brief load model from stream * \param fi input stream. */ void Load(dmlc::Stream* fi) override = 0; + + /*! + * \brief save model to json object + * \param out output json object + */ + void Save(Json* out) const override = 0; /*! * \brief save model to stream. * \param fo output stream diff --git a/include/xgboost/objective.h b/include/xgboost/objective.h index 466f87e0ed57..dd6108a40ed3 100644 --- a/include/xgboost/objective.h +++ b/include/xgboost/objective.h @@ -21,8 +21,10 @@ namespace xgboost { +class Json; + /*! \brief interface of objective function */ -class ObjFunction { +class ObjFunction : public Serializable { protected: GenericParameter const* tparam_; @@ -72,6 +74,9 @@ class ObjFunction { virtual bst_float ProbToMargin(bst_float base_score) const { return base_score; } + + virtual void Save(Json* out) const = 0; + virtual void Load(Json const& in) = 0; /*! * \brief Create an objective function according to name. * \param tparam Generic parameters. diff --git a/include/xgboost/tree_model.h b/include/xgboost/tree_model.h index 37f61dfae8d1..374124a373cf 100644 --- a/include/xgboost/tree_model.h +++ b/include/xgboost/tree_model.h @@ -9,6 +9,7 @@ #include #include + #include #include #include @@ -24,6 +25,8 @@ namespace xgboost { struct PathElement; // forward declaration +class Json; + /*! \brief meta parameters of the tree */ struct TreeParam : public dmlc::Parameter { /*! \brief number of start root */ @@ -57,6 +60,7 @@ struct TreeParam : public dmlc::Parameter { // other arguments are set by the algorithm. DMLC_DECLARE_FIELD(num_roots).set_lower_bound(1).set_default(1) .describe("Number of start root of trees."); + DMLC_DECLARE_FIELD(num_nodes).set_lower_bound(1).set_default(1); DMLC_DECLARE_FIELD(num_feature) .describe("Number of features used in tree construction."); DMLC_DECLARE_FIELD(size_leaf_vector).set_lower_bound(0).set_default(0) @@ -80,7 +84,7 @@ struct RTreeNodeStat { /*! \brief weight of current node */ bst_float base_weight; /*! \brief number of child that is leaf node known up to now */ - int leaf_child_cnt; + int leaf_child_cnt {0}; bool operator==(const RTreeNodeStat& b) const { return loss_chg == b.loss_chg && sum_hess == b.sum_hess && base_weight == b.base_weight && leaf_child_cnt == b.leaf_child_cnt; @@ -91,7 +95,7 @@ struct RTreeNodeStat { * \brief define regression tree to be the most common tree model. * This is the data structure used in xgboost's major tree models. */ -class RegTree { +class RegTree : public Serializable { public: /*! \brief auxiliary statistics of node to help tree building */ using SplitCondT = bst_float; @@ -103,6 +107,12 @@ class RegTree { static_assert(sizeof(Node) == 4 * sizeof(int) + sizeof(Info), "Node: 64 bit align"); } + Node(int32_t cleft, int32_t cright, int32_t parent, + uint32_t split_ind, float split_cond, bool default_left) : + parent_{parent}, cleft_{cleft}, cright_{cright} { + this->SetSplit(split_ind, split_cond, default_left); + } + /*! \brief index of left child */ XGBOOST_DEVICE int LeftChild() const { return this->cleft_; @@ -216,9 +226,9 @@ class RegTree { }; // pointer to parent, highest bit is used to // indicate whether it's a left child or not - int parent_; + int parent_{-1}; // pointer to left, right - int cleft_, cright_; + int cleft_{-1}, cright_{-1}; // split feature index, left split or right split depends on the highest bit unsigned sindex_{0}; // extra info @@ -307,6 +317,8 @@ class RegTree { } CHECK_EQ(static_cast(deleted_nodes_.size()), param.num_deleted); } + + void Load(Json const& in) override; /*! * \brief save model to stream * \param fo output stream @@ -320,6 +332,8 @@ class RegTree { fo->Write(dmlc::BeginPtr(stats_), sizeof(RTreeNodeStat) * nodes_.size()); } + void Save(Json* out) const override; + bool operator==(const RegTree& b) const { return nodes_ == b.nodes_ && stats_ == b.stats_ && deleted_nodes_ == b.deleted_nodes_ && param == b.param; diff --git a/include/xgboost/tree_updater.h b/include/xgboost/tree_updater.h index 0a6a30fba346..a758d35e88d2 100644 --- a/include/xgboost/tree_updater.h +++ b/include/xgboost/tree_updater.h @@ -22,6 +22,9 @@ #include "../../src/common/host_device_vector.h" namespace xgboost { + +class Json; + /*! * \brief interface of tree update module, that performs update of a tree. */ @@ -67,6 +70,8 @@ class TreeUpdater { virtual char const* Name() const = 0; + virtual void Save(Json* out) const {} + /*! * \brief Create a tree updater given name * \param name Name of the tree updater. diff --git a/plugin/example/custom_obj.cc b/plugin/example/custom_obj.cc index 6ce653875d19..19d28119ff34 100644 --- a/plugin/example/custom_obj.cc +++ b/plugin/example/custom_obj.cc @@ -1,5 +1,5 @@ /*! - * Copyright 2015 by Contributors + * Copyright 2015-2019 by Contributors * \file custom_metric.cc * \brief This is an example to define plugin of xgboost. * This plugin defines the additional metric function. @@ -7,6 +7,7 @@ #include #include #include +#include namespace xgboost { namespace obj { @@ -69,6 +70,16 @@ class MyLogistic : public ObjFunction { return -std::log(1.0f / base_score - 1.0f); } + void Save(Json* p_out) const override { + auto& out = *p_out; + out["name"] = String("MyLogistic"); + out["MyLogisticParam"] = toJson(param_); + } + + void Load(Json const& in) override { + param_.InitAllowUnknown(fromJson(get(in["MyLogisticParam"]))); + } + private: MyLogisticParam param_; }; diff --git a/python-package/xgboost/compat.py b/python-package/xgboost/compat.py index e2115194d92b..6304668408df 100644 --- a/python-package/xgboost/compat.py +++ b/python-package/xgboost/compat.py @@ -12,14 +12,18 @@ # pylint: disable=invalid-name, redefined-builtin STRING_TYPES = (str,) - def py_str(x): """convert c string back to python string""" - return x.decode('utf-8') + try: + s = x.decode('utf-8') + except UnicodeDecodeError as e: + print('Failed to decode error message, please file a bug report.') + print(e) + s = x + return s else: STRING_TYPES = (basestring,) # pylint: disable=undefined-variable - def py_str(x): """convert c string back to python string""" return x diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc index 601aad994f46..a8ce161c85f0 100644 --- a/src/c_api/c_api.cc +++ b/src/c_api/c_api.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014 by Contributors +// Copyright (c) 2014-2019 by Contributors #include #include @@ -11,16 +11,19 @@ #include #include +#include #include #include #include #include +#include "xgboost/json.h" #include "./c_api_error.h" #include "../data/simple_csr_source.h" #include "../common/math.h" #include "../common/io.h" #include "../common/group_data.h" +#include "../common/timer.h" namespace xgboost { @@ -891,6 +894,10 @@ XGB_DLL int XGBoosterPredict(BoosterHandle handle, XGB_DLL int XGBoosterLoadModel(BoosterHandle handle, const char* fname) { API_BEGIN(); CHECK_HANDLE(); + auto splited = common::Split(fname, '.'); + if (splited.size() > 1 && splited.back() == "json") { + static_cast(handle)->SetParam("model_format", "json"); + } std::unique_ptr fi(dmlc::Stream::Create(fname, "r")); static_cast(handle)->Load(fi.get()); API_END(); @@ -899,15 +906,25 @@ XGB_DLL int XGBoosterLoadModel(BoosterHandle handle, const char* fname) { XGB_DLL int XGBoosterSaveModel(BoosterHandle handle, const char* fname) { API_BEGIN(); CHECK_HANDLE(); + common::Monitor monitor; + monitor.Init(__func__); + monitor.Start(__func__); + auto splited = common::Split(fname, '.'); + if (splited.size() > 1 && splited.back() == "json") { + // Save as json document. + static_cast(handle)->SetParam("model_format", "json"); + static_cast(handle)->Configure(); + } std::unique_ptr fo(dmlc::Stream::Create(fname, "w")); auto *bst = static_cast(handle); bst->Save(fo.get()); + monitor.Stop(__func__); API_END(); } XGB_DLL int XGBoosterLoadModelFromBuffer(BoosterHandle handle, - const void* buf, - xgboost::bst_ulong len) { + const void* buf, + xgboost::bst_ulong len) { API_BEGIN(); CHECK_HANDLE(); common::MemoryFixSizeBuffer fs((void*)buf, len); // NOLINT(*) @@ -916,8 +933,8 @@ XGB_DLL int XGBoosterLoadModelFromBuffer(BoosterHandle handle, } XGB_DLL int XGBoosterGetModelRaw(BoosterHandle handle, - xgboost::bst_ulong* out_len, - const char** out_dptr) { + xgboost::bst_ulong* out_len, + const char** out_dptr) { std::string& raw_str = XGBAPIThreadLocalStore::Get()->ret_str; raw_str.resize(0); diff --git a/src/gbm/gblinear.cc b/src/gbm/gblinear.cc index ea371e5b1294..4c6992e8140b 100644 --- a/src/gbm/gblinear.cc +++ b/src/gbm/gblinear.cc @@ -14,6 +14,8 @@ #include #include #include + +#include "xgboost/json.h" #include "../common/timer.h" namespace xgboost { @@ -69,9 +71,25 @@ class GBLinear : public GradientBooster { void Load(dmlc::Stream* fi) override { model_.Load(fi); } + void Load(Json const& in) override { + fromJson(in["gblinear_train_param"], ¶m_); + + auto const& model = in["model"]; + model_.Load(model); + } + void Save(dmlc::Stream* fo) const override { model_.Save(fo); } + void Save(Json* p_out) const override { + auto& out = *p_out; + out["name"] = String{"gblinear"}; + out["gblinear_train_param"] = toJson(param_); + + out["model"] = Object(); + auto& model = out["model"]; + model_.Save(&model); + } void DoBoost(DMatrix *p_fmat, HostDeviceVector *in_gpair, @@ -143,9 +161,9 @@ class GBLinear : public GradientBooster { for (const auto &batch : p_fmat->GetBatches()) { // parallel over local batch const auto nsize = static_cast(batch.Size()); - #pragma omp parallel for schedule(static) +#pragma omp parallel for schedule(static) for (bst_omp_uint i = 0; i < nsize; ++i) { - auto inst = batch[i]; + auto inst = batch[i]; auto row_idx = static_cast(batch.base_rowid + i); // loop over output groups for (int gid = 0; gid < ngroup; ++gid) { @@ -203,7 +221,7 @@ class GBLinear : public GradientBooster { // k is number of group // parallel over local batch const auto nsize = static_cast(batch.Size()); - #pragma omp parallel for schedule(static) +#pragma omp parallel for schedule(static) for (omp_ulong i = 0; i < nsize; ++i) { const size_t ridx = batch.base_rowid + i; // loop over output groups @@ -256,8 +274,8 @@ class GBLinear : public GradientBooster { } } - inline void Pred(const SparsePage::Inst &inst, bst_float *preds, int gid, - bst_float base) { + void Pred(const SparsePage::Inst &inst, bst_float *preds, int gid, + bst_float base) { bst_float psum = model_.bias()[gid] + base; for (const auto& ins : inst) { if (ins.index >= model_.param.num_feature) continue; diff --git a/src/gbm/gblinear_model.cc b/src/gbm/gblinear_model.cc new file mode 100644 index 000000000000..e2e77fb84109 --- /dev/null +++ b/src/gbm/gblinear_model.cc @@ -0,0 +1,39 @@ +/*! + * Copyright 2019 by Contributors + */ +#include +#include + +#include "gblinear_model.h" + +namespace xgboost { +namespace gbm { + +void GBLinearModel::Save(Json* p_out) const { + using WT = std::remove_reference().back())>::type; + using JT = Number::Float; + static_assert(std::is_same::value, ""); + auto& out = *p_out; + out["model_param"] = toJson(param); + + size_t const n_weights = weight.size(); + std::vector j_weights(n_weights); + for (size_t i = 0; i < n_weights; ++i) { + j_weights[i] = weight[i]; + } + out["weights"] = std::move(j_weights); +} + +void GBLinearModel::Load(Json const& in) { + fromJson(in["model_param"], ¶m); + + auto const& j_weights = get(in["weights"]); + auto n_weights = j_weights.size(); + weight.resize(n_weights); + for (size_t i = 0; i < n_weights; ++i) { + weight[i] = get(j_weights[i]); + } +} + +} // namespace gbm +} // namespace xgboost diff --git a/src/gbm/gblinear_model.h b/src/gbm/gblinear_model.h index 57daed398a06..1a4f9916dc29 100644 --- a/src/gbm/gblinear_model.h +++ b/src/gbm/gblinear_model.h @@ -1,5 +1,5 @@ /*! - * Copyright by Contributors 2018 + * Copyright 2018-2019 by Contributors */ #pragma once #include @@ -10,6 +10,8 @@ #include #include +#include "xgboost/json.h" + namespace xgboost { namespace gbm { // model parameter @@ -47,13 +49,17 @@ class GBLinearModel { weight.resize((param.num_feature + 1) * param.num_output_group); std::fill(weight.begin(), weight.end(), 0.0f); } + + void Save(Json* p_out) const; + void Load(Json const& in); + // save the model to file - inline void Save(dmlc::Stream* fo) const { + void Save(dmlc::Stream* fo) const { fo->Write(¶m, sizeof(param)); fo->Write(weight); } // load model from file - inline void Load(dmlc::Stream* fi) { + void Load(dmlc::Stream* fi) { CHECK_EQ(fi->Read(¶m, sizeof(param)), sizeof(param)); fi->Read(&weight); } @@ -110,5 +116,6 @@ class GBLinearModel { return v; } }; + } // namespace gbm } // namespace xgboost diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc index 6a5c0714cfbc..f18d1625e35b 100644 --- a/src/gbm/gbtree.cc +++ b/src/gbm/gbtree.cc @@ -26,7 +26,6 @@ #include "gbtree_model.h" #include "../common/timer.h" - namespace xgboost { namespace gbm { @@ -65,13 +64,13 @@ void GBTree::Configure(const Args& cfg) { // dependency on DMatrix once `hist` tree method can handle external memory so that we can // make it default. void GBTree::ConfigureWithKnownData(std::map const& cfg, DMatrix* fmat) { - std::string updater_seq = tparam_.updater_seq; + std::string const updater_seq = tparam_.updater; tparam_.InitAllowUnknown(cfg); this->PerformTreeMethodHeuristic({this->cfg_.begin(), this->cfg_.end()}, fmat); this->ConfigureUpdaters({this->cfg_.begin(), this->cfg_.end()}); - LOG(DEBUG) << "Using updaters: " << tparam_.updater_seq; + LOG(DEBUG) << "Using updaters: " << tparam_.updater; // initialize the updaters only when needed. - if (updater_seq != tparam_.updater_seq) { + if (updater_seq != tparam_.updater) { this->updaters_.clear(); } this->InitUpdater(); @@ -93,7 +92,7 @@ void GBTree::PerformTreeMethodHeuristic(std::map const return; } - tparam_.updater_seq = "grow_histmaker,prune"; + tparam_.updater = "grow_histmaker,prune"; if (rabit::IsDistributed()) { LOG(WARNING) << "Tree method is automatically selected to be 'approx' " @@ -112,7 +111,7 @@ void GBTree::PerformTreeMethodHeuristic(std::map const tparam_.tree_method = TreeMethod::kApprox; } else { tparam_.tree_method = TreeMethod::kExact; - tparam_.updater_seq = "grow_colmaker,prune"; + tparam_.updater = "grow_colmaker,prune"; } LOG(DEBUG) << "Using tree method: " << static_cast(tparam_.tree_method); } @@ -135,27 +134,27 @@ void GBTree::ConfigureUpdaters(const std::map& cfg) { // This choice is deferred to PerformTreeMethodHeuristic(). break; case TreeMethod::kApprox: - tparam_.updater_seq = "grow_histmaker,prune"; + tparam_.updater = "grow_histmaker,prune"; break; case TreeMethod::kExact: - tparam_.updater_seq = "grow_colmaker,prune"; + tparam_.updater = "grow_colmaker,prune"; break; case TreeMethod::kHist: LOG(INFO) << "Tree method is selected to be 'hist', which uses a " "single updater grow_quantile_histmaker."; - tparam_.updater_seq = "grow_quantile_histmaker"; + tparam_.updater = "grow_quantile_histmaker"; break; case TreeMethod::kGPUExact: this->AssertGPUSupport(); - tparam_.updater_seq = "grow_gpu,prune"; + tparam_.updater = "grow_gpu,prune"; if (cfg.find("predictor") == cfg.cend()) { tparam_.predictor = "gpu_predictor"; } break; case TreeMethod::kGPUHist: this->AssertGPUSupport(); - tparam_.updater_seq = "grow_gpu_hist"; + tparam_.updater = "grow_gpu_hist"; if (cfg.find("predictor") == cfg.cend()) { tparam_.predictor = "gpu_predictor"; } @@ -205,7 +204,7 @@ void GBTree::DoBoost(DMatrix* p_fmat, void GBTree::InitUpdater() { if (updaters_.size() != 0) return; - std::string tval = tparam_.updater_seq; + std::string tval = tparam_.updater; std::vector ups = common::Split(tval, ','); for (const std::string& pstr : ups) { std::unique_ptr up(TreeUpdater::Create(pstr.c_str(), learner_param_)); @@ -253,6 +252,22 @@ void GBTree::CommitModel(std::vector>>&& ne GetPredictor()->UpdatePredictionCache(model_, &updaters_, num_new_trees); } +void GBTree::Load(Json const& in) { + fromJson(in["gbtree_train_param"], &tparam_); + model_.Load(in["model"]); +} + +void GBTree::Save(Json* p_out) const { + auto& out = *p_out; + out["name"] = String("gbtree"); + out["gbtree_train_param"] = toJson(tparam_); + + // model + out["model"] = Object(); + auto& model = out["model"]; + model_.Save(&model); +} + // dart class Dart : public GBTree { @@ -273,6 +288,16 @@ class Dart : public GBTree { fi->Read(&weight_drop_); } } + void Load(Json const& in) override { + auto const& gbtree = in["gbtree"]; + GBTree::Load(gbtree); + auto j_weight_drop = get(in["weight_drop"]); + weight_drop_.resize(j_weight_drop.size()); + fromJson(in["dart_train_param"], &dparam_); + for (size_t i = 0; i < weight_drop_.size(); ++i) { + weight_drop_[i] = get(j_weight_drop[i]); + } + } void Save(dmlc::Stream* fo) const override { GBTree::Save(fo); @@ -280,6 +305,19 @@ class Dart : public GBTree { fo->Write(weight_drop_); } } + void Save(Json* p_out) const override { + auto& out = *p_out; + out["name"] = String("dart"); + out["dart_train_param"] = toJson(dparam_); + out["gbtree"] = Object(); + auto& gbtree = out["gbtree"]; + GBTree::Save(&gbtree); + std::vector j_weight_drop(weight_drop_.size()); + for (size_t i = 0; i < weight_drop_.size(); ++i) { + j_weight_drop[i] = Number(weight_drop_[i]); + } + out["weight_drop"] = Array(j_weight_drop); + } // predict the leaf scores with dropout if ntree_limit = 0 void PredictBatch(DMatrix* p_fmat, diff --git a/src/gbm/gbtree.h b/src/gbm/gbtree.h index 91fef174b014..72f2689db55d 100644 --- a/src/gbm/gbtree.h +++ b/src/gbm/gbtree.h @@ -54,7 +54,7 @@ struct GBTreeTrainParam : public dmlc::Parameter { */ int num_parallel_tree; /*! \brief tree updater sequence */ - std::string updater_seq; + std::string updater; /*! \brief type of boosting process to run */ TreeProcessType process_type; // predictor name @@ -68,7 +68,7 @@ struct GBTreeTrainParam : public dmlc::Parameter { .set_lower_bound(1) .describe("Number of parallel trees constructed during each iteration."\ " This option is used to support boosted random forest."); - DMLC_DECLARE_FIELD(updater_seq) + DMLC_DECLARE_FIELD(updater) .set_default("grow_colmaker,prune") .describe("Tree updater sequence."); DMLC_DECLARE_FIELD(process_type) @@ -77,8 +77,6 @@ struct GBTreeTrainParam : public dmlc::Parameter { .add_enum("update", TreeProcessType::kUpdate) .describe("Whether to run the normal boosting process that creates new trees,"\ " or to update the trees in an existing model."); - // add alias - DMLC_DECLARE_ALIAS(updater_seq, updater); DMLC_DECLARE_FIELD(predictor) .set_default("cpu_predictor") .describe("Predictor algorithm type"); @@ -175,6 +173,10 @@ class GBTree : public GradientBooster { tparam_.tree_method == TreeMethod::kGPUExact; } + GBTreeTrainParam const& GetTrainParam() const { + return tparam_; + } + void Load(dmlc::Stream* fi) override { model_.Load(fi); @@ -183,17 +185,16 @@ class GBTree : public GradientBooster { common::ToString(model_.param.num_feature)); } - GBTreeTrainParam const& GetTrainParam() const { - return tparam_; - } + void Load(Json const& in) override; void Save(dmlc::Stream* fo) const override { model_.Save(fo); } + void Save(Json* p_out) const override; bool AllowLazyCheckPoint() const override { return model_.param.num_output_group == 1 || - tparam_.updater_seq.find("distcol") != std::string::npos; + tparam_.updater.find("distcol") != std::string::npos; } void PredictBatch(DMatrix* p_fmat, diff --git a/src/gbm/gbtree_model.cc b/src/gbm/gbtree_model.cc new file mode 100644 index 000000000000..4fb16636adb3 --- /dev/null +++ b/src/gbm/gbtree_model.cc @@ -0,0 +1,46 @@ +/*! + * Copyright 2019 by Contributors + */ +#include "xgboost/json.h" +#include "xgboost/logging.h" +#include "gbtree_model.h" + +namespace xgboost { +namespace gbm { + +void GBTreeModel::Load(Json const& in) { + fromJson(in["model_param"], ¶m); + + trees.clear(); + trees_to_update.clear(); + + auto const& trees_json = get(in["trees"]); + trees.resize(trees_json.size()); + + for (size_t t = 0; t < trees.size(); ++t) { + trees[t].reset( new RegTree() ); + trees[t]->Load(trees_json[t]); + } + + tree_info.resize(param.num_trees); +} + +void GBTreeModel::Save(Json* p_out) const { + auto& out = *p_out; + CHECK_EQ(param.num_trees, static_cast(trees.size())); + out["model_param"] = toJson(param); + std::vector trees_json; + size_t t = 0; + for (auto const& tree : trees) { + Json tree_json{Object()}; + tree->Save(&tree_json); + tree_json["id"] = std::to_string(t); + trees_json.emplace_back(tree_json); + t++; + } + + out["trees"] = Array(trees_json); +} + +} // namespace gbm +} // namespace xgboost diff --git a/src/gbm/gbtree_model.h b/src/gbm/gbtree_model.h index fff339b59436..fbb5dcdb4dbd 100644 --- a/src/gbm/gbtree_model.h +++ b/src/gbm/gbtree_model.h @@ -1,7 +1,10 @@ /*! - * Copyright by Contributors 2017 + * Copyright 2017-2019 by Contributors + * \file gbtree_model.h */ -#pragma once +#ifndef XGBOOST_GBM_GBTREE_MODEL_H_ +#define XGBOOST_GBM_GBTREE_MODEL_H_ + #include #include #include @@ -12,6 +15,9 @@ #include namespace xgboost { + +class Json; + namespace gbm { /*! \brief model parameters */ struct GBTreeModelParam : public dmlc::Parameter { @@ -43,6 +49,10 @@ struct GBTreeModelParam : public dmlc::Parameter { } // declare parameters, only declare those that need to be set. DMLC_DECLARE_PARAMETER(GBTreeModelParam) { + DMLC_DECLARE_FIELD(num_trees) + .set_lower_bound(0) + .set_default(0) + .describe(""); DMLC_DECLARE_FIELD(num_output_group) .set_lower_bound(1) .set_default(1) @@ -98,6 +108,7 @@ struct GBTreeModel { sizeof(int) * param.num_trees); } } + void Load(Json const& in); void Save(dmlc::Stream* fo) const { CHECK_EQ(param.num_trees, static_cast(trees.size())); @@ -110,6 +121,8 @@ struct GBTreeModel { } } + void Save(Json* p_out) const; + std::vector DumpModel(const FeatureMap& fmap, bool with_stats, std::string format) const { std::vector dump; @@ -140,3 +153,5 @@ struct GBTreeModel { }; } // namespace gbm } // namespace xgboost + +#endif // XGBOOST_GBM_GBTREE_MODEL_H_ diff --git a/src/learner.cc b/src/learner.cc index dd40b835238f..a76d7a65c278 100644 --- a/src/learner.cc +++ b/src/learner.cc @@ -5,12 +5,8 @@ * \author Tianqi Chen */ #include -#include -#include -#include -#include -#include -#include +#include + #include #include #include @@ -19,11 +15,18 @@ #include #include #include -#include "./common/common.h" -#include "./common/host_device_vector.h" -#include "./common/io.h" -#include "./common/random.h" -#include "./common/timer.h" + +#include "xgboost/feature_map.h" +#include "xgboost/learner.h" +#include "xgboost/logging.h" +#include "xgboost/generic_parameters.h" +#include "xgboost/json.h" + +#include "common/common.h" +#include "common/host_device_vector.h" +#include "common/io.h" +#include "common/random.h" +#include "common/timer.h" namespace { @@ -106,6 +109,7 @@ struct LearnerTrainParam : public dmlc::Parameter { DataSplitMode dsplit; // flag to disable default metric int disable_default_eval_metric; + std::string model_format; std::string booster; std::string objective; @@ -118,6 +122,9 @@ struct LearnerTrainParam : public dmlc::Parameter { .add_enum("col", DataSplitMode::kCol) .add_enum("row", DataSplitMode::kRow) .describe("Data split mode for distributed training."); + DMLC_DECLARE_FIELD(model_format) + .set_default("") + .describe("(Experimental) Specify json for serializing to JSON document"); DMLC_DECLARE_FIELD(disable_default_eval_metric) .set_default(0) .describe("flag to disable default metric. Set to >0 to disable"); @@ -142,11 +149,12 @@ DMLC_REGISTER_PARAMETER(GenericParameter); class LearnerImpl : public Learner { public: explicit LearnerImpl(std::vector > cache) - : configured_{false}, cache_(std::move(cache)) {} + : configured_{false}, cache_(std::move(cache)) { + monitor_.Init("Learner"); + } // Configuration before data is known. void Configure() override { if (configured_) { return; } - monitor_.Init("Learner"); monitor_.Start("Configure"); auto old_tparam = tparam_; Args args = {cfg_.cbegin(), cfg_.cend()}; @@ -190,7 +198,7 @@ class LearnerImpl : public Learner { } } - void Load(dmlc::Stream* fi) override { + void LoadBinary(dmlc::Stream* fi) { generic_param_.InitAllowUnknown(Args{}); tparam_.Init(std::vector>{}); // TODO(tqchen) mark deprecation of old format. @@ -296,13 +304,49 @@ class LearnerImpl : public Learner { for (auto& p_metric : metrics_) { p_metric->Configure({cfg_.begin(), cfg_.end()}); } + } + void Load(dmlc::Stream* fi) override { + monitor_.Start(__func__); + if (cfg_.find("model_format") != cfg_.cend()) { + tparam_.model_format = cfg_.at("model_format"); + } + if (tparam_.model_format == "json") { + LOG(WARNING) << "JSON serialization is at experimental stage. " + << "Output schema is subject to change in the future."; + auto ReadStream = + [](dmlc::Stream* fi) { + std::string buffer; + std::string temp; + size_t read {0}; + size_t size {1}; + do { + size *= 2; + read = 0; + temp.clear(); + temp.resize(size); + read = fi->Read(&temp[0], size); + temp.resize(read); + buffer.append(temp); + } while (read == size); + return buffer; + }; + std::string model_str = ReadStream(fi); + Json model = Json::Load({model_str.c_str(), model_str.size()}); + this->Load(model); + } else { + LOG(INFO) << "Loading binary model"; + this->LoadBinary(fi); + } this->configured_ = true; + monitor_.Stop(__func__); } - // rabit save model to rabit checkpoint void Save(dmlc::Stream* fo) const override { if (!this->configured_) { + if (cfg_.find("num_feature") == cfg_.cend()) { + LOG(FATAL) << "Missing parameter `num_feature` in configuration."; + } // Save empty model. Calling Configure in a dummy LearnerImpl avoids violating // constness. LearnerImpl empty(std::move(this->cache_)); @@ -315,6 +359,93 @@ class LearnerImpl : public Learner { return; } + if (tparam_.model_format == "json") { + LOG(WARNING) << "JSON serialization is at experimental stage. " + << "Output schema is subject to change in the future."; + Json model { Object() }; + this->Save(&model); + std::stringstream ss; + Json::Dump(model, &ss); + size_t length = ss.str().size(); + auto const str = ss.str(); + fo->Write(str.c_str(), length); + } else { + LOG(INFO) << "Saving binary model"; + this->SaveBinary(fo); + } + } + + void Save(Json* p_out) const override { + Json& out = *p_out; + Integer::Int major{XGBOOST_VER_MAJOR}, minor{XGBOOST_VER_MINOR}, patch{XGBOOST_VER_PATCH}; + out["version"] = Json(Array{std::vector{ + Json(Integer(major)), + Json(Integer(minor)), + Json(Integer(patch))}}); + + out["Learner"] = Object(); + auto& learner = out["Learner"]; + + learner["learner_model_param"] = toJson(mparam_); + learner["learner_train_param"] = toJson(tparam_); + + learner["gradient_booster"] = Object(); + auto& gradient_booster = learner["gradient_booster"]; + gbm_->Save(&gradient_booster); + + learner["objective"] = Object(); + auto& objective_fn = learner["objective"]; + obj_->Save(&objective_fn); + + std::vector metrics(metrics_.size()); + for (size_t i = 0; i < metrics_.size(); ++i) { + metrics[i] = String(metrics_[i]->Name()); + } + learner["metrics"] = Array(metrics); + + std::map j_cfg; + for (auto const& kv : cfg_) { + j_cfg[kv.first] = kv.second; + } + learner["cfg_cache"] = Object(std::move(j_cfg)); + } + + void Load(Json const& in) override { + Integer::Int major, minor, patch; + std::tie(major, minor, patch) = std::make_tuple(get(get(in["version"])[0]), + get(get(in["version"])[1]), + get(get(in["version"])[2])); + LOG(INFO) << "Loading XGBoost " << major << ", " << minor << " model"; + auto const& learner = get(in["Learner"]); + fromJson(learner.at("learner_model_param"), &mparam_); + fromJson(learner.at("learner_train_param"), &tparam_); + + auto const& gradient_booster = learner.at("gradient_booster"); + gbm_.reset(GradientBooster::Create(get(gradient_booster["name"]), &generic_param_, + cache_, mparam_.base_score)); + gbm_->Load(gradient_booster); + + auto const& objective_fn = learner.at("objective"); + obj_.reset(ObjFunction::Create(tparam_.objective, &generic_param_)); + obj_->Load(objective_fn); + + auto const& j_metrics = learner.at("metrics"); + auto n_metrics = get(j_metrics).size(); + metric_names_.resize(n_metrics); + metrics_.resize(n_metrics); + for (size_t i = 0; i < n_metrics; ++i) { + metric_names_[i]= get(j_metrics[i]); + metrics_[i] = std::unique_ptr(Metric::Create(metric_names_.back(), &generic_param_)); + } + + auto j_cfg = get(learner.at("cfg_cache")); + for (auto const& kv : j_cfg) { + cfg_[kv.first] = get(kv.second); + } + } + + // rabit save model to rabit checkpoint + void SaveBinary(dmlc::Stream* fo) const { LearnerModelParam mparam = mparam_; // make a copy to potentially modify std::vector > extra_attr; // extra attributed to be added just before saving @@ -347,7 +478,7 @@ class LearnerImpl : public Learner { attr[kv.first] = kv.second; } fo->Write(std::vector>( - attr.begin(), attr.end())); + attr.begin(), attr.end())); } if (tparam_.objective == "count:poisson") { auto it = cfg_.find("max_delta_step"); diff --git a/src/objective/hinge.cu b/src/objective/hinge.cu index 0dd3010bc2c3..49bea32ec3ce 100644 --- a/src/objective/hinge.cu +++ b/src/objective/hinge.cu @@ -1,10 +1,12 @@ /*! - * Copyright 2018 by Contributors + * Copyright 2018-2019 by Contributors * \file hinge.cc * \brief Provides an implementation of the hinge loss function * \author Henry Gouk */ #include + +#include "xgboost/json.h" #include "../common/math.h" #include "../common/transform.h" #include "../common/common.h" @@ -75,6 +77,12 @@ class HingeObj : public ObjFunction { const char* DefaultEvalMetric() const override { return "error"; } + + void Save(Json* p_out) const override { + auto& out = *p_out; + out["name"] = String("binary:hinge"); + } + void Load(Json const& in) override {} }; // register the objective functions diff --git a/src/objective/multiclass_obj.cu b/src/objective/multiclass_obj.cu index ca803f522624..a1c81663ae29 100644 --- a/src/objective/multiclass_obj.cu +++ b/src/objective/multiclass_obj.cu @@ -14,6 +14,7 @@ #include #include +#include "xgboost/json.h" #include "../common/common.h" #include "../common/math.h" #include "../common/transform.h" @@ -157,6 +158,20 @@ class SoftmaxMultiClassObj : public ObjFunction { } } + void Save(Json* p_out) const override { + auto& out = *p_out; + out["name"] = String("SoftmaxMultiClassObj"); + out["SoftmaxMultiClassParam"] = Object(); + auto& parameter = out["SoftmaxMultiClassParam"]; + for (auto const& kv : param_.__DICT__()) { + parameter[kv.first] = kv.second; + } + } + + void Load(Json const& in) override { + fromJson(in["SoftmaxMultiClassObj"], ¶m_); + } + private: // output probability bool output_prob_; diff --git a/src/objective/rank_obj.cc b/src/objective/rank_obj.cc index cb186a4f4e03..53316a02fd9c 100644 --- a/src/objective/rank_obj.cc +++ b/src/objective/rank_obj.cc @@ -10,6 +10,9 @@ #include #include #include + +#include "xgboost/json.h" + #include "../common/math.h" #include "../common/random.h" @@ -174,7 +177,16 @@ class LambdaRankObj : public ObjFunction { virtual void GetLambdaWeight(const std::vector &sorted_list, std::vector *io_pairs) = 0; - private: + void Save(Json* p_out) const override { + auto& out = *p_out; + out["name"] = String("LambdaRankObj"); + out["lambda_rank_param"] = Object(); + for (auto const& kv : param_.__DICT__()) { + out["lambda_rank_param"][kv.first] = kv.second; + } + } + + protected: LambdaRankParam param_; }; @@ -182,6 +194,15 @@ class PairwiseRankObj: public LambdaRankObj{ protected: void GetLambdaWeight(const std::vector &sorted_list, std::vector *io_pairs) override {} + + void Save(Json* p_out) const override { + auto& out = *p_out; + out["name"] = String("rank:pairwise"); + out["lambda_rank_param"] = toJson(LambdaRankObj::param_); + } + void Load(Json const& in) override { + fromJson(in["lambda_rank_param"], &(LambdaRankObj::param_)); + } }; // beta version: NDCG lambda rank @@ -232,6 +253,14 @@ class LambdaRankObjNDCG : public LambdaRankObj { } return static_cast(sumdcg); } + void Save(Json* p_out) const override { + auto& out = *p_out; + out["name"] = String("rank:ndcg"); + out["lambda_rank_param"] = toJson(LambdaRankObj::param_); + } + void Load(Json const& in) override { + fromJson(in["lambda_rank_param"], &(LambdaRankObj::param_)); + } }; class LambdaRankObjMAP : public LambdaRankObj { @@ -319,6 +348,15 @@ class LambdaRankObjMAP : public LambdaRankObj { pair.neg_index, &map_stats); } } + + void Save(Json* p_out) const override { + auto& out = *p_out; + out["name"] = String("rank:map"); + out["lambda_rank_param"] = toJson(LambdaRankObj::param_); + } + void Load(Json const& in) override { + fromJson(in["lambda_rank_param"], &(LambdaRankObj::param_)); + } }; // register the objective functions diff --git a/src/objective/regression_loss.h b/src/objective/regression_loss.h index 6688452da3de..eae5359f076c 100644 --- a/src/objective/regression_loss.h +++ b/src/objective/regression_loss.h @@ -34,6 +34,8 @@ struct LinearSquareLoss { static bst_float ProbToMargin(bst_float base_score) { return base_score; } static const char* LabelErrorMsg() { return ""; } static const char* DefaultEvalMetric() { return "rmse"; } + + static const char* Name() { return "reg:squarederror"; } }; struct SquaredLogError { @@ -57,6 +59,8 @@ struct SquaredLogError { return "label must be greater than -1 for rmsle so that log(label + 1) can be valid."; } static const char* DefaultEvalMetric() { return "rmsle"; } + + static const char* Name() { return "reg:squaredlogerror"; } }; // logistic loss for probability regression task @@ -90,11 +94,14 @@ struct LogisticRegression { return "label must be in [0,1] for logistic regression"; } static const char* DefaultEvalMetric() { return "rmse"; } + + static const char* Name() { return "reg:logistic"; } }; // logistic loss for binary classification task struct LogisticClassification : public LogisticRegression { static const char* DefaultEvalMetric() { return "error"; } + static const char* Name() { return "binary:logistic"; } }; // logistic loss, but predict un-transformed margin @@ -125,6 +132,8 @@ struct LogisticRaw : public LogisticRegression { return std::max(predt * (T(1.0f) - predt), eps); } static const char* DefaultEvalMetric() { return "auc"; } + + static const char* Name() { return "binary:logitraw"; } }; } // namespace obj diff --git a/src/objective/regression_obj.cu b/src/objective/regression_obj.cu index 0354d57771c1..4e9dc71540c9 100644 --- a/src/objective/regression_obj.cu +++ b/src/objective/regression_obj.cu @@ -12,6 +12,7 @@ #include #include +#include "xgboost/json.h" #include "../common/span.h" #include "../common/transform.h" #include "../common/common.h" @@ -113,6 +114,16 @@ class RegLossObj : public ObjFunction { return Loss::ProbToMargin(base_score); } + void Save(Json* p_out) const override { + auto& out = *p_out; + out["name"] = String(Loss::Name()); + out["reg_loss_param"] = toJson(param_); + } + + void Load(Json const& in) override { + fromJson(in["reg_loss_param"], ¶m_); + } + protected: RegLossParam param_; }; @@ -226,6 +237,16 @@ class PoissonRegression : public ObjFunction { return "poisson-nloglik"; } + void Save(Json* p_out) const override { + auto& out = *p_out; + out["name"] = String("PoissonRegression"); + out["poisson_regression_param"] = toJson(param_); + } + + void Load(Json const& in) override { + fromJson(in["poisson_regression_param"], ¶m_); + } + private: PoissonRegressionParam param_; HostDeviceVector label_correct_; @@ -320,6 +341,12 @@ class CoxRegression : public ObjFunction { const char* DefaultEvalMetric() const override { return "cox-nloglik"; } + + void Save(Json* p_out) const override { + auto& out = *p_out; + out["name"] = String("CoxRegression"); + } + void Load(Json const&) override {} }; // register the objective function @@ -390,6 +417,11 @@ class GammaRegression : public ObjFunction { const char* DefaultEvalMetric() const override { return "gamma-nloglik"; } + void Save(Json* p_out) const override { + auto& out = *p_out; + out["name"] = String("GammaRegression"); + } + void Load(Json const&) override {} private: HostDeviceVector label_correct_; @@ -484,6 +516,15 @@ class TweedieRegression : public ObjFunction { return metric_.c_str(); } + void Save(Json* p_out) const override { + auto& out = *p_out; + out["name"] = String("TweedieRegression"); + out["tweedie_regression_param"] = toJson(param_); + } + void Load(Json const& in) override { + fromJson(in["tweedie_regression_param"], ¶m_); + } + private: std::string metric_; TweedieRegressionParam param_; diff --git a/src/tree/tree_model.cc b/src/tree/tree_model.cc index 9d66eef6ac9f..8ca430e5a0ba 100644 --- a/src/tree/tree_model.cc +++ b/src/tree/tree_model.cc @@ -8,12 +8,15 @@ #include #include +#include + #include #include #include #include #include "param.h" +#include "../common/common.h" namespace xgboost { // register tree parameter @@ -618,6 +621,82 @@ std::string RegTree::DumpModel(const FeatureMap& fmap, return result; } +void RegTree::Load(Json const& in) { + fromJson(in["tree_param"], ¶m); + { + std::vector const& v_j_stats = get(in["stats"]); + stats_.resize(v_j_stats.size()); + + for (size_t i = 0; i < stats_.size(); ++i) { + std::vector const& j_stat = get(v_j_stats[i]); + auto& s = stats_[i]; + s.loss_chg = get(j_stat[0]); + s.sum_hess = get(j_stat[1]); + s.base_weight = get(j_stat[2]); + s.leaf_child_cnt = get(j_stat[3]); + } + CHECK_EQ(param.num_nodes, stats_.size()); + } + + { + std::vector const& v_j_nodes = get(in["nodes"]); + nodes_.resize(v_j_nodes.size()); + + for (size_t i = 0; i < nodes_.size(); ++i) { + auto const& j_node = get(v_j_nodes[i]); + auto& n = nodes_[i]; + auto left = get(j_node[0]); + auto right = get(j_node[1]); + auto parent = get(j_node[2]); + auto split_ind = get(j_node[3]); + auto split_cond = get(j_node[4]); + auto default_left = get(j_node[5]); + n = Node(left, right, parent, split_ind, split_cond, default_left); + } + CHECK_EQ(param.num_nodes, nodes_.size()); + } +} + +void RegTree::Save(Json* p_out) const { + auto& out = *p_out; + CHECK_EQ(param.num_nodes, static_cast(nodes_.size())); + CHECK_EQ(param.num_nodes, static_cast(stats_.size())); + out["tree_param"] = toJson(param); + CHECK_EQ(get(out["tree_param"]["num_nodes"]), std::to_string(param.num_nodes)); + + std::vector v_j_stats(stats_.size()); + using I = Integer::Int; + + for (size_t i = 0; i < stats_.size(); ++i) { + auto const& s = stats_[i]; + std::vector j_stats(4); + j_stats[0] = s.loss_chg; + j_stats[1] = s.sum_hess; + j_stats[2] = s.base_weight; + j_stats[3] = static_cast(s.leaf_child_cnt); + + v_j_stats[i] = std::move(j_stats); + } + out["stats"] = std::move(v_j_stats); + + std::vector v_j_nodes(nodes_.size()); + for (size_t i = 0; i < nodes_.size(); ++i) { + auto const& n = nodes_[i]; + std::vector j_node(6); + j_node[0] = static_cast(n.LeftChild()); + j_node[1] = static_cast(n.RightChild()); + j_node[2] = static_cast(n.Parent()); + j_node[3] = static_cast(n.SplitIndex()); + j_node[4] = n.SplitCond(); + j_node[5] = n.DefaultLeft(); + CHECK(IsA(j_node[5])); + + v_j_nodes[i] = j_node; + } + + out["nodes"] = std::move(v_j_nodes); +} + void RegTree::FillNodeMeanValues() { size_t num_nodes = this->param.num_nodes; if (this->node_mean_values_.size() == num_nodes) { @@ -851,10 +930,9 @@ void RegTree::CalculateContributions(const RegTree::FVec &feat, // Preallocate space for the unique path data const int maxd = this->MaxDepth(root_id) + 2; - auto *unique_path_data = new PathElement[(maxd * (maxd + 1)) / 2]; + std::vector unique_path((maxd * (maxd + 1)) / 2); - TreeShap(feat, out_contribs, root_id, 0, unique_path_data, + TreeShap(feat, out_contribs, root_id, 0, unique_path.data(), 1, 1, -1, condition, condition_feature, 1); - delete[] unique_path_data; } } // namespace xgboost diff --git a/tests/cpp/common/test_common.cc b/tests/cpp/common/test_common.cc index ba6946e80854..55673e1d3611 100644 --- a/tests/cpp/common/test_common.cc +++ b/tests/cpp/common/test_common.cc @@ -34,4 +34,5 @@ TEST(GPUSet, Basic) { EXPECT_EQ(GPUSet::AllVisible(), GPUSet::Empty()); #endif } + } // namespace xgboost diff --git a/tests/cpp/gbm/test_gblinear.cc b/tests/cpp/gbm/test_gblinear.cc new file mode 100644 index 000000000000..092426c86796 --- /dev/null +++ b/tests/cpp/gbm/test_gblinear.cc @@ -0,0 +1,54 @@ +/*! + * Copyright 2019 by Contributors + */ +#include + +#include +#include + +#include "../helpers.h" +#include "xgboost/json.h" +#include "xgboost/logging.h" +#include "xgboost/gbm.h" +#include "xgboost/generic_parameters.h" +#include "test_gbm.h" + +namespace xgboost { +namespace gbm { + +TEST(Linear, Json_IO) { + size_t constexpr kRows = 16, kCols = 16; + auto gbm = ConstructGBM("gblinear", {{"num_feature", "16"}}, kRows, kCols); + + Json model {Object()}; + gbm->Save(&model); + ASSERT_TRUE(IsA(model)); + + std::stringstream ss; + Json::Dump(model, &ss); + + auto model_str = ss.str(); + + model = Json::Load({model_str.c_str(), model_str.size()}); + ASSERT_TRUE(IsA(model)); + model = model["model"]; + + { + auto weights = get(model["weights"]); + ASSERT_EQ(weights.size(), 17); + auto model_param = get(model["model_param"]); + ASSERT_EQ(get(model_param["num_feature"]), "16"); + ASSERT_EQ(get(model_param["num_output_group"]), "1"); + } + + { + model = Json::Load({model_str.c_str(), model_str.size()}); + model = model["model"]; + auto weights = get(model["weights"]); + ASSERT_EQ(weights.size(), 17); // 16 + 1 (bias) + } + +} + +} // namespace gbm +} // namespace xgboost diff --git a/tests/cpp/gbm/test_gbm.h b/tests/cpp/gbm/test_gbm.h new file mode 100644 index 000000000000..5da2d34e04a6 --- /dev/null +++ b/tests/cpp/gbm/test_gbm.h @@ -0,0 +1,45 @@ +#ifndef XGBOOST_TEST_GBM_H_ +#define XGBOOST_TEST_GBM_H_ + +#include +#include +#include + +#include +#include + +#include "../helpers.h" + +namespace xgboost { + +inline std::unique_ptr ConstructGBM( + std::string name, Args kwargs, size_t kRows, size_t kCols) { + GenericParameter param; + param.Init(Args{}); + std::unique_ptr gbm { + GradientBooster::Create(name, ¶m, {}, 0)}; + gbm->Configure(kwargs); + auto pp_dmat = CreateDMatrix(kRows, kCols, 0); + auto p_dmat = *pp_dmat; + + std::vector labels(kRows); + for (size_t i = 0; i < kRows; ++i) { + labels[i] = i; + } + p_dmat->Info().labels_.HostVector() = labels; + HostDeviceVector gpair; + auto& h_gpair = gpair.HostVector(); + h_gpair.resize(kRows); + for (size_t i = 0; i < kRows; ++i) { + h_gpair[i] = {static_cast(i), 1}; + } + + gbm->DoBoost(p_dmat.get(), &gpair, nullptr); + + delete pp_dmat; + return gbm; +} + +} // namespace xgboost + +#endif // XGBOOST_TEST_GBM_H_ diff --git a/tests/cpp/gbm/test_gbtree.cc b/tests/cpp/gbm/test_gbtree.cc index df9fb44c1112..2329df1f26a8 100644 --- a/tests/cpp/gbm/test_gbtree.cc +++ b/tests/cpp/gbm/test_gbtree.cc @@ -2,6 +2,7 @@ #include #include "../helpers.h" #include "../../../src/gbm/gbtree.h" +#include "test_gbm.h" namespace xgboost { TEST(GBTree, SelectTreeMethod) { @@ -25,30 +26,78 @@ TEST(GBTree, SelectTreeMethod) { gbtree.ConfigureWithKnownData(args, p_dmat); auto const& tparam = gbtree.GetTrainParam(); gbtree.ConfigureWithKnownData({Arg{"tree_method", "approx"}, Arg{"num_feature", n_feat}}, p_dmat); - ASSERT_EQ(tparam.updater_seq, "grow_histmaker,prune"); + ASSERT_EQ(tparam.updater, "grow_histmaker,prune"); gbtree.ConfigureWithKnownData({Arg("tree_method", "exact"), Arg("num_feature", n_feat)}, p_dmat); - ASSERT_EQ(tparam.updater_seq, "grow_colmaker,prune"); + ASSERT_EQ(tparam.updater, "grow_colmaker,prune"); gbtree.ConfigureWithKnownData({Arg("tree_method", "hist"), Arg("num_feature", n_feat)}, p_dmat); - ASSERT_EQ(tparam.updater_seq, "grow_quantile_histmaker"); + ASSERT_EQ(tparam.updater, "grow_quantile_histmaker"); ASSERT_EQ(tparam.predictor, "cpu_predictor"); gbtree.ConfigureWithKnownData({Arg{"booster", "dart"}, Arg{"tree_method", "hist"}, Arg{"num_feature", n_feat}}, p_dmat); - ASSERT_EQ(tparam.updater_seq, "grow_quantile_histmaker"); + ASSERT_EQ(tparam.updater, "grow_quantile_histmaker"); #ifdef XGBOOST_USE_CUDA generic_param.InitAllowUnknown(std::vector{Arg{"n_gpus", "1"}}); gbtree.ConfigureWithKnownData({Arg("tree_method", "gpu_exact"), Arg("num_feature", n_feat)}, p_dmat); - ASSERT_EQ(tparam.updater_seq, "grow_gpu,prune"); + ASSERT_EQ(tparam.updater, "grow_gpu,prune"); ASSERT_EQ(tparam.predictor, "gpu_predictor"); gbtree.ConfigureWithKnownData({Arg("tree_method", "gpu_hist"), Arg("num_feature", n_feat)}, p_dmat); - ASSERT_EQ(tparam.updater_seq, "grow_gpu_hist"); + ASSERT_EQ(tparam.updater, "grow_gpu_hist"); ASSERT_EQ(tparam.predictor, "gpu_predictor"); gbtree.ConfigureWithKnownData({Arg{"booster", "dart"}, Arg{"tree_method", "gpu_hist"}, Arg{"num_feature", n_feat}}, p_dmat); - ASSERT_EQ(tparam.updater_seq, "grow_gpu_hist"); + ASSERT_EQ(tparam.updater, "grow_gpu_hist"); #endif delete p_shared_ptr_dmat; } + +// Some other parts of test are in `Tree.Json_IO'. +TEST(GBTree, Json_IO) { + size_t constexpr kRows = 16, kCols = 16; + auto gbm = ConstructGBM("gbtree", + {{"num_feature", std::to_string(kCols)}}, + kRows, kCols); + + Json model {Object()}; + gbm->Save(&model); + + std::stringstream ss; + Json::Dump(model, &ss); + + auto model_str = ss.str(); + model = Json::Load({model_str.c_str(), model_str.size()}); + ASSERT_EQ(get(model["name"]), "gbtree"); + auto j_param = model["gbtree_train_param"]; + ASSERT_EQ(get(j_param["num_parallel_tree"]), "1"); +} + +TEST(Dart, Json_IO) { + size_t constexpr kRows = 16, kCols = 16; + auto gbm = ConstructGBM("dart", + {{"num_feature", std::to_string(kCols)}}, + kRows, kCols); + + Json model {Object()}; + gbm->Save(&model); + + std::stringstream ss; + Json::Dump(model, &ss); + + auto model_str = ss.str(); + model = Json::Load({model_str.c_str(), model_str.size()}); + ASSERT_EQ(get(model["name"]), "dart"); + + { + auto const& gbtree = model["gbtree"]; + ASSERT_TRUE(IsA(gbtree)); + } + + ASSERT_EQ(get(model["dart_train_param"]["sample_type"]), "uniform"); + + auto j_weight_drop = get(model["weight_drop"]); + ASSERT_EQ(j_weight_drop.size(), 1); // One tree is trained. +} + } // namespace xgboost diff --git a/tests/cpp/objective/test_ranking_obj.cc b/tests/cpp/objective/test_ranking_obj.cc index f9410b9ee342..e6a5c1076a52 100644 --- a/tests/cpp/objective/test_ranking_obj.cc +++ b/tests/cpp/objective/test_ranking_obj.cc @@ -2,6 +2,9 @@ #include #include #include "../helpers.h" +#include + +namespace xgboost { TEST(Objective, PairwiseRankingGPair) { xgboost::GenericParameter tparam; @@ -32,3 +35,25 @@ TEST(Objective, PairwiseRankingGPair) { delete obj; } + +TEST(Objective, NDCG_Json_IO) { + xgboost::GenericParameter tparam; + tparam.InitAllowUnknown(Args{}); + + xgboost::ObjFunction * obj = + xgboost::ObjFunction::Create("rank:ndcg", &tparam); + obj->Configure(Args{}); + Json j_obj {Object()}; + obj->Save(&j_obj); + + ASSERT_EQ(get(j_obj["name"]), "rank:ndcg");; + + auto const& j_param = j_obj["lambda_rank_param"]; + + ASSERT_EQ(get(j_param["num_pairsample"]), "1"); + ASSERT_EQ(get(j_param["fix_list_weight"]), "0"); + + delete obj; +} + +} // namespace xgboost diff --git a/tests/cpp/test_learner.cc b/tests/cpp/test_learner.cc index 46d7bc738247..a3b6ee42d6eb 100644 --- a/tests/cpp/test_learner.cc +++ b/tests/cpp/test_learner.cc @@ -4,6 +4,7 @@ #include "helpers.h" #include "xgboost/learner.h" +#include "xgboost/json.h" #include "dmlc/filesystem.h" namespace xgboost { @@ -151,6 +152,77 @@ TEST(Learner, IO) { delete pp_dmat; } +TEST(Learner, Json_IO) { + size_t constexpr kRows = 10; + int32_t constexpr kIters = 4; + std::vector labels(kRows); + for (size_t i = 0; i < labels.size(); ++i) { + labels[i] = i; + } + auto pp_dmat = CreateDMatrix(kRows, 10, 0); + auto& p_dmat = *pp_dmat; + p_dmat->Info().labels_.HostVector() = labels; + + Json dumped {Object()}; + + // gblinear + { + std::unique_ptr learner {Learner::Create({p_dmat})}; + learner->SetParams(Args{{"booster", "gblinear"}, + {"updater", "gpu_coord_descent"}}); + learner->Configure(); + for (int32_t iter = 0; iter < kIters; ++iter) { + learner->UpdateOneIter(iter, p_dmat.get()); + } + learner->Save(&dumped); + } + { + std::unique_ptr learner {Learner::Create({p_dmat})}; + learner->Load(dumped); + Json new_model {Object()}; + learner->Save(&new_model); + + ASSERT_EQ(dumped, new_model); + } + { + ASSERT_TRUE(IsA(dumped["version"])); + auto j_learner = dumped["Learner"]; + ASSERT_TRUE(IsA(j_learner["learner_model_param"])); + ASSERT_TRUE(IsA(j_learner["learner_train_param"])); + ASSERT_TRUE(IsA(j_learner["gradient_booster"])); + ASSERT_TRUE(IsA(j_learner["objective"])); + } + + // gbtree + { + std::unique_ptr learner {Learner::Create({p_dmat})}; + learner->SetParams(Args{{"booster", "gbtree"}, + {"tree_method", "hist"}}); + learner->Configure(); + for (int32_t iter = 0; iter < kIters; ++iter) { + learner->UpdateOneIter(iter, p_dmat.get()); + } + learner->Save(&dumped); + } + { + std::unique_ptr learner {Learner::Create({p_dmat})}; + learner->Load(dumped); + Json new_model {Object()}; + learner->Save(&new_model); + + ASSERT_EQ(dumped, new_model); + } + { + auto j_learner = dumped["Learner"]; + auto const& j_model = j_learner["gradient_booster"]["model"]; + ASSERT_TRUE(IsA(j_model)); + auto j_trees = get(j_model["trees"]); + ASSERT_EQ(j_trees.size(), kIters); + } + + delete pp_dmat; +} + // Tests for automatic GPU configuration. TEST(Learner, GPUConfiguration) { using Arg = std::pair; @@ -218,5 +290,4 @@ TEST(Learner, GPUConfiguration) { } #endif // XGBOOST_USE_CUDA - } // namespace xgboost diff --git a/tests/cpp/tree/test_tree_model.cc b/tests/cpp/tree/test_tree_model.cc index bb7d6996602c..967ac193e08d 100644 --- a/tests/cpp/tree/test_tree_model.cc +++ b/tests/cpp/tree/test_tree_model.cc @@ -3,6 +3,7 @@ #include #include "../helpers.h" #include "dmlc/filesystem.h" +#include "xgboost/json_io.h" namespace xgboost { // Manually construct tree in binary format @@ -218,4 +219,57 @@ TEST(Tree, DumpDot) { str = tree.DumpModel(fmap, true, R"(dot:{"graph_attrs": {"bgcolor": "#FFFF00"}})"); ASSERT_NE(str.find(R"(graph [ bgcolor="#FFFF00" ])"), std::string::npos); } + +TEST(Tree, Json_IO) { + { + std::string json_str = R"json( +{ + "Learner": { + "gradient_booster": { + "model": { + "trees": [{ + "nodes": [[1,2,2147483647,5,0.316871],[-1,0,4,0,-0.075000],[-1,0,4,0,0.743182]], + "stats": [ [42.949860,12.000000,5.076923,0.0],[0.000000,0.000000,0.000000,0.0] ]}] }, + "name": "gbtree"}}} +)json"; + + JsonReader reader({json_str.c_str(), json_str.size()}); + + auto loaded = Json::Load(&reader); + + auto const& nodes = loaded["Learner"]["gradient_booster"]["model"]["trees"][0]["nodes"]; + auto const& j_node = get(nodes)[0]; + ASSERT_EQ(get(j_node[0]), 1); + ASSERT_EQ(get(j_node[1]), 2); + ASSERT_EQ(get(j_node[2]), 2147483647); + ASSERT_EQ(get(j_node[3]), 5);; + ASSERT_EQ(get(j_node[4]), 0.316871f); + + auto const& stats = loaded["Learner"]["gradient_booster"]["model"]["trees"][0]["stats"]; + auto const& j_stat = get(stats)[0]; + ASSERT_EQ(get(j_stat[0]), 42.949860f); + ASSERT_EQ(get(j_stat[1]), 12.0f); + ASSERT_EQ(get(j_stat[2]), 5.076923f); + ASSERT_EQ(get(j_stat[3]), 0.0f); + } + + { + RegTree tree; + tree.ExpandNode(0, 0, 0.0f, false, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f); + Json j_tree{Object()}; + tree.Save(&j_tree); + std::stringstream ss; + Json::Dump(j_tree, &ss); + + auto tparam = j_tree["tree_param"]; + ASSERT_EQ(get(tparam["num_feature"]), "0"); + ASSERT_EQ(get(tparam["num_nodes"]), "3"); + ASSERT_EQ(get(tparam["num_roots"]), "1"); + ASSERT_EQ(get(tparam["size_leaf_vector"]), "0"); + + RegTree loaded_tree; + loaded_tree.Load(j_tree); + } +} + } // namespace xgboost