-
-
Notifications
You must be signed in to change notification settings - Fork 8.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Optimizations of pre-processing for 'hist' tree method #4310
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,6 +11,7 @@ | |
#include "./column_matrix.h" | ||
#include "./hist_util.h" | ||
#include "./quantile.h" | ||
#include "./../tree/updater_quantile_hist.h" | ||
|
||
#if defined(XGBOOST_MM_PREFETCH_PRESENT) | ||
#include <xmmintrin.h> | ||
|
@@ -49,7 +50,7 @@ void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) { | |
constexpr int kFactor = 8; | ||
std::vector<WXQSketch> sketchs; | ||
|
||
const int nthread = omp_get_max_threads(); | ||
const size_t nthread = omp_get_max_threads(); | ||
|
||
unsigned const nstep = | ||
static_cast<unsigned>((info.num_col_ + nthread - 1) / nthread); | ||
|
@@ -67,34 +68,85 @@ void HistCutMatrix::Init(DMatrix* p_fmat, uint32_t max_num_bins) { | |
// Use group index for weights? | ||
bool const use_group_ind = num_groups != 0 && weights.size() != info.num_row_; | ||
|
||
for (const auto &batch : p_fmat->GetRowBatches()) { | ||
size_t group_ind = 0; | ||
if (use_group_ind) { | ||
group_ind = this->SearchGroupIndFromBaseRow(group_ptr, batch.base_rowid); | ||
if (use_group_ind) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @hcho3 We need to change the name of these |
||
for (const auto &batch : p_fmat->GetRowBatches()) { | ||
size_t group_ind = this->SearchGroupIndFromBaseRow(group_ptr, batch.base_rowid); | ||
#pragma omp parallel num_threads(nthread) firstprivate(group_ind, use_group_ind) | ||
{ | ||
CHECK_EQ(nthread, omp_get_num_threads()); | ||
auto tid = static_cast<unsigned>(omp_get_thread_num()); | ||
unsigned begin = std::min(nstep * tid, ncol); | ||
unsigned end = std::min(nstep * (tid + 1), ncol); | ||
|
||
// do not iterate if no columns are assigned to the thread | ||
if (begin < end && end <= ncol) { | ||
for (size_t i = 0; i < batch.Size(); ++i) { // NOLINT(*) | ||
size_t const ridx = batch.base_rowid + i; | ||
SparsePage::Inst const inst = batch[i]; | ||
if (group_ptr[group_ind] == ridx && | ||
// maximum equals to weights.size() - 1 | ||
group_ind < num_groups - 1) { | ||
// move to next group | ||
group_ind++; | ||
} | ||
for (auto const& entry : inst) { | ||
if (entry.index >= begin && entry.index < end) { | ||
size_t w_idx = group_ind; | ||
sketchs[entry.index].Push(entry.fvalue, info.GetWeight(w_idx)); | ||
} | ||
} | ||
} | ||
} | ||
} | ||
} | ||
#pragma omp parallel num_threads(nthread) firstprivate(group_ind, use_group_ind) | ||
{ | ||
CHECK_EQ(nthread, omp_get_num_threads()); | ||
auto tid = static_cast<unsigned>(omp_get_thread_num()); | ||
unsigned begin = std::min(nstep * tid, ncol); | ||
unsigned end = std::min(nstep * (tid + 1), ncol); | ||
|
||
// do not iterate if no columns are assigned to the thread | ||
if (begin < end && end <= ncol) { | ||
for (size_t i = 0; i < batch.Size(); ++i) { // NOLINT(*) | ||
size_t const ridx = batch.base_rowid + i; | ||
SparsePage::Inst const inst = batch[i]; | ||
if (use_group_ind && | ||
group_ptr[group_ind] == ridx && | ||
// maximum equals to weights.size() - 1 | ||
group_ind < num_groups - 1) { | ||
// move to next group | ||
group_ind++; | ||
} else { | ||
for (const auto &batch : p_fmat->GetRowBatches()) { | ||
const size_t size = batch.Size(); | ||
const size_t block_size = 512; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What's the rule for choosing this number (512)? Is this number hardware-dependent? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No, but this value influences on size of allocated buffer: it should not be too large from the one point of view and be not too small (useful work of omp-task should be more than omp-overhead) from the second point of view. By experiments 512 have been chosen as optimal. |
||
const size_t block_size_iter = block_size * nthread; | ||
const size_t n_blocks = size / block_size_iter + !!(size % block_size_iter); | ||
|
||
std::vector<std::vector<std::pair<float, float>>> buff(nthread); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe add a note like There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Buffs are not needed for better cache locality. |
||
for (size_t tid = 0; tid < nthread; ++tid) { | ||
buff[tid].resize(block_size * ncol); | ||
} | ||
|
||
std::vector<size_t> sizes(nthread * ncol, 0); | ||
|
||
for (size_t iblock = 0; iblock < n_blocks; ++iblock) { | ||
#pragma omp parallel num_threads(nthread) | ||
{ | ||
int tid = omp_get_thread_num(); | ||
|
||
const size_t ibegin = iblock * block_size_iter + tid * block_size; | ||
const size_t iend = std::min(ibegin + block_size, size); | ||
|
||
auto* p_sizes = sizes.data() + ncol * tid; | ||
auto* p_buff = buff[tid].data(); | ||
|
||
for (size_t i = ibegin; i < iend; ++i) { | ||
size_t const ridx = batch.base_rowid + i; | ||
bst_float w = info.GetWeight(ridx); | ||
SparsePage::Inst const inst = batch[i]; | ||
|
||
for (auto const& entry : inst) { | ||
const size_t idx = entry.index; | ||
p_buff[idx * block_size + p_sizes[idx]] = { entry.fvalue, w }; | ||
p_sizes[idx]++; | ||
} | ||
} | ||
for (auto const& entry : inst) { | ||
if (entry.index >= begin && entry.index < end) { | ||
size_t w_idx = use_group_ind ? group_ind : ridx; | ||
sketchs[entry.index].Push(entry.fvalue, info.GetWeight(w_idx)); | ||
#pragma omp barrier | ||
#pragma omp for schedule(static) | ||
for (int32_t icol = 0; icol < static_cast<int32_t>(ncol); ++icol) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The number of columns is usually small, ~3000 is quite an extreme case, ~120 might be the more common case. Is it preferred to use OMP here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OMP must be here :) |
||
for (size_t tid = 0; tid < nthread; ++tid) { | ||
auto* p_sizes = sizes.data() + ncol * tid; | ||
auto* p_buff = buff[tid].data() + icol * block_size; | ||
|
||
for (size_t i = 0; i < p_sizes[icol]; ++i) { | ||
sketchs[icol].Push(p_buff[i].first, p_buff[i].second); | ||
} | ||
|
||
p_sizes[icol] = 0; | ||
} | ||
} | ||
} | ||
|
@@ -177,22 +229,66 @@ uint32_t HistCutMatrix::GetBinIdx(const Entry& e) { | |
|
||
void GHistIndexMatrix::Init(DMatrix* p_fmat, int max_num_bins) { | ||
cut.Init(p_fmat, max_num_bins); | ||
|
||
const int nthread = omp_get_max_threads(); | ||
const int32_t nthread = omp_get_max_threads(); | ||
// const int nthread = 1; | ||
const uint32_t nbins = cut.row_ptr.back(); | ||
hit_count.resize(nbins, 0); | ||
hit_count_tloc_.resize(nthread * nbins, 0); | ||
|
||
row_ptr.push_back(0); | ||
|
||
size_t new_size = 1; | ||
for (const auto &batch : p_fmat->GetRowBatches()) { | ||
const size_t rbegin = row_ptr.size() - 1; | ||
for (size_t i = 0; i < batch.Size(); ++i) { | ||
row_ptr.push_back(batch[i].size() + row_ptr.back()); | ||
new_size += batch.Size(); | ||
} | ||
|
||
row_ptr.resize(new_size); | ||
row_ptr[0] = 0; | ||
|
||
size_t rbegin = 0; | ||
size_t prev_sum = 0; | ||
|
||
for (const auto &batch : p_fmat->GetRowBatches()) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note to @trivialfis, @CodingCat: we are not really sure if 'hist' method works when external memory is enabled (i.e. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @hcho3 Absolutely agree. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. AFAICT, it doesn't work in distributed setting, even after @trivialfis 's fix in 0.82 |
||
MemStackAllocator<size_t, 128> partial_sums(nthread); | ||
size_t* p_part = partial_sums.Get(); | ||
|
||
size_t block_size = batch.Size() / nthread; | ||
|
||
#pragma omp parallel num_threads(nthread) | ||
{ | ||
#pragma omp for | ||
for (int32_t tid = 0; tid < nthread; ++tid) { | ||
size_t ibegin = block_size * tid; | ||
size_t iend = (tid == (nthread-1) ? batch.Size() : (block_size * (tid+1))); | ||
|
||
size_t sum = 0; | ||
for (size_t i = ibegin; i < iend; ++i) { | ||
sum += batch[i].size(); | ||
row_ptr[rbegin + 1 + i] = sum; | ||
} | ||
} | ||
|
||
#pragma omp single | ||
{ | ||
p_part[0] = prev_sum; | ||
for (int32_t i = 1; i < nthread; ++i) { | ||
p_part[i] = p_part[i - 1] + row_ptr[rbegin + i*block_size]; | ||
} | ||
} | ||
|
||
#pragma omp for | ||
for (int32_t tid = 0; tid < nthread; ++tid) { | ||
size_t ibegin = block_size * tid; | ||
size_t iend = (tid == (nthread-1) ? batch.Size() : (block_size * (tid+1))); | ||
|
||
for (size_t i = ibegin; i < iend; ++i) { | ||
row_ptr[rbegin + 1 + i] += p_part[tid]; | ||
} | ||
} | ||
} | ||
|
||
index.resize(row_ptr.back()); | ||
|
||
CHECK_GT(cut.cut.size(), 0U); | ||
CHECK_EQ(cut.row_ptr.back(), cut.cut.size()); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @SmirnovEgorRu Why are we removing this line? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Now row_ptr is allocated to final length from the start. So if we have a lot of batches - back() for not final batch will return some trash. |
||
|
||
auto bsize = static_cast<omp_ulong>(batch.Size()); | ||
#pragma omp parallel for num_threads(nthread) schedule(static) | ||
|
@@ -203,7 +299,6 @@ void GHistIndexMatrix::Init(DMatrix* p_fmat, int max_num_bins) { | |
SparsePage::Inst inst = batch[i]; | ||
|
||
CHECK_EQ(ibegin + inst.size(), iend); | ||
|
||
for (bst_uint j = 0; j < inst.size(); ++j) { | ||
uint32_t idx = cut.GetBinIdx(inst[j]); | ||
|
||
|
@@ -215,10 +310,13 @@ void GHistIndexMatrix::Init(DMatrix* p_fmat, int max_num_bins) { | |
|
||
#pragma omp parallel for num_threads(nthread) schedule(static) | ||
for (bst_omp_uint idx = 0; idx < bst_omp_uint(nbins); ++idx) { | ||
for (int tid = 0; tid < nthread; ++tid) { | ||
for (size_t tid = 0; tid < nthread; ++tid) { | ||
hit_count[idx] += hit_count_tloc_[tid * nbins + idx]; | ||
} | ||
} | ||
|
||
prev_sum = row_ptr[rbegin + batch.Size()]; | ||
rbegin += batch.Size(); | ||
} | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,6 +19,74 @@ | |
namespace xgboost { | ||
namespace common { | ||
|
||
/* | ||
* \brief A thin wrapper around dynamically allocated C-style array. | ||
* Make sure to call resize() before use. | ||
*/ | ||
template<typename T> | ||
SmirnovEgorRu marked this conversation as resolved.
Show resolved
Hide resolved
|
||
struct SimpleArray { | ||
~SimpleArray() { | ||
free(ptr_); | ||
ptr_ = nullptr; | ||
} | ||
|
||
void resize(size_t n) { | ||
T* ptr = static_cast<T*>(malloc(n*sizeof(T))); | ||
memcpy(ptr, ptr_, n_ * sizeof(T)); | ||
free(ptr_); | ||
ptr_ = ptr; | ||
n_ = n; | ||
} | ||
|
||
T& operator[](size_t idx) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What is the advantage of using There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
@SmirnovEgorRu I'm wondering about this too, There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I replaced std::vector by SimpleArray because std::vector has to slow resize() function, it fill memory by very not performance manner. |
||
return ptr_[idx]; | ||
} | ||
|
||
T& operator[](size_t idx) const { | ||
return ptr_[idx]; | ||
} | ||
|
||
size_t size() const { | ||
return n_; | ||
} | ||
|
||
T back() const { | ||
return ptr_[n_-1]; | ||
} | ||
|
||
T* data() { | ||
return ptr_; | ||
} | ||
|
||
const T* data() const { | ||
return ptr_; | ||
} | ||
|
||
|
||
T* begin() { | ||
return ptr_; | ||
} | ||
|
||
const T* begin() const { | ||
return ptr_; | ||
} | ||
|
||
T* end() { | ||
return ptr_ + n_; | ||
} | ||
|
||
const T* end() const { | ||
return ptr_ + n_; | ||
} | ||
|
||
private: | ||
T* ptr_ = nullptr; | ||
size_t n_ = 0; | ||
}; | ||
|
||
|
||
|
||
|
||
/*! \brief Cut configuration for all the features. */ | ||
struct HistCutMatrix { | ||
/*! \brief Unit pointer to rows by element position */ | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -59,7 +59,6 @@ class RowSetCollection { | |
} | ||
// clear up things | ||
inline void Clear() { | ||
row_indices_.clear(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we still need this line. Clearing a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I did some measurements with my all optimizations and InitData() function spends ~15% of tree building time. for (size_t i = 0; i < info.num_row_; ++i) {
if (gpair[i].GetHess() >= 0.0f) {
row_indices.push_back(i); There are 2 reasons: std::vector - not performance friendly object, push_back contains at least one "if" to check current size. Also, this code can't be parallelized and it is sequential. I have changed this code to use C-style partition: j = 0;
for (size_t i = 0; i < info.num_row_; ++i) {
if (gpair[i].GetHess() >= 0.0f) {
p_row_indices[j++] = i; But in this case I need to do row_indices_->resize(num_row) each time after row_indices_->clear(). But resize(num_rows) - very expensive operations if current size=0, it will fill data inside by default value (in our case - 0) by not performance manner. So, I deleted clear() and use resize(nrows) only one time. After my changes InitData() spends less then 1% now. P.S. std::vector - really not performance friendly :) |
||
elem_of_each_node_.clear(); | ||
} | ||
// initialize node id 0->everything | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@SmirnovEgorRu Why the change from
bst_uint
toint32_t
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Windows requires that integer type for count inside "#pragma omp paralell for" should have signed type.
Before nfeature wasn't used for parallel loop, but now it is used. So it was changed from unsigned to signed