Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

Commit

Permalink
1bit gradient compression implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
shuo-ouyang committed Aug 9, 2020
1 parent d4052fd commit db71b9c
Show file tree
Hide file tree
Showing 8 changed files with 428 additions and 70 deletions.
6 changes: 4 additions & 2 deletions ci/docker/runtime_functions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -1315,8 +1315,10 @@ integrationtest_ubuntu_cpu_dist_kvstore() {
python3 ../../tools/launch.py -n 7 --launcher local python3 dist_sync_kvstore.py --type=gluon_type_cpu
python3 ../../tools/launch.py -n 7 --launcher local python3 dist_sync_kvstore.py
python3 ../../tools/launch.py -n 7 --launcher local python3 dist_sync_kvstore.py --no-multiprecision
python3 ../../tools/launch.py -n 7 --launcher local python3 dist_sync_kvstore.py --type=compressed_cpu
python3 ../../tools/launch.py -n 7 --launcher local python3 dist_sync_kvstore.py --type=compressed_cpu --no-multiprecision
python3 ../../tools/launch.py -n 7 --launcher local python3 dist_sync_kvstore.py --type=compressed_cpu_1bit
python3 ../../tools/launch.py -n 7 --launcher local python3 dist_sync_kvstore.py --type=compressed_cpu_1bit --no-multiprecision
python3 ../../tools/launch.py -n 7 --launcher local python3 dist_sync_kvstore.py --type=compressed_cpu_2bit
python3 ../../tools/launch.py -n 7 --launcher local python3 dist_sync_kvstore.py --type=compressed_cpu_2bit --no-multiprecision
python3 ../../tools/launch.py -n 3 --launcher local python3 test_server_profiling.py
popd
}
Expand Down
5 changes: 4 additions & 1 deletion python/mxnet/kvstore/kvstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -498,6 +498,9 @@ def set_gradient_compression(self, compression_params):
""" Specifies type of low-bit quantization for gradient compression \
and additional arguments depending on the type of compression being used.
The 1bit compression works as follows: values which is above the threshold in the
gradient will be set to +1, whereas values below threshold will be set to -1.
2bit Gradient Compression takes a positive float `threshold`.
The technique works by thresholding values such that positive values in the
gradient above threshold will be set to threshold. Negative values whose absolute
Expand Down Expand Up @@ -538,7 +541,7 @@ def set_gradient_compression(self, compression_params):
A dictionary specifying the type and parameters for gradient compression.
The key `type` in this dictionary is a
required string argument and specifies the type of gradient compression.
Currently `type` can be only `2bit`
Currently `type` can be only `1bit` and `2bit`
Other keys in this dictionary are optional and specific to the type
of gradient compression.
"""
Expand Down
107 changes: 107 additions & 0 deletions src/kvstore/gradient_compression-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,106 @@ namespace mxnet {
namespace kvstore {

// these gpu functions are defined in gradient_compression.cu
void Quantize1BitImpl(mshadow::Stream<mshadow::gpu> *s, const std::vector<mxnet::TBlob> &inputs,
const float threshold);
void Dequantize1BitImpl(mshadow::Stream<mshadow::gpu> *s, const std::vector<mxnet::TBlob> &inputs,
const float threshold);
void Quantize2BitImpl(mshadow::Stream<mshadow::gpu> *s, const std::vector<mxnet::TBlob> &inputs,
const float threshold);
void Dequantize2BitImpl(mshadow::Stream<mshadow::gpu> *s, const std::vector<mxnet::TBlob> &inputs,
const float threshold);

struct quantize_1bit {
MSHADOW_XINLINE static void Map(int out_block_id,
int original_size,
float *out,
float *grad,
float *residual,
const float threshold) {
// this block contains the compressed representation of
// upto 32 values starting from out_block_id*32
float *compr_block = out + out_block_id;
// init to 0
*compr_block = 0;
// start and end are indices in original grad array
const int start = out_block_id << 5;
const int end = (start + 32 <= original_size) ? start + 32 : original_size;

char *block_ptr = reinterpret_cast < char * > (compr_block);
// masks used to quantize data
const uint8_t bits[] = {0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01};
for (int i = start; i < end; ++i) {
// adds offset to reach appropriate byte
char *curr_byte = block_ptr + ((i - start) >> 3);
// adds gradient to existing residual to get updated grad
residual[i] += grad[i];
if (residual[i] > threshold) {
// set data to 1
*curr_byte |= bits[(i & 7)];
// reduce residual by 1
residual[i] -= 1;
} else {
// set data to 0
*curr_byte &= ~bits[(i & 7)];
// add residual by 1
// because current position will be dequantized to -1
residual[i] += 1;
}
}
}
};

template<typename xpu>
void Quantize1BitKernelLaunch(mshadow::Stream<xpu> *s, const std::vector<mxnet::TBlob> &inputs,
const float threshold) {
mxnet::op::mxnet_op::Kernel<quantize_1bit, xpu>
::Launch(s,
inputs[2].Size(), // compressed array size
inputs[0].Size(), // original size
inputs[2].dptr<float>(), // compressed array
inputs[0].dptr<float>(), // original array
inputs[1].dptr<float>(), // residual array
threshold); // threshold
}

struct dequantize_1bit {
MSHADOW_XINLINE static void Map(int i,
float *out,
float *in,
const float threshold) {
// get position of dequantized value to fill
float *outval = out + i;
// gets byte which holds quantized value for this position
char *ch_ptr = reinterpret_cast < char * > (in + (i >> 5));
ch_ptr += ((i & 31) >> 3);
// masks used to quantize data
const uint8_t bits[] = {0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01};
// col denotes which bit of a byte is set for this value
// col=0 implies the first bit, col=1 implies the second bit,...
const int col = i & 7;
const uint8_t mask = bits[col];
const uint8_t masked = *ch_ptr & mask;
if (masked == mask) {
*outval = +1;
} else {
// if current position of byte is 0
// dequantized it to -1
*outval = -1;
}
}
};

template<typename xpu>
void Dequantize1BitKernelLaunch(mshadow::Stream<xpu> *s, const std::vector<mxnet::TBlob> &inputs,
const float threshold) {
mxnet::op::mxnet_op::Kernel<dequantize_1bit, xpu>
::Launch(s,
inputs[1].Size(), // original size
inputs[1].dptr<float>(), // out array
inputs[0].dptr<float>(), // compressed array
threshold); // threshold
}

struct quantize_2bit {
MSHADOW_XINLINE static void Map(int out_block_id,
int original_size,
Expand Down Expand Up @@ -138,6 +233,18 @@ void Dequantize2BitKernelLaunch(mshadow::Stream<xpu> *s, const std::vector<mxnet
threshold); // positive threshold
}

inline void Quantize1BitImpl(mshadow::Stream<mshadow::cpu> *s,
const std::vector<mxnet::TBlob> &inputs,
const float threshold) {
Quantize1BitKernelLaunch(s, inputs, threshold);
}

inline void Dequantize1BitImpl(mshadow::Stream<mshadow::cpu> *s,
const std::vector<mxnet::TBlob> &inputs,
const float threshold) {
Dequantize1BitKernelLaunch(s, inputs, threshold);
}

inline void Quantize2BitImpl(mshadow::Stream<mshadow::cpu> *s,
const std::vector<mxnet::TBlob> &inputs,
const float threshold) {
Expand Down
84 changes: 64 additions & 20 deletions src/kvstore/gradient_compression.cc
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,10 @@ void GradientCompression::SetParams(const std::vector<std::pair<std::string, std
& kwargs) {
GradientCompressionParam params;
params.InitAllowUnknown(kwargs);
CHECK_GT(params.threshold, 0) << "threshold must be greater than 0";
if (params.type == "2bit") {
if (params.type == "1bit") {
SetOneBitCompression(params.threshold);
} else if (params.type == "2bit") {
CHECK_GT(params.threshold, 0) << "threshold must be greater than 0 for two bit compression";
SetTwoBitCompression(params.threshold);
} else {
LOG(FATAL) << "Unknown type for gradient compression " << params.type;
Expand All @@ -57,6 +59,11 @@ std::string GradientCompression::get_type_str() {
return std::to_string(static_cast<int>(type_));
}

void GradientCompression::SetOneBitCompression(const float threshold) {
type_ = CompressionType::kOneBit;
threshold_ = threshold;
}

void GradientCompression::SetTwoBitCompression(const float threshold) {
type_ = CompressionType::kTwoBit;
threshold_ = threshold;
Expand All @@ -83,7 +90,9 @@ void GradientCompression::DecodeParams(const std::string &s) {
}

int GradientCompression::GetCompressionFactor() {
if (type_ == CompressionType::kTwoBit) {
if (type_ == CompressionType::kOneBit) {
return 32;
} else if (type_ == CompressionType::kTwoBit) {
return 16;
} else {
LOG(FATAL) << "Unsupported compression type: " << get_type_str();
Expand All @@ -106,16 +115,34 @@ void GradientCompression::Quantize(const mxnet::NDArray &from, mxnet::NDArray *t
const int a = from.ctx().dev_mask();
const int b = to->ctx().dev_mask();
const float threshold = threshold_;
if (type_ == CompressionType::kTwoBit) {
if (a == mshadow::cpu::kDevMask && b == mshadow::cpu::kDevMask) {
if (a == mshadow::cpu::kDevMask && b == mshadow::cpu::kDevMask) {
if (type_ == CompressionType::kOneBit) {
mxnet::Engine::Get()->PushSync([from, to, residual, threshold](mxnet::RunContext ctx) {
std::vector<mxnet::TBlob> inputs = {from.data(), residual->data(), to->data()};
Quantize1BitImpl(ctx.get_stream<mshadow::cpu>(), inputs, threshold);
}, from.ctx(), {from.var()}, {to->var(), residual->var()},
mxnet::FnProperty::kNormal, priority, "QuantizeCPU");
} else if (type_ == CompressionType::kTwoBit) {
mxnet::Engine::Get()->PushSync([from, to, residual, threshold](mxnet::RunContext ctx) {
std::vector<mxnet::TBlob> inputs = {from.data(), residual->data(), to->data()};
Quantize2BitImpl(ctx.get_stream<mshadow::cpu>(), inputs, threshold);
}, from.ctx(), {from.var()}, {to->var(), residual->var()},
mxnet::FnProperty::kNormal, priority, "QuantizeCPU");
} else {
LOG(FATAL) << "Unsupported quantization of type " << get_type_str();
}
} else {
#if MXNET_USE_CUDA
if (a == mshadow::gpu::kDevMask && b == mshadow::gpu::kDevMask) {
if (a == mshadow::gpu::kDevMask && b == mshadow::gpu::kDevMask) {
if (type_ == CompressionType::kOneBit) {
mxnet::Engine::Get()->PushSync([from, to, residual, threshold](mxnet::RunContext ctx) {
std::vector<mxnet::TBlob> inputs = {from.data(), residual->data(), to->data()};
Quantize1BitImpl(ctx.get_stream<mshadow::gpu>(), inputs, threshold);
// Wait GPU kernel to complete
ctx.get_stream<mshadow::gpu>()->Wait();
}, from.ctx(), {from.var()}, {to->var(), residual->var()},
mxnet::FnProperty::kNormal, priority, "QuantizeGPU");
} else if (type_ == CompressionType::kTwoBit) {
mxnet::Engine::Get()->PushSync([from, to, residual, threshold](mxnet::RunContext ctx) {
std::vector<mxnet::TBlob> inputs = {from.data(), residual->data(), to->data()};
Quantize2BitImpl(ctx.get_stream<mshadow::gpu>(), inputs, threshold);
Expand All @@ -124,14 +151,14 @@ void GradientCompression::Quantize(const mxnet::NDArray &from, mxnet::NDArray *t
}, from.ctx(), {from.var()}, {to->var(), residual->var()},
mxnet::FnProperty::kNormal, priority, "QuantizeGPU");
} else {
LOG(FATAL) << "unknown device mask";
LOG(FATAL) << "Unsupported quantization of type " << get_type_str();
}
} else {
LOG(FATAL) << "unknown device mask";
}
#else
LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
#endif
}
} else {
LOG(FATAL) << "Unsupported quantization of type " << get_type_str();
}
}

Expand All @@ -142,35 +169,52 @@ void GradientCompression::Dequantize(const mxnet::NDArray &from, mxnet::NDArray
const int a = from.ctx().dev_mask();
const int b = to->ctx().dev_mask();
const float threshold = threshold_;
if (type_ == CompressionType::kTwoBit) {
if (a == mshadow::cpu::kDevMask && b == mshadow::cpu::kDevMask) {
if (a == mshadow::cpu::kDevMask && b == mshadow::cpu::kDevMask) {
if (type_ == CompressionType::kOneBit) {
mxnet::Engine::Get()->PushSync([from, to, threshold](mxnet::RunContext ctx) {
std::vector<mxnet::TBlob> inputs = {from.data(), to->data()};
Dequantize1BitImpl(ctx.get_stream<mshadow::cpu>(), inputs, threshold);
}, from.ctx(), {from.var()}, {to->var()},
mxnet::FnProperty::kNormal, priority, "DequantizeCPU");
} else if (type_ == CompressionType::kTwoBit) {
mxnet::Engine::Get()->PushSync([from, to, threshold](mxnet::RunContext ctx) {
std::vector<mxnet::TBlob> inputs = {from.data(), to->data()};
Dequantize2BitImpl(ctx.get_stream<mshadow::cpu>(), inputs, threshold);
}, from.ctx(), {from.var()}, {to->var()},
mxnet::FnProperty::kNormal, priority, "DequantizeCPU");
} else {
LOG(FATAL) << "Unsupported dequantization of type " << get_type_str();
}
} else {
#if MXNET_USE_CUDA
if (a == mshadow::gpu::kDevMask && b == mshadow::gpu::kDevMask) {
if (a == mshadow::gpu::kDevMask && b == mshadow::gpu::kDevMask) {
if (type_ == CompressionType::kOneBit) {
mxnet::Engine::Get()->PushSync([from, to, threshold](mxnet::RunContext ctx) {
std::vector<mxnet::TBlob> inputs = {from.data(), to->data()};
Dequantize2BitImpl(ctx.get_stream<mshadow::gpu>(), inputs, threshold);
Dequantize1BitImpl(ctx.get_stream<mshadow::gpu>(), inputs, threshold);
// Wait GPU kernel to complete
ctx.get_stream<mshadow::gpu>()->Wait();
}, from.ctx(), {from.var()}, {to->var()},
mxnet::FnProperty::kNormal, priority, "DequantizeGPU");
} else if (type_ == CompressionType::kTwoBit) {
mxnet::Engine::Get()->PushSync([from, to, threshold](mxnet::RunContext ctx) {
std::vector<mxnet::TBlob> inputs = {from.data(), to->data()};
Dequantize2BitImpl(ctx.get_stream<mshadow::gpu>(), inputs, threshold);
// Wait GPU kernel to completes
ctx.get_stream<mshadow::gpu>()->Wait();
}, from.ctx(), {from.var()}, {to->var()},
mxnet::FnProperty::kNormal, priority, "DequantizeGPU");
} else {
LOG(FATAL) << "unknown device mask";
LOG(FATAL) << "Unsupported dequantization of type " << get_type_str();
}
} else {
LOG(FATAL) << "unknown device mask";
}
#else
LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
#endif
}
} else {
LOG(FATAL) << "Unsupported dequantization of type " << get_type_str();
}
}

} // namespace kvstore
} // namespace mxnet

10 changes: 10 additions & 0 deletions src/kvstore/gradient_compression.cu
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,16 @@

namespace mxnet {
namespace kvstore {
void Quantize1BitImpl(mshadow::Stream<gpu>* s, const std::vector<TBlob>& inputs,
const float threshold) {
Quantize1BitKernelLaunch(s, inputs, threshold);
}

void Dequantize1BitImpl(mshadow::Stream<gpu>* s, const std::vector<TBlob>& inputs,
const float threshold) {
Dequantize1BitKernelLaunch(s, inputs, threshold);
}

void Quantize2BitImpl(mshadow::Stream<gpu>* s, const std::vector<TBlob>& inputs,
const float threshold) {
Quantize2BitKernelLaunch(s, inputs, threshold);
Expand Down
8 changes: 7 additions & 1 deletion src/kvstore/gradient_compression.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ namespace mxnet {
namespace kvstore {

enum class CompressionType {
kNone, kTwoBit
kNone, kOneBit, kTwoBit
};

struct GradientCompressionParam : public dmlc::Parameter<GradientCompressionParam> {
Expand Down Expand Up @@ -72,6 +72,12 @@ class GradientCompression {
*/
std::string get_type_str();

/*!
* \biref sets one bit gradient compression
* \param threshold float value used for thresholding gradients
*/
void SetOneBitCompression(const float threshold);

/*!
* \brief sets two bit gradient compression
* \param threshold float value used for thresholding gradients
Expand Down
Loading

0 comments on commit db71b9c

Please sign in to comment.