Skip to content

Commit

Permalink
Revert "2bit gradient compression (apache#8662)" (apache#8711)
Browse files Browse the repository at this point in the history
This reverts commit a499f89.
  • Loading branch information
szha authored and eric-haibin-lin committed Dec 3, 2017
1 parent 82539df commit 116b059
Show file tree
Hide file tree
Showing 21 changed files with 167 additions and 1,501 deletions.
44 changes: 18 additions & 26 deletions example/image-classification/common/fit.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,11 +103,6 @@ def add_fit_args(parser):
help='1 means test reading speed without training')
train.add_argument('--dtype', type=str, default='float32',
help='precision: float32 or float16')
train.add_argument('--gc-type', type=str, default='none',
help='type of gradient compression to use, \
takes `2bit` or `none` for now')
train.add_argument('--gc-threshold', type=float, default=0.5,
help='threshold for 2bit gradient compression')
return train

def fit(args, network, data_loader, **kwargs):
Expand All @@ -119,9 +114,6 @@ def fit(args, network, data_loader, **kwargs):
"""
# kvstore
kv = mx.kvstore.create(args.kv_store)
if args.gc_type != 'none':
kv.set_gradient_compression({'type': args.gc_type,
'threshold': args.gc_threshold})

# logging
head = '%(asctime)-15s Node[' + str(kv.rank) + '] %(message)s'
Expand Down Expand Up @@ -170,10 +162,10 @@ def fit(args, network, data_loader, **kwargs):

lr_scheduler = lr_scheduler
optimizer_params = {
'learning_rate': lr,
'wd' : args.wd,
'lr_scheduler': lr_scheduler,
'multi_precision': True}
'learning_rate': lr,
'wd' : args.wd,
'lr_scheduler': lr_scheduler,
'multi_precision': True}

# Only a limited number of optimizers have 'momentum' property
has_momentum = {'sgd', 'dcasgd', 'nag'}
Expand Down Expand Up @@ -203,17 +195,17 @@ def fit(args, network, data_loader, **kwargs):

# run
model.fit(train,
begin_epoch = args.load_epoch if args.load_epoch else 0,
num_epoch = args.num_epochs,
eval_data = val,
eval_metric = eval_metrics,
kvstore = kv,
optimizer = args.optimizer,
optimizer_params = optimizer_params,
initializer = initializer,
arg_params = arg_params,
aux_params = aux_params,
batch_end_callback = batch_end_callbacks,
epoch_end_callback = checkpoint,
allow_missing = True,
monitor = monitor)
begin_epoch = args.load_epoch if args.load_epoch else 0,
num_epoch = args.num_epochs,
eval_data = val,
eval_metric = eval_metrics,
kvstore = kv,
optimizer = args.optimizer,
optimizer_params = optimizer_params,
initializer = initializer,
arg_params = arg_params,
aux_params = aux_params,
batch_end_callback = batch_end_callbacks,
epoch_end_callback = checkpoint,
allow_missing = True,
monitor = monitor)
1 change: 1 addition & 0 deletions example/rnn/lstm_bucketing.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
parser.add_argument('--disp-batches', type=int, default=50,
help='show progress for every n batches')


def tokenize_text(fname, vocab=None, invalid_label=-1, start_label=0):
if not os.path.isfile(fname):
raise IOError("Please use get_ptb_data.sh to download requied file (data/ptb.train.txt)")
Expand Down
13 changes: 0 additions & 13 deletions include/mxnet/c_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -1550,19 +1550,6 @@ MXNET_DLL int MXInitPSEnv(mx_uint num_vars,
*/
MXNET_DLL int MXKVStoreCreate(const char *type,
KVStoreHandle *out);

/*!
* \brief Set parameters to use low-bit compressed gradients
* \param handle handle to the kvstore
* \param keys keys for compression parameters
* \param vals values for compression parameters
* \return 0 when success, -1 when failure happens
*/
MXNET_DLL int MXKVStoreSetGradientCompression(KVStoreHandle handle,
mx_uint num_params,
const char** keys,
const char** vals);

/*!
* \brief Delete a KVStore handle.
* \param handle handle to the kvstore
Expand Down
15 changes: 0 additions & 15 deletions include/mxnet/kvstore.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@
#include <string>
#include <functional>
#include <atomic>
#include "../../src/kvstore/gradient_compression.h"
#include "./ndarray.h"
#if MXNET_USE_DIST_KVSTORE
#include "ps/ps.h"
Expand Down Expand Up @@ -66,14 +65,6 @@ class KVStore {
*/
inline const std::string& type() { return type_; }

/**
* \brief Set parameters to use low-bit compressed gradients
* \param compression_type type of compression
* \param threshold threshold for 2bit compression
*/
virtual void SetGradientCompression(const std::vector<std::pair<std::string, std::string> >
& kwargs) = 0;

/*!
* \brief Initialize a list of key-value pair to the store.
*
Expand Down Expand Up @@ -397,12 +388,6 @@ class KVStore {
*/
std::string type_;

/** \brief Gradient compression object starts with GC_NONE mode
* Used if SetGradientCompression sets the type.
* Currently there is no support for un-setting gradient compression
*/
std::shared_ptr<kvstore::GradientCompression> gradient_compression_;

/**
* \brief whether to do barrier when finalize
*/
Expand Down
12 changes: 2 additions & 10 deletions python/mxnet/gluon/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,20 +44,14 @@ class Trainer(object):
kvstore : str or KVStore
kvstore type for multi-gpu and distributed training. See help on
:any:`mxnet.kvstore.create` for more information.
compression_params : dict
Specifies type of gradient compression and additional arguments depending
on the type of compression being used. For example, 2bit compression requires a threshold.
Arguments would then be {'type':'2bit', 'threshold':0.5}
See mxnet.KVStore.set_gradient_compression method for more details on gradient compression.
Properties
----------
learning_rate: float
The current learning rate of the optimizer. Given an Optimizer object
optimizer, its learning rate can be accessed as optimizer.learning_rate.
"""
def __init__(self, params, optimizer, optimizer_params=None, kvstore='device',
compression_params=None):
def __init__(self, params, optimizer, optimizer_params=None, kvstore='device'):
if isinstance(params, (dict, ParameterDict)):
params = list(params.values())
if not isinstance(params, (list, tuple)):
Expand All @@ -71,7 +65,7 @@ def __init__(self, params, optimizer, optimizer_params=None, kvstore='device',
"First argument must be a list or dict of Parameters, " \
"got list of %s."%(type(param)))
self._params.append(param)
self._compression_params = compression_params

optimizer_params = optimizer_params if optimizer_params else {}
self._scale = optimizer_params.get('rescale_grad', 1.0)
self._contexts = self._check_contexts()
Expand Down Expand Up @@ -110,8 +104,6 @@ def _init_kvstore(self):
kvstore, update_on_kvstore = _create_kvstore(self._kvstore, len(self._contexts),
arg_arrays)
if kvstore:
if self._compression_params:
kvstore.set_gradient_compression(self._compression_params)
if 'dist' in kvstore.type:
update_on_kvstore = False
for i, param in enumerate(self._params):
Expand Down
62 changes: 0 additions & 62 deletions python/mxnet/kvstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,16 +64,6 @@ def _ctype_key_value(keys, vals):
else c_array_buf(ctypes.c_int, array('i', [keys] * len(vals)))
return (c_keys, c_handle_array(vals), use_str_keys)

def _ctype_dict(param_dict):
"""
Returns ctype arrays for keys and values(converted to strings) in a dictionary
"""
assert(isinstance(param_dict, dict)), \
"unexpected type for param_dict: " + str(type(param_dict))
c_keys = c_array(ctypes.c_char_p, [c_str(k) for k in param_dict.keys()])
c_vals = c_array(ctypes.c_char_p, [c_str(str(v)) for v in param_dict.values()])
return (c_keys, c_vals)

def _updater_wrapper(updater):
"""A wrapper for the user-defined handle."""
def updater_handle(key, lhs_handle, rhs_handle, _):
Expand Down Expand Up @@ -360,58 +350,6 @@ def row_sparse_pull(self, key, out=None, priority=0, row_ids=None):
check_call(_LIB.MXKVStorePullRowSparse(
self.handle, mx_uint(len(ckeys)), ckeys, cvals, crow_ids, ctypes.c_int(priority)))

def set_gradient_compression(self, compression_params):
""" Specifies type of low-bit quantization for gradient compression \
and additional arguments depending on the type of compression being used.
2bit Gradient Compression takes a positive float `threshold`.
The technique works by thresholding values such that positive values in the
gradient above threshold will be set to threshold. Negative values whose absolute
values are higher than threshold, will be set to the negative of threshold.
Values whose absolute values are less than threshold will be set to 0.
By doing so, each value in the gradient is in one of three states. 2bits are
used to represent these states, and every 16 float values in the original
gradient can be represented using one float. This compressed representation
can reduce communication costs. The difference between these thresholded values and
original values is stored at the sender's end as residual and added to the
gradient in the next iteration.
When kvstore is 'local', gradient compression is used to reduce communication
between multiple devices (gpus). Gradient is quantized on each GPU which
computed the gradients, then sent to the GPU which merges the gradients. This
receiving GPU dequantizes the gradients and merges them. Note that this
increases memory usage on each GPU because of the residual array stored.
When kvstore is 'dist', gradient compression is used to reduce communication
from worker to sender. Gradient is quantized on each worker which
computed the gradients, then sent to the server which dequantizes
this data and merges the gradients from each worker. Note that this
increases CPU memory usage on each worker because of the residual array stored.
Only worker to server communication is compressed in this setting.
If each machine has multiple GPUs, currently this GPU to GPU or GPU to CPU communication
is not compressed. Server to worker communication (in the case of pull)
is also not compressed.
To use 2bit compression, we need to specify `type` as `2bit`.
Only specifying `type` would use default value for the threshold.
To completely specify the arguments for 2bit compression, we would need to pass
a dictionary which includes `threshold` like:
{'type': '2bit', 'threshold': 0.5}
Parameters
----------
compression_params : dict
A dictionary specifying the type and parameters for gradient compression.
The key `type` in this dictionary is a
required string argument and specifies the type of gradient compression.
Currently `type` can be only `2bit`
Other keys in this dictionary are optional and specific to the type
of gradient compression.
"""
ckeys, cvals = _ctype_dict(compression_params)
check_call(_LIB.MXKVStoreSetGradientCompression(self.handle,
mx_uint(len(compression_params)),
ckeys, cvals))

def set_optimizer(self, optimizer):
""" Registers an optimizer with the kvstore.
Expand Down
17 changes: 3 additions & 14 deletions python/mxnet/module/bucketing_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,16 +54,10 @@ class BucketingModule(BaseModule):
Instead they are initialized to 0 and can be set by set_states()
group2ctxs : list of dict of str to context
Default is `None`. Mapping the `ctx_group` attribute to the context assignment.
compression_params : dict
Specifies type of gradient compression and additional arguments depending
on the type of compression being used. For example, 2bit compression requires a threshold.
Arguments would then be {'type':'2bit', 'threshold':0.5}
See mxnet.KVStore.set_gradient_compression method for more details on gradient compression.
"""
def __init__(self, sym_gen, default_bucket_key=None, logger=logging,
context=ctx.cpu(), work_load_list=None,
fixed_param_names=None, state_names=None, group2ctxs=None,
compression_params=None):
fixed_param_names=None, state_names=None, group2ctxs=None):
super(BucketingModule, self).__init__(logger=logger)

assert default_bucket_key is not None
Expand All @@ -81,7 +75,6 @@ def __init__(self, sym_gen, default_bucket_key=None, logger=logging,
_check_input_names(symbol, state_names, "state", True)
_check_input_names(symbol, fixed_param_names, "fixed_param", True)

self._compression_params = compression_params
self._fixed_param_names = fixed_param_names
self._state_names = state_names
self._context = context
Expand Down Expand Up @@ -330,9 +323,7 @@ def bind(self, data_shapes, label_shapes=None, for_training=True,
module = Module(symbol, data_names, label_names, logger=self.logger,
context=self._context, work_load_list=self._work_load_list,
fixed_param_names=self._fixed_param_names,
state_names=self._state_names,
group2ctxs=self._group2ctxs,
compression_params=self._compression_params)
state_names=self._state_names, group2ctxs=self._group2ctxs)
module.bind(data_shapes, label_shapes, for_training, inputs_need_grad,
force_rebind=False, shared_module=None, grad_req=grad_req)
self._curr_module = module
Expand Down Expand Up @@ -362,9 +353,7 @@ def switch_bucket(self, bucket_key, data_shapes, label_shapes=None):
logger=self.logger, context=self._context,
work_load_list=self._work_load_list,
fixed_param_names=self._fixed_param_names,
state_names=self._state_names,
group2ctxs=self._group2ctxs,
compression_params=self._compression_params)
state_names=self._state_names, group2ctxs=self._group2ctxs)
module.bind(data_shapes, label_shapes, self._curr_module.for_training,
self._curr_module.inputs_need_grad,
force_rebind=False, shared_module=self._buckets[self._default_bucket_key])
Expand Down
11 changes: 1 addition & 10 deletions python/mxnet/module/module.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,16 +61,10 @@ class Module(BaseModule):
Instead they are initialized to 0 and can be set by `set_states()`.
group2ctxs : list of dict of str to context
Default is `None`. Mapping the `ctx_group` attribute to the context assignment.
compression_params : dict
Specifies type of gradient compression and additional arguments depending
on the type of compression being used. For example, 2bit compression requires a threshold.
Arguments would then be {'type':'2bit', 'threshold':0.5}
See mxnet.KVStore.set_gradient_compression method for more details on gradient compression.
"""
def __init__(self, symbol, data_names=('data',), label_names=('softmax_label',),
logger=logging, context=ctx.cpu(), work_load_list=None,
fixed_param_names=None, state_names=None, group2ctxs=None,
compression_params=None):
fixed_param_names=None, state_names=None, group2ctxs=None):
super(Module, self).__init__(logger=logger)

if isinstance(context, ctx.Context):
Expand Down Expand Up @@ -109,7 +103,6 @@ def __init__(self, symbol, data_names=('data',), label_names=('softmax_label',),
self._aux_params = None
self._params_dirty = False

self._compression_params = compression_params
self._optimizer = None
self._kvstore = None
self._update_on_kvstore = None
Expand Down Expand Up @@ -532,8 +525,6 @@ def init_optimizer(self, kvstore='local', optimizer='sgd',
self._updater = None

if kvstore:
if self._compression_params:
kvstore.set_gradient_compression(self._compression_params)
# copy initialized local parameters to kvstore
_initialize_kvstore(kvstore=kvstore,
param_arrays=self._exec_group.param_arrays,
Expand Down
14 changes: 0 additions & 14 deletions src/c_api/c_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -748,20 +748,6 @@ int MXKVStoreCreate(const char *type,
API_END();
}

int MXKVStoreSetGradientCompression(KVStoreHandle handle, mx_uint num_params,
const char** keys, const char** vals) {
API_BEGIN();
std::vector<std::pair<std::string, std::string> > params;
for (mx_uint i = 0; i < num_params; ++i) {
std::pair<std::string, std::string> p;
p.first = keys[i];
p.second = vals[i];
params.push_back(p);
}
static_cast<KVStore*>(handle)->SetGradientCompression(params);
API_END();
}

int MXKVStoreFree(KVStoreHandle handle) {
API_BEGIN();
delete static_cast<KVStore*>(handle);
Expand Down
Loading

0 comments on commit 116b059

Please sign in to comment.