Revert "2bit gradient compression (apache#8662)" (apache#8711)

This reverts commit a499f89.
eric-haibin-lin · Dec 3, 2017 · 116b059 · 116b059
1 parent 82539df
commit 116b059
Show file tree

Hide file tree

Showing 21 changed files with 167 additions and 1,501 deletions.
diff --git a/example/image-classification/common/fit.py b/example/image-classification/common/fit.py
@@ -103,11 +103,6 @@ def add_fit_args(parser):
                        help='1 means test reading speed without training')
     train.add_argument('--dtype', type=str, default='float32',
                        help='precision: float32 or float16')
-    train.add_argument('--gc-type', type=str, default='none',
-                       help='type of gradient compression to use, \
-                             takes `2bit` or `none` for now')
-    train.add_argument('--gc-threshold', type=float, default=0.5,
-                       help='threshold for 2bit gradient compression')
     return train
 
 def fit(args, network, data_loader, **kwargs):
@@ -119,9 +114,6 @@ def fit(args, network, data_loader, **kwargs):
     """
     # kvstore
     kv = mx.kvstore.create(args.kv_store)
-    if args.gc_type != 'none':
-        kv.set_gradient_compression({'type': args.gc_type,
-                                     'threshold': args.gc_threshold})
 
     # logging
     head = '%(asctime)-15s Node[' + str(kv.rank) + '] %(message)s'
@@ -170,10 +162,10 @@ def fit(args, network, data_loader, **kwargs):
 
     lr_scheduler  = lr_scheduler
     optimizer_params = {
-        'learning_rate': lr,
-        'wd' : args.wd,
-        'lr_scheduler': lr_scheduler,
-        'multi_precision': True}
+            'learning_rate': lr,
+            'wd' : args.wd,
+            'lr_scheduler': lr_scheduler,
+            'multi_precision': True}
 
     # Only a limited number of optimizers have 'momentum' property
     has_momentum = {'sgd', 'dcasgd', 'nag'}
@@ -203,17 +195,17 @@ def fit(args, network, data_loader, **kwargs):
 
     # run
     model.fit(train,
-              begin_epoch        = args.load_epoch if args.load_epoch else 0,
-              num_epoch          = args.num_epochs,
-              eval_data          = val,
-              eval_metric        = eval_metrics,
-              kvstore            = kv,
-              optimizer          = args.optimizer,
-              optimizer_params   = optimizer_params,
-              initializer        = initializer,
-              arg_params         = arg_params,
-              aux_params         = aux_params,
-              batch_end_callback = batch_end_callbacks,
-              epoch_end_callback = checkpoint,
-              allow_missing      = True,
-              monitor            = monitor)
+        begin_epoch        = args.load_epoch if args.load_epoch else 0,
+        num_epoch          = args.num_epochs,
+        eval_data          = val,
+        eval_metric        = eval_metrics,
+        kvstore            = kv,
+        optimizer          = args.optimizer,
+        optimizer_params   = optimizer_params,
+        initializer        = initializer,
+        arg_params         = arg_params,
+        aux_params         = aux_params,
+        batch_end_callback = batch_end_callbacks,
+        epoch_end_callback = checkpoint,
+        allow_missing      = True,
+        monitor            = monitor)
diff --git a/example/rnn/lstm_bucketing.py b/example/rnn/lstm_bucketing.py
@@ -48,6 +48,7 @@
 parser.add_argument('--disp-batches', type=int, default=50,
                     help='show progress for every n batches')
 
+
 def tokenize_text(fname, vocab=None, invalid_label=-1, start_label=0):
     if not os.path.isfile(fname):
         raise IOError("Please use get_ptb_data.sh to download requied file (data/ptb.train.txt)")

diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
@@ -1550,19 +1550,6 @@ MXNET_DLL int MXInitPSEnv(mx_uint num_vars,
  */
 MXNET_DLL int MXKVStoreCreate(const char *type,
                               KVStoreHandle *out);
-
-/*!
- * \brief Set parameters to use low-bit compressed gradients
- * \param handle handle to the kvstore
- * \param keys keys for compression parameters
- * \param vals values for compression parameters
- * \return 0 when success, -1 when failure happens
- */
-MXNET_DLL int MXKVStoreSetGradientCompression(KVStoreHandle handle,
-                                              mx_uint num_params,
-                                              const char** keys,
-                                              const char** vals);
-
 /*!
  * \brief Delete a KVStore handle.
  * \param handle handle to the kvstore

diff --git a/include/mxnet/kvstore.h b/include/mxnet/kvstore.h
@@ -31,7 +31,6 @@
 #include <string>
 #include <functional>
 #include <atomic>
-#include "../../src/kvstore/gradient_compression.h"
 #include "./ndarray.h"
 #if MXNET_USE_DIST_KVSTORE
 #include "ps/ps.h"
@@ -66,14 +65,6 @@ class KVStore {
    */
   inline const std::string& type() { return type_; }
 
-  /**
-   * \brief Set parameters to use low-bit compressed gradients
-   * \param compression_type type of compression
-   * \param threshold threshold for 2bit compression
-   */
-  virtual void SetGradientCompression(const std::vector<std::pair<std::string, std::string> >
-                                      & kwargs) = 0;
-
   /*!
    * \brief Initialize a list of key-value pair to the store.
    *
@@ -397,12 +388,6 @@ class KVStore {
    */
   std::string type_;
 
-  /** \brief Gradient compression object starts with GC_NONE mode
-   * Used if SetGradientCompression sets the type.
-   * Currently there is no support for un-setting gradient compression
-   */
-  std::shared_ptr<kvstore::GradientCompression> gradient_compression_;
-
   /**
    * \brief whether to do barrier when finalize
    */

diff --git a/python/mxnet/gluon/trainer.py b/python/mxnet/gluon/trainer.py
@@ -44,20 +44,14 @@ class Trainer(object):
     kvstore : str or KVStore
         kvstore type for multi-gpu and distributed training. See help on
         :any:`mxnet.kvstore.create` for more information.
-    compression_params : dict
-        Specifies type of gradient compression and additional arguments depending
-        on the type of compression being used. For example, 2bit compression requires a threshold.
-        Arguments would then be {'type':'2bit', 'threshold':0.5}
-        See mxnet.KVStore.set_gradient_compression method for more details on gradient compression.
 
     Properties
     ----------
     learning_rate: float
         The current learning rate of the optimizer. Given an Optimizer object
         optimizer, its learning rate can be accessed as optimizer.learning_rate.
     """
-    def __init__(self, params, optimizer, optimizer_params=None, kvstore='device',
-                 compression_params=None):
+    def __init__(self, params, optimizer, optimizer_params=None, kvstore='device'):
         if isinstance(params, (dict, ParameterDict)):
             params = list(params.values())
         if not isinstance(params, (list, tuple)):
@@ -71,7 +65,7 @@ def __init__(self, params, optimizer, optimizer_params=None, kvstore='device',
                     "First argument must be a list or dict of Parameters, " \
                     "got list of %s."%(type(param)))
             self._params.append(param)
-        self._compression_params = compression_params
+
         optimizer_params = optimizer_params if optimizer_params else {}
         self._scale = optimizer_params.get('rescale_grad', 1.0)
         self._contexts = self._check_contexts()
@@ -110,8 +104,6 @@ def _init_kvstore(self):
         kvstore, update_on_kvstore = _create_kvstore(self._kvstore, len(self._contexts),
                                                      arg_arrays)
         if kvstore:
-            if self._compression_params:
-                kvstore.set_gradient_compression(self._compression_params)
             if 'dist' in kvstore.type:
                 update_on_kvstore = False
             for i, param in enumerate(self._params):

diff --git a/python/mxnet/kvstore.py b/python/mxnet/kvstore.py
@@ -64,16 +64,6 @@ def _ctype_key_value(keys, vals):
                  else c_array_buf(ctypes.c_int, array('i', [keys] * len(vals)))
         return (c_keys, c_handle_array(vals), use_str_keys)
 
-def _ctype_dict(param_dict):
-    """
-    Returns ctype arrays for keys and values(converted to strings) in a dictionary
-    """
-    assert(isinstance(param_dict, dict)), \
-        "unexpected type for param_dict: " + str(type(param_dict))
-    c_keys = c_array(ctypes.c_char_p, [c_str(k) for k in param_dict.keys()])
-    c_vals = c_array(ctypes.c_char_p, [c_str(str(v)) for v in param_dict.values()])
-    return (c_keys, c_vals)
-
 def _updater_wrapper(updater):
     """A wrapper for the user-defined handle."""
     def updater_handle(key, lhs_handle, rhs_handle, _):
@@ -360,58 +350,6 @@ def row_sparse_pull(self, key, out=None, priority=0, row_ids=None):
             check_call(_LIB.MXKVStorePullRowSparse(
                 self.handle, mx_uint(len(ckeys)), ckeys, cvals, crow_ids, ctypes.c_int(priority)))
 
-    def set_gradient_compression(self, compression_params):
-        """ Specifies type of low-bit quantization for gradient compression \
-         and additional arguments depending on the type of compression being used.
-
-        2bit Gradient Compression takes a positive float `threshold`.
-        The technique works by thresholding values such that positive values in the
-        gradient above threshold will be set to threshold. Negative values whose absolute
-        values are higher than threshold, will be set to the negative of threshold.
-        Values whose absolute values are less than threshold will be set to 0.
-        By doing so, each value in the gradient is in one of three states. 2bits are
-        used to represent these states, and every 16 float values in the original
-        gradient can be represented using one float. This compressed representation
-        can reduce communication costs. The difference between these thresholded values and
-        original values is stored at the sender's end as residual and added to the
-        gradient in the next iteration.
-
-        When kvstore is 'local', gradient compression is used to reduce communication
-        between multiple devices (gpus). Gradient is quantized on each GPU which
-        computed the gradients, then sent to the GPU which merges the gradients. This
-        receiving GPU dequantizes the gradients and merges them. Note that this
-        increases memory usage on each GPU because of the residual array stored.
-
-        When kvstore is 'dist', gradient compression is used to reduce communication
-        from worker to sender. Gradient is quantized on each worker which
-        computed the gradients, then sent to the server which dequantizes
-        this data and merges the gradients from each worker. Note that this
-        increases CPU memory usage on each worker because of the residual array stored.
-        Only worker to server communication is compressed in this setting.
-        If each machine has multiple GPUs, currently this GPU to GPU or GPU to CPU communication
-        is not compressed. Server to worker communication (in the case of pull)
-        is also not compressed.
-
-        To use 2bit compression, we need to specify `type` as `2bit`.
-        Only specifying `type` would use default value for the threshold.
-        To completely specify the arguments for 2bit compression, we would need to pass
-        a dictionary which includes `threshold` like:
-        {'type': '2bit', 'threshold': 0.5}
-
-        Parameters
-        ----------
-        compression_params : dict
-            A dictionary specifying the type and parameters for gradient compression.
-            The key `type` in this dictionary is a
-            required string argument and specifies the type of gradient compression.
-            Currently `type` can be only `2bit`
-            Other keys in this dictionary are optional and specific to the type
-            of gradient compression.
-        """
-        ckeys, cvals = _ctype_dict(compression_params)
-        check_call(_LIB.MXKVStoreSetGradientCompression(self.handle,
-                                                        mx_uint(len(compression_params)),
-                                                        ckeys, cvals))
 
     def set_optimizer(self, optimizer):
         """ Registers an optimizer with the kvstore.

diff --git a/python/mxnet/module/bucketing_module.py b/python/mxnet/module/bucketing_module.py
@@ -54,16 +54,10 @@ class BucketingModule(BaseModule):
         Instead they are initialized to 0 and can be set by set_states()
     group2ctxs : list of dict of str to context
         Default is `None`. Mapping the `ctx_group` attribute to the context assignment.
-    compression_params : dict
-        Specifies type of gradient compression and additional arguments depending
-        on the type of compression being used. For example, 2bit compression requires a threshold.
-        Arguments would then be {'type':'2bit', 'threshold':0.5}
-        See mxnet.KVStore.set_gradient_compression method for more details on gradient compression.
     """
     def __init__(self, sym_gen, default_bucket_key=None, logger=logging,
                  context=ctx.cpu(), work_load_list=None,
-                 fixed_param_names=None, state_names=None, group2ctxs=None,
-                 compression_params=None):
+                 fixed_param_names=None, state_names=None, group2ctxs=None):
         super(BucketingModule, self).__init__(logger=logger)
 
         assert default_bucket_key is not None
@@ -81,7 +75,6 @@ def __init__(self, sym_gen, default_bucket_key=None, logger=logging,
         _check_input_names(symbol, state_names, "state", True)
         _check_input_names(symbol, fixed_param_names, "fixed_param", True)
 
-        self._compression_params = compression_params
         self._fixed_param_names = fixed_param_names
         self._state_names = state_names
         self._context = context
@@ -330,9 +323,7 @@ def bind(self, data_shapes, label_shapes=None, for_training=True,
         module = Module(symbol, data_names, label_names, logger=self.logger,
                         context=self._context, work_load_list=self._work_load_list,
                         fixed_param_names=self._fixed_param_names,
-                        state_names=self._state_names,
-                        group2ctxs=self._group2ctxs,
-                        compression_params=self._compression_params)
+                        state_names=self._state_names, group2ctxs=self._group2ctxs)
         module.bind(data_shapes, label_shapes, for_training, inputs_need_grad,
                     force_rebind=False, shared_module=None, grad_req=grad_req)
         self._curr_module = module
@@ -362,9 +353,7 @@ def switch_bucket(self, bucket_key, data_shapes, label_shapes=None):
                             logger=self.logger, context=self._context,
                             work_load_list=self._work_load_list,
                             fixed_param_names=self._fixed_param_names,
-                            state_names=self._state_names,
-                            group2ctxs=self._group2ctxs,
-                            compression_params=self._compression_params)
+                            state_names=self._state_names, group2ctxs=self._group2ctxs)
             module.bind(data_shapes, label_shapes, self._curr_module.for_training,
                         self._curr_module.inputs_need_grad,
                         force_rebind=False, shared_module=self._buckets[self._default_bucket_key])

diff --git a/python/mxnet/module/module.py b/python/mxnet/module/module.py
@@ -61,16 +61,10 @@ class Module(BaseModule):
         Instead they are initialized to 0 and can be set by `set_states()`.
     group2ctxs : list of dict of str to context
         Default is `None`. Mapping the `ctx_group` attribute to the context assignment.
-    compression_params : dict
-        Specifies type of gradient compression and additional arguments depending
-        on the type of compression being used. For example, 2bit compression requires a threshold.
-        Arguments would then be {'type':'2bit', 'threshold':0.5}
-        See mxnet.KVStore.set_gradient_compression method for more details on gradient compression.
     """
     def __init__(self, symbol, data_names=('data',), label_names=('softmax_label',),
                  logger=logging, context=ctx.cpu(), work_load_list=None,
-                 fixed_param_names=None, state_names=None, group2ctxs=None,
-                 compression_params=None):
+                 fixed_param_names=None, state_names=None, group2ctxs=None):
         super(Module, self).__init__(logger=logger)
 
         if isinstance(context, ctx.Context):
@@ -109,7 +103,6 @@ def __init__(self, symbol, data_names=('data',), label_names=('softmax_label',),
         self._aux_params = None
         self._params_dirty = False
 
-        self._compression_params = compression_params
         self._optimizer = None
         self._kvstore = None
         self._update_on_kvstore = None
@@ -532,8 +525,6 @@ def init_optimizer(self, kvstore='local', optimizer='sgd',
         self._updater = None
 
         if kvstore:
-            if self._compression_params:
-                kvstore.set_gradient_compression(self._compression_params)
             # copy initialized local parameters to kvstore
             _initialize_kvstore(kvstore=kvstore,
                                 param_arrays=self._exec_group.param_arrays,

diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
@@ -748,20 +748,6 @@ int MXKVStoreCreate(const char *type,
   API_END();
 }
 
-int MXKVStoreSetGradientCompression(KVStoreHandle handle, mx_uint num_params,
-                                    const char** keys, const char** vals) {
-  API_BEGIN();
-  std::vector<std::pair<std::string, std::string> > params;
-  for (mx_uint i = 0; i < num_params; ++i) {
-    std::pair<std::string, std::string> p;
-    p.first = keys[i];
-    p.second = vals[i];
-    params.push_back(p);
-  }
-  static_cast<KVStore*>(handle)->SetGradientCompression(params);
-  API_END();
-}
-
 int MXKVStoreFree(KVStoreHandle handle) {
   API_BEGIN();
   delete static_cast<KVStore*>(handle);