1bit gradient compression implementation

apache · Aug 9, 2020 · db71b9c · db71b9c
1 parent d4052fd
commit db71b9c
Show file tree

Hide file tree

Showing 8 changed files with 428 additions and 70 deletions.
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
@@ -1315,8 +1315,10 @@ integrationtest_ubuntu_cpu_dist_kvstore() {
     python3 ../../tools/launch.py -n 7 --launcher local python3 dist_sync_kvstore.py --type=gluon_type_cpu
     python3 ../../tools/launch.py -n 7 --launcher local python3 dist_sync_kvstore.py
     python3 ../../tools/launch.py -n 7 --launcher local python3 dist_sync_kvstore.py --no-multiprecision
-    python3 ../../tools/launch.py -n 7 --launcher local python3 dist_sync_kvstore.py --type=compressed_cpu
-    python3 ../../tools/launch.py -n 7 --launcher local python3 dist_sync_kvstore.py --type=compressed_cpu --no-multiprecision
+    python3 ../../tools/launch.py -n 7 --launcher local python3 dist_sync_kvstore.py --type=compressed_cpu_1bit
+    python3 ../../tools/launch.py -n 7 --launcher local python3 dist_sync_kvstore.py --type=compressed_cpu_1bit --no-multiprecision
+    python3 ../../tools/launch.py -n 7 --launcher local python3 dist_sync_kvstore.py --type=compressed_cpu_2bit
+    python3 ../../tools/launch.py -n 7 --launcher local python3 dist_sync_kvstore.py --type=compressed_cpu_2bit --no-multiprecision
     python3 ../../tools/launch.py -n 3 --launcher local python3 test_server_profiling.py
     popd
 }

diff --git a/python/mxnet/kvstore/kvstore.py b/python/mxnet/kvstore/kvstore.py
@@ -498,6 +498,9 @@ def set_gradient_compression(self, compression_params):
         """ Specifies type of low-bit quantization for gradient compression \
          and additional arguments depending on the type of compression being used.
 
+        The 1bit compression works as follows: values which is above the threshold in the
+        gradient will be set to +1, whereas values below threshold will be set to -1.
+
         2bit Gradient Compression takes a positive float `threshold`.
         The technique works by thresholding values such that positive values in the
         gradient above threshold will be set to threshold. Negative values whose absolute
@@ -538,7 +541,7 @@ def set_gradient_compression(self, compression_params):
             A dictionary specifying the type and parameters for gradient compression.
             The key `type` in this dictionary is a
             required string argument and specifies the type of gradient compression.
-            Currently `type` can be only `2bit`
+            Currently `type` can be only `1bit` and `2bit`
             Other keys in this dictionary are optional and specific to the type
             of gradient compression.
         """

diff --git a/src/kvstore/gradient_compression-inl.h b/src/kvstore/gradient_compression-inl.h
@@ -32,11 +32,106 @@ namespace mxnet {
 namespace kvstore {
 
 // these gpu functions are defined in gradient_compression.cu
+void Quantize1BitImpl(mshadow::Stream<mshadow::gpu> *s, const std::vector<mxnet::TBlob> &inputs,
+                      const float threshold);
+void Dequantize1BitImpl(mshadow::Stream<mshadow::gpu> *s, const std::vector<mxnet::TBlob> &inputs,
+                        const float threshold);
 void Quantize2BitImpl(mshadow::Stream<mshadow::gpu> *s, const std::vector<mxnet::TBlob> &inputs,
                       const float threshold);
 void Dequantize2BitImpl(mshadow::Stream<mshadow::gpu> *s, const std::vector<mxnet::TBlob> &inputs,
                         const float threshold);
 
+struct quantize_1bit {
+  MSHADOW_XINLINE static void Map(int out_block_id,
+                                  int original_size,
+                                  float *out,
+                                  float *grad,
+                                  float *residual,
+                                  const float threshold) {
+    // this block contains the compressed representation of
+    // upto 32 values starting from out_block_id*32
+    float *compr_block = out + out_block_id;
+    // init to 0
+    *compr_block = 0;
+    // start and end are indices in original grad array
+    const int start = out_block_id << 5;
+    const int end = (start + 32 <= original_size) ? start + 32 : original_size;
+
+    char *block_ptr = reinterpret_cast < char * > (compr_block);
+    // masks used to quantize data
+    const uint8_t bits[] = {0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01};
+    for (int i = start; i < end; ++i) {
+      // adds offset to reach appropriate byte
+      char *curr_byte = block_ptr + ((i - start) >> 3);
+      // adds gradient to existing residual to get updated grad
+      residual[i] += grad[i];
+      if (residual[i] > threshold) {
+        // set data to 1
+        *curr_byte |= bits[(i & 7)];
+        // reduce residual by 1
+        residual[i] -= 1;
+      } else {
+        // set data to 0
+        *curr_byte &= ~bits[(i & 7)];
+        // add residual by 1
+        // because current position will be dequantized to -1
+        residual[i] += 1;
+      }
+    }
+  }
+};
+
+template<typename xpu>
+void Quantize1BitKernelLaunch(mshadow::Stream<xpu> *s, const std::vector<mxnet::TBlob> &inputs,
+                              const float threshold) {
+  mxnet::op::mxnet_op::Kernel<quantize_1bit, xpu>
+    ::Launch(s,
+            inputs[2].Size(),         // compressed array size
+            inputs[0].Size(),         // original size
+            inputs[2].dptr<float>(),  // compressed array
+            inputs[0].dptr<float>(),  // original array
+            inputs[1].dptr<float>(),  // residual array
+            threshold);               // threshold
+}
+
+struct dequantize_1bit {
+  MSHADOW_XINLINE static void Map(int i,
+                                  float *out,
+                                  float *in,
+                                  const float threshold) {
+    // get position of dequantized value to fill
+    float *outval = out + i;
+    // gets byte which holds quantized value for this position
+    char *ch_ptr = reinterpret_cast < char * > (in + (i >> 5));
+    ch_ptr += ((i & 31) >> 3);
+    // masks used to quantize data
+    const uint8_t bits[] = {0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01};
+    // col denotes which bit of a byte is set for this value
+    // col=0 implies the first bit, col=1 implies the second bit,...
+    const int col = i & 7;
+    const uint8_t mask = bits[col];
+    const uint8_t masked = *ch_ptr & mask;
+    if (masked == mask) {
+      *outval = +1;
+    } else {
+      // if current position of byte is 0
+      // dequantized it to -1
+      *outval = -1;
+    }
+  }
+};
+
+template<typename xpu>
+void Dequantize1BitKernelLaunch(mshadow::Stream<xpu> *s, const std::vector<mxnet::TBlob> &inputs,
+                                const float threshold) {
+  mxnet::op::mxnet_op::Kernel<dequantize_1bit, xpu>
+  ::Launch(s,
+          inputs[1].Size(),         // original size
+          inputs[1].dptr<float>(),  // out array
+          inputs[0].dptr<float>(),  // compressed array
+          threshold);               // threshold
+}
+
 struct quantize_2bit {
   MSHADOW_XINLINE static void Map(int out_block_id,
                                   int original_size,
@@ -138,6 +233,18 @@ void Dequantize2BitKernelLaunch(mshadow::Stream<xpu> *s, const std::vector<mxnet
           threshold);               // positive threshold
 }
 
+inline void Quantize1BitImpl(mshadow::Stream<mshadow::cpu> *s,
+                             const std::vector<mxnet::TBlob> &inputs,
+                             const float threshold) {
+  Quantize1BitKernelLaunch(s, inputs, threshold);
+}
+
+inline void Dequantize1BitImpl(mshadow::Stream<mshadow::cpu> *s,
+                               const std::vector<mxnet::TBlob> &inputs,
+                               const float threshold) {
+  Dequantize1BitKernelLaunch(s, inputs, threshold);
+}
+
 inline void Quantize2BitImpl(mshadow::Stream<mshadow::cpu> *s,
                              const std::vector<mxnet::TBlob> &inputs,
                              const float threshold) {

diff --git a/src/kvstore/gradient_compression.cc b/src/kvstore/gradient_compression.cc
@@ -41,8 +41,10 @@ void GradientCompression::SetParams(const std::vector<std::pair<std::string, std
                                     & kwargs) {
   GradientCompressionParam params;
   params.InitAllowUnknown(kwargs);
-  CHECK_GT(params.threshold, 0) << "threshold must be greater than 0";
-  if (params.type == "2bit") {
+  if (params.type == "1bit") {
+    SetOneBitCompression(params.threshold);
+  } else if (params.type == "2bit") {
+    CHECK_GT(params.threshold, 0) << "threshold must be greater than 0 for two bit compression";
     SetTwoBitCompression(params.threshold);
   } else {
     LOG(FATAL) << "Unknown type for gradient compression " << params.type;
@@ -57,6 +59,11 @@ std::string GradientCompression::get_type_str() {
   return std::to_string(static_cast<int>(type_));
 }
 
+void GradientCompression::SetOneBitCompression(const float threshold) {
+  type_ = CompressionType::kOneBit;
+  threshold_ = threshold;
+}
+
 void GradientCompression::SetTwoBitCompression(const float threshold) {
   type_ = CompressionType::kTwoBit;
   threshold_ = threshold;
@@ -83,7 +90,9 @@ void GradientCompression::DecodeParams(const std::string &s) {
 }
 
 int GradientCompression::GetCompressionFactor() {
-  if (type_ == CompressionType::kTwoBit) {
+  if (type_ == CompressionType::kOneBit) {
+    return 32;
+  } else if (type_ == CompressionType::kTwoBit) {
     return 16;
   } else {
     LOG(FATAL) << "Unsupported compression type: " << get_type_str();
@@ -106,16 +115,34 @@ void GradientCompression::Quantize(const mxnet::NDArray &from, mxnet::NDArray *t
   const int a = from.ctx().dev_mask();
   const int b = to->ctx().dev_mask();
   const float threshold = threshold_;
-  if (type_ == CompressionType::kTwoBit) {
-    if (a == mshadow::cpu::kDevMask && b == mshadow::cpu::kDevMask) {
+  if (a == mshadow::cpu::kDevMask && b == mshadow::cpu::kDevMask) {
+    if (type_ == CompressionType::kOneBit) {
+      mxnet::Engine::Get()->PushSync([from, to, residual, threshold](mxnet::RunContext ctx) {
+        std::vector<mxnet::TBlob> inputs = {from.data(), residual->data(), to->data()};
+        Quantize1BitImpl(ctx.get_stream<mshadow::cpu>(), inputs, threshold);
+      }, from.ctx(), {from.var()}, {to->var(), residual->var()},
+      mxnet::FnProperty::kNormal, priority, "QuantizeCPU");
+    } else if (type_ == CompressionType::kTwoBit) {
       mxnet::Engine::Get()->PushSync([from, to, residual, threshold](mxnet::RunContext ctx) {
         std::vector<mxnet::TBlob> inputs = {from.data(), residual->data(), to->data()};
         Quantize2BitImpl(ctx.get_stream<mshadow::cpu>(), inputs, threshold);
       }, from.ctx(), {from.var()}, {to->var(), residual->var()},
       mxnet::FnProperty::kNormal, priority, "QuantizeCPU");
     } else {
+      LOG(FATAL) << "Unsupported quantization of type " << get_type_str();
+    }
+  } else {
 #if MXNET_USE_CUDA
-      if (a == mshadow::gpu::kDevMask && b == mshadow::gpu::kDevMask) {
+    if (a == mshadow::gpu::kDevMask && b == mshadow::gpu::kDevMask) {
+      if (type_ == CompressionType::kOneBit) {
+        mxnet::Engine::Get()->PushSync([from, to, residual, threshold](mxnet::RunContext ctx) {
+          std::vector<mxnet::TBlob> inputs = {from.data(), residual->data(), to->data()};
+          Quantize1BitImpl(ctx.get_stream<mshadow::gpu>(), inputs, threshold);
+          // Wait GPU kernel to complete
+          ctx.get_stream<mshadow::gpu>()->Wait();
+        }, from.ctx(), {from.var()}, {to->var(), residual->var()},
+        mxnet::FnProperty::kNormal, priority, "QuantizeGPU");
+      } else if (type_ == CompressionType::kTwoBit) {
         mxnet::Engine::Get()->PushSync([from, to, residual, threshold](mxnet::RunContext ctx) {
           std::vector<mxnet::TBlob> inputs = {from.data(), residual->data(), to->data()};
           Quantize2BitImpl(ctx.get_stream<mshadow::gpu>(), inputs, threshold);
@@ -124,14 +151,14 @@ void GradientCompression::Quantize(const mxnet::NDArray &from, mxnet::NDArray *t
         }, from.ctx(), {from.var()}, {to->var(), residual->var()},
         mxnet::FnProperty::kNormal, priority, "QuantizeGPU");
       } else {
-        LOG(FATAL) << "unknown device mask";
+        LOG(FATAL) << "Unsupported quantization of type " << get_type_str();
       }
+    } else {
+      LOG(FATAL) << "unknown device mask";
+    }
 #else
     LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
 #endif
-    }
-  } else {
-    LOG(FATAL) << "Unsupported quantization of type " << get_type_str();
   }
 }
 
@@ -142,35 +169,52 @@ void GradientCompression::Dequantize(const mxnet::NDArray &from, mxnet::NDArray
   const int a = from.ctx().dev_mask();
   const int b = to->ctx().dev_mask();
   const float threshold = threshold_;
-  if (type_ == CompressionType::kTwoBit) {
-    if (a == mshadow::cpu::kDevMask && b == mshadow::cpu::kDevMask) {
+  if (a == mshadow::cpu::kDevMask && b == mshadow::cpu::kDevMask) {
+    if (type_ == CompressionType::kOneBit) {
+      mxnet::Engine::Get()->PushSync([from, to, threshold](mxnet::RunContext ctx) {
+        std::vector<mxnet::TBlob> inputs = {from.data(), to->data()};
+        Dequantize1BitImpl(ctx.get_stream<mshadow::cpu>(), inputs, threshold);
+      }, from.ctx(), {from.var()}, {to->var()},
+      mxnet::FnProperty::kNormal, priority, "DequantizeCPU");
+    } else if (type_ == CompressionType::kTwoBit) {
       mxnet::Engine::Get()->PushSync([from, to, threshold](mxnet::RunContext ctx) {
         std::vector<mxnet::TBlob> inputs = {from.data(), to->data()};
         Dequantize2BitImpl(ctx.get_stream<mshadow::cpu>(), inputs, threshold);
       }, from.ctx(), {from.var()}, {to->var()},
       mxnet::FnProperty::kNormal, priority, "DequantizeCPU");
     } else {
+      LOG(FATAL) << "Unsupported dequantization of type " << get_type_str();
+    }
+  } else {
 #if MXNET_USE_CUDA
-      if (a == mshadow::gpu::kDevMask && b == mshadow::gpu::kDevMask) {
+    if (a == mshadow::gpu::kDevMask && b == mshadow::gpu::kDevMask) {
+      if (type_ == CompressionType::kOneBit) {
         mxnet::Engine::Get()->PushSync([from, to, threshold](mxnet::RunContext ctx) {
           std::vector<mxnet::TBlob> inputs = {from.data(), to->data()};
-          Dequantize2BitImpl(ctx.get_stream<mshadow::gpu>(), inputs, threshold);
+          Dequantize1BitImpl(ctx.get_stream<mshadow::gpu>(), inputs, threshold);
           // Wait GPU kernel to complete
           ctx.get_stream<mshadow::gpu>()->Wait();
         }, from.ctx(), {from.var()}, {to->var()},
         mxnet::FnProperty::kNormal, priority, "DequantizeGPU");
+      } else if (type_ == CompressionType::kTwoBit) {
+        mxnet::Engine::Get()->PushSync([from, to, threshold](mxnet::RunContext ctx) {
+          std::vector<mxnet::TBlob> inputs = {from.data(), to->data()};
+          Dequantize2BitImpl(ctx.get_stream<mshadow::gpu>(), inputs, threshold);
+          // Wait GPU kernel to completes
+          ctx.get_stream<mshadow::gpu>()->Wait();
+        }, from.ctx(), {from.var()}, {to->var()},
+        mxnet::FnProperty::kNormal, priority, "DequantizeGPU");
       } else {
-        LOG(FATAL) << "unknown device mask";
+        LOG(FATAL) << "Unsupported dequantization of type " << get_type_str();
       }
+    } else {
+      LOG(FATAL) << "unknown device mask";
+    }
 #else
-      LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+    LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
 #endif
-    }
-  } else {
-    LOG(FATAL) << "Unsupported dequantization of type " << get_type_str();
   }
 }
-
 }  // namespace kvstore
 }  // namespace mxnet
 
diff --git a/src/kvstore/gradient_compression.cu b/src/kvstore/gradient_compression.cu
@@ -27,6 +27,16 @@
 
 namespace mxnet {
 namespace kvstore {
+void Quantize1BitImpl(mshadow::Stream<gpu>* s, const std::vector<TBlob>& inputs,
+                      const float threshold) {
+  Quantize1BitKernelLaunch(s, inputs, threshold);
+}
+
+void Dequantize1BitImpl(mshadow::Stream<gpu>* s, const std::vector<TBlob>& inputs,
+                        const float threshold) {
+  Dequantize1BitKernelLaunch(s, inputs, threshold);
+}
+
 void Quantize2BitImpl(mshadow::Stream<gpu>* s, const std::vector<TBlob>& inputs,
                       const float threshold) {
   Quantize2BitKernelLaunch(s, inputs, threshold);

diff --git a/src/kvstore/gradient_compression.h b/src/kvstore/gradient_compression.h
@@ -35,7 +35,7 @@ namespace mxnet {
 namespace kvstore {
 
 enum class CompressionType {
-  kNone, kTwoBit
+  kNone, kOneBit, kTwoBit
 };
 
 struct GradientCompressionParam : public dmlc::Parameter<GradientCompressionParam> {
@@ -72,6 +72,12 @@ class GradientCompression {
    */
   std::string get_type_str();
 
+  /*!
+   * \biref sets one bit gradient compression
+   * \param threshold float value used for thresholding gradients
+   */
+  void SetOneBitCompression(const float threshold);
+
   /*!
    * \brief sets two bit gradient compression
    * \param threshold float value used for thresholding gradients