Merge pull request #540 from bo3z/quartus-softmax

Vivado-equivalent implementation of Softmax on Quartus
fastmachinelearning · May 10, 2022 · 6f6e3b2 · 6f6e3b2
2 parents c96a7bc + a7753fb
commit 6f6e3b2
Show file tree

Hide file tree

Showing 8 changed files with 595 additions and 123 deletions.
diff --git a/hls4ml/backends/quartus/passes/core_templates.py b/hls4ml/backends/quartus/passes/core_templates.py
@@ -122,6 +122,7 @@ def format(self, node):
     static const unsigned table_size = {table_size};
     static const unsigned io_type = nnet::{iotype};
     static const unsigned reuse_factor = {reuse};
+    static const nnet::softmax_implementation implementation = nnet::softmax_implementation::{implementation};
     typedef {exp_table_t.name} exp_table_t;
     typedef {inv_table_t.name} inv_table_t;
 }};\n"""

diff --git a/hls4ml/backends/quartus/quartus_backend.py b/hls4ml/backends/quartus/quartus_backend.py
@@ -188,3 +188,8 @@ def init_softmax(self, layer):
             layer.set_attr('exp_table_t', layer.get_attr('table_t'))
         if 'inv_table_t' not in layer.attributes:
             layer.set_attr('inv_table_t', layer.get_attr('table_t'))
+        if layer.model.config.is_resource_strategy(layer):
+            # 'resource' strategy = 'latency' for Softmax
+            layer.set_attr('implementation', 'latency')
+        else:
+            layer.set_attr('implementation', layer.model.config.get_strategy(layer).lower())
diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation.h
@@ -126,47 +126,144 @@ void  sigmoid(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
 // *************************************************
 //       Softmax Activation
 // *************************************************
+
+enum class softmax_implementation {latency=0, legacy=1, stable=2};
+
+template<class data_T, typename CONFIG_T>
+inline unsigned softmax_idx_from_real_val(const data_T x){
+    // Number of address bits for table
+    static constexpr int N = ceillog2(CONFIG_T::table_size);    
+
+    // Slice the top N bits of the input
+    hls_register ac_int<N, false> y = x.template slc<N>(x.width-N);             
+    return y.to_uint();
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void softmax_stable(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){
+    // Look-up tables
+    #include "activation_tables/exp_table.tb"
+    #include "activation_tables/invert_table.tb"
+
+    // Find maximum
+    Op_max<data_T> op_max;
+    hls_register data_T x_max = reduce<data_T, CONFIG_T::n_in, Op_max<data_T>>(data, op_max);
+
+    // Calculate differences from the maximum, forcing rounding and saturation for better accuracy
+    hls_register ac_fixed<data_T::width, data_T::i_width, true, AC_RND, AC_SAT> d_xi_xmax[CONFIG_T::n_in];
+    #pragma unroll
+    for(unsigned i = 0; i < CONFIG_T::n_in; i++) {
+        d_xi_xmax[i] = data[i] - x_max;
+    }
+
+    // Calculate all the e^x's
+    hls_register typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in];
+    #pragma unroll
+    for(unsigned i = 0; i < CONFIG_T::n_in; i++) {
+        exp_res[i] = exp_table[softmax_idx_from_real_val<data_T, CONFIG_T>(d_xi_xmax[i])];
+    }
+
+    // Explicitly sum previously calculated exponentials with an adder tree
+    Op_add<typename CONFIG_T::exp_table_t> op_add;
+    hls_register typename CONFIG_T::exp_table_t exp_sum = reduce<typename CONFIG_T::exp_table_t, CONFIG_T::n_in, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
+
+    // Multiply previously calculated exponetials with the reciprocal of the sum
+    hls_register typename CONFIG_T::inv_table_t inv_exp_sum = invert_table[softmax_idx_from_real_val<typename CONFIG_T::exp_table_t,CONFIG_T>(exp_sum)];
+    #pragma unroll
+    for(unsigned i = 0; i < CONFIG_T::n_in; i++) {
+        res[i] = exp_res[i] * inv_exp_sum;
+    }
+}
+
+// TODO - Improve accuracy
+template <class data_T, class res_T, typename CONFIG_T>
+void softmax_latency(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){
+    /*
+    * Note: The latency tables are equivalent to stable tables
+    * However, the compiler cannot include the same table twice
+    * Therefore, an out-of-scope exception is thrown in one of the functions
+    * Temporary solution - Create the same table twice in quartus_writer.py
+    * Long-term solution - Only create tables needed by the network;
+    * Currently, quartus-writer.py generates LUTs for all activations,
+    * Regardless if they are present in the network or not
+    */
+    #include "activation_tables/exp_table_latency.tb"
+    #include "activation_tables/invert_table_latency.tb"
+
+    // Calculate all the e^x's
+    hls_register typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in];
+    #pragma unroll
+    for(unsigned i = 0; i < CONFIG_T::n_in; i++) {
+        exp_res[i] = exp_table_latency[softmax_idx_from_real_val<data_T, CONFIG_T>(data[i])];
+    }
+
+    // Explicitly sum the results with an adder tree.
+    Op_add<typename CONFIG_T::exp_table_t> op_add;
+    hls_register typename CONFIG_T::exp_table_t exp_sum = reduce<typename CONFIG_T::exp_table_t, CONFIG_T::n_in, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
+
+    // Multiply previously calculated exponetials with the reciprocal of the sum
+    hls_register typename CONFIG_T::inv_table_t inv_exp_sum = invert_table_latency[softmax_idx_from_real_val<typename CONFIG_T::exp_table_t,CONFIG_T>(exp_sum)];
+    #pragma unroll
+    for(unsigned i = 0; i < CONFIG_T::n_in; i++){
+        res[i] = exp_res[i] * inv_exp_sum;
+    }
+}
+
 template<class data_T, class res_T, typename CONFIG_T>
-void  softmax(  data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
-{
-  #include "activation_tables/exp_table.tb"
-  #include "activation_tables/invert_table.tb"
+void softmax_legacy(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    #include "activation_tables/exp_table_legacy.tb"
+    #include "activation_tables/invert_table_legacy.tb"
 
-  hls_register int data_round[CONFIG_T::n_in];
-  New_loop:
-  #pragma unroll
-  for (int ii=0; ii<CONFIG_T::n_in; ii++) {
-      data_round[ii] = (data[ii] * CONFIG_T::table_size/16).to_int();
-  }
-  NN_Outer:
-  #pragma unroll
-  for (int ii=0; ii<CONFIG_T::n_in; ii++) {
-      typename CONFIG_T::exp_table_t exp_res_temp = 0;
-      NN_Inner:
-      #pragma unroll
-      for (int jj=0; jj<CONFIG_T::n_in; jj++)
-      {
-          if (ii==jj)
-          {
-              exp_res_temp += 1;
-          }
-          else
-          {
-              int _data_cache = (data_round[jj]-data_round[ii]);
-              int index = _data_cache + 8*CONFIG_T::table_size/16;
-
-              if (index < 0)   index = 0;
-              if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1;
-
-              typename CONFIG_T::exp_table_t temp_exp = exp_table[index];
-              exp_res_temp += temp_exp;
-          }
-      }
-      int exp_res_index = (exp_res_temp * CONFIG_T::table_size/64).to_int();
-      if (exp_res_index < 0)   exp_res_index = 0;
-      if (exp_res_index > CONFIG_T::table_size-1) exp_res_index = CONFIG_T::table_size-1;
-      res[ii] = invert_table[exp_res_index];
-  }
+    hls_register int data_round[CONFIG_T::n_in];
+    New_loop:
+    #pragma unroll
+    for (int ii=0; ii<CONFIG_T::n_in; ii++) {
+        data_round[ii] = (data[ii] * CONFIG_T::table_size/16).to_int();
+    }
+    NN_Outer:
+    #pragma unroll
+    for (int ii=0; ii<CONFIG_T::n_in; ii++) {
+        typename CONFIG_T::exp_table_t exp_res_temp = 0;
+        NN_Inner:
+        #pragma unroll
+        for (int jj=0; jj<CONFIG_T::n_in; jj++) {
+            if (ii==jj) {
+                exp_res_temp += 1;
+            }
+            else {
+                int _data_cache = (data_round[jj]-data_round[ii]);
+                int index = _data_cache + 8*CONFIG_T::table_size/16;
+
+                if (index < 0)   index = 0;
+                if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1;
+
+                typename CONFIG_T::exp_table_t temp_exp = exp_table_legacy[index];
+                exp_res_temp += temp_exp;
+            }
+        }
+        int exp_res_index = (exp_res_temp * CONFIG_T::table_size/64).to_int();
+        if (exp_res_index < 0)   exp_res_index = 0;
+        if (exp_res_index > CONFIG_T::table_size-1) exp_res_index = CONFIG_T::table_size-1;
+        res[ii] = invert_table_legacy[exp_res_index];
+    }
+}
+
+template<class data_T, class res_T, typename CONFIG_T>
+inline void softmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){
+    switch(CONFIG_T::implementation) {
+        case softmax_implementation::stable:
+            softmax_stable<data_T, res_T, CONFIG_T>(data, res);
+            break;
+        case softmax_implementation::latency:
+            softmax_latency<data_T, res_T, CONFIG_T>(data, res);
+            break;
+        case softmax_implementation::legacy:
+            softmax_legacy<data_T, res_T, CONFIG_T>(data, res);
+            break;
+        default:
+            softmax_stable<data_T, res_T, CONFIG_T>(data, res);
+            break;
+    }
 }
 
 // *************************************************

diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_common.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_common.h
@@ -24,10 +24,12 @@
 #include "ac_int.h"
 #include "ac_fixed.h"
 #include "math.h"
+#include "nnet_helpers.h"
 #else
 #include "HLS/ac_int.h"
 #include "HLS/ac_fixed.h"
 #include "HLS/math.h"
+#include "nnet_helpers.h"
 #endif
 
 typedef ac_fixed<16,6> table_default_t;
@@ -58,6 +60,46 @@ typedef ac_fixed<32,10> accum_t_def;
    }
  }
 
+  /* ---
+  * Balanced tree reduce implementation.
+  * For use in scenarios where Quartus cannot expression balance
+  * Reduces an array of inputs to a single value using the template binary operator 'Op',
+  * for example summing all elements with Op_add, or finding the maximum with Op_max
+  * Use only when the input array is fully unrolled. Or, slice out a fully unrolled section
+  * before applying and accumulate the result over the rolled dimension.
+  * --- */
+ template<class T, int N, class Op>
+ T reduce(const T* x, Op op)
+ {
+     static constexpr int leftN = pow2(floorlog2(N - 1)) > 0 ? pow2(floorlog2(N - 1)) : 0;
+     static constexpr int rightN = N - leftN > 0 ? N - leftN : 0;
+     if (N == 1){
+         return x[0];
+     }
+     if (N == 2){
+         return op(x[0],x[1]);
+     }
+     return op(reduce<T,leftN,Op>(x, op), reduce<T,rightN,Op>(x+leftN, op));
+ } 
+
+
+
+ template<class T>
+ class Op_add{
+ public:
+	 T operator()(T a, T b){
+		 return a + b;
+	 }
+ };
+
+ template<class T>
+ class Op_max{
+ public:
+     T operator()(T a, T b){
+        return a >= b ? a : b;
+     }
+ };
+
 }
 
 #endif
diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_helpers.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_helpers.h
@@ -26,7 +26,6 @@
 #include <fstream>
 #include <algorithm>
 #include <map>
-#include "nnet_common.h"
 
 namespace nnet {
 
@@ -50,6 +49,11 @@ extern size_t trace_type_size;
 constexpr int ceillog2(int x){
   return (x <= 2) ? 1 : 1 + ceillog2((x+1) / 2);
 }
+
+constexpr int floorlog2(int x){
+  return (x < 2) ? 0 : 1 + floorlog2(x / 2);
+}
+
 constexpr int pow2(int x){
   return x == 0 ? 1 : 2 * pow2(x - 1);
 }