Skip to content

Commit

Permalink
Merge pull request #540 from bo3z/quartus-softmax
Browse files Browse the repository at this point in the history
Vivado-equivalent implementation of Softmax on Quartus
  • Loading branch information
thesps committed May 10, 2022
2 parents c96a7bc + a7753fb commit 6f6e3b2
Show file tree
Hide file tree
Showing 8 changed files with 595 additions and 123 deletions.
1 change: 1 addition & 0 deletions hls4ml/backends/quartus/passes/core_templates.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ def format(self, node):
static const unsigned table_size = {table_size};
static const unsigned io_type = nnet::{iotype};
static const unsigned reuse_factor = {reuse};
static const nnet::softmax_implementation implementation = nnet::softmax_implementation::{implementation};
typedef {exp_table_t.name} exp_table_t;
typedef {inv_table_t.name} inv_table_t;
}};\n"""
Expand Down
5 changes: 5 additions & 0 deletions hls4ml/backends/quartus/quartus_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,3 +188,8 @@ def init_softmax(self, layer):
layer.set_attr('exp_table_t', layer.get_attr('table_t'))
if 'inv_table_t' not in layer.attributes:
layer.set_attr('inv_table_t', layer.get_attr('table_t'))
if layer.model.config.is_resource_strategy(layer):
# 'resource' strategy = 'latency' for Softmax
layer.set_attr('implementation', 'latency')
else:
layer.set_attr('implementation', layer.model.config.get_strategy(layer).lower())
175 changes: 136 additions & 39 deletions hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation.h
Original file line number Diff line number Diff line change
Expand Up @@ -126,47 +126,144 @@ void sigmoid(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
// *************************************************
// Softmax Activation
// *************************************************

enum class softmax_implementation {latency=0, legacy=1, stable=2};

template<class data_T, typename CONFIG_T>
inline unsigned softmax_idx_from_real_val(const data_T x){
// Number of address bits for table
static constexpr int N = ceillog2(CONFIG_T::table_size);

// Slice the top N bits of the input
hls_register ac_int<N, false> y = x.template slc<N>(x.width-N);
return y.to_uint();
}

template <class data_T, class res_T, typename CONFIG_T>
void softmax_stable(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){
// Look-up tables
#include "activation_tables/exp_table.tb"
#include "activation_tables/invert_table.tb"

// Find maximum
Op_max<data_T> op_max;
hls_register data_T x_max = reduce<data_T, CONFIG_T::n_in, Op_max<data_T>>(data, op_max);

// Calculate differences from the maximum, forcing rounding and saturation for better accuracy
hls_register ac_fixed<data_T::width, data_T::i_width, true, AC_RND, AC_SAT> d_xi_xmax[CONFIG_T::n_in];
#pragma unroll
for(unsigned i = 0; i < CONFIG_T::n_in; i++) {
d_xi_xmax[i] = data[i] - x_max;
}

// Calculate all the e^x's
hls_register typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in];
#pragma unroll
for(unsigned i = 0; i < CONFIG_T::n_in; i++) {
exp_res[i] = exp_table[softmax_idx_from_real_val<data_T, CONFIG_T>(d_xi_xmax[i])];
}

// Explicitly sum previously calculated exponentials with an adder tree
Op_add<typename CONFIG_T::exp_table_t> op_add;
hls_register typename CONFIG_T::exp_table_t exp_sum = reduce<typename CONFIG_T::exp_table_t, CONFIG_T::n_in, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);

// Multiply previously calculated exponetials with the reciprocal of the sum
hls_register typename CONFIG_T::inv_table_t inv_exp_sum = invert_table[softmax_idx_from_real_val<typename CONFIG_T::exp_table_t,CONFIG_T>(exp_sum)];
#pragma unroll
for(unsigned i = 0; i < CONFIG_T::n_in; i++) {
res[i] = exp_res[i] * inv_exp_sum;
}
}

// TODO - Improve accuracy
template <class data_T, class res_T, typename CONFIG_T>
void softmax_latency(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){
/*
* Note: The latency tables are equivalent to stable tables
* However, the compiler cannot include the same table twice
* Therefore, an out-of-scope exception is thrown in one of the functions
* Temporary solution - Create the same table twice in quartus_writer.py
* Long-term solution - Only create tables needed by the network;
* Currently, quartus-writer.py generates LUTs for all activations,
* Regardless if they are present in the network or not
*/
#include "activation_tables/exp_table_latency.tb"
#include "activation_tables/invert_table_latency.tb"

// Calculate all the e^x's
hls_register typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in];
#pragma unroll
for(unsigned i = 0; i < CONFIG_T::n_in; i++) {
exp_res[i] = exp_table_latency[softmax_idx_from_real_val<data_T, CONFIG_T>(data[i])];
}

// Explicitly sum the results with an adder tree.
Op_add<typename CONFIG_T::exp_table_t> op_add;
hls_register typename CONFIG_T::exp_table_t exp_sum = reduce<typename CONFIG_T::exp_table_t, CONFIG_T::n_in, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);

// Multiply previously calculated exponetials with the reciprocal of the sum
hls_register typename CONFIG_T::inv_table_t inv_exp_sum = invert_table_latency[softmax_idx_from_real_val<typename CONFIG_T::exp_table_t,CONFIG_T>(exp_sum)];
#pragma unroll
for(unsigned i = 0; i < CONFIG_T::n_in; i++){
res[i] = exp_res[i] * inv_exp_sum;
}
}

template<class data_T, class res_T, typename CONFIG_T>
void softmax( data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
{
#include "activation_tables/exp_table.tb"
#include "activation_tables/invert_table.tb"
void softmax_legacy(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
#include "activation_tables/exp_table_legacy.tb"
#include "activation_tables/invert_table_legacy.tb"

hls_register int data_round[CONFIG_T::n_in];
New_loop:
#pragma unroll
for (int ii=0; ii<CONFIG_T::n_in; ii++) {
data_round[ii] = (data[ii] * CONFIG_T::table_size/16).to_int();
}
NN_Outer:
#pragma unroll
for (int ii=0; ii<CONFIG_T::n_in; ii++) {
typename CONFIG_T::exp_table_t exp_res_temp = 0;
NN_Inner:
#pragma unroll
for (int jj=0; jj<CONFIG_T::n_in; jj++)
{
if (ii==jj)
{
exp_res_temp += 1;
}
else
{
int _data_cache = (data_round[jj]-data_round[ii]);
int index = _data_cache + 8*CONFIG_T::table_size/16;

if (index < 0) index = 0;
if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1;

typename CONFIG_T::exp_table_t temp_exp = exp_table[index];
exp_res_temp += temp_exp;
}
}
int exp_res_index = (exp_res_temp * CONFIG_T::table_size/64).to_int();
if (exp_res_index < 0) exp_res_index = 0;
if (exp_res_index > CONFIG_T::table_size-1) exp_res_index = CONFIG_T::table_size-1;
res[ii] = invert_table[exp_res_index];
}
hls_register int data_round[CONFIG_T::n_in];
New_loop:
#pragma unroll
for (int ii=0; ii<CONFIG_T::n_in; ii++) {
data_round[ii] = (data[ii] * CONFIG_T::table_size/16).to_int();
}
NN_Outer:
#pragma unroll
for (int ii=0; ii<CONFIG_T::n_in; ii++) {
typename CONFIG_T::exp_table_t exp_res_temp = 0;
NN_Inner:
#pragma unroll
for (int jj=0; jj<CONFIG_T::n_in; jj++) {
if (ii==jj) {
exp_res_temp += 1;
}
else {
int _data_cache = (data_round[jj]-data_round[ii]);
int index = _data_cache + 8*CONFIG_T::table_size/16;

if (index < 0) index = 0;
if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1;

typename CONFIG_T::exp_table_t temp_exp = exp_table_legacy[index];
exp_res_temp += temp_exp;
}
}
int exp_res_index = (exp_res_temp * CONFIG_T::table_size/64).to_int();
if (exp_res_index < 0) exp_res_index = 0;
if (exp_res_index > CONFIG_T::table_size-1) exp_res_index = CONFIG_T::table_size-1;
res[ii] = invert_table_legacy[exp_res_index];
}
}

template<class data_T, class res_T, typename CONFIG_T>
inline void softmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){
switch(CONFIG_T::implementation) {
case softmax_implementation::stable:
softmax_stable<data_T, res_T, CONFIG_T>(data, res);
break;
case softmax_implementation::latency:
softmax_latency<data_T, res_T, CONFIG_T>(data, res);
break;
case softmax_implementation::legacy:
softmax_legacy<data_T, res_T, CONFIG_T>(data, res);
break;
default:
softmax_stable<data_T, res_T, CONFIG_T>(data, res);
break;
}
}

// *************************************************
Expand Down
42 changes: 42 additions & 0 deletions hls4ml/templates/quartus/firmware/nnet_utils/nnet_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,12 @@
#include "ac_int.h"
#include "ac_fixed.h"
#include "math.h"
#include "nnet_helpers.h"
#else
#include "HLS/ac_int.h"
#include "HLS/ac_fixed.h"
#include "HLS/math.h"
#include "nnet_helpers.h"
#endif

typedef ac_fixed<16,6> table_default_t;
Expand Down Expand Up @@ -58,6 +60,46 @@ typedef ac_fixed<32,10> accum_t_def;
}
}

/* ---
* Balanced tree reduce implementation.
* For use in scenarios where Quartus cannot expression balance
* Reduces an array of inputs to a single value using the template binary operator 'Op',
* for example summing all elements with Op_add, or finding the maximum with Op_max
* Use only when the input array is fully unrolled. Or, slice out a fully unrolled section
* before applying and accumulate the result over the rolled dimension.
* --- */
template<class T, int N, class Op>
T reduce(const T* x, Op op)
{
static constexpr int leftN = pow2(floorlog2(N - 1)) > 0 ? pow2(floorlog2(N - 1)) : 0;
static constexpr int rightN = N - leftN > 0 ? N - leftN : 0;
if (N == 1){
return x[0];
}
if (N == 2){
return op(x[0],x[1]);
}
return op(reduce<T,leftN,Op>(x, op), reduce<T,rightN,Op>(x+leftN, op));
}



template<class T>
class Op_add{
public:
T operator()(T a, T b){
return a + b;
}
};

template<class T>
class Op_max{
public:
T operator()(T a, T b){
return a >= b ? a : b;
}
};

}

#endif
6 changes: 5 additions & 1 deletion hls4ml/templates/quartus/firmware/nnet_utils/nnet_helpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
#include <fstream>
#include <algorithm>
#include <map>
#include "nnet_common.h"

namespace nnet {

Expand All @@ -50,6 +49,11 @@ extern size_t trace_type_size;
constexpr int ceillog2(int x){
return (x <= 2) ? 1 : 1 + ceillog2((x+1) / 2);
}

constexpr int floorlog2(int x){
return (x < 2) ? 0 : 1 + floorlog2(x / 2);
}

constexpr int pow2(int x){
return x == 0 ? 1 : 2 * pow2(x - 1);
}
Expand Down
Loading

0 comments on commit 6f6e3b2

Please sign in to comment.