Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Vivado-equivalent implementation of Softmax on Quartus #540

Merged
merged 5 commits into from
May 10, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions hls4ml/backends/quartus/passes/core_templates.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ def format(self, node):
static const unsigned table_size = {table_size};
static const unsigned io_type = nnet::{iotype};
static const unsigned reuse_factor = {reuse};
static const nnet::softmax_implementation implementation = nnet::softmax_implementation::{implementation};
typedef {exp_table_t.name} exp_table_t;
typedef {inv_table_t.name} inv_table_t;
}};\n"""
Expand Down
5 changes: 5 additions & 0 deletions hls4ml/backends/quartus/quartus_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,3 +188,8 @@ def init_softmax(self, layer):
layer.set_attr('exp_table_t', layer.get_attr('table_t'))
if 'inv_table_t' not in layer.attributes:
layer.set_attr('inv_table_t', layer.get_attr('table_t'))
if layer.model.config.is_resource_strategy(layer):
# 'resource' strategy = 'latency' for Softmax
layer.set_attr('implementation', 'latency')
else:
layer.set_attr('implementation', layer.model.config.get_strategy(layer).lower())
175 changes: 136 additions & 39 deletions hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation.h
Original file line number Diff line number Diff line change
Expand Up @@ -126,47 +126,144 @@ void sigmoid(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
// *************************************************
// Softmax Activation
// *************************************************

enum class softmax_implementation {latency=0, legacy=1, stable=2};

template<class data_T, typename CONFIG_T>
inline unsigned softmax_idx_from_real_val(const data_T x){
// Number of address bits for table
static constexpr int N = ceillog2(CONFIG_T::table_size);

// Slice the top N bits of the input
hls_register ac_int<N, false> y = x.template slc<N>(x.width-N);
return y.to_uint();
}

template <class data_T, class res_T, typename CONFIG_T>
void softmax_stable(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){
// Look-up tables
#include "activation_tables/exp_table.tb"
#include "activation_tables/invert_table.tb"

// Find maximum
Op_max<data_T> op_max;
hls_register data_T x_max = reduce<data_T, CONFIG_T::n_in, Op_max<data_T>>(data, op_max);

// Calculate differences from the maximum, forcing rounding and saturation for better accuracy
hls_register ac_fixed<data_T::width, data_T::i_width, true, AC_RND, AC_SAT> d_xi_xmax[CONFIG_T::n_in];
#pragma unroll
for(unsigned i = 0; i < CONFIG_T::n_in; i++) {
d_xi_xmax[i] = data[i] - x_max;
}

// Calculate all the e^x's
hls_register typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in];
#pragma unroll
for(unsigned i = 0; i < CONFIG_T::n_in; i++) {
exp_res[i] = exp_table[softmax_idx_from_real_val<data_T, CONFIG_T>(d_xi_xmax[i])];
}

// Explicitly sum previously calculated exponentials with an adder tree
Op_add<typename CONFIG_T::exp_table_t> op_add;
hls_register typename CONFIG_T::exp_table_t exp_sum = reduce<typename CONFIG_T::exp_table_t, CONFIG_T::n_in, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);

// Multiply previously calculated exponetials with the reciprocal of the sum
hls_register typename CONFIG_T::inv_table_t inv_exp_sum = invert_table[softmax_idx_from_real_val<typename CONFIG_T::exp_table_t,CONFIG_T>(exp_sum)];
#pragma unroll
for(unsigned i = 0; i < CONFIG_T::n_in; i++) {
res[i] = exp_res[i] * inv_exp_sum;
}
}

// TODO - Improve accuracy
template <class data_T, class res_T, typename CONFIG_T>
void softmax_latency(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){
/*
* Note: The latency tables are equivalent to stable tables
* However, the compiler cannot include the same table twice
* Therefore, an out-of-scope exception is thrown in one of the functions
* Temporary solution - Create the same table twice in quartus_writer.py
* Long-term solution - Only create tables needed by the network;
* Currently, quartus-writer.py generates LUTs for all activations,
* Regardless if they are present in the network or not
*/
#include "activation_tables/exp_table_latency.tb"
#include "activation_tables/invert_table_latency.tb"

// Calculate all the e^x's
hls_register typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in];
#pragma unroll
for(unsigned i = 0; i < CONFIG_T::n_in; i++) {
exp_res[i] = exp_table_latency[softmax_idx_from_real_val<data_T, CONFIG_T>(data[i])];
}

// Explicitly sum the results with an adder tree.
Op_add<typename CONFIG_T::exp_table_t> op_add;
hls_register typename CONFIG_T::exp_table_t exp_sum = reduce<typename CONFIG_T::exp_table_t, CONFIG_T::n_in, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);

// Multiply previously calculated exponetials with the reciprocal of the sum
hls_register typename CONFIG_T::inv_table_t inv_exp_sum = invert_table_latency[softmax_idx_from_real_val<typename CONFIG_T::exp_table_t,CONFIG_T>(exp_sum)];
#pragma unroll
for(unsigned i = 0; i < CONFIG_T::n_in; i++){
res[i] = exp_res[i] * inv_exp_sum;
}
}

template<class data_T, class res_T, typename CONFIG_T>
void softmax( data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
{
#include "activation_tables/exp_table.tb"
#include "activation_tables/invert_table.tb"
void softmax_legacy(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
#include "activation_tables/exp_table_legacy.tb"
#include "activation_tables/invert_table_legacy.tb"

hls_register int data_round[CONFIG_T::n_in];
New_loop:
#pragma unroll
for (int ii=0; ii<CONFIG_T::n_in; ii++) {
data_round[ii] = (data[ii] * CONFIG_T::table_size/16).to_int();
}
NN_Outer:
#pragma unroll
for (int ii=0; ii<CONFIG_T::n_in; ii++) {
typename CONFIG_T::exp_table_t exp_res_temp = 0;
NN_Inner:
#pragma unroll
for (int jj=0; jj<CONFIG_T::n_in; jj++)
{
if (ii==jj)
{
exp_res_temp += 1;
}
else
{
int _data_cache = (data_round[jj]-data_round[ii]);
int index = _data_cache + 8*CONFIG_T::table_size/16;

if (index < 0) index = 0;
if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1;

typename CONFIG_T::exp_table_t temp_exp = exp_table[index];
exp_res_temp += temp_exp;
}
}
int exp_res_index = (exp_res_temp * CONFIG_T::table_size/64).to_int();
if (exp_res_index < 0) exp_res_index = 0;
if (exp_res_index > CONFIG_T::table_size-1) exp_res_index = CONFIG_T::table_size-1;
res[ii] = invert_table[exp_res_index];
}
hls_register int data_round[CONFIG_T::n_in];
New_loop:
#pragma unroll
for (int ii=0; ii<CONFIG_T::n_in; ii++) {
data_round[ii] = (data[ii] * CONFIG_T::table_size/16).to_int();
}
NN_Outer:
#pragma unroll
for (int ii=0; ii<CONFIG_T::n_in; ii++) {
typename CONFIG_T::exp_table_t exp_res_temp = 0;
NN_Inner:
#pragma unroll
for (int jj=0; jj<CONFIG_T::n_in; jj++) {
if (ii==jj) {
exp_res_temp += 1;
}
else {
int _data_cache = (data_round[jj]-data_round[ii]);
int index = _data_cache + 8*CONFIG_T::table_size/16;

if (index < 0) index = 0;
if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1;

typename CONFIG_T::exp_table_t temp_exp = exp_table_legacy[index];
exp_res_temp += temp_exp;
}
}
int exp_res_index = (exp_res_temp * CONFIG_T::table_size/64).to_int();
if (exp_res_index < 0) exp_res_index = 0;
if (exp_res_index > CONFIG_T::table_size-1) exp_res_index = CONFIG_T::table_size-1;
res[ii] = invert_table_legacy[exp_res_index];
}
}

template<class data_T, class res_T, typename CONFIG_T>
inline void softmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){
switch(CONFIG_T::implementation) {
case softmax_implementation::stable:
softmax_stable<data_T, res_T, CONFIG_T>(data, res);
break;
case softmax_implementation::latency:
softmax_latency<data_T, res_T, CONFIG_T>(data, res);
break;
case softmax_implementation::legacy:
softmax_legacy<data_T, res_T, CONFIG_T>(data, res);
break;
default:
softmax_stable<data_T, res_T, CONFIG_T>(data, res);
break;
}
}

// *************************************************
Expand Down
42 changes: 42 additions & 0 deletions hls4ml/templates/quartus/firmware/nnet_utils/nnet_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,12 @@
#include "ac_int.h"
#include "ac_fixed.h"
#include "math.h"
#include "nnet_helpers.h"
#else
#include "HLS/ac_int.h"
#include "HLS/ac_fixed.h"
#include "HLS/math.h"
#include "nnet_helpers.h"
#endif

typedef ac_fixed<16,6> table_default_t;
Expand Down Expand Up @@ -58,6 +60,46 @@ typedef ac_fixed<32,10> accum_t_def;
}
}

/* ---
* Balanced tree reduce implementation.
* For use in scenarios where Quartus cannot expression balance
* Reduces an array of inputs to a single value using the template binary operator 'Op',
* for example summing all elements with Op_add, or finding the maximum with Op_max
* Use only when the input array is fully unrolled. Or, slice out a fully unrolled section
* before applying and accumulate the result over the rolled dimension.
* --- */
template<class T, int N, class Op>
T reduce(const T* x, Op op)
{
static constexpr int leftN = pow2(floorlog2(N - 1)) > 0 ? pow2(floorlog2(N - 1)) : 0;
static constexpr int rightN = N - leftN > 0 ? N - leftN : 0;
if (N == 1){
return x[0];
}
if (N == 2){
return op(x[0],x[1]);
}
return op(reduce<T,leftN,Op>(x, op), reduce<T,rightN,Op>(x+leftN, op));
}



template<class T>
class Op_add{
public:
T operator()(T a, T b){
return a + b;
}
};

template<class T>
class Op_max{
public:
T operator()(T a, T b){
return a >= b ? a : b;
}
};

}

#endif
6 changes: 5 additions & 1 deletion hls4ml/templates/quartus/firmware/nnet_utils/nnet_helpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
#include <fstream>
#include <algorithm>
#include <map>
#include "nnet_common.h"

namespace nnet {

Expand All @@ -50,6 +49,11 @@ extern size_t trace_type_size;
constexpr int ceillog2(int x){
return (x <= 2) ? 1 : 1 + ceillog2((x+1) / 2);
}

constexpr int floorlog2(int x){
return (x < 2) ? 0 : 1 + floorlog2(x / 2);
}

constexpr int pow2(int x){
return x == 0 ? 1 : 2 * pow2(x - 1);
}
Expand Down
Loading