Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove intermediate casting in product #490

Merged
merged 11 commits into from
Mar 23, 2022
2 changes: 0 additions & 2 deletions hls4ml/backends/quartus/passes/core_templates.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,6 @@ def format(self, node):
static const bool store_weights_in_bram = false;
typedef {bias_t.name} bias_t;
typedef {scale_t.name} scale_t;
template<class x_T, class y_T, class res_T>
using product = nnet::product::{product_type}<x_T, y_T, res_T>;
}};\n"""

batchnorm_function_template = 'nnet::normalize<{input_t}, {output_t}, {config}>({input}, {output}, {scale}, {bias});'
Expand Down
4 changes: 2 additions & 2 deletions hls4ml/backends/vivado/passes/convolution_templates.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@
typedef {accum_t.name} accum_t;
typedef {bias_t.name} bias_t;
typedef {weight_t.name} weight_t;
template<class x_T, class y_T, class res_T>
using product = nnet::product::{product_type}<x_T, y_T, res_T>;
template<class x_T, class y_T>
using product = nnet::product::{product_type}<x_T, y_T>;
}};\n"""

# Conv1D templates
Expand Down
8 changes: 4 additions & 4 deletions hls4ml/backends/vivado/passes/core_templates.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@
typedef {bias_t.name} bias_t;
typedef {weight_t.name} weight_t;
typedef {index_t.name} index_t;
template<class x_T, class y_T, class res_T>
using product = nnet::product::{product_type}<x_T, y_T, res_T>;
template<class x_T, class y_T>
using product = nnet::product::{product_type}<x_T, y_T>;
}};\n"""

dense_function_template = 'nnet::dense<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});'
Expand Down Expand Up @@ -62,8 +62,8 @@ def format(self, node):
static const bool store_weights_in_bram = false;
typedef {bias_t.name} bias_t;
typedef {scale_t.name} scale_t;
template<class x_T, class y_T, class res_T>
using product = nnet::product::{product_type}<x_T, y_T, res_T>;
template<class x_T, class y_T>
using product = nnet::product::{product_type}<x_T, y_T>;
}};\n"""

batchnorm_function_template = 'nnet::normalize<{input_t}, {output_t}, {config}>({input}, {output}, {scale}, {bias});'
Expand Down
4 changes: 2 additions & 2 deletions hls4ml/backends/vivado/passes/merge_templates.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,8 @@ def format(self, node):
static const unsigned n_out = {n_out};
static const unsigned reuse_factor = {reuse};
typedef {accum_t.name} accum_t;
template<class x_T, class y_T, class res_T>
using product = nnet::product::{product_type}<x_T, y_T, res_T>;
template<class x_T, class y_T>
using product = nnet::product::{product_type}<x_T, y_T>;
}};\n"""

class DotConfigTemplate(LayerConfigTemplate):
Expand Down
44 changes: 20 additions & 24 deletions hls4ml/templates/quartus/firmware/nnet_utils/nnet_dense.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,38 +51,34 @@ struct dense_config
// partitioning arrays cyclically to go with roll factors?
};

template<class data_T, class weight_T, class ret_T>
inline typename std::enable_if<std::is_same<data_T, ac_int<1, false>>::value
and std::is_same<weight_T, ac_int<1, false>>::value, ac_int<1, false>>::type
product(ac_int<1, false> a, ac_int<1, false> w){
inline ac_int<1, false> product(ac_int<1, false> a, ac_int<1, false> w)
{
// specialisation for 1-bit weights and incoming data
return (ret_T) (a == w);
return (a == w);
}

template<class data_T, class weight_T, class ret_T>
inline typename std::enable_if<(not std::is_same<data_T, ac_int<1, false>>::value)
and std::is_same<weight_T, ac_int<1, false>>::value, ret_T>::type
product(data_T a, ac_int<1, false> w){
template<class data_T>
auto product(data_T a, ac_int<1, false> w) -> decltype(-a)
{
// Specialisation for 1-bit weights, arbitrary data
return w == 0 ? (ret_T) -a : a;
if (w == 0) return -a;
else return a;
}

template<class data_T, class weight_T, class ret_T>
inline typename std::enable_if<(not std::is_same<data_T, ac_int<2, false>>::value)
and std::is_same<weight_T, ac_int<2, true>>::value, ret_T>::type
product(data_T a, ac_int<2, true> w){
template<class data_T>
auto product(data_T a, ac_int<2, true> w) -> decltype(-a)
{
// Specialisation for 2-bit weights, arbitrary data
if (w == 0) return (ret_T) 0;
else if(w == -1) return (ret_T) -a;
else return (ret_T) a; // if(w == 1)
if (w == 0) return 0;
else if(w == -1) return -a;
else return a; // if(w == 1)
}

template<class data_T, class weight_T, class ret_T>
inline typename std::enable_if<(not std::is_same<data_T, ac_int<1, false>>::value)
and (not std::is_same<weight_T, ac_int<1, false>>::value), ret_T>::type
product(data_T a, weight_T w){
template<class data_T, class weight_T>
auto product(data_T a, weight_T w) -> decltype(a*w)
{
// 'Normal' product
return (ret_T)(a * w);
return a * w;
}

template<class data_T, class res_T, typename CONFIG_T>
Expand Down Expand Up @@ -138,7 +134,7 @@ void dense_rf_gt(
uint32 w_index = ir + (CONFIG_T::reuse_factor_rounded) * im;
if (w_index >= CONFIG_T::reuse_factor_rounded*CONFIG_T::block_factor_rounded) continue;
int data_index = d_index[ir][im];
tmp_acc[im] = product<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::accum_t>(data[data_index], weights[w_index]);
tmp_acc[im] = product(data[data_index], weights[w_index]);
}
hls_register typename CONFIG_T::accum_t mult[CONFIG_T::multiplier_limit];
ResetMult:
Expand Down Expand Up @@ -192,7 +188,7 @@ void dense_rf_lt(
for (int im = 0, in_index = ir; im < CONFIG_T::block_factor; im++) {
uint32 w_index = ir + (CONFIG_T::reuse_factor_rounded) * im;
if (ir + CONFIG_T::reuse_factor * im >= CONFIG_T::n_in*CONFIG_T::n_out) continue;
mult[im] = product<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::accum_t>(data[in_index], weights[w_index]);
mult[im] = product(data[in_index], weights[w_index]);
in_index += CONFIG_T::reuse_factor;
if (in_index >= CONFIG_T::n_in) in_index = ir;
}
Expand Down
10 changes: 5 additions & 5 deletions hls4ml/templates/vivado/nnet_utils/nnet_batchnorm.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,8 @@ struct batchnorm_config
static const bool store_weights_in_bram = false;
static const unsigned n_zeros = 0;
// partitioning arrays cyclically to go with roll factors?
template<class x_T, class y_T, class res_T>
using product = nnet::product::mult<x_T, y_T, res_T>;
template<class x_T, class y_T>
using product = nnet::product::mult<x_T, y_T>;
};

template<class data_T, class res_T, typename CONFIG_T>
Expand All @@ -71,7 +71,7 @@ void normalize(
#pragma HLS ARRAY_PARTITION variable=bias complete

int multiplier_limit = ceil(float(CONFIG_T::n_in) / float(CONFIG_T::reuse_factor));
CONFIG_T::template product<data_T, typename CONFIG_T::scale_t, res_T>::limit(multiplier_limit);
CONFIG_T::template product<data_T, typename CONFIG_T::scale_t>::limit(multiplier_limit);

} else if (CONFIG_T::io_type == io_serial) {
#pragma HLS ARRAY_RESHAPE variable=scale complete dim=1
Expand All @@ -87,10 +87,10 @@ void normalize(
}

if (CONFIG_T::n_filt==-1) {
res[ires] = CONFIG_T::template product<data_T, typename CONFIG_T::scale_t, res_T>::product(data[ires], scale[ires]) + bias[ires];
res[ires] = CONFIG_T::template product<data_T, typename CONFIG_T::scale_t>::product(data[ires], scale[ires]) + bias[ires];
} else {
int norm_index = ires%CONFIG_T::n_filt;
res[ires] = CONFIG_T::template product<data_T, typename CONFIG_T::scale_t, res_T>::product(data[ires], scale[norm_index]) + bias[norm_index];
res[ires] = CONFIG_T::template product<data_T, typename CONFIG_T::scale_t>::product(data[ires], scale[norm_index]) + bias[norm_index];
}
}
}
Expand Down
4 changes: 2 additions & 2 deletions hls4ml/templates/vivado/nnet_utils/nnet_batchnorm_stream.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ void normalize(

constexpr unsigned multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in, CONFIG_T::reuse_factor);
constexpr unsigned ii = CONFIG_T::n_in / multiplier_limit;
CONFIG_T::template product<typename data_T::value_type, typename CONFIG_T::scale_t, typename res_T::value_type>::limit(multiplier_limit);
CONFIG_T::template product<typename data_T::value_type, typename CONFIG_T::scale_t>::limit(multiplier_limit);

BatchNormLoop: for (int i = 0; i < CONFIG_T::n_in / data_T::size; i++) {
#pragma HLS PIPELINE II=ii
Expand All @@ -60,7 +60,7 @@ void normalize(
} else {
norm_index = j % CONFIG_T::n_filt;
}
out_data[j] = CONFIG_T::template product<typename data_T::value_type, typename CONFIG_T::scale_t, typename res_T::value_type>::product(in_data[j], scale[norm_index]) + bias[norm_index];
out_data[j] = CONFIG_T::template product<typename data_T::value_type, typename CONFIG_T::scale_t>::product(in_data[j], scale[norm_index]) + bias[norm_index];
}

res.write(out_data);
Expand Down
4 changes: 2 additions & 2 deletions hls4ml/templates/vivado/nnet_utils/nnet_dense.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ struct dense_config
static const unsigned n_zeros = 0;
// partitioning arrays cyclically to go with roll factors?
// Product function to use
template<class x_T, class y_T, class res_T>
using product = nnet::product::mult<x_T, y_T, res_T>;
template<class x_T, class y_T>
using product = nnet::product::mult<x_T, y_T>;
};

template<class data_T, class res_T, typename CONFIG_T>
Expand Down
2 changes: 1 addition & 1 deletion hls4ml/templates/vivado/nnet_utils/nnet_dense_compressed.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ void dense_compressed(
auto weight_cache = weights[w].weight;
data_T data_cache = data[row];
//mult[col] += weight_cache * data_cache;
typename CONFIG_T::accum_t prod = CONFIG_T::template product<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::accum_t>::product(data_cache, weight_cache);
typename CONFIG_T::accum_t prod = CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data_cache, weight_cache);
fill_mult<CONFIG_T>(col, mult, prod);
}

Expand Down
6 changes: 3 additions & 3 deletions hls4ml/templates/vivado/nnet_utils/nnet_dense_latency.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ void dense_latency(
#pragma HLS ARRAY_PARTITION variable=acc complete

int multiplier_limit = ceil(float(CONFIG_T::n_in*CONFIG_T::n_out) / float(CONFIG_T::reuse_factor)) - floor(float(CONFIG_T::n_zeros) / float(CONFIG_T::reuse_factor));
CONFIG_T::template product<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::accum_t>::limit(multiplier_limit);
CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::limit(multiplier_limit);

} else if (CONFIG_T::io_type == io_serial){
// Only reduce cycle_factor if n_out is evenly divisible by reuse_factor
Expand Down Expand Up @@ -90,10 +90,10 @@ void dense_latency(
Product2: for(int jj = 0; jj < CONFIG_T::n_out; jj++) {
if (CONFIG_T::io_type == io_serial) {
int multiplier_limit = ceil(float(CONFIG_T::n_out) / float(CONFIG_T::reuse_factor));
CONFIG_T::template product<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::accum_t>::limit(multiplier_limit);
CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::limit(multiplier_limit);
}
int index = ii*CONFIG_T::n_out+jj;
mult[index] = CONFIG_T::template product<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::accum_t>::product(cache, weights[index]);
mult[index] = CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(cache, weights[index]);
}
}

Expand Down
8 changes: 5 additions & 3 deletions hls4ml/templates/vivado/nnet_utils/nnet_dense_resource.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,8 @@ void dense_resource_rf_leq_nin(
for (int im = 0; im < block_factor; im++) {
#pragma HLS UNROLL

acc[out_index] += CONFIG_T::template product<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::accum_t>::product(data[in_index], weights[w_index]);
acc[out_index] += static_cast<typename CONFIG_T::accum_t>(
CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data[in_index], weights[w_index]));

// Increment w_index
w_index += rufactor;
Expand Down Expand Up @@ -157,7 +158,8 @@ void dense_resource_rf_gt_nin_rem0(
MultLoop:
for (int im = 0; im < block_factor; im++) {
#pragma HLS UNROLL
acc[out_index] += CONFIG_T::template product<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::accum_t>::product(data[in_index], weights[w_index]);
acc[out_index] += static_cast<typename CONFIG_T::accum_t>(
CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data[in_index], weights[w_index]));

w_index += rufactor;
if (w_index >= CONFIG_T::n_in * CONFIG_T::n_out) break; // check out of bounds
Expand Down Expand Up @@ -223,7 +225,7 @@ void dense_resource_rf_gt_nin(
int w_index = ir + rufactor * im;
int in_index = w_index % nin;
if (w_index >= CONFIG_T::n_in*CONFIG_T::n_out) continue; // check out of bounds
tmpmult[im] = CONFIG_T::template product<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::accum_t>::product(data[in_index], weights[w_index]);
tmpmult[im] = CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data[in_index], weights[w_index]);
}

typename CONFIG_T::accum_t mult[multiplier_limit];
Expand Down
8 changes: 4 additions & 4 deletions hls4ml/templates/vivado/nnet_utils/nnet_merge.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@ struct dot_config {
static const unsigned reuse_factor = 1;
typedef float accum_t;
// Product function to use
template<class x_T, class y_T, class res_T>
using product = nnet::product::mult<x_T, y_T, res_T>;
template<class x_T, class y_T>
using product = nnet::product::mult<x_T, y_T>;
};

struct concat_config {
Expand Down Expand Up @@ -129,15 +129,15 @@ void dot1d(
#pragma HLS PIPELINE II=CONFIG_T::reuse_factor

constexpr unsigned multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in, CONFIG_T::reuse_factor);
CONFIG_T::template product<input1_T, input2_T, typename CONFIG_T::accum_t>::limit(multiplier_limit);
CONFIG_T::template product<input1_T, input2_T>::limit(multiplier_limit);

typename CONFIG_T::accum_t mult[CONFIG_T::n_in];
#pragma HLS ARRAY_PARTITION variable=mult complete
typename CONFIG_T::accum_t acc = 0;

Product: for(int i_mult=0; i_mult < CONFIG_T::n_in; i_mult++) {
#pragma HLS UNROLL
mult[i_mult] = CONFIG_T::template product<input1_T, input2_T, typename CONFIG_T::accum_t>::product(data1[i_mult], data2[i_mult]);
mult[i_mult] = CONFIG_T::template product<input1_T, input2_T>::product(data1[i_mult], data2[i_mult]);
}

Accum: for(int i_acc = 0; i_acc < CONFIG_T::n_in; i_acc++) {
Expand Down
Loading