fastmachinelearning · thesps · Mar 23, 2022 · Jan 7, 2022 · Jan 12, 2022 · Jan 17, 2022
diff --git a/hls4ml/backends/quartus/passes/core_templates.py b/hls4ml/backends/quartus/passes/core_templates.py
@@ -71,8 +71,6 @@ def format(self, node):
     static const bool store_weights_in_bram = false;
     typedef {bias_t.name} bias_t;
     typedef {scale_t.name} scale_t;
-    template<class x_T, class y_T, class res_T>
-    using product = nnet::product::{product_type}<x_T, y_T, res_T>;
 }};\n"""
 
 batchnorm_function_template = 'nnet::normalize<{input_t}, {output_t}, {config}>({input}, {output}, {scale}, {bias});'

diff --git a/hls4ml/backends/vivado/passes/convolution_templates.py b/hls4ml/backends/vivado/passes/convolution_templates.py
@@ -13,8 +13,8 @@
     typedef {accum_t.name} accum_t;
     typedef {bias_t.name} bias_t;
     typedef {weight_t.name} weight_t;
-    template<class x_T, class y_T, class res_T>
-    using product = nnet::product::{product_type}<x_T, y_T, res_T>;
+    template<class x_T, class y_T>
+    using product = nnet::product::{product_type}<x_T, y_T>;
 }};\n"""
 
 # Conv1D templates

diff --git a/hls4ml/backends/vivado/passes/core_templates.py b/hls4ml/backends/vivado/passes/core_templates.py
@@ -18,8 +18,8 @@
     typedef {bias_t.name} bias_t;
     typedef {weight_t.name} weight_t;
     typedef {index_t.name} index_t;
-    template<class x_T, class y_T, class res_T>
-    using product = nnet::product::{product_type}<x_T, y_T, res_T>;
+    template<class x_T, class y_T>
+    using product = nnet::product::{product_type}<x_T, y_T>;
 }};\n"""
 
 dense_function_template = 'nnet::dense<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});'
@@ -62,8 +62,8 @@ def format(self, node):
     static const bool store_weights_in_bram = false;
     typedef {bias_t.name} bias_t;
     typedef {scale_t.name} scale_t;
-    template<class x_T, class y_T, class res_T>
-    using product = nnet::product::{product_type}<x_T, y_T, res_T>;
+    template<class x_T, class y_T>
+    using product = nnet::product::{product_type}<x_T, y_T>;
 }};\n"""
 
 batchnorm_function_template = 'nnet::normalize<{input_t}, {output_t}, {config}>({input}, {output}, {scale}, {bias});'

diff --git a/hls4ml/backends/vivado/passes/merge_templates.py b/hls4ml/backends/vivado/passes/merge_templates.py
@@ -50,8 +50,8 @@ def format(self, node):
     static const unsigned n_out = {n_out};
     static const unsigned reuse_factor = {reuse};
     typedef {accum_t.name} accum_t;
-    template<class x_T, class y_T, class res_T>
-    using product = nnet::product::{product_type}<x_T, y_T, res_T>;
+    template<class x_T, class y_T>
+    using product = nnet::product::{product_type}<x_T, y_T>;
 }};\n"""
 
 class DotConfigTemplate(LayerConfigTemplate):

diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_dense.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_dense.h
@@ -51,38 +51,34 @@ struct dense_config
    // partitioning arrays cyclically to go with roll factors?
 };
 
-template<class data_T, class weight_T, class ret_T>
-inline typename std::enable_if<std::is_same<data_T, ac_int<1, false>>::value
-        and std::is_same<weight_T, ac_int<1, false>>::value, ac_int<1, false>>::type
-product(ac_int<1, false> a, ac_int<1, false> w){
+inline ac_int<1, false> product(ac_int<1, false> a, ac_int<1, false> w)
+{
     // specialisation for 1-bit weights and incoming data
-    return (ret_T) (a == w);
+    return (a == w);
 }
 
-template<class data_T, class weight_T, class ret_T>
-inline typename std::enable_if<(not std::is_same<data_T, ac_int<1, false>>::value)
-        and std::is_same<weight_T, ac_int<1, false>>::value, ret_T>::type
-product(data_T a, ac_int<1, false> w){
+template<class data_T>
+auto product(data_T a, ac_int<1, false> w) -> decltype(-a)
+{
     // Specialisation for 1-bit weights, arbitrary data
-    return w == 0 ? (ret_T) -a : a;
+    if (w == 0) return -a;
+    else return a;
 }
 
-template<class data_T, class weight_T, class ret_T>
-inline typename std::enable_if<(not std::is_same<data_T, ac_int<2, false>>::value)
-        and std::is_same<weight_T, ac_int<2, true>>::value, ret_T>::type
-product(data_T a, ac_int<2, true> w){
+template<class data_T>
+auto product(data_T a, ac_int<2, true> w) -> decltype(-a)
+{
     // Specialisation for 2-bit weights, arbitrary data
-    if (w == 0) return (ret_T) 0;
-    else if(w == -1) return (ret_T) -a;
-    else return (ret_T) a; // if(w == 1)
+    if (w == 0) return 0;
+    else if(w == -1) return -a;
+    else return a; // if(w == 1)
 }
 
-template<class data_T, class weight_T, class ret_T>
-inline typename std::enable_if<(not std::is_same<data_T, ac_int<1, false>>::value)
-        and (not std::is_same<weight_T, ac_int<1, false>>::value), ret_T>::type
-product(data_T a, weight_T w){
+template<class data_T, class weight_T>
+auto product(data_T a, weight_T w) -> decltype(a*w)
+{
     // 'Normal' product
-    return (ret_T)(a * w);
+    return a * w;
 }
 
 template<class data_T, class res_T, typename CONFIG_T>
@@ -138,7 +134,7 @@ void dense_rf_gt(
           uint32 w_index = ir + (CONFIG_T::reuse_factor_rounded) * im;
           if (w_index >= CONFIG_T::reuse_factor_rounded*CONFIG_T::block_factor_rounded) continue;
           int data_index = d_index[ir][im];
-          tmp_acc[im] = product<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::accum_t>(data[data_index], weights[w_index]);
+          tmp_acc[im] = product(data[data_index], weights[w_index]);
       }
       hls_register typename CONFIG_T::accum_t mult[CONFIG_T::multiplier_limit];
       ResetMult:
@@ -192,7 +188,7 @@ void dense_rf_lt(
        for (int im = 0, in_index = ir; im < CONFIG_T::block_factor; im++) {
             uint32 w_index = ir + (CONFIG_T::reuse_factor_rounded) * im;
             if (ir + CONFIG_T::reuse_factor * im >= CONFIG_T::n_in*CONFIG_T::n_out) continue;
-            mult[im] = product<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::accum_t>(data[in_index], weights[w_index]);
+            mult[im] = product(data[in_index], weights[w_index]);
             in_index += CONFIG_T::reuse_factor;
             if (in_index >=  CONFIG_T::n_in) in_index = ir;
        }

diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm.h b/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm.h
@@ -43,8 +43,8 @@ struct batchnorm_config
     static const bool store_weights_in_bram = false;
     static const unsigned n_zeros = 0;
     // partitioning arrays cyclically to go with roll factors?
-    template<class x_T, class y_T, class res_T>
-    using product = nnet::product::mult<x_T, y_T, res_T>;
+    template<class x_T, class y_T>
+    using product = nnet::product::mult<x_T, y_T>;
 };
 
 template<class data_T, class res_T, typename CONFIG_T>
@@ -71,7 +71,7 @@ void normalize(
         #pragma HLS ARRAY_PARTITION variable=bias complete
 
         int multiplier_limit  = ceil(float(CONFIG_T::n_in) / float(CONFIG_T::reuse_factor));
-        CONFIG_T::template product<data_T, typename CONFIG_T::scale_t, res_T>::limit(multiplier_limit);
+        CONFIG_T::template product<data_T, typename CONFIG_T::scale_t>::limit(multiplier_limit);
 
     } else if (CONFIG_T::io_type == io_serial) {
         #pragma HLS ARRAY_RESHAPE variable=scale complete dim=1
@@ -87,10 +87,10 @@ void normalize(
         }
 
         if (CONFIG_T::n_filt==-1) {
-            res[ires] = CONFIG_T::template product<data_T, typename CONFIG_T::scale_t, res_T>::product(data[ires], scale[ires]) + bias[ires];
+            res[ires] = CONFIG_T::template product<data_T, typename CONFIG_T::scale_t>::product(data[ires], scale[ires]) + bias[ires];
 	    } else {
             int norm_index = ires%CONFIG_T::n_filt;
-            res[ires] = CONFIG_T::template product<data_T, typename CONFIG_T::scale_t, res_T>::product(data[ires], scale[norm_index]) + bias[norm_index];
+            res[ires] = CONFIG_T::template product<data_T, typename CONFIG_T::scale_t>::product(data[ires], scale[norm_index]) + bias[norm_index];
         }
 	}
 }

diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm_stream.h
@@ -43,7 +43,7 @@ void normalize(
 
     constexpr unsigned multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in, CONFIG_T::reuse_factor);
     constexpr unsigned ii = CONFIG_T::n_in / multiplier_limit;
-    CONFIG_T::template product<typename data_T::value_type, typename CONFIG_T::scale_t, typename res_T::value_type>::limit(multiplier_limit);
+    CONFIG_T::template product<typename data_T::value_type, typename CONFIG_T::scale_t>::limit(multiplier_limit);
 
     BatchNormLoop: for (int i = 0; i < CONFIG_T::n_in / data_T::size; i++) {
         #pragma HLS PIPELINE II=ii
@@ -60,7 +60,7 @@ void normalize(
             } else {
                 norm_index = j % CONFIG_T::n_filt;
             }
-            out_data[j] = CONFIG_T::template product<typename data_T::value_type, typename CONFIG_T::scale_t, typename res_T::value_type>::product(in_data[j], scale[norm_index]) + bias[norm_index];
+            out_data[j] = CONFIG_T::template product<typename data_T::value_type, typename CONFIG_T::scale_t>::product(in_data[j], scale[norm_index]) + bias[norm_index];
         }
 
         res.write(out_data);

diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_dense.h b/hls4ml/templates/vivado/nnet_utils/nnet_dense.h
@@ -30,8 +30,8 @@ struct dense_config
     static const unsigned n_zeros = 0;
     // partitioning arrays cyclically to go with roll factors?
     // Product function to use
-    template<class x_T, class y_T, class res_T>
-    using product = nnet::product::mult<x_T, y_T, res_T>;
+    template<class x_T, class y_T>
+    using product = nnet::product::mult<x_T, y_T>;
 };
 
 template<class data_T, class res_T, typename CONFIG_T>

diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_dense_compressed.h b/hls4ml/templates/vivado/nnet_utils/nnet_dense_compressed.h
@@ -86,7 +86,7 @@ void dense_compressed(
             auto weight_cache = weights[w].weight;
             data_T  data_cache = data[row];
             //mult[col] += weight_cache * data_cache;
-            typename CONFIG_T::accum_t prod = CONFIG_T::template product<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::accum_t>::product(data_cache, weight_cache);
+            typename CONFIG_T::accum_t prod = CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data_cache, weight_cache);
             fill_mult<CONFIG_T>(col, mult, prod);
         }
 

diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_dense_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_dense_latency.h
@@ -54,7 +54,7 @@ void dense_latency(
         #pragma HLS ARRAY_PARTITION variable=acc complete
 
         int multiplier_limit  = ceil(float(CONFIG_T::n_in*CONFIG_T::n_out) / float(CONFIG_T::reuse_factor)) - floor(float(CONFIG_T::n_zeros) / float(CONFIG_T::reuse_factor));
-        CONFIG_T::template product<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::accum_t>::limit(multiplier_limit);
+        CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::limit(multiplier_limit);
 
     } else if (CONFIG_T::io_type == io_serial){
         // Only reduce cycle_factor if n_out is evenly divisible by reuse_factor
@@ -90,10 +90,10 @@ void dense_latency(
         Product2: for(int jj = 0; jj < CONFIG_T::n_out; jj++) {
             if (CONFIG_T::io_type == io_serial) {
                 int multiplier_limit  = ceil(float(CONFIG_T::n_out) / float(CONFIG_T::reuse_factor));
-                CONFIG_T::template product<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::accum_t>::limit(multiplier_limit);
+                CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::limit(multiplier_limit);
             }
         int index = ii*CONFIG_T::n_out+jj;
-        mult[index] = CONFIG_T::template product<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::accum_t>::product(cache, weights[index]);
+        mult[index] = CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(cache, weights[index]);
         }
     }
 

diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_dense_resource.h b/hls4ml/templates/vivado/nnet_utils/nnet_dense_resource.h
@@ -73,7 +73,8 @@ void dense_resource_rf_leq_nin(
         for (int im = 0; im < block_factor; im++) {
             #pragma HLS UNROLL
 
-            acc[out_index] += CONFIG_T::template product<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::accum_t>::product(data[in_index], weights[w_index]);
+            acc[out_index] += static_cast<typename CONFIG_T::accum_t>(
+              CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data[in_index], weights[w_index]));
 
             // Increment w_index
             w_index += rufactor;
@@ -157,7 +158,8 @@ void dense_resource_rf_gt_nin_rem0(
         MultLoop:
         for (int im = 0; im < block_factor; im++) {
             #pragma HLS UNROLL
-            acc[out_index] += CONFIG_T::template product<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::accum_t>::product(data[in_index], weights[w_index]);
+            acc[out_index] += static_cast<typename CONFIG_T::accum_t>(
+              CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data[in_index], weights[w_index]));
 
             w_index += rufactor;
             if (w_index >= CONFIG_T::n_in * CONFIG_T::n_out) break; // check out of bounds
@@ -223,7 +225,7 @@ void dense_resource_rf_gt_nin(
             int w_index = ir + rufactor * im;
             int in_index = w_index % nin;
             if (w_index >= CONFIG_T::n_in*CONFIG_T::n_out) continue; // check out of bounds
-            tmpmult[im] = CONFIG_T::template product<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::accum_t>::product(data[in_index], weights[w_index]);
+            tmpmult[im] = CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data[in_index], weights[w_index]);
         }
 
         typename CONFIG_T::accum_t mult[multiplier_limit];

diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_merge.h b/hls4ml/templates/vivado/nnet_utils/nnet_merge.h
@@ -38,8 +38,8 @@ struct dot_config {
     static const unsigned reuse_factor = 1;
     typedef float accum_t;
     // Product function to use
-    template<class x_T, class y_T, class res_T>
-    using product = nnet::product::mult<x_T, y_T, res_T>;
+    template<class x_T, class y_T>
+    using product = nnet::product::mult<x_T, y_T>;
 };
 
 struct concat_config {
@@ -129,15 +129,15 @@ void dot1d(
     #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
 
     constexpr unsigned multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in, CONFIG_T::reuse_factor);
-    CONFIG_T::template product<input1_T, input2_T, typename CONFIG_T::accum_t>::limit(multiplier_limit);
+    CONFIG_T::template product<input1_T, input2_T>::limit(multiplier_limit);
 
     typename CONFIG_T::accum_t mult[CONFIG_T::n_in];
     #pragma HLS ARRAY_PARTITION variable=mult complete
     typename CONFIG_T::accum_t acc = 0;
 
     Product: for(int i_mult=0; i_mult < CONFIG_T::n_in; i_mult++) {
         #pragma HLS UNROLL
-        mult[i_mult] = CONFIG_T::template product<input1_T, input2_T, typename CONFIG_T::accum_t>::product(data1[i_mult], data2[i_mult]);
+        mult[i_mult] = CONFIG_T::template product<input1_T, input2_T>::product(data1[i_mult], data2[i_mult]);
     }
 
     Accum: for(int i_acc = 0; i_acc < CONFIG_T::n_in; i_acc++) {