fastmachinelearning · vloncar · Jul 26, 2022 · May 19, 2022 · Jul 1, 2022 · Jul 1, 2022
diff --git a/hls4ml/backends/fpga/fpga_types.py b/hls4ml/backends/fpga/fpga_types.py
@@ -258,6 +258,13 @@ def definition_cpp(self, name_suffix='', as_reference=False):
         else: # Declaration
             return 'hls::stream<{type}> {name}{suffix}("{name}")'.format(type=self.type.name, name=self.name, suffix=name_suffix)
 
+class QuartusStreamVariableDefinition(VariableDefinition):
+    def definition_cpp(self, name_suffix='', as_reference=False):
+        if as_reference: # Function parameter
+            return 'stream<{type}> &{name}{suffix}'.format(type=self.type.name, name=self.name, suffix=name_suffix)
+        else:            # Declaration
+            return 'stream<{type}> {name}{suffix}'.format(type=self.type.name, name=self.name, suffix=name_suffix)
+
 class StreamVariableConverter(object):
     def __init__(self, type_converter, prefix, definition_cls):
         self.type_converter = type_converter
@@ -280,6 +287,10 @@ class VivadoStreamVariableConverter(StreamVariableConverter):
     def __init__(self, type_converter):
         super().__init__(type_converter=type_converter, prefix='Vivado', definition_cls=VivadoStreamVariableDefinition)
 
+class QuartusStreamVariableConverter(StreamVariableConverter):
+    def __init__(self, type_converter):
+        super().__init__(type_converter=type_converter, prefix='Quartus', definition_cls=QuartusStreamVariableDefinition)
+
 #endregion
 
 #region InplaceVariable

diff --git a/hls4ml/backends/vivado/passes/clone.py → hls4ml/backends/fpga/passes/clone.py b/hls4ml/backends/vivado/passes/clone.py → hls4ml/backends/fpga/passes/clone.py
@@ -3,7 +3,6 @@
 from hls4ml.model.optimizer import OptimizerPass
 
 from hls4ml.model.layers import Layer, register_layer
-from hls4ml.backends import get_backend
 from hls4ml.backends.template import FunctionCallTemplate
 
 class Clone(Layer):

diff --git a/hls4ml/backends/quartus/passes/core_templates.py b/hls4ml/backends/quartus/passes/core_templates.py
@@ -36,7 +36,7 @@
 
 dense_function_template = 'nnet::dense_{strategy}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});'
 
-dense_include_list = ['nnet_utils/nnet_dense.h', 'nnet_utils/nnet_dense_compressed.h']
+dense_include_list = ['nnet_utils/nnet_dense.h', 'nnet_utils/nnet_dense_compressed.h', 'nnet_utils/nnet_dense_stream.h']
 
 class DenseConfigTemplate(LayerConfigTemplate):
     def __init__(self):
@@ -80,7 +80,7 @@ def format(self, node):
 
 batchnorm_function_template = 'nnet::normalize<{input_t}, {output_t}, {config}>({input}, {output}, {scale}, {bias});'
 
-batchnorm_include_list = ['nnet_utils/nnet_batchnorm.h']
+batchnorm_include_list = ['nnet_utils/nnet_batchnorm.h', 'nnet_utils/nnet_batchnorm_stream.h']
 
 class BatchNormalizationConfigTemplate(LayerConfigTemplate):
     def __init__(self):
@@ -130,7 +130,7 @@ def format(self, node):
 activ_function_template = 'nnet::{activation}<{input_t}, {output_t}, {config}>({input}, {output});'
 param_activ_function_template = 'nnet::{activation}<{input_t}, {output_t}, {config}>({input}, {param}, {output});'
 
-activ_include_list = ['nnet_utils/nnet_activation.h']
+activ_include_list = ['nnet_utils/nnet_activation.h', 'nnet_utils/nnet_activation_stream.h']
 
 class ActivationConfigTemplate(LayerConfigTemplate):
     def __init__(self):

diff --git a/hls4ml/backends/quartus/passes/transform_types.py b/hls4ml/backends/quartus/passes/transform_types.py
@@ -1,14 +1,14 @@
 
 from hls4ml.model.optimizer import GlobalOptimizerPass
 from hls4ml.model.types import InplaceVariable
-from hls4ml.backends.fpga.fpga_types import ACTypeConverter, QuartusArrayVariableConverter, HLSTypeConverter, QuartusInplaceVariableConverter, QuartusStructMemberVariableConverter, StaticWeightVariableConverter
-
+from hls4ml.backends.fpga.fpga_types import ACTypeConverter, QuartusArrayVariableConverter, HLSTypeConverter, QuartusInplaceVariableConverter, QuartusStreamVariableConverter, QuartusStructMemberVariableConverter, StaticWeightVariableConverter
 
 class TransformTypes(GlobalOptimizerPass):
     def __init__(self):
         self.type_converter = HLSTypeConverter(precision_converter=ACTypeConverter())
         self.array_var_converter = QuartusArrayVariableConverter(type_converter=self.type_converter)
         self.struct_var_converter = QuartusStructMemberVariableConverter(type_converter=self.type_converter)
+        self.stream_var_converter = QuartusStreamVariableConverter(type_converter=self.type_converter)
         self.weight_var_converter = StaticWeightVariableConverter(type_converter=self.type_converter)
         self.inplace_var_converter = QuartusInplaceVariableConverter(type_converter=self.type_converter)
 
@@ -18,9 +18,8 @@ def transform(self, model, node):
         for out_name, var in node.variables.items():
             if isinstance(var, InplaceVariable):
                 new_var = self.inplace_var_converter.convert(var, io_type)
-
             if io_type == 'io_stream':
-                raise Exception('Streaming IO is not supported in Quartus.')
+                new_var = self.stream_var_converter.convert(var)
             elif io_type == 'io_parallel':
                 if node.name in node.model.inputs:
                     new_var = self.struct_var_converter.convert(var, pragma='hls_register', struct_name='inputs')

diff --git a/hls4ml/backends/quartus/quartus_backend.py b/hls4ml/backends/quartus/quartus_backend.py
@@ -1,17 +1,9 @@
 import numpy as np
-import math
 import os
-import copy
-import webbrowser
-from calmjs.parse import es5
-from calmjs.parse import asttypes
-from tabulate import tabulate
-from ast import literal_eval
 from contextlib import contextmanager
-
 from hls4ml.model.types import NamedType, IntegerPrecisionType, FixedPrecisionType
-from hls4ml.model.layers import Embedding, Layer, Dense, BatchNormalization, Activation, ParametrizedActivation, PReLU, Softmax
-from hls4ml.model.optimizer import get_backend_passes, layer_optimizer, model_optimizer
+from hls4ml.model.layers import Layer, Dense, Activation, Softmax, Embedding
+from hls4ml.model.optimizer import get_backend_passes, layer_optimizer
 from hls4ml.model.flow import register_flow
 from hls4ml.backends import FPGABackend
 from hls4ml.report import parse_quartus_report
@@ -34,6 +26,11 @@ def _register_flows(self):
         initializers = self._get_layer_initializers()
         init_flow = register_flow('init_layers', initializers, requires=['optimize'], backend=self.name)
 
+        streaming_passes = [
+            'quartus:clone_output'
+        ]
+        streaming_flow = register_flow('streaming', streaming_passes, requires=[init_flow], backend=self.name)
+
         quartus_types = [
             'quartus:transform_types',
         ]
@@ -46,7 +43,6 @@ def _register_flows(self):
         ]
         quantization_flow = register_flow('quantization', quantization_passes, requires=[init_flow], backend=self.name)
 
-
         templates = self._get_layer_templates()
         template_flow = register_flow('apply_templates', templates, requires=[init_flow], backend=self.name)
 
@@ -69,7 +65,7 @@ def _register_flows(self):
         else:
             extras_flow = None
 
-        ip_flow_requirements = ['optimize', init_flow, quantization_flow, quartus_types_flow, extras_flow, template_flow]
+        ip_flow_requirements = ['optimize', init_flow, streaming_flow, quantization_flow, quartus_types_flow, extras_flow, template_flow]
         ip_flow_requirements = list(filter(None, ip_flow_requirements))
 
         self._default_flow = register_flow('ip', None, requires=ip_flow_requirements, backend=self.name)
@@ -178,6 +174,8 @@ def init_dense(self, layer):
 
     @layer_optimizer(Activation)
     def init_activation(self, layer):
+        if layer.get_attr('activation') == 'tanh':
+            layer.set_attr('activation', 'dense_tanh')
         if 'table_t' not in layer.attributes:
             layer.set_attr('table_t', NamedType(name=layer.name + '_table_t', precision=FixedPrecisionType(width=18, integer=8)))
         if 'table_size' not in layer.attributes:

diff --git a/hls4ml/templates/quartus/ac_types/stream.h b/hls4ml/templates/quartus/ac_types/stream.h
@@ -0,0 +1,36 @@
+#ifndef NNET_STREAM_H
+#define NNET_STREAM_H
+
+#include <deque>
+
+namespace nnet {
+
+/*
+* A struct with the same high-level functionality as Intel's HLS ihc::stream
+* This struct is used during GCC compilation / hls4ml model.predict(...)
+* This is because GCC does not have access to HLS source files (ihc::stream)
+* Software-wise, this struct behaves like a first-in, first-out (FIFO) buffer
+* However, it cannot be used for HLS synthesis, since it uses dynamic memory allocation (deque)
+*/
+template<typename T>
+struct stream {
+  private:
+    std::deque<T> _data;
+
+  public:
+    stream() {}
+
+    T read() {
+        T element = _data.front();
+        _data.pop_front();
+        return element; 
+    }
+
+    void write(const T& element) { 
+        _data.push_back(element);
+    }   
+};
+
+}
+
+#endif
diff --git a/hls4ml/templates/quartus/firmware/defines.h b/hls4ml/templates/quartus/firmware/defines.h
@@ -1,16 +1,47 @@
 #ifndef DEFINES_H_
 #define DEFINES_H_
 
+/*
+* Intel HLS makes use of three streaming interfaces:
+*   (1) stream_in - used as the main input to a component
+*   (2) stream_out - used as the main output of a component
+*   (3) stream - allows both reading and writing; used for inter-component connections
+* ihc::stream has a implicitly deleted constructor and therefore, cannot be used as the output of a function/component
+* Therefore, variables of type 'stream' are always passed by reference
+*/
+
 #ifndef __INTELFPGA_COMPILER__
+
 #include "ac_int.h"
 #include "ac_fixed.h"
 #define hls_register
+
+#include "stream.h"
+template<typename T>
+using stream = nnet::stream<T>;
+template<typename T>
+using stream_in = nnet::stream<T>;
+template<typename T>
+using stream_out = nnet::stream<T>;
+
 #else
+
 #include "HLS/hls.h"
 #include "HLS/ac_int.h"
 #include "HLS/ac_fixed.h"
+
+template<typename T>
+using stream = ihc::stream<T>;
+template<typename T>
+using stream_in = ihc::stream_in<T>;
+template<typename T>
+using stream_out = ihc::stream_out<T>;
+
 #endif
 
+// Include nnet::array - a custom array-like struct, mainly used with io_stream
+#include "nnet_utils/nnet_types.h"
+
 //hls-fpga-machine-learning insert numbers
 
 

diff --git a/hls4ml/templates/quartus/firmware/myproject.cpp b/hls4ml/templates/quartus/firmware/myproject.cpp
@@ -16,28 +16,51 @@
 //    You should have received a copy of the GNU General Public License
 //    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 //
-#include <iostream>
 
 #include "myproject.h"
 
 //hls-fpga-machine-learning insert weights
 
+/*
+* Intel HLS requires that all 'stream' types are:
+*     (1) Passed by reference to the top-level entity or
+*     (2) Declared as global variables, outside of the main function
+* Therefore, layer inputs/output (connections betweenn individual layers) are declared here
+*/
+//hls-fpga-machine-learning insert inter-task streams
+
 #ifndef __INTELFPGA_COMPILER__
-output_data myproject(
-   input_data inputs
-) {
+/*
+* The top-level function used during GCC compilation / hls4ml.predic(...) goes here
+* An important distinction is made between io_stream and io_parallel:
+*     (1) io_parallel:
+               - Top-level function takes a struct containing an array as function argument
+               - Returns a struct containing an array - the prediction
+      (2) io_stream:
+               - Top-level function is 'void' - no return value
+               - Instead, both the input and output are passed by reference
+               - This is due the HLS Streaming Interfaces; stream cannot be copied (implicitly deleted copy constructor)
+* This distinction is handled in quartus_writer.py
+*/
+//hls-fpga-machine-learning instantiate GCC top-level
 #else
+// Maximum initiation interval, concurrency and frequency for HLS syntheis are defined here
 //hls-fpga-machine-learning insert cpragmas
-component output_data myproject(
-   input_data inputs
-) {
-#endif
-    hls_register output_data outputs;
 
-    // ****************************************
-    // NETWORK INSTANTIATION
-    // ****************************************
+/*
+* The top-level function used during HLS Synthesis goes here
+* In a similar manner to GCC, there is a distinction between io_stream & io_parallel
+*/
+//hls-fpga-machine-learning instantiate HLS top-level
+#endif 
+   // If using io_parallel, the output needs to be initialised and returned at the end of this function
+   // If using io_stream, no output is initialised, as it is passed by reference to the top-level function
+   //hls-fpga-machine-learning initialize input/output
+
+   // ****************************************
+   // NETWORK INSTANTIATION
+   // ****************************************
 
-    //hls-fpga-machine-learning insert layers
-    return outputs;
-}
+   //hls-fpga-machine-learning insert layers
+
+   //hls-fpga-machine-learning return
diff --git a/hls4ml/templates/quartus/firmware/myproject.h b/hls4ml/templates/quartus/firmware/myproject.h
@@ -30,26 +30,38 @@
 #include "HLS/ac_fixed.h"
 #endif
 
+// Streams are explicitly defined in defines.h, which are included for parameters.h
+// Defining them again in this file will cause compile-time errors
 #include "parameters.h"
 
-struct input_data {
-  //hls-fpga-machine-learning insert inputs
-};
-
-struct output_data {
-  //hls-fpga-machine-learning insert outputs
-};
-
+// If using io_parallel, inputs and output need to be initialised before calling the top-level function
+// If using io_stream, no inputs/outputs are initialised, as they are passed by reference to the top-level function
+//hls-fpga-machine-learning insert inputs
+//hls-fpga-machine-learning insert outputs
 
 #ifndef __INTELFPGA_COMPILER__
-output_data myproject(
-  input_data inputs
-);
+/*
+* The top-level function used during GCC compilation / hls4ml.predic(...) goes here
+* An important distinction is made between io_stream and io_parallel:
+*     (1) io_parallel:
+               - Top-level function takes a struct containing an array as function argument
+               - Returns a struct containing an array - the prediction
+      (2) io_stream:
+               - Top-level function is 'void' - no return value
+               - Instead, both the input and output are passed by reference
+               - This is due the HLS Streaming Interfaces; stream cannot be copied (implicitly deleted copy constructor)
+* This distinction is handled in quartus_writer.py
+*/
+//hls-fpga-machine-learning instantiate GCC top-level
 #else
+// Maximum initiation interval, concurrency and frequency for HLS syntheis are defined here
 //hls-fpga-machine-learning insert cpragmas
-component output_data myproject(
-  input_data inputs
-);
+
+/*
+* The top-level function used during HLS Synthesis goes here
+* In a similar manner to GCC, there is a distinction between io_stream & io_parallel
+*/
+//hls-fpga-machine-learning instantiate HLS top-level
 #endif
 
 #endif
diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation.h
@@ -270,8 +270,7 @@ inline void softmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){
 //       TanH Activation
 // *************************************************
 template<class data_T, class res_T, typename CONFIG_T>
-void  dense_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
-{
+void dense_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
     static const int MAX_VALUE=4;
     // Initialize the lookup table
     #include "activation_tables/tanh_table.tb"