Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Quartus streaming support for Activations, Dense & Batch Normalization #557

Merged
merged 11 commits into from
Jul 26, 2022
11 changes: 11 additions & 0 deletions hls4ml/backends/fpga/fpga_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,13 @@ def definition_cpp(self, name_suffix='', as_reference=False):
else: # Declaration
return 'hls::stream<{type}> {name}{suffix}("{name}")'.format(type=self.type.name, name=self.name, suffix=name_suffix)

class QuartusStreamVariableDefinition(VariableDefinition):
def definition_cpp(self, name_suffix='', as_reference=False):
if as_reference: # Function parameter
return 'stream<{type}> &{name}{suffix}'.format(type=self.type.name, name=self.name, suffix=name_suffix)
else: # Declaration
return 'stream<{type}> {name}{suffix}'.format(type=self.type.name, name=self.name, suffix=name_suffix)

class StreamVariableConverter(object):
def __init__(self, type_converter, prefix, definition_cls):
self.type_converter = type_converter
Expand All @@ -280,6 +287,10 @@ class VivadoStreamVariableConverter(StreamVariableConverter):
def __init__(self, type_converter):
super().__init__(type_converter=type_converter, prefix='Vivado', definition_cls=VivadoStreamVariableDefinition)

class QuartusStreamVariableConverter(StreamVariableConverter):
def __init__(self, type_converter):
super().__init__(type_converter=type_converter, prefix='Quartus', definition_cls=QuartusStreamVariableDefinition)

#endregion

#region InplaceVariable
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from hls4ml.model.optimizer import OptimizerPass

from hls4ml.model.layers import Layer, register_layer
from hls4ml.backends import get_backend
from hls4ml.backends.template import FunctionCallTemplate

class Clone(Layer):
Expand Down
6 changes: 3 additions & 3 deletions hls4ml/backends/quartus/passes/core_templates.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@

dense_function_template = 'nnet::dense_{strategy}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});'

dense_include_list = ['nnet_utils/nnet_dense.h', 'nnet_utils/nnet_dense_compressed.h']
dense_include_list = ['nnet_utils/nnet_dense.h', 'nnet_utils/nnet_dense_compressed.h', 'nnet_utils/nnet_dense_stream.h']

class DenseConfigTemplate(LayerConfigTemplate):
def __init__(self):
Expand Down Expand Up @@ -80,7 +80,7 @@ def format(self, node):

batchnorm_function_template = 'nnet::normalize<{input_t}, {output_t}, {config}>({input}, {output}, {scale}, {bias});'

batchnorm_include_list = ['nnet_utils/nnet_batchnorm.h']
batchnorm_include_list = ['nnet_utils/nnet_batchnorm.h', 'nnet_utils/nnet_batchnorm_stream.h']

class BatchNormalizationConfigTemplate(LayerConfigTemplate):
def __init__(self):
Expand Down Expand Up @@ -130,7 +130,7 @@ def format(self, node):
activ_function_template = 'nnet::{activation}<{input_t}, {output_t}, {config}>({input}, {output});'
param_activ_function_template = 'nnet::{activation}<{input_t}, {output_t}, {config}>({input}, {param}, {output});'

activ_include_list = ['nnet_utils/nnet_activation.h']
activ_include_list = ['nnet_utils/nnet_activation.h', 'nnet_utils/nnet_activation_stream.h']

class ActivationConfigTemplate(LayerConfigTemplate):
def __init__(self):
Expand Down
7 changes: 3 additions & 4 deletions hls4ml/backends/quartus/passes/transform_types.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@

from hls4ml.model.optimizer import GlobalOptimizerPass
from hls4ml.model.types import InplaceVariable
from hls4ml.backends.fpga.fpga_types import ACTypeConverter, QuartusArrayVariableConverter, HLSTypeConverter, QuartusInplaceVariableConverter, QuartusStructMemberVariableConverter, StaticWeightVariableConverter

from hls4ml.backends.fpga.fpga_types import ACTypeConverter, QuartusArrayVariableConverter, HLSTypeConverter, QuartusInplaceVariableConverter, QuartusStreamVariableConverter, QuartusStructMemberVariableConverter, StaticWeightVariableConverter

class TransformTypes(GlobalOptimizerPass):
def __init__(self):
self.type_converter = HLSTypeConverter(precision_converter=ACTypeConverter())
self.array_var_converter = QuartusArrayVariableConverter(type_converter=self.type_converter)
self.struct_var_converter = QuartusStructMemberVariableConverter(type_converter=self.type_converter)
self.stream_var_converter = QuartusStreamVariableConverter(type_converter=self.type_converter)
self.weight_var_converter = StaticWeightVariableConverter(type_converter=self.type_converter)
self.inplace_var_converter = QuartusInplaceVariableConverter(type_converter=self.type_converter)

Expand All @@ -18,9 +18,8 @@ def transform(self, model, node):
for out_name, var in node.variables.items():
if isinstance(var, InplaceVariable):
new_var = self.inplace_var_converter.convert(var, io_type)

if io_type == 'io_stream':
raise Exception('Streaming IO is not supported in Quartus.')
new_var = self.stream_var_converter.convert(var)
elif io_type == 'io_parallel':
if node.name in node.model.inputs:
new_var = self.struct_var_converter.convert(var, pragma='hls_register', struct_name='inputs')
Expand Down
22 changes: 10 additions & 12 deletions hls4ml/backends/quartus/quartus_backend.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,9 @@
import numpy as np
import math
import os
import copy
import webbrowser
from calmjs.parse import es5
from calmjs.parse import asttypes
from tabulate import tabulate
from ast import literal_eval
from contextlib import contextmanager

from hls4ml.model.types import NamedType, IntegerPrecisionType, FixedPrecisionType
from hls4ml.model.layers import Embedding, Layer, Dense, BatchNormalization, Activation, ParametrizedActivation, PReLU, Softmax
from hls4ml.model.optimizer import get_backend_passes, layer_optimizer, model_optimizer
from hls4ml.model.layers import Layer, Dense, Activation, Softmax, Embedding
from hls4ml.model.optimizer import get_backend_passes, layer_optimizer
from hls4ml.model.flow import register_flow
from hls4ml.backends import FPGABackend
from hls4ml.report import parse_quartus_report
Expand All @@ -34,6 +26,11 @@ def _register_flows(self):
initializers = self._get_layer_initializers()
init_flow = register_flow('init_layers', initializers, requires=['optimize'], backend=self.name)

streaming_passes = [
'quartus:clone_output'
]
streaming_flow = register_flow('streaming', streaming_passes, requires=[init_flow], backend=self.name)

quartus_types = [
'quartus:transform_types',
]
Expand All @@ -46,7 +43,6 @@ def _register_flows(self):
]
quantization_flow = register_flow('quantization', quantization_passes, requires=[init_flow], backend=self.name)


templates = self._get_layer_templates()
template_flow = register_flow('apply_templates', templates, requires=[init_flow], backend=self.name)

Expand All @@ -69,7 +65,7 @@ def _register_flows(self):
else:
extras_flow = None

ip_flow_requirements = ['optimize', init_flow, quantization_flow, quartus_types_flow, extras_flow, template_flow]
ip_flow_requirements = ['optimize', init_flow, streaming_flow, quantization_flow, quartus_types_flow, extras_flow, template_flow]
ip_flow_requirements = list(filter(None, ip_flow_requirements))

self._default_flow = register_flow('ip', None, requires=ip_flow_requirements, backend=self.name)
Expand Down Expand Up @@ -178,6 +174,8 @@ def init_dense(self, layer):

@layer_optimizer(Activation)
def init_activation(self, layer):
if layer.get_attr('activation') == 'tanh':
layer.set_attr('activation', 'dense_tanh')
if 'table_t' not in layer.attributes:
layer.set_attr('table_t', NamedType(name=layer.name + '_table_t', precision=FixedPrecisionType(width=18, integer=8)))
if 'table_size' not in layer.attributes:
Expand Down
36 changes: 36 additions & 0 deletions hls4ml/templates/quartus/ac_types/stream.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#ifndef NNET_STREAM_H
#define NNET_STREAM_H

#include <deque>

namespace nnet {

/*
* A struct with the same high-level functionality as Intel's HLS ihc::stream
* This struct is used during GCC compilation / hls4ml model.predict(...)
* This is because GCC does not have access to HLS source files (ihc::stream)
* Software-wise, this struct behaves like a first-in, first-out (FIFO) buffer
* However, it cannot be used for HLS synthesis, since it uses dynamic memory allocation (deque)
*/
template<typename T>
struct stream {
private:
std::deque<T> _data;

public:
stream() {}

T read() {
T element = _data.front();
_data.pop_front();
return element;
}

void write(const T& element) {
_data.push_back(element);
}
};

}

#endif
31 changes: 31 additions & 0 deletions hls4ml/templates/quartus/firmware/defines.h
Original file line number Diff line number Diff line change
@@ -1,16 +1,47 @@
#ifndef DEFINES_H_
#define DEFINES_H_

/*
* Intel HLS makes use of three streaming interfaces:
* (1) stream_in - used as the main input to a component
* (2) stream_out - used as the main output of a component
* (3) stream - allows both reading and writing; used for inter-component connections
* ihc::stream has a implicitly deleted constructor and therefore, cannot be used as the output of a function/component
* Therefore, variables of type 'stream' are always passed by reference
*/

#ifndef __INTELFPGA_COMPILER__

#include "ac_int.h"
#include "ac_fixed.h"
#define hls_register

#include "stream.h"
template<typename T>
using stream = nnet::stream<T>;
template<typename T>
using stream_in = nnet::stream<T>;
template<typename T>
using stream_out = nnet::stream<T>;

#else

#include "HLS/hls.h"
#include "HLS/ac_int.h"
#include "HLS/ac_fixed.h"

template<typename T>
using stream = ihc::stream<T>;
template<typename T>
using stream_in = ihc::stream_in<T>;
template<typename T>
using stream_out = ihc::stream_out<T>;

#endif

// Include nnet::array - a custom array-like struct, mainly used with io_stream
#include "nnet_utils/nnet_types.h"

//hls-fpga-machine-learning insert numbers


Expand Down
53 changes: 38 additions & 15 deletions hls4ml/templates/quartus/firmware/myproject.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,28 +16,51 @@
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//
#include <iostream>

#include "myproject.h"

//hls-fpga-machine-learning insert weights

/*
* Intel HLS requires that all 'stream' types are:
* (1) Passed by reference to the top-level entity or
* (2) Declared as global variables, outside of the main function
* Therefore, layer inputs/output (connections betweenn individual layers) are declared here
*/
//hls-fpga-machine-learning insert inter-task streams

#ifndef __INTELFPGA_COMPILER__
output_data myproject(
input_data inputs
) {
/*
* The top-level function used during GCC compilation / hls4ml.predic(...) goes here
* An important distinction is made between io_stream and io_parallel:
* (1) io_parallel:
- Top-level function takes a struct containing an array as function argument
- Returns a struct containing an array - the prediction
(2) io_stream:
- Top-level function is 'void' - no return value
- Instead, both the input and output are passed by reference
- This is due the HLS Streaming Interfaces; stream cannot be copied (implicitly deleted copy constructor)
* This distinction is handled in quartus_writer.py
*/
//hls-fpga-machine-learning instantiate GCC top-level
#else
// Maximum initiation interval, concurrency and frequency for HLS syntheis are defined here
//hls-fpga-machine-learning insert cpragmas
component output_data myproject(
input_data inputs
) {
#endif
hls_register output_data outputs;

// ****************************************
// NETWORK INSTANTIATION
// ****************************************
/*
* The top-level function used during HLS Synthesis goes here
* In a similar manner to GCC, there is a distinction between io_stream & io_parallel
*/
//hls-fpga-machine-learning instantiate HLS top-level
#endif
// If using io_parallel, the output needs to be initialised and returned at the end of this function
// If using io_stream, no output is initialised, as it is passed by reference to the top-level function
//hls-fpga-machine-learning initialize input/output

// ****************************************
// NETWORK INSTANTIATION
// ****************************************

//hls-fpga-machine-learning insert layers
return outputs;
}
//hls-fpga-machine-learning insert layers

//hls-fpga-machine-learning return
40 changes: 26 additions & 14 deletions hls4ml/templates/quartus/firmware/myproject.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,26 +30,38 @@
#include "HLS/ac_fixed.h"
#endif

// Streams are explicitly defined in defines.h, which are included for parameters.h
// Defining them again in this file will cause compile-time errors
#include "parameters.h"

struct input_data {
//hls-fpga-machine-learning insert inputs
};

struct output_data {
//hls-fpga-machine-learning insert outputs
};

// If using io_parallel, inputs and output need to be initialised before calling the top-level function
// If using io_stream, no inputs/outputs are initialised, as they are passed by reference to the top-level function
//hls-fpga-machine-learning insert inputs
//hls-fpga-machine-learning insert outputs

#ifndef __INTELFPGA_COMPILER__
output_data myproject(
input_data inputs
);
/*
* The top-level function used during GCC compilation / hls4ml.predic(...) goes here
* An important distinction is made between io_stream and io_parallel:
* (1) io_parallel:
- Top-level function takes a struct containing an array as function argument
- Returns a struct containing an array - the prediction
(2) io_stream:
- Top-level function is 'void' - no return value
- Instead, both the input and output are passed by reference
- This is due the HLS Streaming Interfaces; stream cannot be copied (implicitly deleted copy constructor)
* This distinction is handled in quartus_writer.py
*/
//hls-fpga-machine-learning instantiate GCC top-level
#else
// Maximum initiation interval, concurrency and frequency for HLS syntheis are defined here
//hls-fpga-machine-learning insert cpragmas
component output_data myproject(
input_data inputs
);

/*
* The top-level function used during HLS Synthesis goes here
* In a similar manner to GCC, there is a distinction between io_stream & io_parallel
*/
//hls-fpga-machine-learning instantiate HLS top-level
#endif

#endif
Original file line number Diff line number Diff line change
Expand Up @@ -270,8 +270,7 @@ inline void softmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){
// TanH Activation
// *************************************************
template<class data_T, class res_T, typename CONFIG_T>
void dense_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
{
void dense_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
static const int MAX_VALUE=4;
// Initialize the lookup table
#include "activation_tables/tanh_table.tb"
Expand Down
Loading