From 773956ec122a085b5e93456778542b160f51e1a4 Mon Sep 17 00:00:00 2001
From: Benjamin Ramhorst <bramhorst27@gmail.com>
Date: Wed, 4 May 2022 16:11:56 +0100
Subject: [PATCH 1/5] Improved Quartus Softmax LUT - Vivado-equivalent approach

---
 hls4ml/utils/fixed_point_utils.py | 126 +++++++++++++++
 hls4ml/writer/quartus_writer.py   | 260 +++++++++++++++++++++---------
 2 files changed, 310 insertions(+), 76 deletions(-)
 create mode 100644 hls4ml/utils/fixed_point_utils.py

diff --git a/hls4ml/utils/fixed_point_utils.py b/hls4ml/utils/fixed_point_utils.py
new file mode 100644
index 0000000000..0060cc5360
--- /dev/null
+++ b/hls4ml/utils/fixed_point_utils.py
@@ -0,0 +1,126 @@
+import sys
+import math
+
+'''
+A helper class for handling fixed point methods
+Currently, very limited, allowing only:
+    - Conversion to float
+    - Exponents
+    - Reciprocals
+Used primarily for generating softmax look-up table
+by using bit manipulation (see Vivado-equivalent implementation)
+'''
+class FixedPointEmulator:
+    '''
+    Default constructor
+    Args:
+        - N : Total number of bits in the fixed point number
+        - I : Integer bits in the fixed point number
+        - F = N-I : Fixed point bits in the number
+        - signed : True/False - If True, use 2's complement when converting to float
+        - self.integer_bits : Bits corresponding to the integer part of the number
+        - self.decimal_bits : Bits corresponding to the decimal part of the number
+    '''
+    def __init__(self, N, I, signed=True, integer_bits=None, decimal_bits=None):
+        self.N = N
+        self.I = I
+        self.F = N - I
+        self.signed = signed
+        self.integer_bits = [0] * self.I if integer_bits is None else integer_bits
+        self.decimal_bits = [0] * self.F if decimal_bits is None else decimal_bits
+
+    '''
+    Converts the fixed point number stored in self.bits to a floating pont
+    Args:
+        - None
+    Returns:
+        - val : float, the floating point equivalent of the fixed point number
+    Description:
+        1. Check if the number is signed, and if so, set intermediate result to -2.0^(I-1) or 0.0
+           otherwise, set intermediate result to +2.0^(I-1) or 0.0
+        2. Traverse through integer bits, incrementing result by 2.0^(i) (using left shift)
+        3. Traverse through decimal bits, incrementing result by 2.0^(-i) (using pow)
+    Note: 
+        - This function uses left shifts instead of integer powers of 2.
+    '''
+    def to_float(self):
+        val = float(int(self.integer_bits[0]) << (self.I - 1))
+        val = -val if self.signed else val
+
+        for i in range(self.I - 1, 0, -1):
+            val += float(int(self.integer_bits[self.I - i]) << (i - 1))
+
+        for i in range(0, self.F):
+            if (self.decimal_bits[i]):
+                val += pow(2, -(i + 1))
+
+        return val
+
+    '''
+    Sets the top bits of the current number
+    Args:
+        - bits : Values top bit should be set to
+    '''
+    def set_msb_bits(self, bits):
+        for i in range(0, len(bits)):
+            if i < self.I:
+                self.integer_bits[i] = bits[i]
+            elif i >= self.I and i<self.F:
+                self.decimal_bits[i-self.I] = bits[i]
+
+    '''
+    Returns e^x, where x is the current fixed point number
+    Args:
+        - None
+    Returns:
+        - Float : e^x, rounded some number of decimal points
+    Notice:
+        - If e^x overflow, maximum value of float is used
+    '''
+    def exp_float(self, sig_figs=6):
+        try:
+            return round(math.exp(self.to_float()), sig_figs)
+        except OverflowError:
+            return round(sys.float_info.max, sig_figs)
+    '''
+    Returns 1/x, where x is the current fixed point number
+    Args:
+        - None
+    Returns:
+        - Float : 1/x, rounded some number of decimal points
+    '''
+    def inv_float(self, sig_figs=10):
+        if self.to_float()!=0:
+            return round(1.0/self.to_float(), sig_figs)
+        else:
+            return round(sys.float_info.max, sig_figs)
+
+
+'''
+    Converts unsigned integer i to N-bit binary number
+    Args:
+        - i : Number to be converted
+        - N : Number of bits to be used 
+    Note:
+        - N > log2(i)+1
+'''
+def uint_to_binary(i, N):
+    # Gets the binary representation of the number
+    bits = [int(b) for b in list('{0:0b}'.format(i))]
+
+    # Zero padding, so exactly N bits are used
+    while (len(bits) < N):
+        bits.insert(0, 0)
+
+    return bits
+
+
+'''
+    Returns log2(i), rounding up
+    Args:
+        - i : Number
+    Returns:
+        - val : representing ceil(log2(i))
+'''
+def ceil_log2(i):
+    return i.bit_length()-1
\ No newline at end of file
diff --git a/hls4ml/writer/quartus_writer.py b/hls4ml/writer/quartus_writer.py
index 9a49422885..99559a6177 100644
--- a/hls4ml/writer/quartus_writer.py
+++ b/hls4ml/writer/quartus_writer.py
@@ -3,24 +3,27 @@
 import yaml
 from shutil import copyfile, copytree, rmtree
 import numpy as np
+import re
 import os
 import glob
 from collections import OrderedDict
 
 from hls4ml.writer.writers import Writer
+from hls4ml.utils.fixed_point_utils import FixedPointEmulator, ceil_log2, uint_to_binary
 
 config_filename = 'hls4ml_config.yml'
 
+
 class QuartusWriter(Writer):
 
     def next_pow2(self, x):
-        return 1<<(x-1).bit_length()
+        return 1 << (x - 1).bit_length()
 
     def get_max_reuse_factor(self, model):
         max_rf = 0
         for layer in model.get_layers():
             rf = int(layer.get_attr('reuse_factor'))
-            if(rf > max_rf):
+            if (rf > max_rf):
                 max_rf = rf
         return max_rf
 
@@ -28,9 +31,9 @@ def print_array_to_cpp(self, var, layer, odir):
         #######################################
         ## Print weight array to C++
         #######################################
-        h_file = open("{}/firmware/weights/{}.h".format(odir,var.name),"w")
+        h_file = open("{}/firmware/weights/{}.h".format(odir, var.name), "w")
 
-        #meta data
+        # meta data
         h_file.write("//Numpy array shape {}\n".format(var.shape))
         h_file.write("//Min {:.12f}\n".format(np.min(var.min)))
         h_file.write("//Max {:.12f}\n".format(np.max(var.max)))
@@ -43,21 +46,22 @@ def print_array_to_cpp(self, var, layer, odir):
 
         rf = int(layer.get_attr('reuse_factor'))
         weight_header = '#ifdef __INTELFPGA_COMPILER__\n'
-        if (rf == 1 or var.name[0] == 'b' or layer.get_attr('n_in')*layer.get_attr('n_out') <= 2048
+        if (rf == 1 or var.name[0] == 'b' or layer.get_attr('n_in') * layer.get_attr('n_out') <= 2048
                 or (var.name[0] == 'w' and var.type.precision.width < 3)):
             weight_header += 'hls_init_on_powerup\n'
         else:
-            block_factor = (layer.get_attr('n_in')*layer.get_attr('n_out'))/rf
-            nbanks = int(2**np.ceil(np.log2(block_factor)) / 2)
+            block_factor = (layer.get_attr('n_in') * layer.get_attr('n_out')) / rf
+            nbanks = int(2 ** np.ceil(np.log2(block_factor)) / 2)
             var_width = int(np.ceil(var.type.precision.width / 8))
             bwidth = self.next_pow2(var_width)
-            weight_header += 'hls_bankwidth({bwidth})\nhls_numbanks({nbanks})\nhls_max_replicates(1)\nhls_memory_impl("BLOCK_RAM")\n'.format(bwidth=bwidth, nbanks=nbanks)
+            weight_header += 'hls_bankwidth({bwidth})\nhls_numbanks({nbanks})\nhls_max_replicates(1)\nhls_memory_impl("BLOCK_RAM")\n'.format(
+                bwidth=bwidth, nbanks=nbanks)
         weight_header += '#endif\n'
         weight_header += 'static const '
         h_file.write(weight_header + var.definition_cpp() + " = {")
 
-        #fill c++ array.
-        #not including internal brackets for multidimensional case
+        # fill c++ array.
+        # not including internal brackets for multidimensional case
         sep = ''
         for x in var:
             h_file.write(sep + x)
@@ -76,8 +80,8 @@ def write_project_cpp(self, model):
         ###################
 
         filedir = os.path.dirname(os.path.abspath(__file__))
-        f = open(os.path.join(filedir,'../templates/quartus/firmware/myproject.cpp'),'r')
-        fout = open('{}/firmware/{}.cpp'.format(model.config.get_output_dir(), model.config.get_project_name()),'w')
+        f = open(os.path.join(filedir, '../templates/quartus/firmware/myproject.cpp'), 'r')
+        fout = open('{}/firmware/{}.cpp'.format(model.config.get_output_dir(), model.config.get_project_name()), 'w')
 
         model_inputs = model.get_input_variables()
         model_outputs = model.get_output_variables()
@@ -85,7 +89,7 @@ def write_project_cpp(self, model):
         indent = '    '
 
         for line in f.readlines():
-            #Add headers to weights and biases
+            # Add headers to weights and biases
             if 'myproject' in line:
                 newline = line.replace('myproject', model.config.get_project_name())
 
@@ -94,7 +98,7 @@ def write_project_cpp(self, model):
                 newline = line
                 newline += 'hls_max_concurrency(0)\n'
                 newline += 'hls_component_ii({})\n'.format(self.get_max_reuse_factor(model))
-                clock_mhz = 1000/(model.config.get_config_value('ClockPeriod'))
+                clock_mhz = 1000 / (model.config.get_config_value('ClockPeriod'))
                 newline += 'hls_scheduler_target_fmax_mhz({})\n'.format(np.ceil(clock_mhz).astype(np.int))
 
             elif '//hls-fpga-machine-learning insert weights' in line:
@@ -118,14 +122,14 @@ def write_project_cpp(self, model):
                             def_cpp = var.definition_cpp()
                             if def_cpp is not None:
                                 newline += '    ' + def_cpp + ';\n'
-                    if layer.get_attr('activation') == 'tanh': #TODO move this to an optimizer
+                    if layer.get_attr('activation') == 'tanh':  # TODO move this to an optimizer
                         layer.set_attr('activation') == 'dense_tanh'
                     func = layer.get_attr('function_cpp', None)
                     if func:
                         newline += '    ' + func + '\n'
                         newline += '\n'
 
-            #Just copy line
+            # Just copy line
             else:
                 newline = line
 
@@ -140,8 +144,8 @@ def write_project_header(self, model):
         #######################
 
         filedir = os.path.dirname(os.path.abspath(__file__))
-        f = open(os.path.join(filedir,'../templates/quartus/firmware/myproject.h'),'r')
-        fout = open('{}/firmware/{}.h'.format(model.config.get_output_dir(), model.config.get_project_name()),'w')
+        f = open(os.path.join(filedir, '../templates/quartus/firmware/myproject.h'), 'r')
+        fout = open('{}/firmware/{}.h'.format(model.config.get_output_dir(), model.config.get_project_name()), 'w')
 
         model_inputs = model.get_input_variables()
         model_outputs = model.get_output_variables()
@@ -151,14 +155,14 @@ def write_project_header(self, model):
         for line in f.readlines():
 
             if 'MYPROJECT' in line:
-                newline = line.replace('MYPROJECT',format(model.config.get_project_name().upper()))
+                newline = line.replace('MYPROJECT', format(model.config.get_project_name().upper()))
             elif 'myproject' in line:
                 newline = line.replace('myproject', model.config.get_project_name())
             elif '//hls-fpga-machine-learning insert cpragmas' in line:
                 newline = line
                 newline += 'hls_max_concurrency(0)\n'
                 newline += 'hls_component_ii({})\n'.format(self.get_max_reuse_factor(model))
-                clock_mhz = 1000/(model.config.get_config_value('ClockPeriod'))
+                clock_mhz = 1000 / (model.config.get_config_value('ClockPeriod'))
                 newline += 'hls_scheduler_target_fmax_mhz({})\n'.format(np.ceil(clock_mhz).astype(np.int))
             elif 'component output_data myproject(' in line:
                 newline = 'component output_data {}(\n'.format(model.config.get_project_name())
@@ -179,12 +183,12 @@ def write_project_header(self, model):
 
     def write_defines(self, model):
         filedir = os.path.dirname(os.path.abspath(__file__))
-        f = open(os.path.join(filedir,'../templates/quartus/firmware/defines.h'),'r')
-        fout = open('{}/firmware/defines.h'.format(model.config.get_output_dir()),'w')
+        f = open(os.path.join(filedir, '../templates/quartus/firmware/defines.h'), 'r')
+        fout = open('{}/firmware/defines.h'.format(model.config.get_output_dir()), 'w')
 
         for line in f.readlines():
 
-            #Insert numbers
+            # Insert numbers
             if '//hls-fpga-machine-learning insert numbers' in line:
                 newline = line
                 numbers = OrderedDict.fromkeys([layer.get_numbers_cpp() for layer in model.get_layers()])
@@ -206,14 +210,15 @@ def write_defines(self, model):
 
     def write_parameters(self, model):
         filedir = os.path.dirname(os.path.abspath(__file__))
-        f = open(os.path.join(filedir,'../templates/quartus/firmware/parameters.h'),'r')
-        fout = open('{}/firmware/parameters.h'.format(model.config.get_output_dir()),'w')
+        f = open(os.path.join(filedir, '../templates/quartus/firmware/parameters.h'), 'r')
+        fout = open('{}/firmware/parameters.h'.format(model.config.get_output_dir()), 'w')
 
         for line in f.readlines():
 
             if '//hls-fpga-machine-learning insert includes' in line:
                 newline = line
-                for include in sorted(set(sum((layer.get_attr('include_header', []) for layer in model.get_layers()), []))):
+                for include in sorted(
+                        set(sum((layer.get_attr('include_header', []) for layer in model.get_layers()), []))):
                     newline += '#include "%s"\n' % include
 
             elif "//hls-fpga-machine-learning insert layer-config" in line:
@@ -256,21 +261,24 @@ def write_test_bench(self, model):
             if input_data[-3:] == "dat":
                 copyfile(input_data, '{}/tb_data/tb_input_features.dat'.format(model.config.get_output_dir()))
             else:
-                self.__make_dat_file(input_data,'{}/tb_data/tb_input_features.dat'.format(model.config.get_output_dir()))
+                self.__make_dat_file(input_data,
+                                     '{}/tb_data/tb_input_features.dat'.format(model.config.get_output_dir()))
 
         if output_predictions:
             if output_predictions[-3:] == "dat":
-                copyfile(output_predictions, '{}/tb_data/tb_output_predictions.dat'.format(model.config.get_output_dir()))
+                copyfile(output_predictions,
+                         '{}/tb_data/tb_output_predictions.dat'.format(model.config.get_output_dir()))
             else:
-                self.__make_dat_file(output_predictions,'{}/tb_data/tb_output_predictions.dat'.format(model.config.get_output_dir()))
+                self.__make_dat_file(output_predictions,
+                                     '{}/tb_data/tb_output_predictions.dat'.format(model.config.get_output_dir()))
 
-        f = open(os.path.join(filedir,'../templates/quartus/myproject_test.cpp'),'r')
-        fout = open('{}/{}_test.cpp'.format(model.config.get_output_dir(), model.config.get_project_name()),'w')
+        f = open(os.path.join(filedir, '../templates/quartus/myproject_test.cpp'), 'r')
+        fout = open('{}/{}_test.cpp'.format(model.config.get_output_dir(), model.config.get_project_name()), 'w')
 
         for line in f.readlines():
             indent = ' ' * (len(line) - len(line.lstrip(' ')))
 
-            #Insert numbers
+            # Insert numbers
             if 'myproject' in line:
                 newline = line.replace('myproject', model.config.get_project_name())
             elif '//hls-fpga-machine-learning insert data' in line:
@@ -332,8 +340,8 @@ def write_bridge(self, model):
         ###################
 
         filedir = os.path.dirname(os.path.abspath(__file__))
-        f = open(os.path.join(filedir,'../templates/quartus/myproject_bridge.cpp'),'r')
-        fout = open('{}/{}_bridge.cpp'.format(model.config.get_output_dir(), model.config.get_project_name()),'w')
+        f = open(os.path.join(filedir, '../templates/quartus/myproject_bridge.cpp'), 'r')
+        fout = open('{}/{}_bridge.cpp'.format(model.config.get_output_dir(), model.config.get_project_name()), 'w')
 
         model_inputs = model.get_input_variables()
         model_outputs = model.get_output_variables()
@@ -348,10 +356,16 @@ def write_bridge(self, model):
                 newline = line.replace('myproject', format(model.config.get_project_name()))
             elif '//hls-fpga-machine-learning insert header' in line:
                 dtype = line.split('#', 1)[1].strip()
-                inputs_str = ', '.join(['{type} {name}[{shape}]'.format(type=dtype, name=i.cppname, shape=i.size_cpp()) for i in model_inputs])
-                outputs_str = ', '.join(['{type} {name}[{shape}]'.format(type=dtype, name=o.cppname, shape=o.size_cpp()) for o in model_outputs])
-                insize_str = ', '.join(['unsigned short &const_size_in_{}'.format(i) for i in range(1, len(model_inputs) + 1)])
-                outsize_str = ', '.join(['unsigned short &const_size_out_{}'.format(o) for o in range(1, len(model_outputs) + 1)])
+                inputs_str = ', '.join(
+                    ['{type} {name}[{shape}]'.format(type=dtype, name=i.cppname, shape=i.size_cpp()) for i in
+                     model_inputs])
+                outputs_str = ', '.join(
+                    ['{type} {name}[{shape}]'.format(type=dtype, name=o.cppname, shape=o.size_cpp()) for o in
+                     model_outputs])
+                insize_str = ', '.join(
+                    ['unsigned short &const_size_in_{}'.format(i) for i in range(1, len(model_inputs) + 1)])
+                outsize_str = ', '.join(
+                    ['unsigned short &const_size_out_{}'.format(o) for o in range(1, len(model_outputs) + 1)])
 
                 newline = ''
                 newline += indent + inputs_str + ',\n'
@@ -364,7 +378,10 @@ def write_bridge(self, model):
                 newline = ''
                 newline += indent + 'input_data inputs_ap;\n'
                 for i in model_inputs:
-                    newline += indent + 'nnet::convert_data<{}, {}, {}>({}, inputs_ap.{});\n'.format(dtype, i.type.name, i.size_cpp(), i.cppname, i.cppname)
+                    newline += indent + 'nnet::convert_data<{}, {}, {}>({}, inputs_ap.{});\n'.format(dtype, i.type.name,
+                                                                                                     i.size_cpp(),
+                                                                                                     i.cppname,
+                                                                                                     i.cppname)
                 newline += '\n'
 
                 newline += indent + 'output_data outputs_ap;\n'
@@ -373,15 +390,21 @@ def write_bridge(self, model):
                 newline += '\n'
 
                 for o in model_outputs:
-                    newline += indent + 'nnet::convert_data_back<{}, {}, {}>(outputs_ap.{}, {});\n'.format(o.type.name, dtype, o.size_cpp(), o.cppname, o.cppname)
+                    newline += indent + 'nnet::convert_data_back<{}, {}, {}>(outputs_ap.{}, {});\n'.format(o.type.name,
+                                                                                                           dtype,
+                                                                                                           o.size_cpp(),
+                                                                                                           o.cppname,
+                                                                                                           o.cppname)
             elif '//hls-fpga-machine-learning insert trace_outputs' in line:
                 newline = ''
                 for layer in model.get_layers():
                     func = layer.get_attr('function_cpp')
-                    if func and model.config.trace_output and model.config.get_layer_config_value(layer, 'Trace', False):
-                            vars = layer.get_variables()
-                            for var in vars:
-                                newline += indent + 'nnet::trace_outputs->insert(std::pair<std::string, void *>("{}", (void *) malloc({} * element_size)));\n'.format(layer.name, var.size_cpp())
+                    if func and model.config.trace_output and model.config.get_layer_config_value(layer, 'Trace',
+                                                                                                  False):
+                        vars = layer.get_variables()
+                        for var in vars:
+                            newline += indent + 'nnet::trace_outputs->insert(std::pair<std::string, void *>("{}", (void *) malloc({} * element_size)));\n'.format(
+                                layer.name, var.size_cpp())
 
             else:
                 newline = line
@@ -396,12 +419,12 @@ def write_build_script(self, model):
         ###################
 
         filedir = os.path.dirname(os.path.abspath(__file__))
-        f = open(os.path.join(filedir,'../templates/quartus/Makefile'),'r')
-        fout = open('{}/Makefile'.format(model.config.get_output_dir()),'w')
+        f = open(os.path.join(filedir, '../templates/quartus/Makefile'), 'r')
+        fout = open('{}/Makefile'.format(model.config.get_output_dir()), 'w')
 
         for line in f.readlines():
 
-            line = line.replace('myproject',model.config.get_project_name())
+            line = line.replace('myproject', model.config.get_project_name())
 
             if 'DEVICE   :=' in line:
                 line = 'DEVICE   := {}\n'.format(model.config.get_config_value('Part'))
@@ -414,8 +437,8 @@ def write_build_script(self, model):
         # build_lib.sh
         ###################
 
-        f = open(os.path.join(filedir,'../templates/quartus/build_lib.sh'),'r')
-        fout = open('{}/build_lib.sh'.format(model.config.get_output_dir()),'w')
+        f = open(os.path.join(filedir, '../templates/quartus/build_lib.sh'), 'r')
+        fout = open('{}/build_lib.sh'.format(model.config.get_output_dir()), 'w')
 
         for line in f.readlines():
             line = line.replace('myproject', model.config.get_project_name())
@@ -432,7 +455,7 @@ def write_nnet_utils(self, model):
 
         filedir = os.path.dirname(os.path.abspath(__file__))
 
-        srcpath = os.path.join(filedir,'../templates/quartus/firmware/nnet_utils/')
+        srcpath = os.path.join(filedir, '../templates/quartus/firmware/nnet_utils/')
         dstpath = '{}/firmware/nnet_utils/'.format(model.config.get_output_dir())
 
         if not os.path.exists(dstpath):
@@ -449,7 +472,7 @@ def write_nnet_utils(self, model):
 
         filedir = os.path.dirname(os.path.abspath(__file__))
 
-        srcpath = os.path.join(filedir,'../templates/quartus/ac_types/')
+        srcpath = os.path.join(filedir, '../templates/quartus/ac_types/')
         dstpath = '{}/firmware/ac_types/'.format(model.config.get_output_dir())
 
         if os.path.exists(dstpath):
@@ -477,13 +500,13 @@ def __get_table_header(self, table_name, table_size):
     def __write_elu_table(self, model, path):
         table_name = 'elu_table'
         table_size = self.__get_table_size(model, 'elu')
-        
-        h_file = open('{}/{}.tb'.format(path, table_name),'w')
+
+        h_file = open('{}/{}.tb'.format(path, table_name), 'w')
         h_file.write(self.__get_table_header(table_name, table_size))
 
         sep = ''
         for i in range(table_size):
-            in_val = -8.0*i/float(table_size)
+            in_val = -8.0 * i / float(table_size)
             real_val = np.exp(in_val) - 1.
             h_file.write(sep + str(real_val))
             sep = ", "
@@ -495,18 +518,19 @@ def __write_elu_table(self, model, path):
     def __write_sigmoid_table(self, model, path):
         MAX_VALUE = 8
         MIN_VALUE = 0
-        
+
         table_name = 'sigmoid_table'
         table_size = self.__get_table_size(model, 'sigmoid')
-        
+
         h_file = open('{}/{}.tb'.format(path, table_name), 'w')
         h_file.write(self.__get_table_header(table_name, table_size))
 
         sep = ''
         for i in range(table_size):
-            in_val = i * (MAX_VALUE-MIN_VALUE)/float(table_size) + (MAX_VALUE-MIN_VALUE)/(float(table_size)*2) + MIN_VALUE
+            in_val = i * (MAX_VALUE - MIN_VALUE) / float(table_size) + (MAX_VALUE - MIN_VALUE) / (
+                        float(table_size) * 2) + MIN_VALUE
             real_val = 1.0 / (1 + np.exp(-in_val))
-            if(real_val >= 0.5):
+            if (real_val >= 0.5):
                 h_file.write(sep + str(real_val))
                 sep = ", "
 
@@ -517,18 +541,19 @@ def __write_sigmoid_table(self, model, path):
     def __write_tanh_table(self, model, path):
         MAX_VALUE = 4
         MIN_VALUE = 0
-        
+
         table_name = 'tanh_table'
         table_size = self.__get_table_size(model, 'dense_tanh')
 
         h_file = open('{}/{}.tb'.format(path, table_name), 'w')
         h_file.write(self.__get_table_header(table_name, table_size))
-        
+
         sep = ''
         for i in range(table_size):
-            in_val = i*(MAX_VALUE-MIN_VALUE)/float(table_size) + (MAX_VALUE-MIN_VALUE)/(float(table_size)*2) + MIN_VALUE
+            in_val = i * (MAX_VALUE - MIN_VALUE) / float(table_size) + (MAX_VALUE - MIN_VALUE) / (
+                        float(table_size) * 2) + MIN_VALUE
             real_val = np.tanh(in_val)
-            if(real_val >= 0):
+            if (real_val >= 0):
                 h_file.write(sep + str(real_val))
                 sep = ", "
 
@@ -545,11 +570,11 @@ def __write_softplus_table(self, model, path):
 
         sep = ''
         for i in range(table_size):
-            in_val = 2*8.0*(i-float(table_size)/2.0)/float(table_size)
+            in_val = 2 * 8.0 * (i - float(table_size) / 2.0) / float(table_size)
             real_val = np.log(np.exp(in_val) + 1.)
             h_file.write(sep + str(real_val))
             sep = ", "
-        
+
         h_file.write('};\n')
         h_file.write('\n#endif\n')
         h_file.close()
@@ -563,7 +588,7 @@ def __write_softsign_table(self, model, path):
 
         sep = ''
         for i in range(table_size):
-            in_val = 2*8.0*(i-float(table_size)/2.0)/float(table_size)
+            in_val = 2 * 8.0 * (i - float(table_size) / 2.0) / float(table_size)
             real_val = in_val / (np.fabs(in_val) + 1.)
             h_file.write(sep + str(real_val))
             sep = ", "
@@ -578,10 +603,10 @@ def __write_selu_table(self, model, path):
 
         h_file = open('{}/{}.tb'.format(path, table_name), 'w')
         h_file.write(self.__get_table_header(table_name, table_size))
-        
+
         sep = ''
         for i in range(table_size):
-            in_val = -8.0*i/float(table_size)
+            in_val = -8.0 * i / float(table_size)
             real_val = 1.0507009873554804934193349852946 * (1.6732632423543772848170429916717 * (np.exp(in_val) - 1.))
             h_file.write(sep + str(real_val))
             sep = ", "
@@ -596,11 +621,33 @@ def __write_exp_table(self, model, path):
 
         h_file = open('{}/{}.tb'.format(path, table_name), 'w')
         h_file.write(self.__get_table_header(table_name, table_size))
-        
+
+        # Default fixed point precision
+        # 6 bits for integer part, 10 bits for decimal - total, 16
+        fp_bits = 16
+        fp_integer = 6
+        fp_signed = True
+
+        # Exp table should use the same precision as exp_table, as seen in Vivado code
+        # init_exp_table<data_T, CONFIG_T>(exp_table);
+        for layer in model.get_layers():
+            if layer.name == 'softmax':
+                ac_type = layer.get_input_variable().type
+                if ac_type is not None:
+                    try:
+                        fp_bits = ac_type.precision.integer + ac_type.precision.fractional
+                        fp_integer = ac_type.precision.integer
+                        fp_signed = ac_type.precision.signed
+                    except:
+                        # FixedPrecisionType wasn't correctly stored in layer attributes, use default values
+                        pass
+
         sep = ''
+        N = ceil_log2(table_size)
         for i in range(table_size):
-            in_val = 2*8.0*(i-float(table_size)/2.0)/float(table_size)
-            real_val = np.exp(in_val)
+            f = FixedPointEmulator(fp_bits, fp_integer, signed=fp_signed)
+            f.set_msb_bits(uint_to_binary(i, N))
+            real_val = f.exp_float()
             h_file.write(sep + str(real_val))
             sep = ", "
 
@@ -614,16 +661,74 @@ def __write_invert_table(self, model, path):
 
         h_file = open('{}/{}.tb'.format(path, table_name), 'w')
         h_file.write(self.__get_table_header(table_name, table_size))
-        
+
+        # Default fixed point precision, in case values from layer attributes cannot be extracted
+        # 8 bits for integer part, 10 bits for decimal - total, 18
+        fp_bits = 18
+        fp_integer = 8
+        fp_signed = True
+
+        # Invert table should use the same precision as exp_table, as seen in Vivado code
+        # init_invert_table<typename CONFIG_T::exp_table_t, CONFIG_T>(invert_table);
+        for layer in model.get_layers():
+            if layer.name == 'softmax':
+                ac_type = layer.get_attr('exp_table_t')
+                if ac_type is not None:
+                    try:
+                        fp_bits = ac_type.precision.integer + ac_type.precision.fractional
+                        fp_integer = ac_type.precision.integer
+                        fp_signed = ac_type.precision.signed
+                    except:
+                        # FixedPrecisionType wasn't correctly stored in layer attributes, use default values
+                        pass
+
+        sep = ''
+        N = ceil_log2(table_size)
+        for i in range(table_size):
+            f = FixedPointEmulator(fp_bits, fp_integer, signed=fp_signed)
+            f.set_msb_bits(uint_to_binary(i, N))
+            real_val = f.inv_float()
+            h_file.write(sep + str(real_val))
+            sep = ", "
+
+        h_file.write('};\n')
+        h_file.write('\n#endif\n')
+        h_file.close()
+
+    def __write_exp_table_legacy(self, model, path):
+        table_name = 'exp_table_legacy'
+        table_size = self.__get_table_size(model, 'softmax')
+
+        h_file = open('{}/{}.tb'.format(path, table_name), 'w')
+        h_file.write(self.__get_table_header(table_name, table_size))
+
+        sep = ''
+        for i in range(table_size):
+            in_val = 2 * 8.0 * (i - float(table_size) / 2.0) / float(table_size)
+            real_val = np.exp(in_val)
+            h_file.write(sep + str(real_val))
+            sep = ", "
+
+        h_file.write('};\n')
+        h_file.write('\n#endif\n')
+        h_file.close()
+
+    def __write_invert_table_legacy(self, model, path):
+        table_name = 'invert_table_legacy'
+        table_size = self.__get_table_size(model, 'softmax')
+
+        h_file = open('{}/{}.tb'.format(path, table_name), 'w')
+        h_file.write(self.__get_table_header(table_name, table_size))
+
         sep = ''
         for i in range(table_size):
             real_val = 0
-            in_val = 64.0*i/float(table_size)
+            in_val = 64.0 * i / float(table_size)
             if (in_val > 0.0):
-                real_val = 1.0/in_val
+                real_val = 1.0 / in_val
             h_file.write(sep + str(real_val))
             sep = ", "
-        
+
         h_file.write('};\n')
         h_file.write('\n#endif\n')
         h_file.close()
@@ -633,8 +738,9 @@ def write_activation_tables(self, model):
         dstpath = '{}/firmware/nnet_utils/activation_tables'.format(model.config.get_output_dir())
         if not os.path.exists(dstpath):
             os.mkdir(dstpath)
-        
+
         # Tables
+        # TODO - Only write tables needed by model, not all of them
         self.__write_elu_table(model, dstpath)
         self.__write_sigmoid_table(model, dstpath)
         self.__write_tanh_table(model, dstpath)
@@ -643,7 +749,9 @@ def write_activation_tables(self, model):
         self.__write_selu_table(model, dstpath)
         self.__write_exp_table(model, dstpath)
         self.__write_invert_table(model, dstpath)
-       
+        self.__write_exp_table_legacy(model, dstpath)
+        self.__write_invert_table_legacy(model, dstpath)
+
     def write_yml(self, model):
         ###################
         # YAML config file

From c26904245d1dd129ebb7973a793cdb0ccb389e8a Mon Sep 17 00:00:00 2001
From: Benjamin Ramhorst <bramhorst27@gmail.com>
Date: Wed, 4 May 2022 16:43:21 +0100
Subject: [PATCH 2/5] Quartus balanced reduce tree implementation. Remove
 circular import in nnet_helpers.

---
 .../quartus/firmware/nnet_utils/nnet_common.h | 42 +++++++++++++++++++
 .../firmware/nnet_utils/nnet_helpers.h        |  6 ++-
 2 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_common.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_common.h
index cad53592c9..69b8579415 100644
--- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_common.h
+++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_common.h
@@ -24,10 +24,12 @@
 #include "ac_int.h"
 #include "ac_fixed.h"
 #include "math.h"
+#include "nnet_helpers.h"
 #else
 #include "HLS/ac_int.h"
 #include "HLS/ac_fixed.h"
 #include "HLS/math.h"
+#include "nnet_helpers.h"
 #endif
 
 typedef ac_fixed<16,6> table_default_t;
@@ -58,6 +60,46 @@ typedef ac_fixed<32,10> accum_t_def;
    }
  }
 
+  /* ---
+  * Balanced tree reduce implementation.
+  * For use in scenarios where Quartus cannot expression balance
+  * Reduces an array of inputs to a single value using the template binary operator 'Op',
+  * for example summing all elements with Op_add, or finding the maximum with Op_max
+  * Use only when the input array is fully unrolled. Or, slice out a fully unrolled section
+  * before applying and accumulate the result over the rolled dimension.
+  * --- */
+ template<class T, int N, class Op>
+ T reduce(const T* x, Op op)
+ {
+     static constexpr int leftN = pow2(floorlog2(N - 1)) > 0 ? pow2(floorlog2(N - 1)) : 0;
+     static constexpr int rightN = N - leftN > 0 ? N - leftN : 0;
+     if (N == 1){
+         return x[0];
+     }
+     if (N == 2){
+         return op(x[0],x[1]);
+     }
+     return op(reduce<T,leftN,Op>(x, op), reduce<T,rightN,Op>(x+leftN, op));
+ } 
+
+ 
+
+ template<class T>
+ class Op_add{
+ public:
+	 T operator()(T a, T b){
+		 return a + b;
+	 }
+ };
+
+ template<class T>
+ class Op_max{
+ public:
+     T operator()(T a, T b){
+        return a >= b ? a : b;
+     }
+ };
+
 }
 
 #endif
diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_helpers.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_helpers.h
index 40f36751cc..1027e8fb00 100755
--- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_helpers.h
+++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_helpers.h
@@ -26,7 +26,6 @@
 #include <fstream>
 #include <algorithm>
 #include <map>
-#include "nnet_common.h"
 
 namespace nnet {
 
@@ -50,6 +49,11 @@ extern size_t trace_type_size;
 constexpr int ceillog2(int x){
   return (x <= 2) ? 1 : 1 + ceillog2((x+1) / 2);
 }
+
+constexpr int floorlog2(int x){
+  return (x < 2) ? 0 : 1 + floorlog2(x / 2);
+}
+
 constexpr int pow2(int x){
   return x == 0 ? 1 : 2 * pow2(x - 1);
 }

From 8496fc531e587034468bc1ca8da3814db51aa304 Mon Sep 17 00:00:00 2001
From: Benjamin Ramhorst <bramhorst27@gmail.com>
Date: Thu, 5 May 2022 11:47:56 +0100
Subject: [PATCH 3/5] Quartus stable Softmax

---
 .../backends/quartus/passes/core_templates.py |   1 +
 hls4ml/backends/quartus/quartus_backend.py    |   5 +
 .../firmware/nnet_utils/nnet_activation.h     | 140 +++++++++++++-----
 3 files changed, 108 insertions(+), 38 deletions(-)

diff --git a/hls4ml/backends/quartus/passes/core_templates.py b/hls4ml/backends/quartus/passes/core_templates.py
index 88fed63994..c18447f88c 100644
--- a/hls4ml/backends/quartus/passes/core_templates.py
+++ b/hls4ml/backends/quartus/passes/core_templates.py
@@ -122,6 +122,7 @@ def format(self, node):
     static const unsigned table_size = {table_size};
     static const unsigned io_type = nnet::{iotype};
     static const unsigned reuse_factor = {reuse};
+    static const nnet::softmax_implementation implementation = nnet::softmax_implementation::{implementation};
     typedef {exp_table_t.name} exp_table_t;
     typedef {inv_table_t.name} inv_table_t;
 }};\n"""
diff --git a/hls4ml/backends/quartus/quartus_backend.py b/hls4ml/backends/quartus/quartus_backend.py
index e3e289aaeb..4ee6781f82 100644
--- a/hls4ml/backends/quartus/quartus_backend.py
+++ b/hls4ml/backends/quartus/quartus_backend.py
@@ -188,3 +188,8 @@ def init_softmax(self, layer):
             layer.set_attr('exp_table_t', layer.get_attr('table_t'))
         if 'inv_table_t' not in layer.attributes:
             layer.set_attr('inv_table_t', layer.get_attr('table_t'))
+        if layer.model.config.is_resource_strategy(layer):
+            # 'resource' strategy = 'latency' for Softmax
+            layer.set_attr('implementation', 'latency')
+        else:
+            layer.set_attr('implementation', layer.model.config.get_strategy(layer).lower())
diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation.h
index 1eb9524fe4..5b4452199d 100755
--- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation.h
+++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation.h
@@ -126,47 +126,111 @@ void  sigmoid(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
 // *************************************************
 //       Softmax Activation
 // *************************************************
+
+enum class softmax_implementation {latency=0, legacy=1, stable=2};
+
+template<class data_T, typename CONFIG_T>
+inline unsigned softmax_idx_from_real_val(const data_T x){
+    // Number of address bits for table
+    static constexpr int N = ceillog2(CONFIG_T::table_size);    
+
+    // Slice the top N bits of the input
+    hls_register ac_int<N, false> y = x.template slc<N>(x.width-N);             
+    return y.to_uint();
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void softmax_stable(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){
+    // Look-up tables
+    #include "activation_tables/exp_table.tb"
+    #include "activation_tables/invert_table.tb"
+
+    // Find maximum
+    Op_max<data_T> op_max;
+    hls_register data_T x_max = reduce<data_T, CONFIG_T::n_in, Op_max<data_T>>(data, op_max);
+    
+    // Calculate differences from the maximum, forcing rounding and saturation for better accuracy
+    hls_register ac_fixed<data_T::width, data_T::i_width, true, AC_RND, AC_SAT> d_xi_xmax[CONFIG_T::n_in];
+    #pragma unroll
+    for(unsigned i = 0; i < CONFIG_T::n_in; i++) {
+        d_xi_xmax[i] = data[i] - x_max;
+    }
+
+    // Calculate all the e^x's
+    hls_register typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in];
+    #pragma unroll
+    for(unsigned i = 0; i < CONFIG_T::n_in; i++) {
+        exp_res[i] = exp_table[softmax_idx_from_real_val<data_T, CONFIG_T>(d_xi_xmax[i])];
+    }
+
+    // Explicitly sum previously calculated exponentials with an adder tree
+    Op_add<typename CONFIG_T::exp_table_t> op_add;
+    hls_register typename CONFIG_T::exp_table_t exp_sum = reduce<typename CONFIG_T::exp_table_t, CONFIG_T::n_in, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
+
+    // Multiply previously calculated exponetials with the reciprocal of the sum
+    hls_register typename CONFIG_T::inv_table_t inv_exp_sum = invert_table[softmax_idx_from_real_val<typename CONFIG_T::exp_table_t,CONFIG_T>(exp_sum)];
+    #pragma unroll
+    for(unsigned i = 0; i < CONFIG_T::n_in; i++) {
+        res[i] = exp_res[i] * inv_exp_sum;
+    }
+}
+
 template<class data_T, class res_T, typename CONFIG_T>
-void  softmax(  data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
+void softmax_legacy(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
 {
-  #include "activation_tables/exp_table.tb"
-  #include "activation_tables/invert_table.tb"
+    #include "activation_tables/exp_table_legacy.tb"
+    #include "activation_tables/invert_table_legacy.tb"
 
-  hls_register int data_round[CONFIG_T::n_in];
-  New_loop:
-  #pragma unroll
-  for (int ii=0; ii<CONFIG_T::n_in; ii++) {
-      data_round[ii] = (data[ii] * CONFIG_T::table_size/16).to_int();
-  }
-  NN_Outer:
-  #pragma unroll
-  for (int ii=0; ii<CONFIG_T::n_in; ii++) {
-      typename CONFIG_T::exp_table_t exp_res_temp = 0;
-      NN_Inner:
-      #pragma unroll
-      for (int jj=0; jj<CONFIG_T::n_in; jj++)
-      {
-          if (ii==jj)
-          {
-              exp_res_temp += 1;
-          }
-          else
-          {
-              int _data_cache = (data_round[jj]-data_round[ii]);
-              int index = _data_cache + 8*CONFIG_T::table_size/16;
-
-              if (index < 0)   index = 0;
-              if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1;
-
-              typename CONFIG_T::exp_table_t temp_exp = exp_table[index];
-              exp_res_temp += temp_exp;
-          }
-      }
-      int exp_res_index = (exp_res_temp * CONFIG_T::table_size/64).to_int();
-      if (exp_res_index < 0)   exp_res_index = 0;
-      if (exp_res_index > CONFIG_T::table_size-1) exp_res_index = CONFIG_T::table_size-1;
-      res[ii] = invert_table[exp_res_index];
-  }
+    hls_register int data_round[CONFIG_T::n_in];
+    New_loop:
+    #pragma unroll
+    for (int ii=0; ii<CONFIG_T::n_in; ii++) {
+        data_round[ii] = (data[ii] * CONFIG_T::table_size/16).to_int();
+    }
+    NN_Outer:
+    #pragma unroll
+    for (int ii=0; ii<CONFIG_T::n_in; ii++) {
+        typename CONFIG_T::exp_table_t exp_res_temp = 0;
+        NN_Inner:
+        #pragma unroll
+        for (int jj=0; jj<CONFIG_T::n_in; jj++)
+        {
+            if (ii==jj)
+            {
+                exp_res_temp += 1;
+            }
+            else
+            {
+                int _data_cache = (data_round[jj]-data_round[ii]);
+                int index = _data_cache + 8*CONFIG_T::table_size/16;
+
+                if (index < 0)   index = 0;
+                if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1;
+
+                typename CONFIG_T::exp_table_t temp_exp = exp_table_legacy[index];
+                exp_res_temp += temp_exp;
+            }
+        }
+        int exp_res_index = (exp_res_temp * CONFIG_T::table_size/64).to_int();
+        if (exp_res_index < 0)   exp_res_index = 0;
+        if (exp_res_index > CONFIG_T::table_size-1) exp_res_index = CONFIG_T::table_size-1;
+        res[ii] = invert_table_legacy[exp_res_index];
+    }
+}
+
+template<class data_T, class res_T, typename CONFIG_T>
+inline void softmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){
+    switch(CONFIG_T::implementation) {
+        case softmax_implementation::stable:
+            softmax_stable<data_T, res_T, CONFIG_T>(data, res);
+            break;
+        case softmax_implementation::legacy:
+            softmax_legacy<data_T, res_T, CONFIG_T>(data, res);
+            break;
+        default:
+            softmax_stable<data_T, res_T, CONFIG_T>(data, res);
+            break;
+    }
 }
 
 // *************************************************

From 7351c76f18231a91f7cd706a269c482fe3ffd460 Mon Sep 17 00:00:00 2001
From: Benjamin Ramhorst <bramhorst27@gmail.com>
Date: Thu, 5 May 2022 12:08:09 +0100
Subject: [PATCH 4/5] Quartus latency Softmax

---
 .../firmware/nnet_utils/nnet_activation.h     | 49 +++++++++--
 hls4ml/writer/quartus_writer.py               | 82 +++++++++++++++++++
 2 files changed, 123 insertions(+), 8 deletions(-)

diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation.h
index 5b4452199d..ce1c2b1f6b 100755
--- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation.h
+++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation.h
@@ -175,9 +175,42 @@ void softmax_stable(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){
     }
 }
 
+// TODO - Improve accuracy
+template <class data_T, class res_T, typename CONFIG_T>
+void softmax_latency(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){
+    /*
+    * Note: The latency tables are equivalent to stable tables
+    * However, the compiler cannot include the same table twice
+    * Therefore, an out-of-scope exception is thrown in one of the functions
+    * Temporary solution - Create the same table twice in quartus_writer.py
+    * Long-term solution - Only create tables needed by the network;
+    * Currently, quartus-writer.py generates LUTs for all activations,
+    * Regardless if they are present in the network or not
+    */
+    #include "activation_tables/exp_table_latency.tb"
+    #include "activation_tables/invert_table_latency.tb"
+    
+    // Calculate all the e^x's
+    hls_register typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in];
+    #pragma unroll
+    for(unsigned i = 0; i < CONFIG_T::n_in; i++) {
+        exp_res[i] = exp_table_latency[softmax_idx_from_real_val<data_T, CONFIG_T>(data[i])];
+    }
+
+    // Explicitly sum the results with an adder tree.
+    Op_add<typename CONFIG_T::exp_table_t> op_add;
+    hls_register typename CONFIG_T::exp_table_t exp_sum = reduce<typename CONFIG_T::exp_table_t, CONFIG_T::n_in, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
+
+    // Multiply previously calculated exponetials with the reciprocal of the sum
+    hls_register typename CONFIG_T::inv_table_t inv_exp_sum = invert_table_latency[softmax_idx_from_real_val<typename CONFIG_T::exp_table_t,CONFIG_T>(exp_sum)];
+    #pragma unroll
+    for(unsigned i = 0; i < CONFIG_T::n_in; i++){
+        res[i] = exp_res[i] * inv_exp_sum;
+    }
+}
+
 template<class data_T, class res_T, typename CONFIG_T>
-void softmax_legacy(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
-{
+void softmax_legacy(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
     #include "activation_tables/exp_table_legacy.tb"
     #include "activation_tables/invert_table_legacy.tb"
 
@@ -193,14 +226,11 @@ void softmax_legacy(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
         typename CONFIG_T::exp_table_t exp_res_temp = 0;
         NN_Inner:
         #pragma unroll
-        for (int jj=0; jj<CONFIG_T::n_in; jj++)
-        {
-            if (ii==jj)
-            {
+        for (int jj=0; jj<CONFIG_T::n_in; jj++) {
+            if (ii==jj) {
                 exp_res_temp += 1;
             }
-            else
-            {
+            else {
                 int _data_cache = (data_round[jj]-data_round[ii]);
                 int index = _data_cache + 8*CONFIG_T::table_size/16;
 
@@ -224,6 +254,9 @@ inline void softmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){
         case softmax_implementation::stable:
             softmax_stable<data_T, res_T, CONFIG_T>(data, res);
             break;
+        case softmax_implementation::latency:
+            softmax_latency<data_T, res_T, CONFIG_T>(data, res);
+            break;
         case softmax_implementation::legacy:
             softmax_legacy<data_T, res_T, CONFIG_T>(data, res);
             break;
diff --git a/hls4ml/writer/quartus_writer.py b/hls4ml/writer/quartus_writer.py
index 99559a6177..f8c35eea05 100644
--- a/hls4ml/writer/quartus_writer.py
+++ b/hls4ml/writer/quartus_writer.py
@@ -695,6 +695,86 @@ def __write_invert_table(self, model, path):
         h_file.write('\n#endif\n')
         h_file.close()
 
+    def __write_exp_table_latency(self, model, path):
+        table_name = 'exp_table_latency'
+        table_size = self.__get_table_size(model, 'softmax')
+
+        h_file = open('{}/{}.tb'.format(path, table_name), 'w')
+        h_file.write(self.__get_table_header(table_name, table_size))
+
+        # Default fixed point precision
+        # 6 bits for integer part, 10 bits for decimal - total, 16
+        fp_bits = 16
+        fp_integer = 6
+        fp_signed = True
+
+        # Exp table should use the same precision as exp_table, as seen in Vivado code
+        # init_exp_table<data_T, CONFIG_T>(exp_table);
+        for layer in model.get_layers():
+            if layer.name == 'softmax':
+                ac_type = layer.get_input_variable().type
+                if ac_type is not None:
+                    try:
+                        fp_bits = ac_type.precision.integer + ac_type.precision.fractional
+                        fp_integer = ac_type.precision.integer
+                        fp_signed = ac_type.precision.signed
+                    except:
+                        # FixedPrecisionType wasn't correctly stored in layer attributes, use default values
+                        pass
+
+        sep = ''
+        N = ceil_log2(table_size)
+        for i in range(table_size):
+            f = FixedPointEmulator(fp_bits, fp_integer, signed=fp_signed)
+            f.set_msb_bits(uint_to_binary(i, N))
+            real_val = f.exp_float()
+            h_file.write(sep + str(real_val))
+            sep = ", "
+
+        h_file.write('};\n')
+        h_file.write('\n#endif\n')
+        h_file.close()
+
+    def __write_invert_table_latency(self, model, path):
+        table_name = 'invert_table_latency'
+        table_size = self.__get_table_size(model, 'softmax')
+
+        h_file = open('{}/{}.tb'.format(path, table_name), 'w')
+        h_file.write(self.__get_table_header(table_name, table_size))
+
+        # Default fixed point precision, in case values from layer attributes cannot be extracted
+        # 8 bits for integer part, 10 bits for decimal - total, 18
+        fp_bits = 18
+        fp_integer = 8
+        fp_signed = True
+
+        # Invert table should use the same precision as exp_table, as seen in Vivado code
+        # init_invert_table<typename CONFIG_T::exp_table_t, CONFIG_T>(invert_table);
+        for layer in model.get_layers():
+            if layer.name == 'softmax':
+                ac_type = layer.get_attr('exp_table_t')
+                if ac_type is not None:
+                    try:
+                        fp_bits = ac_type.precision.integer + ac_type.precision.fractional
+                        fp_integer = ac_type.precision.integer
+                        fp_signed = ac_type.precision.signed
+                    except:
+                        # FixedPrecisionType wasn't correctly stored in layer attributes, use default values
+                        pass
+
+        sep = ''
+        N = ceil_log2(table_size)
+        for i in range(table_size):
+            f = FixedPointEmulator(fp_bits, fp_integer, signed=fp_signed)
+            f.set_msb_bits(uint_to_binary(i, N))
+            real_val = f.inv_float()
+            h_file.write(sep + str(real_val))
+            sep = ", "
+
+        h_file.write('};\n')
+        h_file.write('\n#endif\n')
+        h_file.close()
+
     def __write_exp_table_legacy(self, model, path):
         table_name = 'exp_table_legacy'
         table_size = self.__get_table_size(model, 'softmax')
@@ -749,6 +829,8 @@ def write_activation_tables(self, model):
         self.__write_selu_table(model, dstpath)
         self.__write_exp_table(model, dstpath)
         self.__write_invert_table(model, dstpath)
+        self.__write_exp_table_latency(model, dstpath)
+        self.__write_invert_table_latency(model, dstpath)
         self.__write_exp_table_legacy(model, dstpath)
         self.__write_invert_table_legacy(model, dstpath)
 

From a7753fbe3d7de2ac82f17ae51f953b3cba133ebe Mon Sep 17 00:00:00 2001
From: Benjamin Ramhorst <bramhorst27@gmail.com>
Date: Thu, 5 May 2022 12:09:19 +0100
Subject: [PATCH 5/5] Quartus tests for Softmax

---
 test/pytest/test_softmax.py | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/test/pytest/test_softmax.py b/test/pytest/test_softmax.py
index 44bfb9dc63..08625d9314 100644
--- a/test/pytest/test_softmax.py
+++ b/test/pytest/test_softmax.py
@@ -24,20 +24,27 @@ def generate_data(function, input_shape):
     return function((1000, *input_shape))
 
 
-# TODO: include latency strategy with flat_distribution when it can be made to pass
+# TODO: Include latency strategy with flat_distribution when it can be made to pass
 @pytest.mark.parametrize('backend,strategy,function,input_shape,io_type', [   
-                            #('latency', flat_distribution, (8,), 'io_parallel'),
-                            #('latency', flat_distribution, (8, 8, 3), 'io_stream'),
                             ('Vivado', 'stable', flat_distribution, (8,), 'io_parallel'),
                             ('Vivado', 'stable', high_accuracy_distribution, (8,), 'io_parallel'),
-                            ('Quartus', 'resource', flat_distribution, (8,), 'io_parallel'),
-                            ('Quartus', 'resource', high_accuracy_distribution, (8,), 'io_parallel'),
+                            
+                            ('Quartus', 'stable', flat_distribution, (8,), 'io_parallel'),
+                            ('Quartus', 'stable', high_accuracy_distribution, (8,), 'io_parallel'),
+
+                            # Streaming, single-dimensional implementation (not supported on Quartus yet)
                             ('Vivado', 'stable', flat_distribution, (8,), 'io_stream'),
                             ('Vivado', 'stable', high_accuracy_distribution, (8,), 'io_stream'),
+                            
                             # Multi-dimensional tests, only for io_stream for now
                             ('Vivado', 'stable', flat_distribution, (8, 8, 3), 'io_stream'),
-                            ('Vivado', 'stable', high_accuracy_distribution, (8, 8, 3), 'io_stream')
-                        
+                            ('Vivado', 'stable', high_accuracy_distribution, (8, 8, 3), 'io_stream'),
+
+                            # Latency, include when test pass
+                            #('Vivado', 'latency', flat_distribution, (8,), 'io_parallel'),
+                            #('Vivado', 'latency', flat_distribution, (8, 8, 3), 'io_stream'),
+                            #('Quartus', 'latency', flat_distribution, (8,), 'io_parallel'),
+                            
                         ])
 def test_softmax(backend, strategy, generate_data, input_shape, io_type):
     X = generate_data