From 773956ec122a085b5e93456778542b160f51e1a4 Mon Sep 17 00:00:00 2001 From: Benjamin Ramhorst Date: Wed, 4 May 2022 16:11:56 +0100 Subject: [PATCH 1/5] Improved Quartus Softmax LUT - Vivado-equivalent approach --- hls4ml/utils/fixed_point_utils.py | 126 +++++++++++++++ hls4ml/writer/quartus_writer.py | 260 +++++++++++++++++++++--------- 2 files changed, 310 insertions(+), 76 deletions(-) create mode 100644 hls4ml/utils/fixed_point_utils.py diff --git a/hls4ml/utils/fixed_point_utils.py b/hls4ml/utils/fixed_point_utils.py new file mode 100644 index 0000000000..0060cc5360 --- /dev/null +++ b/hls4ml/utils/fixed_point_utils.py @@ -0,0 +1,126 @@ +import sys +import math + +''' +A helper class for handling fixed point methods +Currently, very limited, allowing only: + - Conversion to float + - Exponents + - Reciprocals +Used primarily for generating softmax look-up table +by using bit manipulation (see Vivado-equivalent implementation) +''' +class FixedPointEmulator: + ''' + Default constructor + Args: + - N : Total number of bits in the fixed point number + - I : Integer bits in the fixed point number + - F = N-I : Fixed point bits in the number + - signed : True/False - If True, use 2's complement when converting to float + - self.integer_bits : Bits corresponding to the integer part of the number + - self.decimal_bits : Bits corresponding to the decimal part of the number + ''' + def __init__(self, N, I, signed=True, integer_bits=None, decimal_bits=None): + self.N = N + self.I = I + self.F = N - I + self.signed = signed + self.integer_bits = [0] * self.I if integer_bits is None else integer_bits + self.decimal_bits = [0] * self.F if decimal_bits is None else decimal_bits + + ''' + Converts the fixed point number stored in self.bits to a floating pont + Args: + - None + Returns: + - val : float, the floating point equivalent of the fixed point number + Description: + 1. Check if the number is signed, and if so, set intermediate result to -2.0^(I-1) or 0.0 + otherwise, set intermediate result to +2.0^(I-1) or 0.0 + 2. Traverse through integer bits, incrementing result by 2.0^(i) (using left shift) + 3. Traverse through decimal bits, incrementing result by 2.0^(-i) (using pow) + Note: + - This function uses left shifts instead of integer powers of 2. + ''' + def to_float(self): + val = float(int(self.integer_bits[0]) << (self.I - 1)) + val = -val if self.signed else val + + for i in range(self.I - 1, 0, -1): + val += float(int(self.integer_bits[self.I - i]) << (i - 1)) + + for i in range(0, self.F): + if (self.decimal_bits[i]): + val += pow(2, -(i + 1)) + + return val + + ''' + Sets the top bits of the current number + Args: + - bits : Values top bit should be set to + ''' + def set_msb_bits(self, bits): + for i in range(0, len(bits)): + if i < self.I: + self.integer_bits[i] = bits[i] + elif i >= self.I and i log2(i)+1 +''' +def uint_to_binary(i, N): + # Gets the binary representation of the number + bits = [int(b) for b in list('{0:0b}'.format(i))] + + # Zero padding, so exactly N bits are used + while (len(bits) < N): + bits.insert(0, 0) + + return bits + + +''' + Returns log2(i), rounding up + Args: + - i : Number + Returns: + - val : representing ceil(log2(i)) +''' +def ceil_log2(i): + return i.bit_length()-1 \ No newline at end of file diff --git a/hls4ml/writer/quartus_writer.py b/hls4ml/writer/quartus_writer.py index 9a49422885..99559a6177 100644 --- a/hls4ml/writer/quartus_writer.py +++ b/hls4ml/writer/quartus_writer.py @@ -3,24 +3,27 @@ import yaml from shutil import copyfile, copytree, rmtree import numpy as np +import re import os import glob from collections import OrderedDict from hls4ml.writer.writers import Writer +from hls4ml.utils.fixed_point_utils import FixedPointEmulator, ceil_log2, uint_to_binary config_filename = 'hls4ml_config.yml' + class QuartusWriter(Writer): def next_pow2(self, x): - return 1<<(x-1).bit_length() + return 1 << (x - 1).bit_length() def get_max_reuse_factor(self, model): max_rf = 0 for layer in model.get_layers(): rf = int(layer.get_attr('reuse_factor')) - if(rf > max_rf): + if (rf > max_rf): max_rf = rf return max_rf @@ -28,9 +31,9 @@ def print_array_to_cpp(self, var, layer, odir): ####################################### ## Print weight array to C++ ####################################### - h_file = open("{}/firmware/weights/{}.h".format(odir,var.name),"w") + h_file = open("{}/firmware/weights/{}.h".format(odir, var.name), "w") - #meta data + # meta data h_file.write("//Numpy array shape {}\n".format(var.shape)) h_file.write("//Min {:.12f}\n".format(np.min(var.min))) h_file.write("//Max {:.12f}\n".format(np.max(var.max))) @@ -43,21 +46,22 @@ def print_array_to_cpp(self, var, layer, odir): rf = int(layer.get_attr('reuse_factor')) weight_header = '#ifdef __INTELFPGA_COMPILER__\n' - if (rf == 1 or var.name[0] == 'b' or layer.get_attr('n_in')*layer.get_attr('n_out') <= 2048 + if (rf == 1 or var.name[0] == 'b' or layer.get_attr('n_in') * layer.get_attr('n_out') <= 2048 or (var.name[0] == 'w' and var.type.precision.width < 3)): weight_header += 'hls_init_on_powerup\n' else: - block_factor = (layer.get_attr('n_in')*layer.get_attr('n_out'))/rf - nbanks = int(2**np.ceil(np.log2(block_factor)) / 2) + block_factor = (layer.get_attr('n_in') * layer.get_attr('n_out')) / rf + nbanks = int(2 ** np.ceil(np.log2(block_factor)) / 2) var_width = int(np.ceil(var.type.precision.width / 8)) bwidth = self.next_pow2(var_width) - weight_header += 'hls_bankwidth({bwidth})\nhls_numbanks({nbanks})\nhls_max_replicates(1)\nhls_memory_impl("BLOCK_RAM")\n'.format(bwidth=bwidth, nbanks=nbanks) + weight_header += 'hls_bankwidth({bwidth})\nhls_numbanks({nbanks})\nhls_max_replicates(1)\nhls_memory_impl("BLOCK_RAM")\n'.format( + bwidth=bwidth, nbanks=nbanks) weight_header += '#endif\n' weight_header += 'static const ' h_file.write(weight_header + var.definition_cpp() + " = {") - #fill c++ array. - #not including internal brackets for multidimensional case + # fill c++ array. + # not including internal brackets for multidimensional case sep = '' for x in var: h_file.write(sep + x) @@ -76,8 +80,8 @@ def write_project_cpp(self, model): ################### filedir = os.path.dirname(os.path.abspath(__file__)) - f = open(os.path.join(filedir,'../templates/quartus/firmware/myproject.cpp'),'r') - fout = open('{}/firmware/{}.cpp'.format(model.config.get_output_dir(), model.config.get_project_name()),'w') + f = open(os.path.join(filedir, '../templates/quartus/firmware/myproject.cpp'), 'r') + fout = open('{}/firmware/{}.cpp'.format(model.config.get_output_dir(), model.config.get_project_name()), 'w') model_inputs = model.get_input_variables() model_outputs = model.get_output_variables() @@ -85,7 +89,7 @@ def write_project_cpp(self, model): indent = ' ' for line in f.readlines(): - #Add headers to weights and biases + # Add headers to weights and biases if 'myproject' in line: newline = line.replace('myproject', model.config.get_project_name()) @@ -94,7 +98,7 @@ def write_project_cpp(self, model): newline = line newline += 'hls_max_concurrency(0)\n' newline += 'hls_component_ii({})\n'.format(self.get_max_reuse_factor(model)) - clock_mhz = 1000/(model.config.get_config_value('ClockPeriod')) + clock_mhz = 1000 / (model.config.get_config_value('ClockPeriod')) newline += 'hls_scheduler_target_fmax_mhz({})\n'.format(np.ceil(clock_mhz).astype(np.int)) elif '//hls-fpga-machine-learning insert weights' in line: @@ -118,14 +122,14 @@ def write_project_cpp(self, model): def_cpp = var.definition_cpp() if def_cpp is not None: newline += ' ' + def_cpp + ';\n' - if layer.get_attr('activation') == 'tanh': #TODO move this to an optimizer + if layer.get_attr('activation') == 'tanh': # TODO move this to an optimizer layer.set_attr('activation') == 'dense_tanh' func = layer.get_attr('function_cpp', None) if func: newline += ' ' + func + '\n' newline += '\n' - #Just copy line + # Just copy line else: newline = line @@ -140,8 +144,8 @@ def write_project_header(self, model): ####################### filedir = os.path.dirname(os.path.abspath(__file__)) - f = open(os.path.join(filedir,'../templates/quartus/firmware/myproject.h'),'r') - fout = open('{}/firmware/{}.h'.format(model.config.get_output_dir(), model.config.get_project_name()),'w') + f = open(os.path.join(filedir, '../templates/quartus/firmware/myproject.h'), 'r') + fout = open('{}/firmware/{}.h'.format(model.config.get_output_dir(), model.config.get_project_name()), 'w') model_inputs = model.get_input_variables() model_outputs = model.get_output_variables() @@ -151,14 +155,14 @@ def write_project_header(self, model): for line in f.readlines(): if 'MYPROJECT' in line: - newline = line.replace('MYPROJECT',format(model.config.get_project_name().upper())) + newline = line.replace('MYPROJECT', format(model.config.get_project_name().upper())) elif 'myproject' in line: newline = line.replace('myproject', model.config.get_project_name()) elif '//hls-fpga-machine-learning insert cpragmas' in line: newline = line newline += 'hls_max_concurrency(0)\n' newline += 'hls_component_ii({})\n'.format(self.get_max_reuse_factor(model)) - clock_mhz = 1000/(model.config.get_config_value('ClockPeriod')) + clock_mhz = 1000 / (model.config.get_config_value('ClockPeriod')) newline += 'hls_scheduler_target_fmax_mhz({})\n'.format(np.ceil(clock_mhz).astype(np.int)) elif 'component output_data myproject(' in line: newline = 'component output_data {}(\n'.format(model.config.get_project_name()) @@ -179,12 +183,12 @@ def write_project_header(self, model): def write_defines(self, model): filedir = os.path.dirname(os.path.abspath(__file__)) - f = open(os.path.join(filedir,'../templates/quartus/firmware/defines.h'),'r') - fout = open('{}/firmware/defines.h'.format(model.config.get_output_dir()),'w') + f = open(os.path.join(filedir, '../templates/quartus/firmware/defines.h'), 'r') + fout = open('{}/firmware/defines.h'.format(model.config.get_output_dir()), 'w') for line in f.readlines(): - #Insert numbers + # Insert numbers if '//hls-fpga-machine-learning insert numbers' in line: newline = line numbers = OrderedDict.fromkeys([layer.get_numbers_cpp() for layer in model.get_layers()]) @@ -206,14 +210,15 @@ def write_defines(self, model): def write_parameters(self, model): filedir = os.path.dirname(os.path.abspath(__file__)) - f = open(os.path.join(filedir,'../templates/quartus/firmware/parameters.h'),'r') - fout = open('{}/firmware/parameters.h'.format(model.config.get_output_dir()),'w') + f = open(os.path.join(filedir, '../templates/quartus/firmware/parameters.h'), 'r') + fout = open('{}/firmware/parameters.h'.format(model.config.get_output_dir()), 'w') for line in f.readlines(): if '//hls-fpga-machine-learning insert includes' in line: newline = line - for include in sorted(set(sum((layer.get_attr('include_header', []) for layer in model.get_layers()), []))): + for include in sorted( + set(sum((layer.get_attr('include_header', []) for layer in model.get_layers()), []))): newline += '#include "%s"\n' % include elif "//hls-fpga-machine-learning insert layer-config" in line: @@ -256,21 +261,24 @@ def write_test_bench(self, model): if input_data[-3:] == "dat": copyfile(input_data, '{}/tb_data/tb_input_features.dat'.format(model.config.get_output_dir())) else: - self.__make_dat_file(input_data,'{}/tb_data/tb_input_features.dat'.format(model.config.get_output_dir())) + self.__make_dat_file(input_data, + '{}/tb_data/tb_input_features.dat'.format(model.config.get_output_dir())) if output_predictions: if output_predictions[-3:] == "dat": - copyfile(output_predictions, '{}/tb_data/tb_output_predictions.dat'.format(model.config.get_output_dir())) + copyfile(output_predictions, + '{}/tb_data/tb_output_predictions.dat'.format(model.config.get_output_dir())) else: - self.__make_dat_file(output_predictions,'{}/tb_data/tb_output_predictions.dat'.format(model.config.get_output_dir())) + self.__make_dat_file(output_predictions, + '{}/tb_data/tb_output_predictions.dat'.format(model.config.get_output_dir())) - f = open(os.path.join(filedir,'../templates/quartus/myproject_test.cpp'),'r') - fout = open('{}/{}_test.cpp'.format(model.config.get_output_dir(), model.config.get_project_name()),'w') + f = open(os.path.join(filedir, '../templates/quartus/myproject_test.cpp'), 'r') + fout = open('{}/{}_test.cpp'.format(model.config.get_output_dir(), model.config.get_project_name()), 'w') for line in f.readlines(): indent = ' ' * (len(line) - len(line.lstrip(' '))) - #Insert numbers + # Insert numbers if 'myproject' in line: newline = line.replace('myproject', model.config.get_project_name()) elif '//hls-fpga-machine-learning insert data' in line: @@ -332,8 +340,8 @@ def write_bridge(self, model): ################### filedir = os.path.dirname(os.path.abspath(__file__)) - f = open(os.path.join(filedir,'../templates/quartus/myproject_bridge.cpp'),'r') - fout = open('{}/{}_bridge.cpp'.format(model.config.get_output_dir(), model.config.get_project_name()),'w') + f = open(os.path.join(filedir, '../templates/quartus/myproject_bridge.cpp'), 'r') + fout = open('{}/{}_bridge.cpp'.format(model.config.get_output_dir(), model.config.get_project_name()), 'w') model_inputs = model.get_input_variables() model_outputs = model.get_output_variables() @@ -348,10 +356,16 @@ def write_bridge(self, model): newline = line.replace('myproject', format(model.config.get_project_name())) elif '//hls-fpga-machine-learning insert header' in line: dtype = line.split('#', 1)[1].strip() - inputs_str = ', '.join(['{type} {name}[{shape}]'.format(type=dtype, name=i.cppname, shape=i.size_cpp()) for i in model_inputs]) - outputs_str = ', '.join(['{type} {name}[{shape}]'.format(type=dtype, name=o.cppname, shape=o.size_cpp()) for o in model_outputs]) - insize_str = ', '.join(['unsigned short &const_size_in_{}'.format(i) for i in range(1, len(model_inputs) + 1)]) - outsize_str = ', '.join(['unsigned short &const_size_out_{}'.format(o) for o in range(1, len(model_outputs) + 1)]) + inputs_str = ', '.join( + ['{type} {name}[{shape}]'.format(type=dtype, name=i.cppname, shape=i.size_cpp()) for i in + model_inputs]) + outputs_str = ', '.join( + ['{type} {name}[{shape}]'.format(type=dtype, name=o.cppname, shape=o.size_cpp()) for o in + model_outputs]) + insize_str = ', '.join( + ['unsigned short &const_size_in_{}'.format(i) for i in range(1, len(model_inputs) + 1)]) + outsize_str = ', '.join( + ['unsigned short &const_size_out_{}'.format(o) for o in range(1, len(model_outputs) + 1)]) newline = '' newline += indent + inputs_str + ',\n' @@ -364,7 +378,10 @@ def write_bridge(self, model): newline = '' newline += indent + 'input_data inputs_ap;\n' for i in model_inputs: - newline += indent + 'nnet::convert_data<{}, {}, {}>({}, inputs_ap.{});\n'.format(dtype, i.type.name, i.size_cpp(), i.cppname, i.cppname) + newline += indent + 'nnet::convert_data<{}, {}, {}>({}, inputs_ap.{});\n'.format(dtype, i.type.name, + i.size_cpp(), + i.cppname, + i.cppname) newline += '\n' newline += indent + 'output_data outputs_ap;\n' @@ -373,15 +390,21 @@ def write_bridge(self, model): newline += '\n' for o in model_outputs: - newline += indent + 'nnet::convert_data_back<{}, {}, {}>(outputs_ap.{}, {});\n'.format(o.type.name, dtype, o.size_cpp(), o.cppname, o.cppname) + newline += indent + 'nnet::convert_data_back<{}, {}, {}>(outputs_ap.{}, {});\n'.format(o.type.name, + dtype, + o.size_cpp(), + o.cppname, + o.cppname) elif '//hls-fpga-machine-learning insert trace_outputs' in line: newline = '' for layer in model.get_layers(): func = layer.get_attr('function_cpp') - if func and model.config.trace_output and model.config.get_layer_config_value(layer, 'Trace', False): - vars = layer.get_variables() - for var in vars: - newline += indent + 'nnet::trace_outputs->insert(std::pair("{}", (void *) malloc({} * element_size)));\n'.format(layer.name, var.size_cpp()) + if func and model.config.trace_output and model.config.get_layer_config_value(layer, 'Trace', + False): + vars = layer.get_variables() + for var in vars: + newline += indent + 'nnet::trace_outputs->insert(std::pair("{}", (void *) malloc({} * element_size)));\n'.format( + layer.name, var.size_cpp()) else: newline = line @@ -396,12 +419,12 @@ def write_build_script(self, model): ################### filedir = os.path.dirname(os.path.abspath(__file__)) - f = open(os.path.join(filedir,'../templates/quartus/Makefile'),'r') - fout = open('{}/Makefile'.format(model.config.get_output_dir()),'w') + f = open(os.path.join(filedir, '../templates/quartus/Makefile'), 'r') + fout = open('{}/Makefile'.format(model.config.get_output_dir()), 'w') for line in f.readlines(): - line = line.replace('myproject',model.config.get_project_name()) + line = line.replace('myproject', model.config.get_project_name()) if 'DEVICE :=' in line: line = 'DEVICE := {}\n'.format(model.config.get_config_value('Part')) @@ -414,8 +437,8 @@ def write_build_script(self, model): # build_lib.sh ################### - f = open(os.path.join(filedir,'../templates/quartus/build_lib.sh'),'r') - fout = open('{}/build_lib.sh'.format(model.config.get_output_dir()),'w') + f = open(os.path.join(filedir, '../templates/quartus/build_lib.sh'), 'r') + fout = open('{}/build_lib.sh'.format(model.config.get_output_dir()), 'w') for line in f.readlines(): line = line.replace('myproject', model.config.get_project_name()) @@ -432,7 +455,7 @@ def write_nnet_utils(self, model): filedir = os.path.dirname(os.path.abspath(__file__)) - srcpath = os.path.join(filedir,'../templates/quartus/firmware/nnet_utils/') + srcpath = os.path.join(filedir, '../templates/quartus/firmware/nnet_utils/') dstpath = '{}/firmware/nnet_utils/'.format(model.config.get_output_dir()) if not os.path.exists(dstpath): @@ -449,7 +472,7 @@ def write_nnet_utils(self, model): filedir = os.path.dirname(os.path.abspath(__file__)) - srcpath = os.path.join(filedir,'../templates/quartus/ac_types/') + srcpath = os.path.join(filedir, '../templates/quartus/ac_types/') dstpath = '{}/firmware/ac_types/'.format(model.config.get_output_dir()) if os.path.exists(dstpath): @@ -477,13 +500,13 @@ def __get_table_header(self, table_name, table_size): def __write_elu_table(self, model, path): table_name = 'elu_table' table_size = self.__get_table_size(model, 'elu') - - h_file = open('{}/{}.tb'.format(path, table_name),'w') + + h_file = open('{}/{}.tb'.format(path, table_name), 'w') h_file.write(self.__get_table_header(table_name, table_size)) sep = '' for i in range(table_size): - in_val = -8.0*i/float(table_size) + in_val = -8.0 * i / float(table_size) real_val = np.exp(in_val) - 1. h_file.write(sep + str(real_val)) sep = ", " @@ -495,18 +518,19 @@ def __write_elu_table(self, model, path): def __write_sigmoid_table(self, model, path): MAX_VALUE = 8 MIN_VALUE = 0 - + table_name = 'sigmoid_table' table_size = self.__get_table_size(model, 'sigmoid') - + h_file = open('{}/{}.tb'.format(path, table_name), 'w') h_file.write(self.__get_table_header(table_name, table_size)) sep = '' for i in range(table_size): - in_val = i * (MAX_VALUE-MIN_VALUE)/float(table_size) + (MAX_VALUE-MIN_VALUE)/(float(table_size)*2) + MIN_VALUE + in_val = i * (MAX_VALUE - MIN_VALUE) / float(table_size) + (MAX_VALUE - MIN_VALUE) / ( + float(table_size) * 2) + MIN_VALUE real_val = 1.0 / (1 + np.exp(-in_val)) - if(real_val >= 0.5): + if (real_val >= 0.5): h_file.write(sep + str(real_val)) sep = ", " @@ -517,18 +541,19 @@ def __write_sigmoid_table(self, model, path): def __write_tanh_table(self, model, path): MAX_VALUE = 4 MIN_VALUE = 0 - + table_name = 'tanh_table' table_size = self.__get_table_size(model, 'dense_tanh') h_file = open('{}/{}.tb'.format(path, table_name), 'w') h_file.write(self.__get_table_header(table_name, table_size)) - + sep = '' for i in range(table_size): - in_val = i*(MAX_VALUE-MIN_VALUE)/float(table_size) + (MAX_VALUE-MIN_VALUE)/(float(table_size)*2) + MIN_VALUE + in_val = i * (MAX_VALUE - MIN_VALUE) / float(table_size) + (MAX_VALUE - MIN_VALUE) / ( + float(table_size) * 2) + MIN_VALUE real_val = np.tanh(in_val) - if(real_val >= 0): + if (real_val >= 0): h_file.write(sep + str(real_val)) sep = ", " @@ -545,11 +570,11 @@ def __write_softplus_table(self, model, path): sep = '' for i in range(table_size): - in_val = 2*8.0*(i-float(table_size)/2.0)/float(table_size) + in_val = 2 * 8.0 * (i - float(table_size) / 2.0) / float(table_size) real_val = np.log(np.exp(in_val) + 1.) h_file.write(sep + str(real_val)) sep = ", " - + h_file.write('};\n') h_file.write('\n#endif\n') h_file.close() @@ -563,7 +588,7 @@ def __write_softsign_table(self, model, path): sep = '' for i in range(table_size): - in_val = 2*8.0*(i-float(table_size)/2.0)/float(table_size) + in_val = 2 * 8.0 * (i - float(table_size) / 2.0) / float(table_size) real_val = in_val / (np.fabs(in_val) + 1.) h_file.write(sep + str(real_val)) sep = ", " @@ -578,10 +603,10 @@ def __write_selu_table(self, model, path): h_file = open('{}/{}.tb'.format(path, table_name), 'w') h_file.write(self.__get_table_header(table_name, table_size)) - + sep = '' for i in range(table_size): - in_val = -8.0*i/float(table_size) + in_val = -8.0 * i / float(table_size) real_val = 1.0507009873554804934193349852946 * (1.6732632423543772848170429916717 * (np.exp(in_val) - 1.)) h_file.write(sep + str(real_val)) sep = ", " @@ -596,11 +621,33 @@ def __write_exp_table(self, model, path): h_file = open('{}/{}.tb'.format(path, table_name), 'w') h_file.write(self.__get_table_header(table_name, table_size)) - + + # Default fixed point precision + # 6 bits for integer part, 10 bits for decimal - total, 16 + fp_bits = 16 + fp_integer = 6 + fp_signed = True + + # Exp table should use the same precision as exp_table, as seen in Vivado code + # init_exp_table(exp_table); + for layer in model.get_layers(): + if layer.name == 'softmax': + ac_type = layer.get_input_variable().type + if ac_type is not None: + try: + fp_bits = ac_type.precision.integer + ac_type.precision.fractional + fp_integer = ac_type.precision.integer + fp_signed = ac_type.precision.signed + except: + # FixedPrecisionType wasn't correctly stored in layer attributes, use default values + pass + sep = '' + N = ceil_log2(table_size) for i in range(table_size): - in_val = 2*8.0*(i-float(table_size)/2.0)/float(table_size) - real_val = np.exp(in_val) + f = FixedPointEmulator(fp_bits, fp_integer, signed=fp_signed) + f.set_msb_bits(uint_to_binary(i, N)) + real_val = f.exp_float() h_file.write(sep + str(real_val)) sep = ", " @@ -614,16 +661,74 @@ def __write_invert_table(self, model, path): h_file = open('{}/{}.tb'.format(path, table_name), 'w') h_file.write(self.__get_table_header(table_name, table_size)) - + + # Default fixed point precision, in case values from layer attributes cannot be extracted + # 8 bits for integer part, 10 bits for decimal - total, 18 + fp_bits = 18 + fp_integer = 8 + fp_signed = True + + # Invert table should use the same precision as exp_table, as seen in Vivado code + # init_invert_table(invert_table); + for layer in model.get_layers(): + if layer.name == 'softmax': + ac_type = layer.get_attr('exp_table_t') + if ac_type is not None: + try: + fp_bits = ac_type.precision.integer + ac_type.precision.fractional + fp_integer = ac_type.precision.integer + fp_signed = ac_type.precision.signed + except: + # FixedPrecisionType wasn't correctly stored in layer attributes, use default values + pass + + sep = '' + N = ceil_log2(table_size) + for i in range(table_size): + f = FixedPointEmulator(fp_bits, fp_integer, signed=fp_signed) + f.set_msb_bits(uint_to_binary(i, N)) + real_val = f.inv_float() + h_file.write(sep + str(real_val)) + sep = ", " + + h_file.write('};\n') + h_file.write('\n#endif\n') + h_file.close() + + def __write_exp_table_legacy(self, model, path): + table_name = 'exp_table_legacy' + table_size = self.__get_table_size(model, 'softmax') + + h_file = open('{}/{}.tb'.format(path, table_name), 'w') + h_file.write(self.__get_table_header(table_name, table_size)) + + sep = '' + for i in range(table_size): + in_val = 2 * 8.0 * (i - float(table_size) / 2.0) / float(table_size) + real_val = np.exp(in_val) + h_file.write(sep + str(real_val)) + sep = ", " + + h_file.write('};\n') + h_file.write('\n#endif\n') + h_file.close() + + def __write_invert_table_legacy(self, model, path): + table_name = 'invert_table_legacy' + table_size = self.__get_table_size(model, 'softmax') + + h_file = open('{}/{}.tb'.format(path, table_name), 'w') + h_file.write(self.__get_table_header(table_name, table_size)) + sep = '' for i in range(table_size): real_val = 0 - in_val = 64.0*i/float(table_size) + in_val = 64.0 * i / float(table_size) if (in_val > 0.0): - real_val = 1.0/in_val + real_val = 1.0 / in_val h_file.write(sep + str(real_val)) sep = ", " - + h_file.write('};\n') h_file.write('\n#endif\n') h_file.close() @@ -633,8 +738,9 @@ def write_activation_tables(self, model): dstpath = '{}/firmware/nnet_utils/activation_tables'.format(model.config.get_output_dir()) if not os.path.exists(dstpath): os.mkdir(dstpath) - + # Tables + # TODO - Only write tables needed by model, not all of them self.__write_elu_table(model, dstpath) self.__write_sigmoid_table(model, dstpath) self.__write_tanh_table(model, dstpath) @@ -643,7 +749,9 @@ def write_activation_tables(self, model): self.__write_selu_table(model, dstpath) self.__write_exp_table(model, dstpath) self.__write_invert_table(model, dstpath) - + self.__write_exp_table_legacy(model, dstpath) + self.__write_invert_table_legacy(model, dstpath) + def write_yml(self, model): ################### # YAML config file From c26904245d1dd129ebb7973a793cdb0ccb389e8a Mon Sep 17 00:00:00 2001 From: Benjamin Ramhorst Date: Wed, 4 May 2022 16:43:21 +0100 Subject: [PATCH 2/5] Quartus balanced reduce tree implementation. Remove circular import in nnet_helpers. --- .../quartus/firmware/nnet_utils/nnet_common.h | 42 +++++++++++++++++++ .../firmware/nnet_utils/nnet_helpers.h | 6 ++- 2 files changed, 47 insertions(+), 1 deletion(-) diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_common.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_common.h index cad53592c9..69b8579415 100644 --- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_common.h +++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_common.h @@ -24,10 +24,12 @@ #include "ac_int.h" #include "ac_fixed.h" #include "math.h" +#include "nnet_helpers.h" #else #include "HLS/ac_int.h" #include "HLS/ac_fixed.h" #include "HLS/math.h" +#include "nnet_helpers.h" #endif typedef ac_fixed<16,6> table_default_t; @@ -58,6 +60,46 @@ typedef ac_fixed<32,10> accum_t_def; } } + /* --- + * Balanced tree reduce implementation. + * For use in scenarios where Quartus cannot expression balance + * Reduces an array of inputs to a single value using the template binary operator 'Op', + * for example summing all elements with Op_add, or finding the maximum with Op_max + * Use only when the input array is fully unrolled. Or, slice out a fully unrolled section + * before applying and accumulate the result over the rolled dimension. + * --- */ + template + T reduce(const T* x, Op op) + { + static constexpr int leftN = pow2(floorlog2(N - 1)) > 0 ? pow2(floorlog2(N - 1)) : 0; + static constexpr int rightN = N - leftN > 0 ? N - leftN : 0; + if (N == 1){ + return x[0]; + } + if (N == 2){ + return op(x[0],x[1]); + } + return op(reduce(x, op), reduce(x+leftN, op)); + } + + + + template + class Op_add{ + public: + T operator()(T a, T b){ + return a + b; + } + }; + + template + class Op_max{ + public: + T operator()(T a, T b){ + return a >= b ? a : b; + } + }; + } #endif diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_helpers.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_helpers.h index 40f36751cc..1027e8fb00 100755 --- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_helpers.h +++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_helpers.h @@ -26,7 +26,6 @@ #include #include #include -#include "nnet_common.h" namespace nnet { @@ -50,6 +49,11 @@ extern size_t trace_type_size; constexpr int ceillog2(int x){ return (x <= 2) ? 1 : 1 + ceillog2((x+1) / 2); } + +constexpr int floorlog2(int x){ + return (x < 2) ? 0 : 1 + floorlog2(x / 2); +} + constexpr int pow2(int x){ return x == 0 ? 1 : 2 * pow2(x - 1); } From 8496fc531e587034468bc1ca8da3814db51aa304 Mon Sep 17 00:00:00 2001 From: Benjamin Ramhorst Date: Thu, 5 May 2022 11:47:56 +0100 Subject: [PATCH 3/5] Quartus stable Softmax --- .../backends/quartus/passes/core_templates.py | 1 + hls4ml/backends/quartus/quartus_backend.py | 5 + .../firmware/nnet_utils/nnet_activation.h | 140 +++++++++++++----- 3 files changed, 108 insertions(+), 38 deletions(-) diff --git a/hls4ml/backends/quartus/passes/core_templates.py b/hls4ml/backends/quartus/passes/core_templates.py index 88fed63994..c18447f88c 100644 --- a/hls4ml/backends/quartus/passes/core_templates.py +++ b/hls4ml/backends/quartus/passes/core_templates.py @@ -122,6 +122,7 @@ def format(self, node): static const unsigned table_size = {table_size}; static const unsigned io_type = nnet::{iotype}; static const unsigned reuse_factor = {reuse}; + static const nnet::softmax_implementation implementation = nnet::softmax_implementation::{implementation}; typedef {exp_table_t.name} exp_table_t; typedef {inv_table_t.name} inv_table_t; }};\n""" diff --git a/hls4ml/backends/quartus/quartus_backend.py b/hls4ml/backends/quartus/quartus_backend.py index e3e289aaeb..4ee6781f82 100644 --- a/hls4ml/backends/quartus/quartus_backend.py +++ b/hls4ml/backends/quartus/quartus_backend.py @@ -188,3 +188,8 @@ def init_softmax(self, layer): layer.set_attr('exp_table_t', layer.get_attr('table_t')) if 'inv_table_t' not in layer.attributes: layer.set_attr('inv_table_t', layer.get_attr('table_t')) + if layer.model.config.is_resource_strategy(layer): + # 'resource' strategy = 'latency' for Softmax + layer.set_attr('implementation', 'latency') + else: + layer.set_attr('implementation', layer.model.config.get_strategy(layer).lower()) diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation.h index 1eb9524fe4..5b4452199d 100755 --- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation.h +++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation.h @@ -126,47 +126,111 @@ void sigmoid(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) // ************************************************* // Softmax Activation // ************************************************* + +enum class softmax_implementation {latency=0, legacy=1, stable=2}; + +template +inline unsigned softmax_idx_from_real_val(const data_T x){ + // Number of address bits for table + static constexpr int N = ceillog2(CONFIG_T::table_size); + + // Slice the top N bits of the input + hls_register ac_int y = x.template slc(x.width-N); + return y.to_uint(); +} + +template +void softmax_stable(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){ + // Look-up tables + #include "activation_tables/exp_table.tb" + #include "activation_tables/invert_table.tb" + + // Find maximum + Op_max op_max; + hls_register data_T x_max = reduce>(data, op_max); + + // Calculate differences from the maximum, forcing rounding and saturation for better accuracy + hls_register ac_fixed d_xi_xmax[CONFIG_T::n_in]; + #pragma unroll + for(unsigned i = 0; i < CONFIG_T::n_in; i++) { + d_xi_xmax[i] = data[i] - x_max; + } + + // Calculate all the e^x's + hls_register typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in]; + #pragma unroll + for(unsigned i = 0; i < CONFIG_T::n_in; i++) { + exp_res[i] = exp_table[softmax_idx_from_real_val(d_xi_xmax[i])]; + } + + // Explicitly sum previously calculated exponentials with an adder tree + Op_add op_add; + hls_register typename CONFIG_T::exp_table_t exp_sum = reduce>(exp_res, op_add); + + // Multiply previously calculated exponetials with the reciprocal of the sum + hls_register typename CONFIG_T::inv_table_t inv_exp_sum = invert_table[softmax_idx_from_real_val(exp_sum)]; + #pragma unroll + for(unsigned i = 0; i < CONFIG_T::n_in; i++) { + res[i] = exp_res[i] * inv_exp_sum; + } +} + template -void softmax( data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) +void softmax_legacy(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { - #include "activation_tables/exp_table.tb" - #include "activation_tables/invert_table.tb" + #include "activation_tables/exp_table_legacy.tb" + #include "activation_tables/invert_table_legacy.tb" - hls_register int data_round[CONFIG_T::n_in]; - New_loop: - #pragma unroll - for (int ii=0; ii CONFIG_T::table_size-1) index = CONFIG_T::table_size-1; - - typename CONFIG_T::exp_table_t temp_exp = exp_table[index]; - exp_res_temp += temp_exp; - } - } - int exp_res_index = (exp_res_temp * CONFIG_T::table_size/64).to_int(); - if (exp_res_index < 0) exp_res_index = 0; - if (exp_res_index > CONFIG_T::table_size-1) exp_res_index = CONFIG_T::table_size-1; - res[ii] = invert_table[exp_res_index]; - } + hls_register int data_round[CONFIG_T::n_in]; + New_loop: + #pragma unroll + for (int ii=0; ii CONFIG_T::table_size-1) index = CONFIG_T::table_size-1; + + typename CONFIG_T::exp_table_t temp_exp = exp_table_legacy[index]; + exp_res_temp += temp_exp; + } + } + int exp_res_index = (exp_res_temp * CONFIG_T::table_size/64).to_int(); + if (exp_res_index < 0) exp_res_index = 0; + if (exp_res_index > CONFIG_T::table_size-1) exp_res_index = CONFIG_T::table_size-1; + res[ii] = invert_table_legacy[exp_res_index]; + } +} + +template +inline void softmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){ + switch(CONFIG_T::implementation) { + case softmax_implementation::stable: + softmax_stable(data, res); + break; + case softmax_implementation::legacy: + softmax_legacy(data, res); + break; + default: + softmax_stable(data, res); + break; + } } // ************************************************* From 7351c76f18231a91f7cd706a269c482fe3ffd460 Mon Sep 17 00:00:00 2001 From: Benjamin Ramhorst Date: Thu, 5 May 2022 12:08:09 +0100 Subject: [PATCH 4/5] Quartus latency Softmax --- .../firmware/nnet_utils/nnet_activation.h | 49 +++++++++-- hls4ml/writer/quartus_writer.py | 82 +++++++++++++++++++ 2 files changed, 123 insertions(+), 8 deletions(-) diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation.h index 5b4452199d..ce1c2b1f6b 100755 --- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation.h +++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_activation.h @@ -175,9 +175,42 @@ void softmax_stable(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){ } } +// TODO - Improve accuracy +template +void softmax_latency(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){ + /* + * Note: The latency tables are equivalent to stable tables + * However, the compiler cannot include the same table twice + * Therefore, an out-of-scope exception is thrown in one of the functions + * Temporary solution - Create the same table twice in quartus_writer.py + * Long-term solution - Only create tables needed by the network; + * Currently, quartus-writer.py generates LUTs for all activations, + * Regardless if they are present in the network or not + */ + #include "activation_tables/exp_table_latency.tb" + #include "activation_tables/invert_table_latency.tb" + + // Calculate all the e^x's + hls_register typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in]; + #pragma unroll + for(unsigned i = 0; i < CONFIG_T::n_in; i++) { + exp_res[i] = exp_table_latency[softmax_idx_from_real_val(data[i])]; + } + + // Explicitly sum the results with an adder tree. + Op_add op_add; + hls_register typename CONFIG_T::exp_table_t exp_sum = reduce>(exp_res, op_add); + + // Multiply previously calculated exponetials with the reciprocal of the sum + hls_register typename CONFIG_T::inv_table_t inv_exp_sum = invert_table_latency[softmax_idx_from_real_val(exp_sum)]; + #pragma unroll + for(unsigned i = 0; i < CONFIG_T::n_in; i++){ + res[i] = exp_res[i] * inv_exp_sum; + } +} + template -void softmax_legacy(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) -{ +void softmax_legacy(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) { #include "activation_tables/exp_table_legacy.tb" #include "activation_tables/invert_table_legacy.tb" @@ -193,14 +226,11 @@ void softmax_legacy(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) typename CONFIG_T::exp_table_t exp_res_temp = 0; NN_Inner: #pragma unroll - for (int jj=0; jj(data, res); break; + case softmax_implementation::latency: + softmax_latency(data, res); + break; case softmax_implementation::legacy: softmax_legacy(data, res); break; diff --git a/hls4ml/writer/quartus_writer.py b/hls4ml/writer/quartus_writer.py index 99559a6177..f8c35eea05 100644 --- a/hls4ml/writer/quartus_writer.py +++ b/hls4ml/writer/quartus_writer.py @@ -695,6 +695,86 @@ def __write_invert_table(self, model, path): h_file.write('\n#endif\n') h_file.close() + def __write_exp_table_latency(self, model, path): + table_name = 'exp_table_latency' + table_size = self.__get_table_size(model, 'softmax') + + h_file = open('{}/{}.tb'.format(path, table_name), 'w') + h_file.write(self.__get_table_header(table_name, table_size)) + + # Default fixed point precision + # 6 bits for integer part, 10 bits for decimal - total, 16 + fp_bits = 16 + fp_integer = 6 + fp_signed = True + + # Exp table should use the same precision as exp_table, as seen in Vivado code + # init_exp_table(exp_table); + for layer in model.get_layers(): + if layer.name == 'softmax': + ac_type = layer.get_input_variable().type + if ac_type is not None: + try: + fp_bits = ac_type.precision.integer + ac_type.precision.fractional + fp_integer = ac_type.precision.integer + fp_signed = ac_type.precision.signed + except: + # FixedPrecisionType wasn't correctly stored in layer attributes, use default values + pass + + sep = '' + N = ceil_log2(table_size) + for i in range(table_size): + f = FixedPointEmulator(fp_bits, fp_integer, signed=fp_signed) + f.set_msb_bits(uint_to_binary(i, N)) + real_val = f.exp_float() + h_file.write(sep + str(real_val)) + sep = ", " + + h_file.write('};\n') + h_file.write('\n#endif\n') + h_file.close() + + def __write_invert_table_latency(self, model, path): + table_name = 'invert_table_latency' + table_size = self.__get_table_size(model, 'softmax') + + h_file = open('{}/{}.tb'.format(path, table_name), 'w') + h_file.write(self.__get_table_header(table_name, table_size)) + + # Default fixed point precision, in case values from layer attributes cannot be extracted + # 8 bits for integer part, 10 bits for decimal - total, 18 + fp_bits = 18 + fp_integer = 8 + fp_signed = True + + # Invert table should use the same precision as exp_table, as seen in Vivado code + # init_invert_table(invert_table); + for layer in model.get_layers(): + if layer.name == 'softmax': + ac_type = layer.get_attr('exp_table_t') + if ac_type is not None: + try: + fp_bits = ac_type.precision.integer + ac_type.precision.fractional + fp_integer = ac_type.precision.integer + fp_signed = ac_type.precision.signed + except: + # FixedPrecisionType wasn't correctly stored in layer attributes, use default values + pass + + sep = '' + N = ceil_log2(table_size) + for i in range(table_size): + f = FixedPointEmulator(fp_bits, fp_integer, signed=fp_signed) + f.set_msb_bits(uint_to_binary(i, N)) + real_val = f.inv_float() + h_file.write(sep + str(real_val)) + sep = ", " + + h_file.write('};\n') + h_file.write('\n#endif\n') + h_file.close() + def __write_exp_table_legacy(self, model, path): table_name = 'exp_table_legacy' table_size = self.__get_table_size(model, 'softmax') @@ -749,6 +829,8 @@ def write_activation_tables(self, model): self.__write_selu_table(model, dstpath) self.__write_exp_table(model, dstpath) self.__write_invert_table(model, dstpath) + self.__write_exp_table_latency(model, dstpath) + self.__write_invert_table_latency(model, dstpath) self.__write_exp_table_legacy(model, dstpath) self.__write_invert_table_legacy(model, dstpath) From a7753fbe3d7de2ac82f17ae51f953b3cba133ebe Mon Sep 17 00:00:00 2001 From: Benjamin Ramhorst Date: Thu, 5 May 2022 12:09:19 +0100 Subject: [PATCH 5/5] Quartus tests for Softmax --- test/pytest/test_softmax.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/test/pytest/test_softmax.py b/test/pytest/test_softmax.py index 44bfb9dc63..08625d9314 100644 --- a/test/pytest/test_softmax.py +++ b/test/pytest/test_softmax.py @@ -24,20 +24,27 @@ def generate_data(function, input_shape): return function((1000, *input_shape)) -# TODO: include latency strategy with flat_distribution when it can be made to pass +# TODO: Include latency strategy with flat_distribution when it can be made to pass @pytest.mark.parametrize('backend,strategy,function,input_shape,io_type', [ - #('latency', flat_distribution, (8,), 'io_parallel'), - #('latency', flat_distribution, (8, 8, 3), 'io_stream'), ('Vivado', 'stable', flat_distribution, (8,), 'io_parallel'), ('Vivado', 'stable', high_accuracy_distribution, (8,), 'io_parallel'), - ('Quartus', 'resource', flat_distribution, (8,), 'io_parallel'), - ('Quartus', 'resource', high_accuracy_distribution, (8,), 'io_parallel'), + + ('Quartus', 'stable', flat_distribution, (8,), 'io_parallel'), + ('Quartus', 'stable', high_accuracy_distribution, (8,), 'io_parallel'), + + # Streaming, single-dimensional implementation (not supported on Quartus yet) ('Vivado', 'stable', flat_distribution, (8,), 'io_stream'), ('Vivado', 'stable', high_accuracy_distribution, (8,), 'io_stream'), + # Multi-dimensional tests, only for io_stream for now ('Vivado', 'stable', flat_distribution, (8, 8, 3), 'io_stream'), - ('Vivado', 'stable', high_accuracy_distribution, (8, 8, 3), 'io_stream') - + ('Vivado', 'stable', high_accuracy_distribution, (8, 8, 3), 'io_stream'), + + # Latency, include when test pass + #('Vivado', 'latency', flat_distribution, (8,), 'io_parallel'), + #('Vivado', 'latency', flat_distribution, (8, 8, 3), 'io_stream'), + #('Quartus', 'latency', flat_distribution, (8,), 'io_parallel'), + ]) def test_softmax(backend, strategy, generate_data, input_shape, io_type): X = generate_data