Skip to content

Commit

Permalink
add thneed optimizer (commaai#23772)
Browse files Browse the repository at this point in the history
* add thneed optimizer

* local work group opt

* kernels and final mods

* release files

* build system touchups

* fix kernel path, rand inputs for self test

* broken since extra is gone

* update model replay ref

Co-authored-by: Comma Device <device@comma.ai>
  • Loading branch information
geohot and Comma Device committed Feb 15, 2022
1 parent 7176f5c commit 90beaeb
Show file tree
Hide file tree
Showing 12 changed files with 903 additions and 7 deletions.
2 changes: 2 additions & 0 deletions release/files_common
Original file line number Diff line number Diff line change
Expand Up @@ -426,7 +426,9 @@ selfdrive/modeld/transforms/transform.cl
selfdrive/modeld/thneed/thneed.*
selfdrive/modeld/thneed/serialize.cc
selfdrive/modeld/thneed/compile.cc
selfdrive/modeld/thneed/optimizer.cc
selfdrive/modeld/thneed/include/*
selfdrive/modeld/thneed/kernels/*.cl

selfdrive/modeld/runners/snpemodel.cc
selfdrive/modeld/runners/snpemodel.h
Expand Down
13 changes: 10 additions & 3 deletions selfdrive/modeld/SConscript
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import os

Import('env', 'arch', 'cereal', 'messaging', 'common', 'gpucommon', 'visionipc')
lenv = env.Clone()

Expand All @@ -23,6 +25,7 @@ common_src = [
thneed_src = [
"thneed/thneed.cc",
"thneed/serialize.cc",
"thneed/optimizer.cc",
"runners/thneedmodel.cc",
]

Expand Down Expand Up @@ -62,12 +65,16 @@ common_model = lenv.Object(common_src)

# build thneed model
if use_thneed and arch in ("aarch64", "larch64"):
fn = "../../models/supercombo"
compiler = lenv.Program('thneed/compile', ["thneed/compile.cc"]+common_model, LIBS=libs)
cmd = f"cd {Dir('.').abspath} && {compiler[0].abspath} ../../models/supercombo.dlc ../../models/supercombo.thneed --binary"
cmd = f"cd {Dir('.').abspath} && {compiler[0].abspath} {fn}.dlc {fn}.thneed --binary"

lib_paths = ':'.join(Dir(p).abspath for p in lenv["LIBPATH"])
cenv = Environment(ENV={'LD_LIBRARY_PATH': f"{lib_paths}:{lenv['ENV']['LD_LIBRARY_PATH']}"})
cenv.Command("../../models/supercombo.thneed", ["../../models/supercombo.dlc", compiler], cmd)
kernel_path = os.path.join(Dir('.').abspath, "thneed", "kernels")
cenv = Environment(ENV={'LD_LIBRARY_PATH': f"{lib_paths}:{lenv['ENV']['LD_LIBRARY_PATH']}", 'KERNEL_PATH': kernel_path})

kernels = [os.path.join(kernel_path, x) for x in os.listdir(kernel_path) if x.endswith(".cl")]
cenv.Command(fn + ".thneed", [fn + ".dlc", kernels, compiler], cmd)

lenv.Program('_dmonitoringmodeld', [
"dmonitoringmodeld.cc",
Expand Down
11 changes: 9 additions & 2 deletions selfdrive/modeld/runners/snpemodel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include <cstring>

#include "selfdrive/common/util.h"
#include "selfdrive/common/timing.h"

void PrintErrorStringAndExit() {
std::cerr << zdl::DlSystem::getLastErrorString() << std::endl;
Expand Down Expand Up @@ -158,8 +159,14 @@ void SNPEModel::execute(float *net_input_buf, int buf_size) {
float *outputs_golden = (float *)malloc(output_size*sizeof(float));
memcpy(outputs_golden, output, output_size*sizeof(float));
memset(output, 0, output_size*sizeof(float));
memset(recurrent, 0, recurrent_size*sizeof(float));
thneed->execute(inputs, output);

for (int i = 0; i < 5; i++) {
memset(recurrent, 0, recurrent_size*sizeof(float));
uint64_t start_time = nanos_since_boot();
thneed->execute(inputs, output);
uint64_t elapsed_time = nanos_since_boot() - start_time;
printf("ran model in %.2f ms\n", float(elapsed_time)/1e6);
}

if (memcmp(output, outputs_golden, output_size*sizeof(float)) == 0) {
printf("thneed selftest passed\n");
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
__kernel void convolution_horizontal_reduced_reads(
read_only image2d_t input,
short startPackedInputChannel,
short numPackedInputChannelsForGroup, short totalNumPackedInputChannels,
short packedOuputChannelOffset, short totalNumPackedOutputChannels,
read_only image2d_t weights, __constant float *biases,
short filterSizeX, short filterSizeY,
write_only image2d_t output,
short paddingX, short paddingY, short strideX, short strideY,
short dilationX, short dilationY,
short neuron, float a, float b, float min_clamp, float max_clamp,
__constant float *parameters, __constant float *batchNormBiases,
short numOutputColumns) {

// init
const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
short packedOutputChannel = get_global_id(0);
short startOutputColumn = mul24((short)get_global_id(1), 4);
short outputRow = get_global_id(2);
short startX = mad24(mad24(startOutputColumn, strideX, -paddingX),
totalNumPackedInputChannels, startPackedInputChannel);
short strideWithChannels = mul24(strideX, totalNumPackedInputChannels);

float4 outputValues[4];
for (short i = 0; i < 4; ++i) {
outputValues[i] = (float4)(0, 0, 0, 0);
}

int2 inputLocation;
inputLocation.y = mad24(outputRow, strideY, -paddingY);

int2 weightLocation;
weightLocation.x = 0;
weightLocation.y = packedOutputChannel;

// convolution
for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) {
for (short packedInputChannel = 0;
packedInputChannel < numPackedInputChannelsForGroup;
++packedInputChannel) {
short startXForChannel = startX + packedInputChannel;
for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) {
float4 weightValues[4];
for (short outChIdx = 0; outChIdx < 4; ++outChIdx) {
weightValues[outChIdx] = read_imagef(weights, smp, weightLocation);
++weightLocation.x;
}
short dilatedStepX = mul24(totalNumPackedInputChannels, dilationX);
inputLocation.x = mad24(rfColumn, dilatedStepX, startXForChannel);
float4 inputValues[4];
for (short i = 0; i < 4; ++i) {
inputValues[i] = read_imagef(input, smp, inputLocation);
inputLocation.x += strideWithChannels;
}
for (short i = 0; i < 4; ++i) {
float4 curOutputValues = outputValues[i];
curOutputValues.x += inputValues[i].x * weightValues[0].x;
curOutputValues.x += inputValues[i].y * weightValues[0].y;
curOutputValues.x += inputValues[i].z * weightValues[0].z;
curOutputValues.x += inputValues[i].w * weightValues[0].w;
curOutputValues.y += inputValues[i].x * weightValues[1].x;
curOutputValues.y += inputValues[i].y * weightValues[1].y;
curOutputValues.y += inputValues[i].z * weightValues[1].z;
curOutputValues.y += inputValues[i].w * weightValues[1].w;
curOutputValues.z += inputValues[i].x * weightValues[2].x;
curOutputValues.z += inputValues[i].y * weightValues[2].y;
curOutputValues.z += inputValues[i].z * weightValues[2].z;
curOutputValues.z += inputValues[i].w * weightValues[2].w;
curOutputValues.w += inputValues[i].x * weightValues[3].x;
curOutputValues.w += inputValues[i].y * weightValues[3].y;
curOutputValues.w += inputValues[i].z * weightValues[3].z;
curOutputValues.w += inputValues[i].w * weightValues[3].w;
outputValues[i] = curOutputValues;
}
}
}
inputLocation.y += dilationY;
}

// bias
packedOutputChannel += packedOuputChannelOffset;
short outputChannel = mul24(packedOutputChannel, 4);
float4 biasValues = vload4(0, biases + outputChannel);
for (short i = 0; i < 4; ++i) {
outputValues[i] += biasValues;
}

// activation
switch (neuron) {
case 1:
for (short i = 0; i < 4; ++i) {
outputValues[i] = max(outputValues[i], 0.0f);
}
break;
case 2:
for (short i = 0; i < 4; ++i) {
outputValues[i] = a * tanh(b * outputValues[i]);
}
break;
case 3:
for (short i = 0; i < 4; ++i) {
outputValues[i] = native_recip(1.0f + native_exp(-a * outputValues[i] + b));
}
break;
case 4:
for (short i = 0; i < 4; ++i) {
outputValues[i] = max(outputValues[i], min_clamp);
outputValues[i] = min(outputValues[i], max_clamp);
}
break;
case 5:
for (short i = 0; i < 4; ++i) {
outputValues[i] = max(outputValues[i], 0.0f) + a * (native_exp(min(outputValues[i], 0.0f)) - 1.0f);
}
break;
}

// output
int2 outputLocation;
short outputColumn = startOutputColumn;
outputLocation.y = outputRow;
for (short i = 0; i < 4; ++i) {
outputLocation.x = mad24(outputColumn, totalNumPackedOutputChannels, packedOutputChannel);
if (outputColumn < numOutputColumns) {
write_imagef(output, outputLocation, outputValues[i]);
}
++outputColumn;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
__kernel void convolution_horizontal_reduced_reads_1x1(
read_only image2d_t input,
short startPackedInputChannel,
short numPackedInputChannelsForGroup, short totalNumPackedInputChannels,
short packedOuputChannelOffset, short totalNumPackedOutputChannels,
read_only image2d_t weights, __constant float *biases,
short filterSizeX, short filterSizeY,
write_only image2d_t output,
short paddingX, short paddingY, short strideX, short strideY,
short neuron, float a, float b, float min_clamp, float max_clamp,
__constant float *parameters, __constant float *batchNormBiases,
short numOutputColumns,
short doAccumulate, read_only image2d_t accumulator) {

// init
const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
short packedOutputChannel = get_global_id(0);
short startOutputColumn = mul24((short)get_global_id(1), 4);
short outputRow = get_global_id(2);
short endPackedInputChannel = startPackedInputChannel + numPackedInputChannelsForGroup;
short startX = mad24(mad24(startOutputColumn, strideX, -paddingX),
totalNumPackedInputChannels, startPackedInputChannel);
short strideWithChannels = mul24(strideX, totalNumPackedInputChannels);

float4 outputValues[4];
for (short i = 0; i < 4; ++i) {
outputValues[i] = (float4)(0, 0, 0, 0);
}

int2 inputLocation;
inputLocation.y = mad24(outputRow, strideY, -paddingY);

int2 weightLocation;
weightLocation.x = 0;
weightLocation.y = packedOutputChannel;

// convolution
for (short packedInputChannel = startPackedInputChannel;
packedInputChannel < endPackedInputChannel; ++packedInputChannel) {

float4 weightValues[4];
for (short outChIdx = 0; outChIdx < 4; ++outChIdx) {
weightValues[outChIdx] = read_imagef(weights, smp, weightLocation);
++weightLocation.x;
}

inputLocation.x = startX + packedInputChannel;
float4 inputValues[4];
for (short i = 0; i < 4; ++i) {
inputValues[i] = read_imagef(input, smp, inputLocation);
inputLocation.x += strideWithChannels;
}

for (short i = 0; i < 4; ++i) {
float4 curOutputValues = outputValues[i];
curOutputValues.x += inputValues[i].x * weightValues[0].x;
curOutputValues.x += inputValues[i].y * weightValues[0].y;
curOutputValues.x += inputValues[i].z * weightValues[0].z;
curOutputValues.x += inputValues[i].w * weightValues[0].w;
curOutputValues.y += inputValues[i].x * weightValues[1].x;
curOutputValues.y += inputValues[i].y * weightValues[1].y;
curOutputValues.y += inputValues[i].z * weightValues[1].z;
curOutputValues.y += inputValues[i].w * weightValues[1].w;
curOutputValues.z += inputValues[i].x * weightValues[2].x;
curOutputValues.z += inputValues[i].y * weightValues[2].y;
curOutputValues.z += inputValues[i].z * weightValues[2].z;
curOutputValues.z += inputValues[i].w * weightValues[2].w;
curOutputValues.w += inputValues[i].x * weightValues[3].x;
curOutputValues.w += inputValues[i].y * weightValues[3].y;
curOutputValues.w += inputValues[i].z * weightValues[3].z;
curOutputValues.w += inputValues[i].w * weightValues[3].w;
outputValues[i] = curOutputValues;
}
}

// bias
packedOutputChannel += packedOuputChannelOffset;
short outputChannel = mul24(packedOutputChannel, 4);
float4 biasValues = vload4(0, biases + outputChannel);
for (short i = 0; i < 4; ++i) {
outputValues[i] += biasValues;
}

// accumulate
if (doAccumulate) {
int2 outputLocation;
short outputColumn = startOutputColumn;
outputLocation.y = outputRow;
for (short i = 0; i < 4; ++i) {
outputLocation.x = mad24(outputColumn, totalNumPackedOutputChannels, packedOutputChannel);
if (outputColumn < numOutputColumns) {
outputValues[i] += read_imagef(accumulator, smp, outputLocation);
}
++outputColumn;
}
}

// activation
switch (neuron) {
case 1:
for (short i = 0; i < 4; ++i) {
outputValues[i] = max(outputValues[i], 0.0f);
}
break;
case 2:
for (short i = 0; i < 4; ++i) {
outputValues[i] = a * tanh(b * outputValues[i]);
}
break;
case 3:
for (short i = 0; i < 4; ++i) {
outputValues[i] = native_recip(1.0f + native_exp(-a * outputValues[i] + b));
}
break;
case 4:
for (short i = 0; i < 4; ++i) {
outputValues[i] = max(outputValues[i], min_clamp);
outputValues[i] = min(outputValues[i], max_clamp);
}
break;
case 5:
for (short i = 0; i < 4; ++i) {
outputValues[i] = max(outputValues[i], 0.0f) + a * (native_exp(min(outputValues[i], 0.0f)) - 1.0f);
}
break;
}

// output
int2 outputLocation;
short outputColumn = startOutputColumn;
outputLocation.y = outputRow;
for (short i = 0; i < 4; ++i) {
outputLocation.x = mad24(outputColumn, totalNumPackedOutputChannels, packedOutputChannel);
if (outputColumn < numOutputColumns) {
write_imagef(output, outputLocation, outputValues[i]);
}
++outputColumn;
}
}

Loading

0 comments on commit 90beaeb

Please sign in to comment.