forked from commaai/openpilot
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
add thneed optimizer (commaai#23772)
* add thneed optimizer * local work group opt * kernels and final mods * release files * build system touchups * fix kernel path, rand inputs for self test * broken since extra is gone * update model replay ref Co-authored-by: Comma Device <device@comma.ai>
- Loading branch information
Showing
12 changed files
with
903 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
129 changes: 129 additions & 0 deletions
129
selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads.cl
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,129 @@ | ||
__kernel void convolution_horizontal_reduced_reads( | ||
read_only image2d_t input, | ||
short startPackedInputChannel, | ||
short numPackedInputChannelsForGroup, short totalNumPackedInputChannels, | ||
short packedOuputChannelOffset, short totalNumPackedOutputChannels, | ||
read_only image2d_t weights, __constant float *biases, | ||
short filterSizeX, short filterSizeY, | ||
write_only image2d_t output, | ||
short paddingX, short paddingY, short strideX, short strideY, | ||
short dilationX, short dilationY, | ||
short neuron, float a, float b, float min_clamp, float max_clamp, | ||
__constant float *parameters, __constant float *batchNormBiases, | ||
short numOutputColumns) { | ||
|
||
// init | ||
const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; | ||
short packedOutputChannel = get_global_id(0); | ||
short startOutputColumn = mul24((short)get_global_id(1), 4); | ||
short outputRow = get_global_id(2); | ||
short startX = mad24(mad24(startOutputColumn, strideX, -paddingX), | ||
totalNumPackedInputChannels, startPackedInputChannel); | ||
short strideWithChannels = mul24(strideX, totalNumPackedInputChannels); | ||
|
||
float4 outputValues[4]; | ||
for (short i = 0; i < 4; ++i) { | ||
outputValues[i] = (float4)(0, 0, 0, 0); | ||
} | ||
|
||
int2 inputLocation; | ||
inputLocation.y = mad24(outputRow, strideY, -paddingY); | ||
|
||
int2 weightLocation; | ||
weightLocation.x = 0; | ||
weightLocation.y = packedOutputChannel; | ||
|
||
// convolution | ||
for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) { | ||
for (short packedInputChannel = 0; | ||
packedInputChannel < numPackedInputChannelsForGroup; | ||
++packedInputChannel) { | ||
short startXForChannel = startX + packedInputChannel; | ||
for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) { | ||
float4 weightValues[4]; | ||
for (short outChIdx = 0; outChIdx < 4; ++outChIdx) { | ||
weightValues[outChIdx] = read_imagef(weights, smp, weightLocation); | ||
++weightLocation.x; | ||
} | ||
short dilatedStepX = mul24(totalNumPackedInputChannels, dilationX); | ||
inputLocation.x = mad24(rfColumn, dilatedStepX, startXForChannel); | ||
float4 inputValues[4]; | ||
for (short i = 0; i < 4; ++i) { | ||
inputValues[i] = read_imagef(input, smp, inputLocation); | ||
inputLocation.x += strideWithChannels; | ||
} | ||
for (short i = 0; i < 4; ++i) { | ||
float4 curOutputValues = outputValues[i]; | ||
curOutputValues.x += inputValues[i].x * weightValues[0].x; | ||
curOutputValues.x += inputValues[i].y * weightValues[0].y; | ||
curOutputValues.x += inputValues[i].z * weightValues[0].z; | ||
curOutputValues.x += inputValues[i].w * weightValues[0].w; | ||
curOutputValues.y += inputValues[i].x * weightValues[1].x; | ||
curOutputValues.y += inputValues[i].y * weightValues[1].y; | ||
curOutputValues.y += inputValues[i].z * weightValues[1].z; | ||
curOutputValues.y += inputValues[i].w * weightValues[1].w; | ||
curOutputValues.z += inputValues[i].x * weightValues[2].x; | ||
curOutputValues.z += inputValues[i].y * weightValues[2].y; | ||
curOutputValues.z += inputValues[i].z * weightValues[2].z; | ||
curOutputValues.z += inputValues[i].w * weightValues[2].w; | ||
curOutputValues.w += inputValues[i].x * weightValues[3].x; | ||
curOutputValues.w += inputValues[i].y * weightValues[3].y; | ||
curOutputValues.w += inputValues[i].z * weightValues[3].z; | ||
curOutputValues.w += inputValues[i].w * weightValues[3].w; | ||
outputValues[i] = curOutputValues; | ||
} | ||
} | ||
} | ||
inputLocation.y += dilationY; | ||
} | ||
|
||
// bias | ||
packedOutputChannel += packedOuputChannelOffset; | ||
short outputChannel = mul24(packedOutputChannel, 4); | ||
float4 biasValues = vload4(0, biases + outputChannel); | ||
for (short i = 0; i < 4; ++i) { | ||
outputValues[i] += biasValues; | ||
} | ||
|
||
// activation | ||
switch (neuron) { | ||
case 1: | ||
for (short i = 0; i < 4; ++i) { | ||
outputValues[i] = max(outputValues[i], 0.0f); | ||
} | ||
break; | ||
case 2: | ||
for (short i = 0; i < 4; ++i) { | ||
outputValues[i] = a * tanh(b * outputValues[i]); | ||
} | ||
break; | ||
case 3: | ||
for (short i = 0; i < 4; ++i) { | ||
outputValues[i] = native_recip(1.0f + native_exp(-a * outputValues[i] + b)); | ||
} | ||
break; | ||
case 4: | ||
for (short i = 0; i < 4; ++i) { | ||
outputValues[i] = max(outputValues[i], min_clamp); | ||
outputValues[i] = min(outputValues[i], max_clamp); | ||
} | ||
break; | ||
case 5: | ||
for (short i = 0; i < 4; ++i) { | ||
outputValues[i] = max(outputValues[i], 0.0f) + a * (native_exp(min(outputValues[i], 0.0f)) - 1.0f); | ||
} | ||
break; | ||
} | ||
|
||
// output | ||
int2 outputLocation; | ||
short outputColumn = startOutputColumn; | ||
outputLocation.y = outputRow; | ||
for (short i = 0; i < 4; ++i) { | ||
outputLocation.x = mad24(outputColumn, totalNumPackedOutputChannels, packedOutputChannel); | ||
if (outputColumn < numOutputColumns) { | ||
write_imagef(output, outputLocation, outputValues[i]); | ||
} | ||
++outputColumn; | ||
} | ||
} |
140 changes: 140 additions & 0 deletions
140
selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads_1x1.cl
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,140 @@ | ||
__kernel void convolution_horizontal_reduced_reads_1x1( | ||
read_only image2d_t input, | ||
short startPackedInputChannel, | ||
short numPackedInputChannelsForGroup, short totalNumPackedInputChannels, | ||
short packedOuputChannelOffset, short totalNumPackedOutputChannels, | ||
read_only image2d_t weights, __constant float *biases, | ||
short filterSizeX, short filterSizeY, | ||
write_only image2d_t output, | ||
short paddingX, short paddingY, short strideX, short strideY, | ||
short neuron, float a, float b, float min_clamp, float max_clamp, | ||
__constant float *parameters, __constant float *batchNormBiases, | ||
short numOutputColumns, | ||
short doAccumulate, read_only image2d_t accumulator) { | ||
|
||
// init | ||
const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; | ||
short packedOutputChannel = get_global_id(0); | ||
short startOutputColumn = mul24((short)get_global_id(1), 4); | ||
short outputRow = get_global_id(2); | ||
short endPackedInputChannel = startPackedInputChannel + numPackedInputChannelsForGroup; | ||
short startX = mad24(mad24(startOutputColumn, strideX, -paddingX), | ||
totalNumPackedInputChannels, startPackedInputChannel); | ||
short strideWithChannels = mul24(strideX, totalNumPackedInputChannels); | ||
|
||
float4 outputValues[4]; | ||
for (short i = 0; i < 4; ++i) { | ||
outputValues[i] = (float4)(0, 0, 0, 0); | ||
} | ||
|
||
int2 inputLocation; | ||
inputLocation.y = mad24(outputRow, strideY, -paddingY); | ||
|
||
int2 weightLocation; | ||
weightLocation.x = 0; | ||
weightLocation.y = packedOutputChannel; | ||
|
||
// convolution | ||
for (short packedInputChannel = startPackedInputChannel; | ||
packedInputChannel < endPackedInputChannel; ++packedInputChannel) { | ||
|
||
float4 weightValues[4]; | ||
for (short outChIdx = 0; outChIdx < 4; ++outChIdx) { | ||
weightValues[outChIdx] = read_imagef(weights, smp, weightLocation); | ||
++weightLocation.x; | ||
} | ||
|
||
inputLocation.x = startX + packedInputChannel; | ||
float4 inputValues[4]; | ||
for (short i = 0; i < 4; ++i) { | ||
inputValues[i] = read_imagef(input, smp, inputLocation); | ||
inputLocation.x += strideWithChannels; | ||
} | ||
|
||
for (short i = 0; i < 4; ++i) { | ||
float4 curOutputValues = outputValues[i]; | ||
curOutputValues.x += inputValues[i].x * weightValues[0].x; | ||
curOutputValues.x += inputValues[i].y * weightValues[0].y; | ||
curOutputValues.x += inputValues[i].z * weightValues[0].z; | ||
curOutputValues.x += inputValues[i].w * weightValues[0].w; | ||
curOutputValues.y += inputValues[i].x * weightValues[1].x; | ||
curOutputValues.y += inputValues[i].y * weightValues[1].y; | ||
curOutputValues.y += inputValues[i].z * weightValues[1].z; | ||
curOutputValues.y += inputValues[i].w * weightValues[1].w; | ||
curOutputValues.z += inputValues[i].x * weightValues[2].x; | ||
curOutputValues.z += inputValues[i].y * weightValues[2].y; | ||
curOutputValues.z += inputValues[i].z * weightValues[2].z; | ||
curOutputValues.z += inputValues[i].w * weightValues[2].w; | ||
curOutputValues.w += inputValues[i].x * weightValues[3].x; | ||
curOutputValues.w += inputValues[i].y * weightValues[3].y; | ||
curOutputValues.w += inputValues[i].z * weightValues[3].z; | ||
curOutputValues.w += inputValues[i].w * weightValues[3].w; | ||
outputValues[i] = curOutputValues; | ||
} | ||
} | ||
|
||
// bias | ||
packedOutputChannel += packedOuputChannelOffset; | ||
short outputChannel = mul24(packedOutputChannel, 4); | ||
float4 biasValues = vload4(0, biases + outputChannel); | ||
for (short i = 0; i < 4; ++i) { | ||
outputValues[i] += biasValues; | ||
} | ||
|
||
// accumulate | ||
if (doAccumulate) { | ||
int2 outputLocation; | ||
short outputColumn = startOutputColumn; | ||
outputLocation.y = outputRow; | ||
for (short i = 0; i < 4; ++i) { | ||
outputLocation.x = mad24(outputColumn, totalNumPackedOutputChannels, packedOutputChannel); | ||
if (outputColumn < numOutputColumns) { | ||
outputValues[i] += read_imagef(accumulator, smp, outputLocation); | ||
} | ||
++outputColumn; | ||
} | ||
} | ||
|
||
// activation | ||
switch (neuron) { | ||
case 1: | ||
for (short i = 0; i < 4; ++i) { | ||
outputValues[i] = max(outputValues[i], 0.0f); | ||
} | ||
break; | ||
case 2: | ||
for (short i = 0; i < 4; ++i) { | ||
outputValues[i] = a * tanh(b * outputValues[i]); | ||
} | ||
break; | ||
case 3: | ||
for (short i = 0; i < 4; ++i) { | ||
outputValues[i] = native_recip(1.0f + native_exp(-a * outputValues[i] + b)); | ||
} | ||
break; | ||
case 4: | ||
for (short i = 0; i < 4; ++i) { | ||
outputValues[i] = max(outputValues[i], min_clamp); | ||
outputValues[i] = min(outputValues[i], max_clamp); | ||
} | ||
break; | ||
case 5: | ||
for (short i = 0; i < 4; ++i) { | ||
outputValues[i] = max(outputValues[i], 0.0f) + a * (native_exp(min(outputValues[i], 0.0f)) - 1.0f); | ||
} | ||
break; | ||
} | ||
|
||
// output | ||
int2 outputLocation; | ||
short outputColumn = startOutputColumn; | ||
outputLocation.y = outputRow; | ||
for (short i = 0; i < 4; ++i) { | ||
outputLocation.x = mad24(outputColumn, totalNumPackedOutputChannels, packedOutputChannel); | ||
if (outputColumn < numOutputColumns) { | ||
write_imagef(output, outputLocation, outputValues[i]); | ||
} | ||
++outputColumn; | ||
} | ||
} | ||
|
Oops, something went wrong.