add thneed optimizer (commaai#23772)

* add thneed optimizer * local work group opt * kernels and final mods * release files * build system touchups * fix kernel path, rand inputs for self test * broken since extra is gone * update model replay ref Co-authored-by: Comma Device <device@comma.ai>
budney · Feb 15, 2022 · 90beaeb · 90beaeb
1 parent 7176f5c
commit 90beaeb
Show file tree

Hide file tree

Showing 12 changed files with 903 additions and 7 deletions.
diff --git a/release/files_common b/release/files_common
@@ -426,7 +426,9 @@ selfdrive/modeld/transforms/transform.cl
 selfdrive/modeld/thneed/thneed.*
 selfdrive/modeld/thneed/serialize.cc
 selfdrive/modeld/thneed/compile.cc
+selfdrive/modeld/thneed/optimizer.cc
 selfdrive/modeld/thneed/include/*
+selfdrive/modeld/thneed/kernels/*.cl
 
 selfdrive/modeld/runners/snpemodel.cc
 selfdrive/modeld/runners/snpemodel.h

diff --git a/selfdrive/modeld/SConscript b/selfdrive/modeld/SConscript
@@ -1,3 +1,5 @@
+import os
+
 Import('env', 'arch', 'cereal', 'messaging', 'common', 'gpucommon', 'visionipc')
 lenv = env.Clone()
 
@@ -23,6 +25,7 @@ common_src = [
 thneed_src = [
   "thneed/thneed.cc",
   "thneed/serialize.cc",
+  "thneed/optimizer.cc",
   "runners/thneedmodel.cc",
 ]
 
@@ -62,12 +65,16 @@ common_model = lenv.Object(common_src)
 
 # build thneed model
 if use_thneed and arch in ("aarch64", "larch64"):
+  fn = "../../models/supercombo"
   compiler = lenv.Program('thneed/compile', ["thneed/compile.cc"]+common_model, LIBS=libs)
-  cmd = f"cd {Dir('.').abspath} && {compiler[0].abspath} ../../models/supercombo.dlc ../../models/supercombo.thneed --binary"
+  cmd = f"cd {Dir('.').abspath} && {compiler[0].abspath} {fn}.dlc {fn}.thneed --binary"
 
   lib_paths = ':'.join(Dir(p).abspath for p in lenv["LIBPATH"])
-  cenv = Environment(ENV={'LD_LIBRARY_PATH': f"{lib_paths}:{lenv['ENV']['LD_LIBRARY_PATH']}"})
-  cenv.Command("../../models/supercombo.thneed", ["../../models/supercombo.dlc", compiler], cmd)
+  kernel_path = os.path.join(Dir('.').abspath, "thneed", "kernels")
+  cenv = Environment(ENV={'LD_LIBRARY_PATH': f"{lib_paths}:{lenv['ENV']['LD_LIBRARY_PATH']}", 'KERNEL_PATH': kernel_path})
+
+  kernels = [os.path.join(kernel_path, x) for x in os.listdir(kernel_path) if x.endswith(".cl")]
+  cenv.Command(fn + ".thneed", [fn + ".dlc", kernels, compiler], cmd)
 
 lenv.Program('_dmonitoringmodeld', [
     "dmonitoringmodeld.cc",

diff --git a/selfdrive/modeld/runners/snpemodel.cc b/selfdrive/modeld/runners/snpemodel.cc
@@ -7,6 +7,7 @@
 #include <cstring>
 
 #include "selfdrive/common/util.h"
+#include "selfdrive/common/timing.h"
 
 void PrintErrorStringAndExit() {
   std::cerr << zdl::DlSystem::getLastErrorString() << std::endl;
@@ -158,8 +159,14 @@ void SNPEModel::execute(float *net_input_buf, int buf_size) {
       float *outputs_golden = (float *)malloc(output_size*sizeof(float));
       memcpy(outputs_golden, output, output_size*sizeof(float));
       memset(output, 0, output_size*sizeof(float));
-      memset(recurrent, 0, recurrent_size*sizeof(float));
-      thneed->execute(inputs, output);
+
+      for (int i = 0; i < 5; i++) {
+        memset(recurrent, 0, recurrent_size*sizeof(float));
+        uint64_t start_time = nanos_since_boot();
+        thneed->execute(inputs, output);
+        uint64_t elapsed_time = nanos_since_boot() - start_time;
+        printf("ran model in %.2f ms\n", float(elapsed_time)/1e6);
+      }
 
       if (memcmp(output, outputs_golden, output_size*sizeof(float)) == 0) {
         printf("thneed selftest passed\n");

diff --git a/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads.cl b/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads.cl
@@ -0,0 +1,129 @@
+__kernel void convolution_horizontal_reduced_reads(
+    read_only image2d_t input,
+    short startPackedInputChannel,
+    short numPackedInputChannelsForGroup, short totalNumPackedInputChannels,
+    short packedOuputChannelOffset, short totalNumPackedOutputChannels,
+    read_only image2d_t weights, __constant float *biases,
+    short filterSizeX, short filterSizeY,
+    write_only image2d_t output,
+    short paddingX, short paddingY, short strideX, short strideY,
+    short dilationX, short dilationY,
+    short neuron, float a, float b, float min_clamp, float max_clamp,
+    __constant float *parameters, __constant float *batchNormBiases,
+    short numOutputColumns) {
+
+  // init
+  const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  short packedOutputChannel = get_global_id(0);
+  short startOutputColumn = mul24((short)get_global_id(1), 4);
+  short outputRow = get_global_id(2);
+  short startX = mad24(mad24(startOutputColumn, strideX, -paddingX),
+                       totalNumPackedInputChannels, startPackedInputChannel);
+  short strideWithChannels = mul24(strideX, totalNumPackedInputChannels);
+
+  float4 outputValues[4];
+  for (short i = 0; i < 4; ++i) {
+    outputValues[i] = (float4)(0, 0, 0, 0);
+  }
+
+  int2 inputLocation;
+  inputLocation.y = mad24(outputRow, strideY, -paddingY);
+
+  int2 weightLocation;
+  weightLocation.x = 0;
+  weightLocation.y = packedOutputChannel;
+
+  // convolution
+  for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) {
+    for (short packedInputChannel = 0;
+         packedInputChannel < numPackedInputChannelsForGroup;
+         ++packedInputChannel) {
+      short startXForChannel = startX + packedInputChannel;
+      for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) {
+        float4 weightValues[4];
+        for (short outChIdx = 0; outChIdx < 4; ++outChIdx) {
+          weightValues[outChIdx] = read_imagef(weights, smp, weightLocation);
+          ++weightLocation.x;
+        }
+        short dilatedStepX = mul24(totalNumPackedInputChannels, dilationX);
+        inputLocation.x = mad24(rfColumn, dilatedStepX, startXForChannel);
+        float4 inputValues[4];
+        for (short i = 0; i < 4; ++i) {
+          inputValues[i] = read_imagef(input, smp, inputLocation);
+          inputLocation.x += strideWithChannels;
+        }
+        for (short i = 0; i < 4; ++i) {
+          float4 curOutputValues = outputValues[i];
+          curOutputValues.x += inputValues[i].x * weightValues[0].x;
+          curOutputValues.x += inputValues[i].y * weightValues[0].y;
+          curOutputValues.x += inputValues[i].z * weightValues[0].z;
+          curOutputValues.x += inputValues[i].w * weightValues[0].w;
+          curOutputValues.y += inputValues[i].x * weightValues[1].x;
+          curOutputValues.y += inputValues[i].y * weightValues[1].y;
+          curOutputValues.y += inputValues[i].z * weightValues[1].z;
+          curOutputValues.y += inputValues[i].w * weightValues[1].w;
+          curOutputValues.z += inputValues[i].x * weightValues[2].x;
+          curOutputValues.z += inputValues[i].y * weightValues[2].y;
+          curOutputValues.z += inputValues[i].z * weightValues[2].z;
+          curOutputValues.z += inputValues[i].w * weightValues[2].w;
+          curOutputValues.w += inputValues[i].x * weightValues[3].x;
+          curOutputValues.w += inputValues[i].y * weightValues[3].y;
+          curOutputValues.w += inputValues[i].z * weightValues[3].z;
+          curOutputValues.w += inputValues[i].w * weightValues[3].w;
+          outputValues[i] = curOutputValues;
+        }
+      }
+    }
+    inputLocation.y += dilationY;
+  }
+
+  // bias
+  packedOutputChannel += packedOuputChannelOffset;
+  short outputChannel = mul24(packedOutputChannel, 4);
+  float4 biasValues = vload4(0, biases + outputChannel);
+  for (short i = 0; i < 4; ++i) {
+    outputValues[i] += biasValues;
+  }
+
+  // activation
+  switch (neuron) {
+  case 1:
+    for (short i = 0; i < 4; ++i) {
+      outputValues[i] = max(outputValues[i], 0.0f);
+    }
+    break;
+  case 2:
+    for (short i = 0; i < 4; ++i) {
+      outputValues[i] = a * tanh(b * outputValues[i]);
+    }
+    break;
+  case 3:
+    for (short i = 0; i < 4; ++i) {
+      outputValues[i] = native_recip(1.0f + native_exp(-a * outputValues[i] + b));
+    }
+    break;
+  case 4:
+    for (short i = 0; i < 4; ++i) {
+      outputValues[i] = max(outputValues[i], min_clamp);
+      outputValues[i] = min(outputValues[i], max_clamp);
+    }
+    break;
+  case 5:
+    for (short i = 0; i < 4; ++i) {
+      outputValues[i] = max(outputValues[i], 0.0f) + a * (native_exp(min(outputValues[i], 0.0f)) - 1.0f);
+    }
+    break;
+  }
+
+  // output
+  int2 outputLocation;
+  short outputColumn = startOutputColumn;
+  outputLocation.y = outputRow;
+  for (short i = 0; i < 4; ++i) {
+    outputLocation.x = mad24(outputColumn, totalNumPackedOutputChannels, packedOutputChannel);
+    if (outputColumn < numOutputColumns) {
+      write_imagef(output, outputLocation, outputValues[i]);
+    }
+    ++outputColumn;
+  }
+}
diff --git a/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads_1x1.cl b/selfdrive/modeld/thneed/kernels/convolution_horizontal_reduced_reads_1x1.cl
@@ -0,0 +1,140 @@
+__kernel void convolution_horizontal_reduced_reads_1x1(
+    read_only image2d_t input,
+    short startPackedInputChannel,
+    short numPackedInputChannelsForGroup, short totalNumPackedInputChannels,
+    short packedOuputChannelOffset, short totalNumPackedOutputChannels,
+    read_only image2d_t weights, __constant float *biases,
+    short filterSizeX, short filterSizeY,
+    write_only image2d_t output,
+    short paddingX, short paddingY, short strideX, short strideY,
+    short neuron, float a, float b, float min_clamp, float max_clamp,
+    __constant float *parameters, __constant float *batchNormBiases,
+    short numOutputColumns,
+    short doAccumulate, read_only image2d_t accumulator) {
+
+  // init
+  const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  short packedOutputChannel = get_global_id(0);
+  short startOutputColumn = mul24((short)get_global_id(1), 4);
+  short outputRow = get_global_id(2);
+  short endPackedInputChannel = startPackedInputChannel + numPackedInputChannelsForGroup;
+  short startX = mad24(mad24(startOutputColumn, strideX, -paddingX),
+                       totalNumPackedInputChannels, startPackedInputChannel);
+  short strideWithChannels = mul24(strideX, totalNumPackedInputChannels);
+
+  float4 outputValues[4];
+  for (short i = 0; i < 4; ++i) {
+    outputValues[i] = (float4)(0, 0, 0, 0);
+  }
+
+  int2 inputLocation;
+  inputLocation.y = mad24(outputRow, strideY, -paddingY);
+
+  int2 weightLocation;
+  weightLocation.x = 0;
+  weightLocation.y = packedOutputChannel;
+
+  // convolution
+  for (short packedInputChannel = startPackedInputChannel;
+       packedInputChannel < endPackedInputChannel; ++packedInputChannel) {
+
+    float4 weightValues[4];
+    for (short outChIdx = 0; outChIdx < 4; ++outChIdx) {
+      weightValues[outChIdx] = read_imagef(weights, smp, weightLocation);
+      ++weightLocation.x;
+    }
+
+    inputLocation.x = startX + packedInputChannel;
+    float4 inputValues[4];
+    for (short i = 0; i < 4; ++i) {
+      inputValues[i] = read_imagef(input, smp, inputLocation);
+      inputLocation.x += strideWithChannels;
+    }
+
+    for (short i = 0; i < 4; ++i) {
+      float4 curOutputValues = outputValues[i];
+      curOutputValues.x += inputValues[i].x * weightValues[0].x;
+      curOutputValues.x += inputValues[i].y * weightValues[0].y;
+      curOutputValues.x += inputValues[i].z * weightValues[0].z;
+      curOutputValues.x += inputValues[i].w * weightValues[0].w;
+      curOutputValues.y += inputValues[i].x * weightValues[1].x;
+      curOutputValues.y += inputValues[i].y * weightValues[1].y;
+      curOutputValues.y += inputValues[i].z * weightValues[1].z;
+      curOutputValues.y += inputValues[i].w * weightValues[1].w;
+      curOutputValues.z += inputValues[i].x * weightValues[2].x;
+      curOutputValues.z += inputValues[i].y * weightValues[2].y;
+      curOutputValues.z += inputValues[i].z * weightValues[2].z;
+      curOutputValues.z += inputValues[i].w * weightValues[2].w;
+      curOutputValues.w += inputValues[i].x * weightValues[3].x;
+      curOutputValues.w += inputValues[i].y * weightValues[3].y;
+      curOutputValues.w += inputValues[i].z * weightValues[3].z;
+      curOutputValues.w += inputValues[i].w * weightValues[3].w;
+      outputValues[i] = curOutputValues;
+    }
+  }
+
+  // bias
+  packedOutputChannel += packedOuputChannelOffset;
+  short outputChannel = mul24(packedOutputChannel, 4);
+  float4 biasValues = vload4(0, biases + outputChannel);
+  for (short i = 0; i < 4; ++i) {
+    outputValues[i] += biasValues;
+  }
+
+  // accumulate
+  if (doAccumulate) {
+    int2 outputLocation;
+    short outputColumn = startOutputColumn;
+    outputLocation.y = outputRow;
+    for (short i = 0; i < 4; ++i) {
+      outputLocation.x = mad24(outputColumn, totalNumPackedOutputChannels, packedOutputChannel);
+      if (outputColumn < numOutputColumns) {
+        outputValues[i] += read_imagef(accumulator, smp, outputLocation);
+      }
+      ++outputColumn;
+    }
+  }
+
+  // activation
+  switch (neuron) {
+  case 1:
+    for (short i = 0; i < 4; ++i) {
+      outputValues[i] = max(outputValues[i], 0.0f);
+    }
+    break;
+  case 2:
+    for (short i = 0; i < 4; ++i) {
+      outputValues[i] = a * tanh(b * outputValues[i]);
+    }
+    break;
+  case 3:
+    for (short i = 0; i < 4; ++i) {
+      outputValues[i] = native_recip(1.0f + native_exp(-a * outputValues[i] + b));
+    }
+    break;
+  case 4:
+    for (short i = 0; i < 4; ++i) {
+      outputValues[i] = max(outputValues[i], min_clamp);
+      outputValues[i] = min(outputValues[i], max_clamp);
+    }
+    break;
+  case 5:
+    for (short i = 0; i < 4; ++i) {
+      outputValues[i] = max(outputValues[i], 0.0f) + a * (native_exp(min(outputValues[i], 0.0f)) - 1.0f);
+    }
+    break;
+  }
+
+  // output
+  int2 outputLocation;
+  short outputColumn = startOutputColumn;
+  outputLocation.y = outputRow;
+  for (short i = 0; i < 4; ++i) {
+    outputLocation.x = mad24(outputColumn, totalNumPackedOutputChannels, packedOutputChannel);
+    if (outputColumn < numOutputColumns) {
+      write_imagef(output, outputLocation, outputValues[i]);
+    }
+    ++outputColumn;
+  }
+}
+