From 2549499371bb72f27abedb747d6977ccb670e88d Mon Sep 17 00:00:00 2001
From: Hanno Schwalm <hanno@schwalm-bremen.de>
Date: Wed, 2 Oct 2024 06:16:00 +0200
Subject: [PATCH 1/3] Implement dt_gaussian_fast_blur_cl_buffer() and
 dt_gaussian_fast_blur()

Although the standard gaussian blur code for cl buffers or CPU is pretty fast we can do even better for small sigmas.

Using a simple NxN kernel is faster because we don't have to fiddle around with sorting data and especially on cards
with lots of processing units and fast cl_mem or on CPU cached memory the algorithm is clearly faster due to cache locality.

Both (OpenCL and CPU) fast gaussian variants support 1, 2 or 4 channels and can run on a 5x5, 7x7 or a 9x9 coeff matrix,
the chosen matrix size depends on the sigma. (9x9 for sigma > 1.2, 7x7 for sigma > 0.7 or 5x5 if smaller than 0.7)

Measured performance gain depends on the chosen sigma, performance vs "standard" is approximately both for OpenCL and
CPU code:
 - 9x9 200%
 - 7x7 300%
 - 5x5 400%

The calculated coeffs and the kernel leave out edge positions as they contribute almost nothing to the correct result,
errors vs the standard gaussian for each kernel is less than 0.2% as long the sigma is <= 1.5

Please note: both functions now also do a proper gaussian at the borders (did not before this pr!).
---
 data/kernels/gaussian.cl | 224 ++++++++++++++++++++
 src/common/gaussian.c    | 439 ++++++++++++++++++++++++++++++++++++++-
 src/common/gaussian.h    |   5 +-
 3 files changed, 659 insertions(+), 9 deletions(-)

diff --git a/data/kernels/gaussian.cl b/data/kernels/gaussian.cl
index 3d20a9bbd7db..a665e1216be8 100644
--- a/data/kernels/gaussian.cl
+++ b/data/kernels/gaussian.cl
@@ -282,6 +282,230 @@ gaussian_column_1c(global float *in, global float *out, unsigned int width, unsi
   }
 }
 
+/*
+  kern always is an array of 25 floats for kernel coeffs.
+  Access i = y*5 + x
+*/
+__kernel void gaussian_kernel_9x9(global float *input,
+                                  global float *output,
+                                  const int w,
+                                  const int h,
+                                  const int ch,
+                                  global const float *kern,
+                                  const float4 minval,
+                                  const float4 maxval,
+                                  const int dim)
+{
+  const int col = get_global_id(0);
+  const int row = get_global_id(1);
+  if((col >= w) || (row >= h)) return;
+
+  const int i = mad24(row, w, col);
+  const int w2 = 2 * w;
+  const int w3 = 3 * w;
+  const int w4 = 4 * w;
+
+  if(ch == 1)
+  {
+    global float *in = input;
+    global float *out = output;
+    float val = 0.0f;
+    if(dim == 4 && col >= 4 && row >= 4 && col < w - 4 && row < h - 4)
+    {
+      val =   kern[10+4] * (in[i - w4 -2] + in[i - w4 +2] + in[i - w2 -4] + in[i - w2 +4] + in[i + w2 -4] + in[i + w2 +4] + in[i + w4 -2] + in[i + w4 +2])
+            + kern[5 +4] * (in[i - w4 -1] + in[i - w4 +1] + in[i -  w -4] + in[i -  w +4] + in[i +  w -4] + in[i +  w +4] + in[i + w4 -1] + in[i + w4 +1])
+            + kern[4]    * (in[i - w4]    + in[i - 4]     + in[i + 4]     + in[i + w4])
+            + kern[15+3] * (in[i - w3 -3] + in[i - w3 +3] + in[i + w3 -3] + in[i + w3 +3])
+            + kern[10+3] * (in[i - w3 -2] + in[i - w3 +2] + in[i - w2 -3] + in[i - w2 +3] + in[i + w2 -3] + in[i + w2 +3] + in[i + w3 -2] + in[i + w3 +2])
+            + kern[ 5+3] * (in[i - w3 -1] + in[i - w3 +1] + in[i - w  -3] + in[i -  w +3] + in[i +  w -3] + in[i +  w +3] + in[i + w3 -1] + in[i + w3 +1])
+            + kern[   3] * (in[i - w3]    + in[i - 3]     + in[i + 3]     + in[i + w3])
+            + kern[10+2] * (in[i - w2 -2] + in[i - w2 +2] + in[i + w2 -2] + in[i + w2 +2])
+            + kern[ 5+2] * (in[i - w2 -1] + in[i - w2 +1] + in[i -  w -2] + in[i -  w +2] + in[i +  w -2] + in[i +  w +2] + in[i + w2 -1] + in[i + w2 +1])
+            + kern[   2] * (in[i - w2]    + in[i - 2]     + in[i + 2]     + in[i + w2])
+            + kern[ 5+1] * (in[i -  w -1] + in[i -  w +1] + in[i +  w -1] + in[i +  w +1])
+            + kern[   1] * (in[i -  w]    + in[i - 1]     + in[i + 1]     + in[i +  w])
+            + kern[   0] * in[i];
+    }
+    else if(dim == 3 && col >= 3 && row >= 3 && col < w - 3 && row < h - 3)
+    {
+      val =   kern[10+3] * (in[i - w3 -2] + in[i - w3 +2] + in[i - w2 -3] + in[i - w2 +3] + in[i + w2 -3] + in[i + w2 +3] + in[i + w3 -2] + in[i + w3 +2])
+            + kern[ 5+3] * (in[i - w3 -1] + in[i - w3 +1] + in[i - w  -3] + in[i -  w +3] + in[i +  w -3] + in[i +  w +3] + in[i + w3 -1] + in[i + w3 +1])
+            + kern[   3] * (in[i - w3]    + in[i - 3]     + in[i + 3]     + in[i + w3])
+            + kern[10+2] * (in[i - w2 -2] + in[i - w2 +2] + in[i + w2 -2] + in[i + w2 +2])
+            + kern[ 5+2] * (in[i - w2 -1] + in[i - w2 +1] + in[i -  w -2] + in[i -  w +2] + in[i +  w -2] + in[i +  w +2] + in[i + w2 -1] + in[i + w2 +1])
+            + kern[   2] * (in[i - w2]    + in[i - 2]     + in[i + 2]     + in[i + w2])
+            + kern[ 5+1] * (in[i -  w -1] + in[i -  w +1] + in[i +  w -1] + in[i +  w +1])
+            + kern[   1] * (in[i -  w]    + in[i - 1]     + in[i + 1]     + in[i +  w])
+            + kern[   0] * in[i];
+    }
+    else if(dim == 2 && col >= 2 && row >= 2 && col < w - 2 && row < h - 2)
+    {
+      val =   kern[ 5+2] * (in[i - w2 -1] + in[i - w2 +1] + in[i -  w -2] + in[i -  w +2] + in[i +  w -2] + in[i +  w +2] + in[i + w2 -1] + in[i + w2 +1])
+            + kern[   2] * (in[i - w2]    + in[i - 2]     + in[i + 2]     + in[i + w2])
+            + kern[ 5+1] * (in[i -  w -1] + in[i -  w +1] + in[i +  w -1] + in[i +  w +1])
+            + kern[   1] * (in[i -  w]    + in[i - 1]     + in[i + 1]     + in[i +  w])
+            + kern[   0] * in[i];
+    }
+    else
+    {
+      float sum = 0.0f;
+      float div = 0.0f;
+      for(int ir = -dim; ir <= dim; ir++)
+      {
+        const int irow = row+ir;
+        if(irow >= 0 && irow < h)
+        {
+          for(int ic = -dim; ic <= dim; ic++)
+          {
+            const int icol = col+ic;
+            if(icol >=0 && icol < w)
+            {
+              const float coeff = kern[5 * abs(ir) + abs(ic)];
+              div += coeff;
+              sum += coeff * in[mad24(irow, w, icol)];
+            }
+          }
+        }
+      }
+      val = (div != 0.0f) ? sum / div : 0.0f;
+    }
+    out[i] = clamp(val, minval.x, maxval.x);
+  }
+
+  else if(ch == 2)
+  {
+    global float2 *in = (global float2 *)input;
+    global float2 *out = (global float2 *)output;
+    float2 val = 0.0f;
+    if(dim == 4 && col >= 4 && row >= 4 && col < w - 4 && row < h - 4)
+    {
+      val =   kern[10+4] * (in[i - w4 -2] + in[i - w4 +2] + in[i - w2 -4] + in[i - w2 +4] + in[i + w2 -4] + in[i + w2 +4] + in[i + w4 -2] + in[i + w4 +2])
+            + kern[5 +4] * (in[i - w4 -1] + in[i - w4 +1] + in[i -  w -4] + in[i -  w +4] + in[i +  w -4] + in[i +  w +4] + in[i + w4 -1] + in[i + w4 +1])
+            + kern[4]    * (in[i - w4]    + in[i - 4]     + in[i + 4]     + in[i + w4])
+            + kern[15+3] * (in[i - w3 -3] + in[i - w3 +3] + in[i + w3 -3] + in[i + w3 +3])
+            + kern[10+3] * (in[i - w3 -2] + in[i - w3 +2] + in[i - w2 -3] + in[i - w2 +3] + in[i + w2 -3] + in[i + w2 +3] + in[i + w3 -2] + in[i + w3 +2])
+            + kern[ 5+3] * (in[i - w3 -1] + in[i - w3 +1] + in[i - w  -3] + in[i -  w +3] + in[i +  w -3] + in[i +  w +3] + in[i + w3 -1] + in[i + w3 +1])
+            + kern[   3] * (in[i - w3]    + in[i - 3]     + in[i + 3]     + in[i + w3])
+            + kern[10+2] * (in[i - w2 -2] + in[i - w2 +2] + in[i + w2 -2] + in[i + w2 +2])
+            + kern[ 5+2] * (in[i - w2 -1] + in[i - w2 +1] + in[i -  w -2] + in[i -  w +2] + in[i +  w -2] + in[i +  w +2] + in[i + w2 -1] + in[i + w2 +1])
+            + kern[   2] * (in[i - w2]    + in[i - 2]     + in[i + 2]     + in[i + w2])
+            + kern[ 5+1] * (in[i -  w -1] + in[i -  w +1] + in[i +  w -1] + in[i +  w +1])
+            + kern[   1] * (in[i -  w]    + in[i - 1]     + in[i + 1]     + in[i +  w])
+            + kern[   0] * in[i];
+    }
+    else if(dim == 3 && col >= 3 && row >= 3 && col < w - 3 && row < h - 3)
+    {
+      val =   kern[10+3] * (in[i - w3 -2] + in[i - w3 +2] + in[i - w2 -3] + in[i - w2 +3] + in[i + w2 -3] + in[i + w2 +3] + in[i + w3 -2] + in[i + w3 +2])
+            + kern[ 5+3] * (in[i - w3 -1] + in[i - w3 +1] + in[i - w  -3] + in[i -  w +3] + in[i +  w -3] + in[i +  w +3] + in[i + w3 -1] + in[i + w3 +1])
+            + kern[   3] * (in[i - w3]    + in[i - 3]     + in[i + 3]     + in[i + w3])
+            + kern[10+2] * (in[i - w2 -2] + in[i - w2 +2] + in[i + w2 -2] + in[i + w2 +2])
+            + kern[ 5+2] * (in[i - w2 -1] + in[i - w2 +1] + in[i -  w -2] + in[i -  w +2] + in[i +  w -2] + in[i +  w +2] + in[i + w2 -1] + in[i + w2 +1])
+            + kern[   2] * (in[i - w2]    + in[i - 2]     + in[i + 2]     + in[i + w2])
+            + kern[ 5+1] * (in[i -  w -1] + in[i -  w +1] + in[i +  w -1] + in[i +  w +1])
+            + kern[   1] * (in[i -  w]    + in[i - 1]     + in[i + 1]     + in[i +  w])
+            + kern[   0] * in[i];
+    }
+    else if(dim == 2 && col >= 2 && row >= 2 && col < w - 2 && row < h - 2)
+    {
+      val =   kern[ 5+2] * (in[i - w2 -1] + in[i - w2 +1] + in[i -  w -2] + in[i -  w +2] + in[i +  w -2] + in[i +  w +2] + in[i + w2 -1] + in[i + w2 +1])
+            + kern[   2] * (in[i - w2]    + in[i - 2]     + in[i + 2]     + in[i + w2])
+            + kern[ 5+1] * (in[i -  w -1] + in[i -  w +1] + in[i +  w -1] + in[i +  w +1])
+            + kern[   1] * (in[i -  w]    + in[i - 1]     + in[i + 1]     + in[i +  w])
+            + kern[   0] * in[i];
+    }
+    else
+    {
+      float2 sum = 0.0f;
+      float div = 0.0f;
+      for(int ir = -dim; ir <= dim; ir++)
+      {
+        const int irow = row+ir;
+        if(irow >= 0 && irow < h)
+        {
+          for(int ic = -dim; ic <= dim; ic++)
+          {
+            const int icol = col+ic;
+            if(icol >=0 && icol < w)
+            {
+              const float coeff = kern[5 * abs(ir) + abs(ic)];
+              div += coeff;
+              sum += coeff * in[mad24(irow, w, icol)];
+            }
+          }
+        }
+      }
+      val = (div != 0.0f) ? sum / div : 0.0f;
+    }
+    out[i] = clamp(val, minval.xy, maxval.xy);
+  }
+
+  else if(ch == 4)
+  {
+    global float4 *in = (global float4 *)input;
+    global float4 *out = (global float4 *)output;
+    float4 val = 0.0f;
+    if(dim == 4 && col >= 4 && row >= 4 && col < w - 4 && row < h - 4)
+    {
+      val =   kern[10+4] * (in[i - w4 -2] + in[i - w4 +2] + in[i - w2 -4] + in[i - w2 +4] + in[i + w2 -4] + in[i + w2 +4] + in[i + w4 -2] + in[i + w4 +2])
+            + kern[5 +4] * (in[i - w4 -1] + in[i - w4 +1] + in[i -  w -4] + in[i -  w +4] + in[i +  w -4] + in[i +  w +4] + in[i + w4 -1] + in[i + w4 +1])
+            + kern[4]    * (in[i - w4]    + in[i - 4]     + in[i + 4]     + in[i + w4])
+            + kern[15+3] * (in[i - w3 -3] + in[i - w3 +3] + in[i + w3 -3] + in[i + w3 +3])
+            + kern[10+3] * (in[i - w3 -2] + in[i - w3 +2] + in[i - w2 -3] + in[i - w2 +3] + in[i + w2 -3] + in[i + w2 +3] + in[i + w3 -2] + in[i + w3 +2])
+            + kern[ 5+3] * (in[i - w3 -1] + in[i - w3 +1] + in[i - w  -3] + in[i -  w +3] + in[i +  w -3] + in[i +  w +3] + in[i + w3 -1] + in[i + w3 +1])
+            + kern[   3] * (in[i - w3]    + in[i - 3]     + in[i + 3]     + in[i + w3])
+            + kern[10+2] * (in[i - w2 -2] + in[i - w2 +2] + in[i + w2 -2] + in[i + w2 +2])
+            + kern[ 5+2] * (in[i - w2 -1] + in[i - w2 +1] + in[i -  w -2] + in[i -  w +2] + in[i +  w -2] + in[i +  w +2] + in[i + w2 -1] + in[i + w2 +1])
+            + kern[   2] * (in[i - w2]    + in[i - 2]     + in[i + 2]     + in[i + w2])
+            + kern[ 5+1] * (in[i -  w -1] + in[i -  w +1] + in[i +  w -1] + in[i +  w +1])
+            + kern[   1] * (in[i -  w]    + in[i - 1]     + in[i + 1]     + in[i +  w])
+            + kern[   0] * in[i];
+    }
+    else if(dim == 3 && col >= 3 && row >= 3 && col < w - 3 && row < h - 3)
+    {
+      val =   kern[10+3] * (in[i - w3 -2] + in[i - w3 +2] + in[i - w2 -3] + in[i - w2 +3] + in[i + w2 -3] + in[i + w2 +3] + in[i + w3 -2] + in[i + w3 +2])
+            + kern[ 5+3] * (in[i - w3 -1] + in[i - w3 +1] + in[i - w  -3] + in[i -  w +3] + in[i +  w -3] + in[i +  w +3] + in[i + w3 -1] + in[i + w3 +1])
+            + kern[   3] * (in[i - w3]    + in[i - 3]     + in[i + 3]     + in[i + w3])
+            + kern[10+2] * (in[i - w2 -2] + in[i - w2 +2] + in[i + w2 -2] + in[i + w2 +2])
+            + kern[ 5+2] * (in[i - w2 -1] + in[i - w2 +1] + in[i -  w -2] + in[i -  w +2] + in[i +  w -2] + in[i +  w +2] + in[i + w2 -1] + in[i + w2 +1])
+            + kern[   2] * (in[i - w2]    + in[i - 2]     + in[i + 2]     + in[i + w2])
+            + kern[ 5+1] * (in[i -  w -1] + in[i -  w +1] + in[i +  w -1] + in[i +  w +1])
+            + kern[   1] * (in[i -  w]    + in[i - 1]     + in[i + 1]     + in[i +  w])
+            + kern[   0] * in[i];
+    }
+    else if(dim == 2 && col >= 2 && row >= 2 && col < w - 2 && row < h - 2)
+    {
+      val =   kern[ 5+2] * (in[i - w2 -1] + in[i - w2 +1] + in[i -  w -2] + in[i -  w +2] + in[i +  w -2] + in[i +  w +2] + in[i + w2 -1] + in[i + w2 +1])
+            + kern[   2] * (in[i - w2]    + in[i - 2]     + in[i + 2]     + in[i + w2])
+            + kern[ 5+1] * (in[i -  w -1] + in[i -  w +1] + in[i +  w -1] + in[i +  w +1])
+            + kern[   1] * (in[i -  w]    + in[i - 1]     + in[i + 1]     + in[i +  w])
+            + kern[   0] * in[i];
+    }
+    else
+    {
+      float4 sum = 0.0f;
+      float div = 0.0f;
+      for(int ir = -dim; ir <= dim; ir++)
+      {
+        const int irow = row+ir;
+        if(irow >= 0 && irow < h)
+        {
+          for(int ic = -dim; ic <= dim; ic++)
+          {
+            const int icol = col+ic;
+            if(icol >=0 && icol < w)
+            {
+              const float coeff = kern[5 * abs(ir) + abs(ic)];
+              div += coeff;
+              sum += coeff * in[mad24(irow, w, icol)];
+            }
+          }
+        }
+      }
+      val = (div != 0.0f) ? sum / div : 0.0f;
+    }
+    out[i] = clamp(val, minval, maxval);
+  }
+}
 
 
 float
diff --git a/src/common/gaussian.c b/src/common/gaussian.c
index ef0bbac63e51..e4a57e00cfaf 100644
--- a/src/common/gaussian.c
+++ b/src/common/gaussian.c
@@ -22,11 +22,20 @@
 #include "common/gaussian.h"
 #include "common/math.h"
 #include "common/opencl.h"
+#include "common/imagebuf.h"
 
 #define BLOCKSIZE (1 << 6)
 
-static void compute_gauss_params(const float sigma, dt_gaussian_order_t order, float *a0, float *a1,
-                                 float *a2, float *a3, float *b1, float *b2, float *coefp, float *coefn)
+static void _compute_gauss_params(const float sigma,
+                                  dt_gaussian_order_t order,
+                                  float *a0,
+                                  float *a1,
+                                  float *a2,
+                                  float *a3,
+                                  float *b1,
+                                  float *b2,
+                                  float *coefp,
+                                  float *coefn)
 {
   const float alpha = 1.695f / sigma;
   const float ema = expf(-alpha);
@@ -78,6 +87,39 @@ static void compute_gauss_params(const float sigma, dt_gaussian_order_t order, f
   *coefn = (*a2 + *a3) / (1.0f + *b1 + *b2);
 }
 
+static int _calc_9x9_gauss_coeffs(float *coeffs, const float sigma)
+{
+  float kernel[9][9];
+
+  /* As we want to leave out the outermost locations for each NxN kernel we choose
+     kernel size depending on sigma and make sure the nomalizing is done depending on this.
+  */
+  const float r[6] = { 0.0f, 0.6f, 0.84f, 1.15f, 1.5f, 2.0f };
+  const int dim = sigma > r[3] ? 4 : (sigma > r[2] ? 3 : 2);
+  const float range = sqrf(3.0f * r[dim]);
+  const float temp = -2.0f * sigma * sigma;
+  float sum = 0.0f;
+  for(int k = -4; k < 5; k++)
+  {
+    for(int j = -4; j < 5; j++)
+    {
+      const float rad = k*k + j*j;
+      if(rad <= range)
+      {
+        kernel[k + 4][j + 4] = expf(rad / temp);
+        sum += kernel[k + 4][j + 4];
+      }
+      else
+        kernel[k + 4][j + 4] = 0.0f;
+    }
+  }
+
+  for(int k = 0; k < 5; k++)
+    for(int j = 0; j < 5; j++)
+      coeffs[5*k+j] = kernel[k+4][j+4] / sum;
+  return dim;
+}
+
 size_t dt_gaussian_memory_use(const int width,    // width of input image
                               const int height,   // height of input image
                               const int channels) // channels per pixel
@@ -159,7 +201,7 @@ void dt_gaussian_blur(dt_gaussian_t *g, const float *const in, float *const out)
 
   float a0, a1, a2, a3, b1, b2, coefp, coefn;
 
-  compute_gauss_params(g->sigma, g->order, &a0, &a1, &a2, &a3, &b1, &b2, &coefp, &coefn);
+  _compute_gauss_params(g->sigma, g->order, &a0, &a1, &a2, &a3, &b1, &b2, &coefp, &coefn);
 
   float *temp = g->buf;
 
@@ -312,7 +354,7 @@ void dt_gaussian_blur_4c(dt_gaussian_t *g, const float *const in, float *const o
 
   float a0, a1, a2, a3, b1, b2, coefp, coefn;
 
-  compute_gauss_params(g->sigma, g->order, &a0, &a1, &a2, &a3, &b1, &b2, &coefp, &coefn);
+  _compute_gauss_params(g->sigma, g->order, &a0, &a1, &a2, &a3, &b1, &b2, &coefp, &coefn);
 
   float *const temp = g->buf;
 
@@ -464,6 +506,319 @@ void dt_gaussian_free(dt_gaussian_t *g)
   free(g);
 }
 
+DT_OMP_DECLARE_SIMD(aligned(input, output:64))
+static void _fast_9x9_kernel_1(float *input,
+                               float *output,
+                               const int width,
+                               const int height,
+                               const float sigma,
+                               const float min,
+                               const float max)
+{
+  float kern[25];
+  const int dim = _calc_9x9_gauss_coeffs(kern, sigma);
+  const int w1 = width;
+  const int w2 = 2 * width;
+  const int w3 = 3 * width;
+  const int w4 = 4 * width;
+
+  DT_OMP_FOR(collapse(2))
+  for(int row = 0; row < height; row++)
+  {
+    for(int col = 0; col < width; col++)
+    {
+      const size_t i = (size_t)row * width + col;
+      float *in = &input[i];
+      float *out= &output[i];
+
+      float val = 0.0f;
+      if(dim == 4 && col >= 4 && row >= 4 && col < width - 4 && row < height - 4)
+      {
+        val = kern[10+4] * (in[-w4 -2] + in[-w4 +2] + in[-w2 -4] + in[-w2 +4] + in[+w2 -4] + in[+w2 +4] + in[+w4 -2] + in[+w4 +2])
+            + kern[5 +4] * (in[-w4 -1] + in[-w4 +1] + in[-w1 -4] + in[-w1 +4] + in[+w1 -4] + in[+w1 +4] + in[+w4 -1] + in[+w4 +1])
+            + kern[4]    * (in[-w4]    + in[-4]     + in[+4]     + in[+w4])
+            + kern[15+3] * (in[-w3 -3] + in[-w3 +3] + in[+w3 -3] + in[+w3 +3])
+            + kern[10+3] * (in[-w3 -2] + in[-w3 +2] + in[-w2 -3] + in[-w2 +3] + in[+w2 -3] + in[+w2 +3] + in[+w3 -2] + in[+w3 +2])
+            + kern[ 5+3] * (in[-w3 -1] + in[-w3 +1] + in[-w1 -3] + in[-w1 +3] + in[+w1 -3] + in[+w1 +3] + in[+w3 -1] + in[+w3 +1])
+            + kern[   3] * (in[-w3]    + in[-3]     + in[+3]     + in[+w3])
+            + kern[10+2] * (in[-w2 -2] + in[-w2 +2] + in[+w2 -2] + in[+w2 +2])
+            + kern[ 5+2] * (in[-w2 -1] + in[-w2 +1] + in[-w1 -2] + in[-w1 +2] + in[+w1 -2] + in[+w1 +2] + in[+w2 -1] + in[+w2 +1])
+            + kern[   2] * (in[-w2]    + in[-2]     + in[+2]     + in[+w2])
+            + kern[ 5+1] * (in[-w1 -1] + in[-w1 +1] + in[+w1 -1] + in[+w1 +1])
+            + kern[   1] * (in[-w1]    + in[-1]     + in[+1]     + in[+w1])
+            + kern[   0] * in[0];
+      }
+      else if(dim == 3 && col >= 3 && row >= 3 && col < width - 3 && row < height - 3)
+      {
+        val = kern[10+3] * (in[-w3 -2] + in[-w3 +2] + in[-w2 -3] + in[-w2 +3] + in[+w2 -3] + in[+w2 +3] + in[+w3 -2] + in[+w3 +2])
+            + kern[ 5+3] * (in[-w3 -1] + in[-w3 +1] + in[-w1 -3] + in[-w1 +3] + in[+w1 -3] + in[+w1 +3] + in[+w3 -1] + in[+w3 +1])
+            + kern[   3] * (in[-w3]    + in[-3]     + in[+3]     + in[+w3])
+            + kern[10+2] * (in[-w2 -2] + in[-w2 +2] + in[+w2 -2] + in[+w2 +2])
+            + kern[ 5+2] * (in[-w2 -1] + in[-w2 +1] + in[-w1 -2] + in[-w1 +2] + in[+w1 -2] + in[+w1 +2] + in[+w2 -1] + in[+w2 +1])
+            + kern[   2] * (in[-w2]    + in[-2]     + in[+2]     + in[+w2])
+            + kern[ 5+1] * (in[-w1 -1] + in[-w1 +1] + in[+w1 -1] + in[+w1 +1])
+            + kern[   1] * (in[-w1]    + in[-1]     + in[+1]     + in[+w1])
+            + kern[   0] * in[0];
+      }
+      else if(dim == 2 && col >= 2 && row >= 2 && col < width - 2 && row < height - 2)
+      {
+        val =
+              kern[ 5+2] * (in[-w2 -1] + in[-w2 +1] + in[-w1 -2] + in[-w1 +2] + in[+w1 -2] + in[+w1 +2] + in[+w2 -1] + in[+w2 +1])
+            + kern[   2] * (in[-w2]    + in[-2]     + in[+2]     + in[+w2])
+            + kern[ 5+1] * (in[-w1 -1] + in[-w1 +1] + in[+w1 -1] + in[+w1 +1])
+            + kern[   1] * (in[-w1]    + in[-1]     + in[+1]     + in[+w1])
+            + kern[   0] * in[0];
+      }
+      else
+      {
+        float div = 0.0f;
+        for(int ir = -dim; ir <= dim; ir++)
+        {
+          const int irow = row+ir;
+          if(irow >= 0 && irow < height)
+          {
+            for(int ic = -dim; ic <= dim; ic++)
+            {
+              const int icol = col+ic;
+              if(icol >=0 && icol < width)
+              {
+                const float coeff = kern[5 * ABS(ir) + ABS(ic)];
+                div += coeff;
+                val += coeff * in[ir * w1 + ic];
+              }
+            }
+          }
+        }
+        val = (div != 0.0f) ? val / div : 0.0f;
+      }
+      out[0] = CLAMPF(val, min, max);
+    }
+  }
+}
+
+
+DT_OMP_DECLARE_SIMD(aligned(input, output:64))
+static void _fast_9x9_kernel_2(float *input,
+                               float *output,
+                               const int width,
+                               const int height,
+                               const float sigma,
+                               const float min,
+                               const float max)
+{
+  float kern[25];
+  const int dim = _calc_9x9_gauss_coeffs(kern, sigma);
+
+  const int w1 = 2 * width;
+  const int w2 = 4 * width;
+  const int w3 = 6 * width;
+  const int w4 = 8 * width;
+
+  DT_OMP_FOR(collapse(2))
+  for(int row = 0; row < height; row++)
+  {
+    for(int col = 0; col < width; col++)
+    {
+      const size_t i = (size_t)2 * (row * width + col);
+      float *in = &input[i];
+      float *out= &output[i];
+      dt_aligned_pixel_t val = { 0.0f, 0.0f, 0.0f, 0.0f };
+
+      if(dim == 4 && col >= 4 && row >= 4 && col < width - 4 && row < height - 4)
+      {
+        for(int c = 0; c < 2; c++)
+          val[c] =
+            kern[10+4] * (in[-w4 -4+c] + in[-w4 +4+c] + in[-w2 -8+c] + in[-w2 +8+c] + in[+w2 -8+c] + in[+w2 +8+c] + in[+w4 -4+c] + in[+w4 +4+c])
+          + kern[5 +4] * (in[-w4 -2+c] + in[-w4 +2+c] + in[-w1 -8+c] + in[-w1 +8+c] + in[+w1 -8+c] + in[+w1 +8+c] + in[+w4 -2+c] + in[+w4 +2+c])
+          + kern[4]    * (in[-w4+c]    + in[-8+c]     + in[+8+c]     + in[+w4+c])
+          + kern[15+3] * (in[-w3 -6+c] + in[-w3 +6+c] + in[+w3 -6+c] + in[+w3 +6+c])
+          + kern[10+3] * (in[-w3 -4+c] + in[-w3 +4+c] + in[-w2 -6+c] + in[-w2 +6+c] + in[+w2 -6+c] + in[+w2 +6+c] + in[+w3 -4+c] + in[+w3 +4+c])
+          + kern[ 5+3] * (in[-w3 -2+c] + in[-w3 +2+c] + in[-w1 -6+c] + in[-w1 +6+c] + in[+w1 -6+c] + in[+w1 +6+c] + in[+w3 -2+c] + in[+w3 +2+c])
+          + kern[   3] * (in[-w3+c]    + in[-6+c]     + in[+6+c]     + in[+w3+c])
+          + kern[10+2] * (in[-w2 -4+c] + in[-w2 +4+c] + in[+w2 -4+c] + in[+w2 +4+c])
+          + kern[ 5+2] * (in[-w2 -2+c] + in[-w2 +2+c] + in[-w1 -4+c] + in[-w1 +4+c] + in[+w1 -4+c] + in[+w1 +4+c] + in[+w2 -2+c] + in[+w2 +2+c])
+          + kern[   2] * (in[-w2+c]    + in[-4+c]     + in[+4+c]     + in[+w2+c])
+          + kern[ 5+1] * (in[-w1 -2+c] + in[-w1 +2+c] + in[+w1 -2+c] + in[+w1 +2+c])
+          + kern[   1] * (in[-w1+c]    + in[-2+c]     + in[+2+c]     + in[+w1+c])
+          + kern[   0] * in[c];
+      }
+      else if(dim == 3 && col >= 3 && row >= 3 && col < width - 3 && row < height - 3)
+      {
+        for(int c = 0; c < 2; c++)
+          val[c] =
+            kern[10+3] * (in[-w3 -4+c] + in[-w3 +4+c] + in[-w2 -6+c] + in[-w2 +6+c] + in[+w2 -6+c] + in[+w2 +6+c] + in[+w3 -4+c] + in[+w3 +4+c])
+          + kern[ 5+3] * (in[-w3 -2+c] + in[-w3 +2+c] + in[-w1 -6+c] + in[-w1 +6+c] + in[+w1 -6+c] + in[+w1 +6+c] + in[+w3 -2+c] + in[+w3 +2+c])
+          + kern[   3] * (in[-w3+c]    + in[-6+c]     + in[+6+c]     + in[+w3+c])
+          + kern[10+2] * (in[-w2 -4+c] + in[-w2 +4+c] + in[+w2 -4+c] + in[+w2 +4+c])
+          + kern[ 5+2] * (in[-w2 -2+c] + in[-w2 +2+c] + in[-w1 -4+c] + in[-w1 +4+c] + in[+w1 -4+c] + in[+w1 +4+c] + in[+w2 -2+c] + in[+w2 +2+c])
+          + kern[   2] * (in[-w2+c]    + in[-4+c]     + in[+4+c]     + in[+w2+c])
+          + kern[ 5+1] * (in[-w1 -2+c] + in[-w1 +2+c] + in[+w1 -2+c] + in[+w1 +2+c])
+          + kern[   1] * (in[-w1+c]    + in[-2+c]     + in[+2+c]     + in[+w1+c])
+          + kern[   0] * in[c];
+      }
+      else if(dim == 2 && col >= 2 && row >= 2 && col < width - 2 && row < height - 2)
+      {
+        for(int c = 0; c < 2; c++)
+          val[c] =
+            kern[ 5+2] * (in[-w2 -2+c] + in[-w2 +2+c] + in[-w1 -4+c] + in[-w1 +4+c] + in[+w1 -4+c] + in[+w1 +4+c] + in[+w2 -2+c] + in[+w2 +2+c])
+          + kern[   2] * (in[-w2+c]    + in[-4+c]     + in[+4+c]     + in[+w2+c])
+          + kern[ 5+1] * (in[-w1 -2+c] + in[-w1 +2+c] + in[+w1 -2+c] + in[+w1 +2+c])
+          + kern[   1] * (in[-w1+c]    + in[-2+c]     + in[+2+c]     + in[+w1+c])
+          + kern[   0] * in[c];
+      }
+      else
+      {
+        float div = 0.0f;
+        for(int ir = -dim; ir <= dim; ir++)
+        {
+          const int irow = row+ir;
+          if(irow >= 0 && irow < height)
+          {
+            for(int ic = -dim; ic <= dim; ic++)
+            {
+              const int icol = col+ic;
+              if(icol >=0 && icol < width)
+              {
+                const float coeff = kern[5 * ABS(ir) + ABS(ic)];
+                div += coeff;
+                for(int c = 0; c < 2; c++)
+                  val[c] += coeff * in[2*(ir * width + ic)+c];
+              }
+            }
+          }
+        }
+        for(int c = 0; c < 2; c++)
+          val[c] = (div != 0.0f) ? val[c] / div : 0.0f;
+      }
+      for(int c = 0; c < 2; c++)
+        out[c] = CLAMPF(val[c], min, max);
+    }
+  }
+}
+
+DT_OMP_DECLARE_SIMD(aligned(input, output:64))
+static void _fast_9x9_kernel_4(float *input,
+                               float *output,
+                               const int width,
+                               const int height,
+                               const float sigma,
+                               const float min,
+                               const float max)
+{
+  float kern[25];
+  const int dim = _calc_9x9_gauss_coeffs(kern, sigma);
+
+  const int w1 = 4 * width;
+  const int w2 = 8 * width;
+  const int w3 = 12 * width;
+  const int w4 = 16 * width;
+
+  DT_OMP_FOR(collapse(2))
+  for(int row = 0; row < height; row++)
+  {
+    for(int col = 0; col < width; col++)
+    {
+      const size_t i = (size_t)4 * (row * width + col);
+      float *in = &input[i];
+      float *out= &output[i];
+      dt_aligned_pixel_t val = { 0.0f, 0.0f, 0.0f, 0.0f };
+
+      if(dim == 4 && col >= 4 && row >= 4 && col < width - 4 && row < height - 4)
+      {
+        for(int c = 0; c < 4; c++)
+          val[c] =
+            kern[10+4] * (in[-w4 -8+c]  + in[-w4 +8+c]  + in[-w2 -16+c] + in[-w2 +16+c] + in[+w2 -16+c] + in[+w2 +16+c] + in[+w4 -8+c] + in[+w4 +8+c])
+          + kern[5 +4] * (in[-w4 -4+c]  + in[-w4 +4+c]  + in[-w1 -16+c] + in[-w1 +16+c] + in[+w1 -16+c] + in[+w1 +16+c] + in[+w4 -4+c] + in[+w4 +4+c])
+          + kern[4]    * (in[-w4+c]     + in[-16+c]     + in[+16+c]     + in[+w4+c])
+          + kern[15+3] * (in[-w3 -12+c] + in[-w3 +12+c] + in[+w3 -12+c] + in[+w3 +12+c])
+          + kern[10+3] * (in[-w3 -8+c]  + in[-w3 +8+c]  + in[-w2 -12+c] + in[-w2 +12+c] + in[+w2 -12+c] + in[+w2 +12+c] + in[+w3 -8+c] + in[+w3 +8+c])
+          + kern[ 5+3] * (in[-w3 -4+c]  + in[-w3 +4+c]  + in[-w1 -12+c] + in[-w1 +12+c] + in[+w1 -12+c] + in[+w1 +12+c] + in[+w3 -4+c] + in[+w3 +4+c])
+          + kern[   3] * (in[-w3+c]     + in[-12+c]     + in[+12+c]     + in[+w3+c])
+          + kern[10+2] * (in[-w2 -8+c]  + in[-w2 +8+c]  + in[+w2 -8+c]  + in[+w2 +8+c])
+          + kern[ 5+2] * (in[-w2 -4+c]  + in[-w2 +4+c]  + in[-w1 -8+c]  + in[-w1 +8+c]  + in[+w1 -8+c]  + in[+w1 +8+c]  + in[+w2 -4+c] + in[+w2 +4+c])
+          + kern[   2] * (in[-w2+c]     + in[-8+c]      + in[+8+c]      + in[+w2+c])
+          + kern[ 5+1] * (in[-w1 -4+c]  + in[-w1 +4+c]  + in[+w1 -4+c]  + in[+w1 +4+c])
+          + kern[   1] * (in[-w1+c]     + in[-4+c]      + in[+4+c]      + in[+w1+c])
+          + kern[   0] * in[c];
+      }
+      else if(dim == 3 && col >= 3 && row >= 3 && col < width - 3 && row < height - 3)
+      {
+        for(int c = 0; c < 4; c++)
+          val[c] =
+            kern[10+3] * (in[-w3 -8+c] + in[-w3 +8+c] + in[-w2 -12+c] + in[-w2 +12+c] + in[+w2 -12+c] + in[+w2 +12+c] + in[+w3 -8+c] + in[+w3 +8+c])
+          + kern[ 5+3] * (in[-w3 -4+c] + in[-w3 +4+c] + in[-w1 -12+c] + in[-w1 +12+c] + in[+w1 -12+c] + in[+w1 +12+c] + in[+w3 -4+c] + in[+w3 +4+c])
+          + kern[   3] * (in[-w3+c]    + in[-12+c]    + in[+12+c]     + in[+w3+c])
+          + kern[10+2] * (in[-w2 -8+c] + in[-w2 +8+c] + in[+w2 -8+c]  + in[+w2 +8+c])
+          + kern[ 5+2] * (in[-w2 -4+c] + in[-w2 +4+c] + in[-w1 -8+c]  + in[-w1 +8+c]  + in[+w1 -8+c]  + in[+w1 +8+c]  + in[+w2 -4+c] + in[+w2 +4+c])
+          + kern[   2] * (in[-w2+c]    + in[-8+c]     + in[+8+c]      + in[+w2+c])
+          + kern[ 5+1] * (in[-w1 -4+c] + in[-w1 +4+c] + in[+w1 -4+c]  + in[+w1 +4+c])
+          + kern[   1] * (in[-w1+c]    + in[-4+c]     + in[+4+c]      + in[+w1+c])
+          + kern[   0] * in[c];
+      }
+      else if(dim == 2 && col >= 2 && row >= 2 && col < width - 2 && row < height - 2)
+      {
+        for(int c = 0; c < 4; c++)
+          val[c] =
+            kern[ 5+2] * (in[-w2 -4+c] + in[-w2 +4+c] + in[-w1 -8+c] + in[-w1 +8+c] + in[+w1 -8+c] + in[+w1 +8+c] + in[+w2 -4+c] + in[+w2 +4+c])
+          + kern[   2] * (in[-w2+c]    + in[-8+c]     + in[+8+c]     + in[+w2+c])
+          + kern[ 5+1] * (in[-w1 -4+c] + in[-w1 +4+c] + in[+w1 -4+c] + in[+w1 +4+c])
+          + kern[   1] * (in[-w1+c]    + in[-4+c]     + in[+4+c]     + in[+w1+c])
+          + kern[   0] * in[c];
+      }
+      else
+      {
+        float div = 0.0f;
+        for(int ir = -dim; ir <= dim; ir++)
+        {
+          const int irow = row+ir;
+          if(irow >= 0 && irow < height)
+          {
+            for(int ic = -dim; ic <= dim; ic++)
+            {
+              const int icol = col+ic;
+              if(icol >=0 && icol < width)
+              {
+                const float coeff = kern[5 * ABS(ir) + ABS(ic)];
+                div += coeff;
+                for(int c = 0; c < 4; c++)
+                  val[c] += coeff * in[4*(ir * width + ic)+c];
+              }
+            }
+          }
+        }
+        for(int c = 0; c < 4; c++)
+          val[c] = (div != 0.0f) ? val[c] / div : 0.0f;
+      }
+      for(int c = 0; c < 4; c++)
+        out[c] = CLAMPF(val[c], min, max);
+    }
+  }
+}
+
+void dt_gaussian_fast_blur(float *in,
+                           float *out,
+                           const int width,
+                           const int height,
+                           const float sigma,
+                           const float min,
+                           const float max,
+                           const int ch)
+{
+  float *tmpout = out;
+  const gboolean inplace = (in == out);
+  const size_t bsize = (size_t)ch * width * height;
+  if(inplace) tmpout = dt_alloc_align_float(bsize);
+
+  if(ch == 1)       _fast_9x9_kernel_1(in, tmpout, width, height, sigma, min, max);
+  else if(ch == 2)  _fast_9x9_kernel_2(in, tmpout, width, height, sigma, min, max);
+  else if(ch == 4)  _fast_9x9_kernel_4(in, tmpout, width, height, sigma, min, max);
+
+  if(inplace)
+  {
+    dt_iop_image_copy(out, tmpout, bsize);
+    dt_free_align(tmpout);
+  }
+}
 
 #ifdef HAVE_OPENCL
 dt_gaussian_cl_global_t *dt_gaussian_init_cl_global()
@@ -477,6 +832,7 @@ dt_gaussian_cl_global_t *dt_gaussian_init_cl_global()
   g->kernel_gaussian_transpose_2c = dt_opencl_create_kernel(program, "gaussian_transpose_2c");
   g->kernel_gaussian_column_4c = dt_opencl_create_kernel(program, "gaussian_column_4c");
   g->kernel_gaussian_transpose_4c = dt_opencl_create_kernel(program, "gaussian_transpose_4c");
+  g->kernel_gaussian_9x9 = dt_opencl_create_kernel(program, "gaussian_kernel_9x9");
   return g;
 }
 
@@ -625,7 +981,7 @@ cl_int dt_gaussian_blur_cl(dt_gaussian_cl_t *g, cl_mem dev_in, cl_mem dev_out)
 
   // compute gaussian parameters
   float a0, a1, a2, a3, b1, b2, coefp, coefn;
-  compute_gauss_params(g->sigma, g->order, &a0, &a1, &a2, &a3, &b1, &b2, &coefp, &coefn);
+  _compute_gauss_params(g->sigma, g->order, &a0, &a1, &a2, &a3, &b1, &b2, &coefp, &coefn);
 
   // copy dev_in to intermediate buffer dev_temp1
   err = dt_opencl_enqueue_copy_image_to_buffer(devid, dev_in, dev_temp1, origin, region, 0);
@@ -680,7 +1036,7 @@ cl_int dt_gaussian_blur_cl(dt_gaussian_cl_t *g, cl_mem dev_in, cl_mem dev_out)
   return dt_opencl_enqueue_copy_buffer_to_image(devid, dev_temp1, dev_out, 0, origin, region);
 }
 
-cl_int dt_gaussian_blur_cl_buffer(dt_gaussian_cl_t *g, cl_mem dev_in, cl_mem dev_out)
+static cl_int _gaussian_blur_cl_buffer(dt_gaussian_cl_t *g, cl_mem dev_in, cl_mem dev_out)
 {
   cl_int err = DT_OPENCL_DEFAULT_ERROR;
   const int devid = g->devid;
@@ -731,7 +1087,7 @@ cl_int dt_gaussian_blur_cl_buffer(dt_gaussian_cl_t *g, cl_mem dev_in, cl_mem dev
 
   // compute gaussian parameters
   float a0, a1, a2, a3, b1, b2, coefp, coefn;
-  compute_gauss_params(g->sigma, g->order, &a0, &a1, &a2, &a3, &b1, &b2, &coefp, &coefn);
+  _compute_gauss_params(g->sigma, g->order, &a0, &a1, &a2, &a3, &b1, &b2, &coefp, &coefn);
 
   // first blur step: column by column with dev_in -> dev_temp2
   sizes[0] = ROUNDUPDWD(width, devid);
@@ -778,6 +1134,74 @@ cl_int dt_gaussian_blur_cl_buffer(dt_gaussian_cl_t *g, cl_mem dev_in, cl_mem dev
   return dt_opencl_enqueue_kernel_2d_with_local(devid, kernel_gaussian_transpose, sizes, local);
 }
 
+/* falls back to standard gaussian if sigma too large for small errors with a 9x9 kernel
+*/
+cl_int dt_gaussian_fast_blur_cl_buffer(const int devid,
+                                       cl_mem dev_in,
+                                       cl_mem dev_out,
+                                       const int width,
+                                       const int height,
+                                       const float sigma,
+                                       const int ch,
+                                       const float *min,
+                                       const float *max)
+{
+  cl_int err = CL_MEM_OBJECT_ALLOCATION_FAILURE;
+  dt_gaussian_cl_global_t *global = darktable.opencl->gaussian;
+
+  const gboolean inplace = (dev_in == dev_out);
+  const size_t bsize = (size_t)ch * width * height * sizeof(float);
+  cl_mem tmp_out = dev_out;
+  cl_mem kern_cl = NULL;
+
+  dt_aligned_pixel_t Labmax = { 0.0f, 0.0f, 0.0f, 0.0f };
+  dt_aligned_pixel_t Labmin = { 0.0f, 0.0f, 0.0f, 0.0f };
+
+  for(int k = 0; k < MIN(ch, 4); k++)
+  {
+    Labmax[k] = max[k];
+    Labmin[k] = min[k];
+  }
+
+  if(sigma > 1.5f)
+  {
+    dt_gaussian_cl_t *g = dt_gaussian_init_cl(devid, width, height, ch, Labmax, Labmin, sigma, DT_IOP_GAUSSIAN_ZERO);
+    if(!g) return DT_OPENCL_PROCESS_CL;
+
+    err = _gaussian_blur_cl_buffer(g, dev_in, dev_out);
+    dt_gaussian_free_cl(g);
+    return err;
+  }
+
+  if(inplace)
+  {
+    tmp_out = dt_opencl_alloc_device_buffer(devid, bsize);
+    if(tmp_out == NULL) goto error;
+  }
+
+  float kern[25];
+  const int dim = _calc_9x9_gauss_coeffs(kern, sigma);
+  kern_cl = dt_opencl_copy_host_to_device_constant(devid, 25 * sizeof(float), kern);
+  if(kern_cl == NULL) goto error;
+
+  err = dt_opencl_enqueue_kernel_2d_args(devid, global->kernel_gaussian_9x9, width, height,
+    CLARG(dev_in), CLARG(tmp_out), CLARG(width), CLARG(height), CLARG(ch), CLARG(kern_cl),
+    CLFLARRAY(4, Labmin), CLFLARRAY(4, Labmax), CLARG(dim));
+  if(err != CL_SUCCESS) goto error;
+
+  if(inplace)
+  {
+    err = dt_opencl_enqueue_copy_buffer_to_buffer(devid, tmp_out, dev_out, 0, 0, bsize);
+    if(err != CL_SUCCESS) goto error;
+  }
+
+error:
+  dt_opencl_release_mem_object(kern_cl);
+  if(inplace) dt_opencl_release_mem_object(tmp_out);
+
+  return err;
+}
+
 void dt_gaussian_free_cl_global(dt_gaussian_cl_global_t *g)
 {
   if(!g) return;
@@ -788,6 +1212,7 @@ void dt_gaussian_free_cl_global(dt_gaussian_cl_global_t *g)
   dt_opencl_free_kernel(g->kernel_gaussian_transpose_2c);
   dt_opencl_free_kernel(g->kernel_gaussian_column_4c);
   dt_opencl_free_kernel(g->kernel_gaussian_transpose_4c);
+  dt_opencl_free_kernel(g->kernel_gaussian_9x9);
   free(g);
 }
 
diff --git a/src/common/gaussian.h b/src/common/gaussian.h
index 788c49077f0e..e3910c6ecf70 100644
--- a/src/common/gaussian.h
+++ b/src/common/gaussian.h
@@ -55,7 +55,7 @@ void dt_gaussian_blur(dt_gaussian_t *g, const float *const in, float *const out)
 void dt_gaussian_blur_4c(dt_gaussian_t *g, const float *const in, float *const out);
 
 void dt_gaussian_free(dt_gaussian_t *g);
-
+void dt_gaussian_fast_blur(float *in, float *out, const int width, const int height, const float sigma, const float min, const float max, const int channels);
 
 #ifdef HAVE_OPENCL
 typedef struct dt_gaussian_cl_global_t
@@ -63,6 +63,7 @@ typedef struct dt_gaussian_cl_global_t
   int kernel_gaussian_column_4c, kernel_gaussian_transpose_4c;
   int kernel_gaussian_column_2c, kernel_gaussian_transpose_2c;
   int kernel_gaussian_column_1c, kernel_gaussian_transpose_1c;
+  int kernel_gaussian_9x9;
 } dt_gaussian_cl_global_t;
 
 
@@ -89,7 +90,7 @@ dt_gaussian_cl_t *dt_gaussian_init_cl(const int devid, const int width, const in
                                       const float *max, const float *min, const float sigma, const int order);
 
 cl_int dt_gaussian_blur_cl(dt_gaussian_cl_t *g, cl_mem dev_in, cl_mem dev_out);
-cl_int dt_gaussian_blur_cl_buffer(dt_gaussian_cl_t *g, cl_mem dev_in, cl_mem dev_out);
+cl_int dt_gaussian_fast_blur_cl_buffer(const int devid, cl_mem dev_in, cl_mem dev_out, const int width, const int height, const float sigma, const int ch, const float *min, const float *max);
 
 void dt_gaussian_free_cl(dt_gaussian_cl_t *g);
 #endif

From 1bc7a1dcc3f0fe52e84a91fb9887584bd3ba4368 Mon Sep 17 00:00:00 2001
From: Hanno Schwalm <hanno@schwalm-bremen.de>
Date: Wed, 2 Oct 2024 06:16:15 +0200
Subject: [PATCH 2/3] Color equalizer gets a perf boost

While using small sigma for gaussian blurring we can use the faster
dt_gaussian_fast_blur_cl_buffer() and dt_gaussian_fast_blur() variants.

As gaussian blurring is a major performance bottleneck in color equalizer, the overall
performance is almost doubled if the radii for the guided filter are small like with defaults.
---
 src/iop/colorequal.c | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/src/iop/colorequal.c b/src/iop/colorequal.c
index d1941565dbf7..f51f03b42891 100644
--- a/src/iop/colorequal.c
+++ b/src/iop/colorequal.c
@@ -441,20 +441,27 @@ int legacy_params(dt_iop_module_t *self,
 void _mean_gaussian(float *const buf,
                     const int width,
                     const int height,
-                    const uint32_t ch,
+                    const int ch,
                     const float sigma)
 {
   // We use unbounded signals, so don't care for the internal value clipping
   const float range = 1.0e9;
-  const dt_aligned_pixel_t max = {range, range, range, range};
-  const dt_aligned_pixel_t min = {-range, -range, -range, -range};
-  dt_gaussian_t *g = dt_gaussian_init(width, height, ch, max, min, sigma, DT_IOP_GAUSSIAN_ZERO);
-  if(!g) return;
-  if(ch == 4)
-    dt_gaussian_blur_4c(g, buf, buf);
+  if(sigma <= 1.5f)
+  {
+    dt_gaussian_fast_blur(buf, buf, width, height, sigma, -range, range, ch);
+  }
   else
-    dt_gaussian_blur(g, buf, buf);
-  dt_gaussian_free(g);
+  {
+    const dt_aligned_pixel_t max = {range, range, range, range};
+    const dt_aligned_pixel_t min = {-range, -range, -range, -range};
+    dt_gaussian_t *g = dt_gaussian_init(width, height, ch, max, min, sigma, DT_IOP_GAUSSIAN_ZERO);
+    if(!g) return;
+    if(ch == 4)
+      dt_gaussian_blur_4c(g, buf, buf);
+    else
+      dt_gaussian_blur(g, buf, buf);
+    dt_gaussian_free(g);
+  }
 }
 
 
@@ -1234,12 +1241,7 @@ int _mean_gaussian_cl(const int devid,
   const dt_aligned_pixel_t max = {range, range, range, range};
   const dt_aligned_pixel_t min = {-range, -range, -range, -range};
 
-  dt_gaussian_cl_t *g = dt_gaussian_init_cl(devid, width, height, ch, max, min, sigma, DT_IOP_GAUSSIAN_ZERO);
-  if(!g) return DT_OPENCL_PROCESS_CL;
-
-  cl_int err = dt_gaussian_blur_cl_buffer(g, image, image);
-  dt_gaussian_free_cl(g);
-  return err;
+  return dt_gaussian_fast_blur_cl_buffer(devid, image, image, width, height, sigma, ch, min, max);
 }
 
 static cl_mem _init_covariance_cl(const int devid,

From 2c91caef1b4d592b7d7b2d6b1886f0fc9ccfe22f Mon Sep 17 00:00:00 2001
From: Hanno Schwalm <hanno@schwalm-bremen.de>
Date: Tue, 1 Oct 2024 06:22:57 +0200
Subject: [PATCH 3/3] Gaussian blurring maintenance

For details threshold, dual demosaicing, mask blurring and segmentation gradients we used
special gaussian blurs.

All algorithms now make use of public dt_gaussian_fast_blur_cl_buffer() and dt_gaussian_fast_blur()
functions, internal code has been removed.
---
 data/kernels/demosaic_rcd.cl     |  34 +-------
 src/develop/blend.c              |  29 ++-----
 src/develop/blend.h              |   2 +-
 src/develop/masks.h              |  12 ---
 src/develop/masks/detail.c       | 140 +++----------------------------
 src/iop/demosaic.c               |   1 +
 src/iop/demosaicing/dual.c       |  50 +++++------
 src/iop/hlreconstruct/segbased.c |  41 +++++++--
 8 files changed, 80 insertions(+), 229 deletions(-)

diff --git a/data/kernels/demosaic_rcd.cl b/data/kernels/demosaic_rcd.cl
index c7ba6c04e63e..600f0b028d06 100644
--- a/data/kernels/demosaic_rcd.cl
+++ b/data/kernels/demosaic_rcd.cl
@@ -1,6 +1,6 @@
 /*
     This file is part of darktable,
-    rcd_cl implemented Hanno Schwalm (hanno@schwalm-bremen.de)
+    Copyright (C) 2020-2024 darktable developers.
 
     darktable is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -346,38 +346,6 @@ __kernel void calc_detail_blend(global float *in, global float *out, const int w
   out[idx] = detail ? blend : 1.0f - blend;
 }
 
-__kernel void fastblur_mask_9x9(global float *src, global float *out, const int w, const int height, global const float *kern)
-{
-  const int col = get_global_id(0);
-  const int row = get_global_id(1);
-  if((col >= w) || (row >= height)) return;
-
-  const int oidx = mad24(row, w, col);
-  int incol = col < 4 ? 4 : col;
-  incol = col > w - 5 ? w - 5 : incol;
-  int inrow = row < 4 ? 4 : row;
-  inrow = row > height - 5 ? height - 5 : inrow;
-  const int i = mad24(inrow, w, incol);
-
-  const int w2 = 2 * w;
-  const int w3 = 3 * w;
-  const int w4 = 4 * w;
-  const float val = kern[12] * (src[i - w4 - 2] + src[i - w4 + 2] + src[i - w2 - 4] + src[i - w2 + 4] + src[i + w2 - 4] + src[i + w2 + 4] + src[i + w4 - 2] + src[i + w4 + 2]) +
-                    kern[11] * (src[i - w4 - 1] + src[i - w4 + 1] + src[i -  w - 4] + src[i -  w + 4] + src[i +  w - 4] + src[i +  w + 4] + src[i + w4 - 1] + src[i + w4 + 1]) +
-                    kern[10] * (src[i - w4] + src[i - 4] + src[i + 4] + src[i + w4]) +
-                    kern[9] * (src[i - w3 - 3] + src[i - w3 + 3] + src[i + w3 - 3] + src[i + w3 + 3]) +
-                    kern[8] * (src[i - w3 - 2] + src[i - w3 + 2] + src[i - w2 - 3] + src[i - w2 + 3] + src[i + w2 - 3] + src[i + w2 + 3] + src[i + w3 - 2] + src[i + w3 + 2]) +
-                    kern[7] * (src[i - w3 - 1] + src[i - w3 + 1] + src[i -  w - 3] + src[i -  w + 3] + src[i +  w - 3] + src[i +  w + 3] + src[i + w3 - 1] + src[i + w3 + 1]) +
-                    kern[6] * (src[i - w3] + src[i - 3] + src[i + 3] + src[i + w3]) +
-                    kern[5] * (src[i - w2 - 2] + src[i - w2 + 2] + src[i + w2 - 2] + src[i + w2 + 2]) +
-                    kern[4] * (src[i - w2 - 1] + src[i - w2 + 1] + src[i -  w - 2] + src[i -  w + 2] + src[i +  w - 2] + src[i +  w + 2] + src[i + w2 - 1] + src[i + w2 + 1]) +
-                    kern[3] * (src[i - w2] + src[i - 2] + src[i + 2] + src[i + w2]) +
-                    kern[2] * (src[i -  w - 1] + src[i -  w + 1] + src[i +  w - 1] + src[i +  w + 1]) +
-                    kern[1] * (src[i -  w] + src[i - 1] + src[i + 1] + src[i +  w]) +
-                    kern[0] * src[i];
-  out[oidx] = clamp(val, 0.0f, 1.0f);
-}
-
 kernel void rcd_border_green(read_only image2d_t in, write_only image2d_t out, const int width, const int height,
                     const unsigned int filters, local float *buffer, const int border)
 {
diff --git a/src/develop/blend.c b/src/develop/blend.c
index 8e551497ca2c..75e697f247be 100644
--- a/src/develop/blend.c
+++ b/src/develop/blend.c
@@ -880,25 +880,13 @@ static void _refine_with_detail_mask_cl(struct dt_iop_module_t *self,
           CLARG(out), CLARG(blur), CLARG(iwidth), CLARG(iheight), CLARG(threshold), CLARG(detail));
   if(err != CL_SUCCESS) goto error;
 
-  float blurmat[13];
-  dt_masks_blur_coeff(blurmat, 2.0f);
-  cl_mem dev_blurmat = dt_opencl_copy_host_to_device_constant(devid, sizeof(blurmat), blurmat);
-  if(dev_blurmat != NULL)
-  {
-    err = dt_opencl_enqueue_kernel_2d_args
-          (devid, darktable.opencl->blendop->kernel_mask_blur, iwidth, iheight,
-           CLARG(blur), CLARG(out), CLARG(iwidth), CLARG(iheight), CLARG(dev_blurmat));
-    dt_opencl_release_mem_object(dev_blurmat);
-    if(err != CL_SUCCESS) goto error;
-
-    err = dt_opencl_read_buffer_from_device(devid, lum, out, 0, sizeof(float) * iwidth * iheight, TRUE);
-    if(err != CL_SUCCESS) goto error;
-  }
-  else
-  {
-    err = CL_MEM_OBJECT_ALLOCATION_FAILURE;
-    goto error;
-  }
+  const dt_aligned_pixel_t max = {1.0f, 1.0f, 1.0f, 1.0f};
+  const dt_aligned_pixel_t min = {0.0f, 0.0f, 0.0f, 0.0f};
+  err = dt_gaussian_fast_blur_cl_buffer(devid, blur, out, iwidth, iheight, 2.0f, 1, min, max);
+  if(err != CL_SUCCESS) goto error;
+
+  err = dt_opencl_read_buffer_from_device(devid, lum, out, 0, sizeof(float) * iwidth * iheight, TRUE);
+  if(err != CL_SUCCESS) goto error;
 
   dt_opencl_release_mem_object(blur);
   dt_opencl_release_mem_object(out);
@@ -1587,8 +1575,6 @@ dt_blendop_cl_global_t *dt_develop_blend_init_cl_global(void)
     dt_opencl_create_kernel(program_rcd, "calc_scharr_mask");
   b->kernel_calc_blend =
     dt_opencl_create_kernel(program_rcd, "calc_detail_blend");
-  b->kernel_mask_blur  =
-    dt_opencl_create_kernel(program_rcd, "fastblur_mask_9x9");
 
   return b;
 #else
@@ -1618,7 +1604,6 @@ void dt_develop_blend_free_cl_global(dt_blendop_cl_global_t *b)
   dt_opencl_free_kernel(b->kernel_calc_Y0_mask);
   dt_opencl_free_kernel(b->kernel_calc_scharr_mask);
   dt_opencl_free_kernel(b->kernel_calc_blend);
-  dt_opencl_free_kernel(b->kernel_mask_blur);
   free(b);
 #endif
 }
diff --git a/src/develop/blend.h b/src/develop/blend.h
index ca3235f0595f..9275c6cccf18 100644
--- a/src/develop/blend.h
+++ b/src/develop/blend.h
@@ -26,6 +26,7 @@
 #include "dtgtk/gradientslider.h"
 #include "gui/color_picker_proxy.h"
 #include "common/imagebuf.h"
+#include "common/gaussian.h"
 
 #define DEVELOP_BLEND_VERSION (13)
 
@@ -238,7 +239,6 @@ typedef struct dt_blendop_cl_global_t
   int kernel_calc_Y0_mask;
   int kernel_calc_scharr_mask;
   int kernel_calc_blend;
-  int kernel_mask_blur;
   int kernel_blendop_highlights_mask;
 } dt_blendop_cl_global_t;
 
diff --git a/src/develop/masks.h b/src/develop/masks.h
index 5335969ac5e0..1bc7828d8ad6 100644
--- a/src/develop/masks.h
+++ b/src/develop/masks.h
@@ -667,18 +667,6 @@ void dt_masks_calculate_source_pos_value(dt_masks_form_gui_t *gui,
                                          const int adding);
 
 /** detail mask support */
-void dt_masks_extend_border(float *const mask,
-                            const int width,
-                            const int height,
-                            const int border);
-void dt_masks_blur_coeff(float *coeffs, const float sigma);
-void dt_masks_blur(float *const src,
-                   float *const out,
-                   const int width,
-                   const int height,
-                   const float sigma,
-                   const float gain,
-                   const float clip);
 gboolean dt_masks_calc_scharr_mask(dt_dev_detail_mask_t *details,
                                    float *const src,
                                    const dt_aligned_pixel_t wb);
diff --git a/src/develop/masks/detail.c b/src/develop/masks/detail.c
index 70547282d00a..b222611e5b85 100644
--- a/src/develop/masks/detail.c
+++ b/src/develop/masks/detail.c
@@ -73,10 +73,9 @@
   with the threshold and scharr as parameters.
 
   At last the IM is slightly blurred to avoid hard transitions, as
-  there still is no scaling we can use a constant sigma. As the
-  blur_9x9 is pretty fast both in openmp/cl code paths - much faster
-  than dt gaussians - it is used here.  Now we have an unscaled detail
-  mask which requires to be transformed through the pipeline using
+  there still is no scaling we can use a constant sigma.
+  Now we have an unscaled detail mask which requires to be transformed
+  through the pipeline using
 
   float *dt_dev_distort_detail_mask(const dt_dev_pixelpipe_t *pipe, float *src, const dt_iop_module_t *target_module)
 
@@ -98,127 +97,13 @@
      detail refinement.
 
   3. Of course credit goes to Ingo @heckflosse from rt team for the
-     original idea. (in the rt world this is knowb as details mask)
+     original idea. (in the rt world this is known as details mask)
 
   4. Thanks to rawfiner for pointing out how to use Y0 and scharr for better maths.
 
   hanno@schwalm-bremen.de 21/04/29
 */
 
-void dt_masks_extend_border(float *const mask,
-                            const int width,
-                            const int height,
-                            const int border)
-{
-  if(border <= 0) return;
-  DT_OMP_FOR()
-  for(size_t row = border; row < height - border; row++)
-  {
-    const size_t idx = row * width;
-    for(size_t i = 0; i < border; i++)
-    {
-      mask[idx + i] = mask[idx + border];
-      mask[idx + width - i - 1] = mask[idx + width - border -1];
-    }
-  }
-  DT_OMP_FOR()
-  for(size_t col = 0; col < width; col++)
-  {
-    const float top = mask[border * width + MIN(width - border - 1, MAX(col, border))];
-    const float bot = mask[(height - border - 1) * width
-                           + MIN(width - border - 1, MAX(col, border))];
-    for(size_t i = 0; i < border; i++)
-    {
-      mask[col + i * width] = top;
-      mask[col + (height - i - 1) * width] = bot;
-    }
-  }
-}
-
-void dt_masks_blur_coeff(float *c, const float sigma)
-{
-  float kernel[9][9];
-  const float temp = -2.0f * sqrf(sigma);
-  const float range = sqrf(3.0f * 1.5f);
-  float sum = 0.0f;
-  for(int k = -4; k <= 4; k++)
-  {
-    for(int j = -4; j <= 4; j++)
-    {
-      if((sqrf(k) + sqrf(j)) <= range)
-      {
-        kernel[k + 4][j + 4] = expf((sqrf(k) + sqrf(j)) / temp);
-        sum += kernel[k + 4][j + 4];
-      }
-      else
-        kernel[k + 4][j + 4] = 0.0f;
-    }
-  }
-  for(int i = 0; i < 9; i++)
-  {
-#if defined(__GNUC__)
-  #pragma GCC ivdep
-#endif
-    for(int j = 0; j < 9; j++)
-      kernel[i][j] /= sum;
-  }
-  /* c00 */ c[0]  = kernel[4][4];
-  /* c10 */ c[1]  = kernel[3][4];
-  /* c11 */ c[2]  = kernel[3][3];
-  /* c20 */ c[3]  = kernel[2][4];
-  /* c21 */ c[4]  = kernel[2][3];
-  /* c22 */ c[5]  = kernel[2][2];
-  /* c30 */ c[6]  = kernel[1][4];
-  /* c31 */ c[7]  = kernel[1][3];
-  /* c32 */ c[8]  = kernel[1][2];
-  /* c33 */ c[9]  = kernel[1][1];
-  /* c40 */ c[10] = kernel[0][4];
-  /* c41 */ c[11] = kernel[0][3];
-  /* c42 */ c[12] = kernel[0][2];
-}
-
-#define FAST_BLUR_9 ( \
-  blurmat[12] * (src[i - w4 - 2] + src[i - w4 + 2] + src[i - w2 - 4] + src[i - w2 + 4] + src[i + w2 - 4] + src[i + w2 + 4] + src[i + w4 - 2] + src[i + w4 + 2]) + \
-  blurmat[11] * (src[i - w4 - 1] + src[i - w4 + 1] + src[i - w1 - 4] + src[i - w1 + 4] + src[i + w1 - 4] + src[i + w1 + 4] + src[i + w4 - 1] + src[i + w4 + 1]) + \
-  blurmat[10] * (src[i - w4] + src[i - 4] + src[i + 4] + src[i + w4]) + \
-  blurmat[9]  * (src[i - w3 - 3] + src[i - w3 + 3] + src[i + w3 - 3] + src[i + w3 + 3]) + \
-  blurmat[8]  * (src[i - w3 - 2] + src[i - w3 + 2] + src[i - w2 - 3] + src[i - w2 + 3] + src[i + w2 - 3] + src[i + w2 + 3] + src[i + w3 - 2] + src[i + w3 + 2]) + \
-  blurmat[7]  * (src[i - w3 - 1] + src[i - w3 + 1] + src[i - w1 - 3] + src[i - w1 + 3] + src[i + w1 - 3] + src[i + w1 + 3] + src[i + w3 - 1] + src[i + w3 + 1]) + \
-  blurmat[6]  * (src[i - w3] + src[i - 3] + src[i + 3] + src[i + w3]) + \
-  blurmat[5]  * (src[i - w2 - 2] + src[i - w2 + 2] + src[i + w2 - 2] + src[i + w2 + 2]) + \
-  blurmat[4]  * (src[i - w2 - 1] + src[i - w2 + 1] + src[i - w1 - 2] + src[i - w1 + 2] + src[i + w1 - 2] + src[i + w1 + 2] + src[i + w2 - 1] + src[i + w2 + 1]) + \
-  blurmat[3]  * (src[i - w2] + src[i - 2] + src[i + 2] + src[i + w2]) + \
-  blurmat[2]  * (src[i - w1 - 1] + src[i - w1 + 1] + src[i + w1 - 1] + src[i + w1 + 1]) + \
-  blurmat[1]  * (src[i - w1] + src[i - 1] + src[i + 1] + src[i + w1]) + \
-  blurmat[0]  * src[i] )
-
-void dt_masks_blur(float *const restrict src,
-                   float *const restrict out,
-                   const int width,
-                   const int height,
-                   const float sigma,
-                   const float gain,
-                   const float clip)
-{
-  float blurmat[13];
-  dt_masks_blur_coeff(blurmat, sigma);
-
-  const size_t w1 = width;
-  const size_t w2 = 2*width;
-  const size_t w3 = 3*width;
-  const size_t w4 = 4*width;
-  DT_OMP_FOR()
-  for(size_t row = 4; row < height - 4; row++)
-  {
-    for(size_t col = 4; col < width - 4; col++)
-    {
-      const size_t i = row * width + col;
-      out[i] = fmaxf(0.0f, fminf(clip, gain * FAST_BLUR_9));
-    }
-  }
-  dt_masks_extend_border(out, width, height, 4);
-}
-
 gboolean dt_masks_calc_scharr_mask(dt_dev_detail_mask_t *details,
                                       float *const restrict src,
                                       const dt_aligned_pixel_t wb)
@@ -242,16 +127,18 @@ gboolean dt_masks_calc_scharr_mask(dt_dev_detail_mask_t *details,
   }
 
   DT_OMP_FOR()
-  for(size_t row = 1; row < height - 1; row++)
+  for(size_t row = 0; row < height; row++)
   {
-    for(size_t col = 1; col < width - 1; col++)
+    const int irow = CLAMP(row, 1, height -2);
+    for(size_t col = 0; col < width; col++)
     {
-      const size_t idx = row * width + col;
+      const int icol = CLAMP(col, 1, width -2);
+      const size_t idx = (size_t)irow * width + icol;
+
       const float gradient_magnitude = scharr_gradient(&tmp[idx], width);
-      mask[idx] = fminf(1.0f, fmaxf(0.0f, gradient_magnitude / 16.0f));
+      mask[(size_t)row * width + col] = fminf(1.0f, fmaxf(0.0f, gradient_magnitude / 16.0f));
     }
   }
-  dt_masks_extend_border(mask, width, height, 1);
   dt_free_align(tmp);
   return FALSE;
 }
@@ -293,12 +180,11 @@ float *dt_masks_calc_detail_mask(struct dt_dev_pixelpipe_iop_t *piece,
     tmp[idx] = detail ? blend : 1.0f - blend;
   }
   // for very small images the blurring should be slightly less to have an effect at all
-  const float blurring = (MIN(details->roi.width, details->roi.height) < 500) ? 1.5f : 2.0f;
-  dt_masks_blur(tmp, mask, details->roi.width, details->roi.height, blurring, 1.0f, 1.0f);
+  const float sigma = (MIN(details->roi.width, details->roi.height) < 500) ? 1.5f : 2.0f;
+  dt_gaussian_fast_blur(tmp, mask, details->roi.width, details->roi.height, sigma, 0.0f, 1.0f, 1);
   dt_free_align(tmp);
   return mask;
 }
-#undef FAST_BLUR_9
 
 
 // clang-format off
diff --git a/src/iop/demosaic.c b/src/iop/demosaic.c
index 0efe3ac212af..b18542f5b9b5 100644
--- a/src/iop/demosaic.c
+++ b/src/iop/demosaic.c
@@ -24,6 +24,7 @@
 #include "common/image_cache.h"
 #include "common/math.h"
 #include "common/imagebuf.h"
+#include "common/gaussian.h"
 #include "control/conf.h"
 #include "control/control.h"
 #include "develop/develop.h"
diff --git a/src/iop/demosaicing/dual.c b/src/iop/demosaicing/dual.c
index f733f0500934..9055bbb30b7f 100644
--- a/src/iop/demosaicing/dual.c
+++ b/src/iop/demosaicing/dual.c
@@ -1,6 +1,6 @@
 /*
     This file is part of darktable,
-    Copyright (C) 2010-2023 darktable developers.
+    Copyright (C) 2010-2024 darktable developers.
 
     darktable is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -83,54 +83,48 @@ static void dual_demosaic(
 }
 
 #ifdef HAVE_OPENCL
-gboolean dual_demosaic_cl(
-        struct dt_iop_module_t *self,
-        dt_dev_pixelpipe_iop_t *piece,
-        cl_mem high_image,
-        cl_mem low_image,
-        cl_mem out,
-        const dt_iop_roi_t *const roi_in,
-        const int dual_mask)
+gboolean dual_demosaic_cl(struct dt_iop_module_t *self,
+                          dt_dev_pixelpipe_iop_t *piece,
+                          cl_mem high_image,
+                          cl_mem low_image,
+                          cl_mem out,
+                          const dt_iop_roi_t *const roi_in,
+                          const int dual_mask)
 {
   const int devid = piece->pipe->devid;
   const int width = roi_in->width;
   const int height = roi_in->height;
-  const int clwidth = ROUNDUPDWD(width, devid);
-  const int clheight = ROUNDUPDHT(height, devid);
 
   dt_iop_demosaic_data_t *data = (dt_iop_demosaic_data_t *)piece->data;
   dt_iop_demosaic_global_data_t *gd = (dt_iop_demosaic_global_data_t *)self->global_data;
 
   const float contrastf = slider2contrast(data->dual_thrs);
 
-  cl_int err = CL_SUCCESS;
-  cl_mem dev_blurmat = NULL;
-  cl_mem mask = dt_opencl_alloc_device_buffer(devid, width * height * sizeof(float));
-  cl_mem tmp = dt_opencl_alloc_device_buffer(devid, width * height * sizeof(float));
+  cl_int err = CL_MEM_OBJECT_ALLOCATION_FAILURE;
+  cl_mem mask = NULL;
+  cl_mem tmp = NULL;
+  const size_t bsize = sizeof(float) * width * height;
 
-  err = dt_opencl_write_buffer_to_device(devid, piece->pipe->scharr.data, tmp, 0, sizeof(float) * width * height, TRUE);
-  if(err != CL_SUCCESS) goto finish;
+  tmp = dt_opencl_copy_host_to_device_constant(devid, bsize, piece->pipe->scharr.data);
+  mask = dt_opencl_alloc_device_buffer(devid, bsize);
+  if(mask == NULL || tmp == NULL) goto finish;
 
-  const int flag = 1;
-  err = dt_opencl_enqueue_kernel_2d_args(devid, darktable.opencl->blendop->kernel_calc_blend, clwidth, clheight,
-      CLARG(tmp), CLARG(mask), CLARG(width), CLARG(height), CLARG(contrastf), CLARG(flag));
+  const int detail = 1;
+  err = dt_opencl_enqueue_kernel_2d_args(devid, darktable.opencl->blendop->kernel_calc_blend, width, height,
+      CLARG(tmp), CLARG(mask), CLARG(width), CLARG(height), CLARG(contrastf), CLARG(detail));
   if(err != CL_SUCCESS) goto finish;
 
-  float blurmat[13];
-  dt_masks_blur_coeff(blurmat, 2.0f);
-  dev_blurmat = dt_opencl_copy_host_to_device_constant(devid, sizeof(blurmat), blurmat);
-
-  err = dt_opencl_enqueue_kernel_2d_args(devid, darktable.opencl->blendop->kernel_mask_blur, clwidth, clheight,
-      CLARG(mask), CLARG(tmp), CLARG(width), CLARG(height), CLARG(dev_blurmat));
+  const dt_aligned_pixel_t max = {1.0f, 1.0f, 1.0f, 1.0f};
+  const dt_aligned_pixel_t min = {0.0f, 0.0f, 0.0f, 0.0f};
+  err = dt_gaussian_fast_blur_cl_buffer(devid, mask, tmp, width, height, 2.0f, 1, min, max);
   if(err != CL_SUCCESS) goto finish;
 
-  err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_write_blended_dual, clwidth, clheight,
+  err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_write_blended_dual, width, height,
       CLARG(high_image), CLARG(low_image), CLARG(out), CLARG(width), CLARG(height), CLARG(tmp), CLARG(dual_mask));
 
   finish:
   dt_opencl_release_mem_object(mask);
   dt_opencl_release_mem_object(tmp);
-  dt_opencl_release_mem_object(dev_blurmat);
   return err;
 }
 #endif
diff --git a/src/iop/hlreconstruct/segbased.c b/src/iop/hlreconstruct/segbased.c
index c370f3d30616..65da09d6d35d 100644
--- a/src/iop/hlreconstruct/segbased.c
+++ b/src/iop/hlreconstruct/segbased.c
@@ -418,6 +418,36 @@ static inline size_t _raw_to_plane(const int width, const int row, const int col
   return (HL_BORDER + (row / 3)) * width + (col / 3) + HL_BORDER;
 }
 
+static void _masks_extend_border(float *const mask,
+                                 const int width,
+                                 const int height,
+                                 const int border)
+{
+  if(border <= 0) return;
+  DT_OMP_FOR()
+  for(size_t row = border; row < height - border; row++)
+  {
+    const size_t idx = row * width;
+    for(size_t i = 0; i < border; i++)
+    {
+      mask[idx + i] = mask[idx + border];
+      mask[idx + width - i - 1] = mask[idx + width - border -1];
+    }
+  }
+  DT_OMP_FOR()
+  for(size_t col = 0; col < width; col++)
+  {
+    const float top = mask[border * width + MIN(width - border - 1, MAX(col, border))];
+    const float bot = mask[(height - border - 1) * width
+                           + MIN(width - border - 1, MAX(col, border))];
+    for(size_t i = 0; i < border; i++)
+    {
+      mask[col + i * width] = top;
+      mask[col + (height - i - 1) * width] = bot;
+    }
+  }
+}
+
 static void _process_segmentation(dt_dev_pixelpipe_iop_t *piece,
                                   const float *const input,
                                   float *const output,
@@ -541,7 +571,7 @@ static void _process_segmentation(dt_dev_pixelpipe_iop_t *piece,
     goto finish;
 
   for(int i = 0; i < HL_RGB_PLANES; i++)
-    dt_masks_extend_border(plane[i], pwidth, pheight, HL_BORDER);
+    _masks_extend_border(plane[i], pwidth, pheight, HL_BORDER);
 
   for(int p = 0; p < HL_RGB_PLANES; p++)
     dt_segments_combine(&isegments[p], data->combine);
@@ -616,9 +646,8 @@ static void _process_segmentation(dt_dev_pixelpipe_iop_t *piece,
         distance[i] = (segall->data[i] == 1) ? DT_DISTANCE_TRANSFORM_MAX : 0.0f;
       }
     }
-    dt_masks_extend_border(tmp, pwidth, pheight, segall->border);
-    dt_masks_blur(tmp, luminance, pwidth, pheight, 1.2f, 1.0f, 20.0f);
-    dt_masks_extend_border(luminance, pwidth, pheight, segall->border);
+    _masks_extend_border(tmp, pwidth, pheight, segall->border);
+    dt_gaussian_fast_blur(tmp, luminance, pwidth, pheight, 1.2f, 0.0f, 20.0f, 1);
   }
 
   if(do_recovery)
@@ -628,7 +657,7 @@ static void _process_segmentation(dt_dev_pixelpipe_iop_t *piece,
     {
       dt_segmentize_plane(segall);
       _initial_gradients(pwidth, pheight, luminance, distance, recout);
-      dt_masks_extend_border(recout, pwidth, pheight, segall->border);
+      _masks_extend_border(recout, pwidth, pheight, segall->border);
 
       // now we check for significant all-clipped-segments and reconstruct data
       for(uint32_t id = 2; id < segall->nr; id++)
@@ -639,7 +668,7 @@ static void _process_segmentation(dt_dev_pixelpipe_iop_t *piece,
           _segment_gradients(distance, recout, tmp, recovery_mode, segall, id, recovery_close);
       }
 
-      dt_masks_blur(recout, gradient, pwidth, pheight, 1.2f, 1.0f, 20.0f);
+      dt_gaussian_fast_blur(recout, gradient, pwidth, pheight, 1.2f, 0.0f, 20.0f, 1);
       // possibly add some noise
       const float noise_level = data->noise_level;
       if(noise_level > 0.0f)