From e6fadb92c01a885369228662e91c1dfaa8658bb8 Mon Sep 17 00:00:00 2001
From: Hanno Schwalm <hanno@schwalm-bremen.de>
Date: Tue, 24 Jun 2025 06:26:40 +0200
Subject: [PATCH] Implement capture sharpening inside demosaic module

Capture sharpening has been implemented to work inside the demosaic module so it's raw only.
Credits to: Ingo Weyrich (heckflosse67@gmx.de), he implemented the original algorithm for rawtherapee,
this implementation is based on his work, especially the convolution kernels.
CPU and OpenCL code paths are both available.
Demosaic module gets more parameters so there is a version bump, one still unused float parameter has been reserved.

A "mini manual"

Capture sharpening (CS) tries to recover details lost due to in-camera blurring, which can be caused by diffraction,
the anti-aliasing filter or other sources of gaussian-type blur.
Prerequisites are
- good white balance parameters (same requirement as for highlights reconstruction or demosaic)
- no chromatic aberration, you might want to add the "raw chromatic aberration" module
- sensor noise will be amplified by CS

controls:
1. capture sharpen
   switches CS on if above zero and defines the strength of overall effect.
   CS works in an iterative process, this defines the number of iterations, mostly a setting of 10 will be enough.
2. radius
   defines the basic convolution gaussian sigma.
   This should not be set by "creative means" but to the blurring radius of the optical system and sensor, too large
   values will lead to artifacts like halos.
   Calculating a correct radius is provided internally. This will be done either if you
   a) click on the button besides the slider
   b) activate capture sharpen the first time after resetting to demosaic defaults or developing old edits.
3. contrast threshold
   As sensor noise will be amplified by CS we take some care about this by a per pixel variance analysis
   and restrict CS to locations with higher variance. The default is good for low to medium ISO images.
4. corner boost
   Increase the radius in image corners. We assume a circle of 1/2 of image size to be "sharp" (only use main radius),
   locations outside this center circle get an increased convolution radius.
---
 data/kernels/capture.cl       | 285 ++++++++++++
 data/kernels/programs.conf    |   1 +
 src/iop/demosaic.c            | 293 ++++++++++--
 src/iop/demosaicing/capture.c | 836 ++++++++++++++++++++++++++++++++++
 4 files changed, 1375 insertions(+), 40 deletions(-)
 create mode 100644 data/kernels/capture.cl
 create mode 100644 src/iop/demosaicing/capture.c

diff --git a/data/kernels/capture.cl b/data/kernels/capture.cl
new file mode 100644
index 000000000000..5c311163dd31
--- /dev/null
+++ b/data/kernels/capture.cl
@@ -0,0 +1,285 @@
+/*
+    This file is part of darktable,
+    copyright (c) 2025 darktable developer.
+
+    darktable is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    darktable is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with darktable.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "common.h"
+
+#define CAPTURE_KERNEL_ALIGN 32
+#define CAPTURE_BLEND_EPS 0.01f
+#define CAPTURE_YMIN 0.001f
+#define CAPTURE_THRESHPOWER 0.15f
+
+static inline float sqrf(float a)
+{
+  return (a * a);
+}
+
+__kernel void kernel_9x9_mul(global float *in,
+                             global float *out,
+                             global float *blend,
+                             global float *kernels,
+                             global unsigned char *table,
+                             const int w1,
+                             const int height)
+{
+  const int col = get_global_id(0);
+  const int row = get_global_id(1);
+  if(col >= w1 || row >= height) return;
+
+  const int i = mad24(row, w1, col);
+  const int w2 = 2 * w1;
+  const int w3 = 3 * w1;
+  const int w4 = 4 * w1;
+  if(blend[i] <= CAPTURE_BLEND_EPS)
+     return;
+
+  global const float *kern = kernels + CAPTURE_KERNEL_ALIGN * table[i];
+  global float *d = in + i;
+
+  float val = 0.0f;
+  if(col >= 4 && row >= 4 && col < w1 - 4 && row < height - 4)
+  {
+    val = kern[10+4] * (d[-w4-2]  + d[-w4+2]  + d[-w2-4]  + d[-w2+4] + d[w2-4] + d[w2+4] + d[w4-2] + d[w4+2]) +
+          kern[5 +4] * (d[-w4-1]  + d[-w4+1]  + d[-w1-4]  + d[-w1+4] + d[w1-4] + d[w1+4] + d[w4-1] + d[w4+1]) +
+          kern[4]    * (d[-w4  ]  + d[   -4]  + d[    4]  + d[ w4  ]) +
+          kern[15+3] * (d[-w3-3]  + d[-w3+3]  + d[ w3-3]  + d[ w3+3]) +
+          kern[10+3] * (d[-w3-2]  + d[-w3+2]  + d[-w2-3]  + d[-w2+3] + d[w2-3] + d[w2+3] + d[w3-2] + d[w3+2]) +
+          kern[ 5+3] * (d[-w3-1]  + d[-w3+1]  + d[-w1-3]  + d[-w1+3] + d[w1-3] + d[w1+3] + d[w3-1] + d[w3+1]) +
+          kern[   3] * (d[-w3  ]  + d[   -3]  + d[    3]  + d[ w3  ]) +
+          kern[10+2] * (d[-w2-2]  + d[-w2+2]  + d[ w2-2]  + d[ w2+2]) +
+          kern[ 5+2] * (d[-w2-1]  + d[-w2+1]  + d[-w1-2]  + d[-w1+2] + d[w1-2] + d[w1+2] + d[w2-1] + d[w2+1]) +
+          kern[   2] * (d[-w2  ]  + d[   -2]  + d[    2]  + d[ w2  ]) +
+          kern[ 5+1] * (d[-w1-1]  + d[-w1+1]  + d[ w1-1]  + d[ w1+1]) +
+          kern[   1] * (d[-w1  ]  + d[   -1]  + d[    1]  + d[ w1  ]) +
+          kern[   0] * (d[0]);
+  }
+  else
+  {
+    for(int ir = -4; ir <= 4; ir++)
+    {
+      const int irow = row+ir;
+      if(irow >= 0 && irow < height)
+      {
+        for(int ic = -4; ic <= 4; ic++)
+        {
+          const int icol = col+ic;
+          if(icol >=0 && icol < w1)
+            val += kern[5 * abs(ir) + abs(ic)] * in[mad24(irow, w1, icol)];
+        }
+      }
+    }
+  }
+  out[i] *= val;
+}
+
+__kernel void kernel_9x9_div(global float *in,
+                             global float *out,
+                             global float *divbuff,
+                             global float *blend,
+                             global float *kernels,
+                             global unsigned char *table,
+                             const int w1,
+                             const int height)
+{
+  const int col = get_global_id(0);
+  const int row = get_global_id(1);
+  if(col >= w1 || row >= height) return;
+
+  const int i = mad24(row, w1, col);
+  const int w2 = 2 * w1;
+  const int w3 = 3 * w1;
+  const int w4 = 4 * w1;
+  if(blend[i] <= CAPTURE_BLEND_EPS)
+    return;
+
+  global const float *kern = kernels + CAPTURE_KERNEL_ALIGN * table[i];
+  global float *d = in + i;
+
+  float val = 0.0f;
+  if(col >= 4 && row >= 4 && col < w1 - 4 && row < height - 4)
+  {
+    val = kern[10+4] * (d[-w4-2]  + d[-w4+2]  + d[-w2-4]  + d[-w2+4] + d[w2-4] + d[w2+4] + d[w4-2] + d[w4+2]) +
+          kern[5 +4] * (d[-w4-1]  + d[-w4+1]  + d[-w1-4]  + d[-w1+4] + d[w1-4] + d[w1+4] + d[w4-1] + d[w4+1]) +
+          kern[4]    * (d[-w4  ]  + d[   -4]  + d[    4]  + d[ w4  ]) +
+          kern[15+3] * (d[-w3-3]  + d[-w3+3]  + d[ w3-3]  + d[ w3+3]) +
+          kern[10+3] * (d[-w3-2]  + d[-w3+2]  + d[-w2-3]  + d[-w2+3] + d[w2-3] + d[w2+3] + d[w3-2] + d[w3+2]) +
+          kern[ 5+3] * (d[-w3-1]  + d[-w3+1]  + d[-w1-3]  + d[-w1+3] + d[w1-3] + d[w1+3] + d[w3-1] + d[w3+1]) +
+          kern[   3] * (d[-w3  ]  + d[   -3]  + d[    3]  + d[ w3  ]) +
+          kern[10+2] * (d[-w2-2]  + d[-w2+2]  + d[ w2-2]  + d[ w2+2]) +
+          kern[ 5+2] * (d[-w2-1]  + d[-w2+1]  + d[-w1-2]  + d[-w1+2] + d[w1-2] + d[w1+2] + d[w2-1] + d[w2+1]) +
+          kern[   2] * (d[-w2  ]  + d[   -2]  + d[    2]  + d[ w2  ]) +
+          kern[ 5+1] * (d[-w1-1]  + d[-w1+1]  + d[ w1-1]  + d[ w1+1]) +
+          kern[   1] * (d[-w1  ]  + d[   -1]  + d[    1]  + d[ w1  ]) +
+          kern[   0] * (d[0]);
+  }
+  else
+  {
+    for(int ir = -4; ir <= 4; ir++)
+    {
+      const int irow = row+ir;
+      if(irow >= 0 && irow < height)
+      {
+        for(int ic = -4; ic <= 4; ic++)
+        {
+          const int icol = col+ic;
+          if(icol >=0 && icol < w1)
+            val += kern[5 * abs(ir) + abs(ic)] * in[mad24(irow, w1, icol)];
+        }
+      }
+    }
+  }
+  out[i] = divbuff[i] / fmax(val, 0.00001f);
+}
+
+__kernel void prefill_clip_mask(global float *mask,
+                                const int width,
+                                const int height)
+{
+  const int col = get_global_id(0);
+  const int row = get_global_id(1);
+  if(col >= width || row >= height) return;
+
+  const int i = mad24(row, width, col);
+  mask[i] = 1.0f;
+}
+
+__kernel void prepare_blend(__read_only image2d_t cfa,
+                            __read_only image2d_t dev_out,
+                            const int filters,
+                            global const unsigned char (*const xtrans)[6],
+                            global float *mask,
+                            global float *Yold,
+                            global float *whites,
+                            const int w,
+                            const int height)
+{
+  const int col = get_global_id(0);
+  const int row = get_global_id(1);
+  if(col >= w || row >= height) return;
+
+  const float4 rgb = read_imagef(dev_out, samplerA, (int2)(col, row));
+  const float Y = fmax(0.0f, 0.2626f * rgb.x + 0.7152f * rgb.y + 0.0722f * rgb.z);
+  const int k = mad24(row, w, col);
+  Yold[k] = Y;
+
+  if(row > 1 && col > 1 && row < height-2 && col < w -2)
+  {
+    const int w2 = 2 * w;
+    const int color = (filters == 9u) ? FCxtrans(row, col, xtrans) : FC(row, col, filters);
+    const float val = read_imagef(cfa, samplerA, (int2)(col, row)).x;
+    if(val > whites[color] || Y < CAPTURE_YMIN)
+    {
+      mask[k-w2-1] = mask[k-w2]  = mask[k-w2+1] =
+      mask[k-w-2]  = mask[k-w-1] = mask[k-w ]   = mask[k-w+1] = mask[k-w+2] =
+      mask[k-2]    = mask[k-1]   = mask[k]      = mask[k+1]   = mask[k+2] =
+      mask[k+w-2]  = mask[k+w-1] = mask[k+w]    = mask[k+w+1] = mask[k+w+2] =
+      mask[k+w2-1] = mask[k+w2]  = mask[k+w2+1] = 0.0f;
+    }
+  }
+  else
+    mask[k] = 0.0f;
+}
+
+__kernel void modify_blend(global float *blend,
+                           global float *Yold,
+                           global float *luminance,
+                           const float threshold,
+                           const int width,
+                           const int height)
+{
+  const int icol = get_global_id(0);
+  const int irow = get_global_id(1);
+  if(icol >= width || irow >= height) return;
+
+  const int row = clamp(irow, 2, height-3);
+  const int col = clamp(icol, 2, width-3);
+
+  float av = 0.0f;
+  for(int y = row-1; y < row+2; y++)
+  {
+    for(int x = col-2; x < col+3; x++)
+      av += Yold[mad24(y, width, x)];
+  }
+  for(int x = col-1; x < col+2; x++)
+  {
+    av += Yold[mad24(row-2, width, x)];
+    av += Yold[mad24(row+2, width, x)];
+  }
+  av /= 21.0f;
+
+  float sv = 0.0f;
+  for(int y = row-1; y < row+2; y++)
+  {
+    for(int x = col-2; x < col+3; x++)
+      sv += sqrf(Yold[mad24(y, width, x)] - av);
+  }
+  for(int x = col-2; x < col+3; x++)
+  {
+    sv+= sqrf(Yold[mad24(row-2, width, x)] - av);
+    sv+= sqrf(Yold[mad24(row+2, width, x)] - av);
+  }
+  sv = dtcl_pow(fmax(0.0f, 5.0f * dtcl_sqrt(sv / 21.f) - threshold), CAPTURE_THRESHPOWER);
+  const int k = mad24(irow, width, icol);
+
+  blend[k] *= clamp(sv, 0.0f, 1.0f);
+  luminance[k] = Yold[k];
+}
+
+__kernel void show_blend_mask(__read_only image2d_t in,
+                              __write_only image2d_t out,
+                              global float *blend_mask,
+                              const int width,
+                              const int height)
+{
+  const int col = get_global_id(0);
+  const int row = get_global_id(1);
+  if(col >= width || row >= height) return;
+
+  float4 pix = read_imagef(in, samplerA, (int2)(col, row));
+  const float blend = blend_mask[mad24(row, width, col)];
+  pix.w = blend < CAPTURE_BLEND_EPS ? 0.0f : blend;
+  write_imagef(out, (int2)(col, row), pix);
+}
+
+__kernel void capture_result( __read_only image2d_t in,
+                              __write_only image2d_t out,
+                              global float *blendmask,
+                              global float *luminance,
+                              global float *tmp,
+                              const int width,
+                              const int height)
+{
+  const int col = get_global_id(0);
+  const int row = get_global_id(1);
+  if(col >= width || row >= height) return;
+
+  float4 pix = read_imagef(in, samplerA, (int2)(col, row));
+  const int k = mad24(row, width, col);
+
+  if(blendmask[k] > CAPTURE_BLEND_EPS)
+  {
+    const float mixer = clamp(blendmask[k], 0.0f, 1.0f);
+    const float lumold = fmax(luminance[k], 0.000001f);
+    const float lumtmp = fmax(tmp[k], 0.0000001f);
+    const float luminance_new = mix(lumold, lumtmp, mixer);
+    const float4 factor = luminance_new / lumold;
+    pix = pix * factor;
+  }
+  write_imagef(out, (int2)(col, row), pix);
+}
+
+#undef CAPTURE_KERNEL_ALIGN
diff --git a/data/kernels/programs.conf b/data/kernels/programs.conf
index d2f38ad82157..dc1dc287ba46 100644
--- a/data/kernels/programs.conf
+++ b/data/kernels/programs.conf
@@ -38,3 +38,4 @@ blurs.cl                34
 bspline.cl              35
 sigmoid.cl              36
 colorequal.cl           37
+capture.cl              38
diff --git a/src/iop/demosaic.c b/src/iop/demosaic.c
index 96d50748c391..80bb00c2e4a6 100644
--- a/src/iop/demosaic.c
+++ b/src/iop/demosaic.c
@@ -46,7 +46,7 @@
 #include <string.h>
 #include <time.h>
 
-DT_MODULE_INTROSPECTION(4, dt_iop_demosaic_params_t)
+DT_MODULE_INTROSPECTION(5, dt_iop_demosaic_params_t)
 
 #define DT_DEMOSAIC_XTRANS 1024 // masks for non-Bayer demosaic ops
 #define DT_DEMOSAIC_DUAL 2048   // masks for dual demosaicing methods
@@ -124,7 +124,12 @@ typedef struct dt_iop_demosaic_params_t
   dt_iop_demosaic_smooth_t color_smoothing;     // $DEFAULT: DT_DEMOSAIC_SMOOTH_OFF $DESCRIPTION: "color smoothing"
   dt_iop_demosaic_method_t demosaicing_method;  // $DEFAULT: DT_IOP_DEMOSAIC_RCD $DESCRIPTION: "method"
   dt_iop_demosaic_lmmse_t lmmse_refine;         // $DEFAULT: DT_LMMSE_REFINE_1 $DESCRIPTION: "LMMSE refine"
-  float dual_thrs;                              // $MIN: 0.0 $MAX: 1.0 $DEFAULT: 0.20 $DESCRIPTION: "dual threshold"
+  float dual_thrs;                              // $MIN: 0.0 $MAX: 1.0 $DEFAULT: 0.2 $DESCRIPTION: "dual threshold"
+  float cs_radius;                              // $MIN: 0.0 $MAX: 2.0 $DEFAULT: 0.0 $DESCRIPTION: "radius"
+  float cs_thrs;                                // $MIN: 0.0 $MAX: 1.0 $DEFAULT: 0.35 $DESCRIPTION: "contrast threshold"
+  float cs_boost;                               // $MIN: 0.0 $MAX: 1.5 $DEFAULT: 0.0 $DESCRIPTION: "corner boost"
+  int cs_strength;                              // $MIN: 0 $MAX: 20 $DEFAULT: 0 $DESCRIPTION: "capture sharpen"
+  float reserved;
 } dt_iop_demosaic_params_t;
 
 typedef struct dt_iop_demosaic_gui_data_t
@@ -137,7 +142,13 @@ typedef struct dt_iop_demosaic_gui_data_t
   GtkWidget *demosaic_method_bayerfour;
   GtkWidget *dual_thrs;
   GtkWidget *lmmse_refine;
+  GtkWidget *cs_thrs;
+  GtkWidget *cs_radius;
+  GtkWidget *cs_boost;
+  GtkWidget *cs_strength;
+  gboolean cs_mask;
   gboolean dual_mask;
+  gboolean autoradius;
 } dt_iop_demosaic_gui_data_t;
 
 typedef struct dt_iop_demosaic_global_data_t
@@ -193,6 +204,14 @@ typedef struct dt_iop_demosaic_global_data_t
   int kernel_rcd_border_redblue;
   int kernel_rcd_border_green;
   int kernel_write_blended_dual;
+  int gaussian_9x9_mul;
+  int gaussian_9x9_div;
+  int prefill_clip_mask;
+  int prepare_blend;
+  int modify_blend;
+  int show_blend_mask;
+  int capture_result;
+  float *gauss_coeffs;
 } dt_iop_demosaic_global_data_t;
 
 typedef struct dt_iop_demosaic_data_t
@@ -204,6 +223,10 @@ typedef struct dt_iop_demosaic_data_t
   float median_thrs;
   double CAM_to_RGB[3][4];
   float dual_thrs;
+  float cs_radius;
+  float cs_thrs;
+  float cs_boost;
+  int cs_strength;
 } dt_iop_demosaic_data_t;
 
 static gboolean _get_thumb_quality(const int width, const int height)
@@ -281,6 +304,7 @@ void amaze_demosaic(dt_dev_pixelpipe_iop_t *piece,
 #include "iop/demosaicing/ppg.c"
 #include "iop/demosaicing/rcd.c"
 #include "iop/demosaicing/lmmse.c"
+#include "iop/demosaicing/capture.c"
 #include "iop/demosaicing/dual.c"
 
 const char *name()
@@ -321,7 +345,7 @@ int legacy_params(dt_iop_module_t *self,
                   int32_t *new_params_size,
                   int *new_version)
 {
-  typedef struct dt_iop_demosaic_params_v4_t
+  typedef struct dt_iop_demosaic_params_v5_t
   {
     dt_iop_demosaic_greeneq_t green_eq;
     float median_thrs;
@@ -329,7 +353,12 @@ int legacy_params(dt_iop_module_t *self,
     dt_iop_demosaic_method_t demosaicing_method;
     dt_iop_demosaic_lmmse_t lmmse_refine;
     float dual_thrs;
-  } dt_iop_demosaic_params_v4_t;
+    float cs_radius;
+    float cs_thrs;
+    float cs_boost;
+    int cs_strength;
+    float reserved;
+  } dt_iop_demosaic_params_v5_t;
 
   if(old_version == 2)
   {
@@ -340,17 +369,22 @@ int legacy_params(dt_iop_module_t *self,
     } dt_iop_demosaic_params_v2_t;
 
     const dt_iop_demosaic_params_v2_t *o = (dt_iop_demosaic_params_v2_t *)old_params;
-    dt_iop_demosaic_params_v4_t *n = malloc(sizeof(dt_iop_demosaic_params_v4_t));
+    dt_iop_demosaic_params_v5_t *n = malloc(sizeof(dt_iop_demosaic_params_v5_t));
     n->green_eq = o->green_eq;
     n->median_thrs = o->median_thrs;
     n->color_smoothing = DT_DEMOSAIC_SMOOTH_OFF;
     n->demosaicing_method = DT_IOP_DEMOSAIC_PPG;
     n->lmmse_refine = DT_LMMSE_REFINE_1;
     n->dual_thrs = 0.20f;
+    n->cs_radius = 0.0f;
+    n->cs_thrs = 0.35f;
+    n->cs_boost = 0.0f;
+    n->cs_strength = 0;
+    n->reserved = 0.0f;
 
     *new_params = n;
-    *new_params_size = sizeof(dt_iop_demosaic_params_v4_t);
-    *new_version = 4;
+    *new_params_size = sizeof(dt_iop_demosaic_params_v5_t);
+    *new_version = 5;
     return 0;
   }
 
@@ -366,13 +400,45 @@ int legacy_params(dt_iop_module_t *self,
     } dt_iop_demosaic_params_v3_t;
 
     const dt_iop_demosaic_params_v3_t *o = (dt_iop_demosaic_params_v3_t *)old_params;
-    dt_iop_demosaic_params_v4_t *n = malloc(sizeof(dt_iop_demosaic_params_v4_t));
+    dt_iop_demosaic_params_v5_t *n = malloc(sizeof(dt_iop_demosaic_params_v5_t));
     memcpy(n, o, sizeof *o);
     n->dual_thrs = 0.20f;
+    n->cs_radius = 0.0f;
+    n->cs_thrs = 0.35f;
+    n->cs_boost = 0.0f;
+    n->cs_strength = 0;
+    n->reserved = 0.0f;
 
     *new_params = n;
-    *new_params_size = sizeof(dt_iop_demosaic_params_v4_t);
-    *new_version = 4;
+    *new_params_size = sizeof(dt_iop_demosaic_params_v5_t);
+    *new_version = 5;
+    return 0;
+  }
+
+  if(old_version == 4)
+  {
+    typedef struct dt_iop_demosaic_params_v4_t
+    {
+      dt_iop_demosaic_greeneq_t green_eq;
+      float median_thrs;
+      dt_iop_demosaic_smooth_t color_smoothing;
+      dt_iop_demosaic_method_t demosaicing_method;
+      dt_iop_demosaic_lmmse_t lmmse_refine;
+      float dual_thrs;
+    } dt_iop_demosaic_params_v4_t;
+
+    const dt_iop_demosaic_params_v4_t *o = (dt_iop_demosaic_params_v4_t *)old_params;
+    dt_iop_demosaic_params_v5_t *n = malloc(sizeof(dt_iop_demosaic_params_v5_t));
+    memcpy(n, o, sizeof *o);
+    n->cs_radius = 0.0f;
+    n->cs_thrs = 0.35f;
+    n->cs_boost = 0.0f;
+    n->cs_strength = 0;
+    n->reserved = 0.0f;
+
+    *new_params = n;
+    *new_params_size = sizeof(dt_iop_demosaic_params_v5_t);
+    *new_version = 5;
     return 0;
   }
 
@@ -577,11 +643,12 @@ void tiling_callback(dt_iop_module_t *self,
     tiling->overlap = 6;
   }
 
-  if(d->demosaicing_method & DT_DEMOSAIC_DUAL)
+  if((d->demosaicing_method & DT_DEMOSAIC_DUAL) || d->cs_strength)
   {
-    // make sure VNG4 is also possible
-    tiling->factor += 1.0f;
-    tiling->overlap = MAX(6, tiling->overlap);
+    // internals plus 2 output
+    tiling->factor = MAX(tiling->factor, 1.0f + 2.0f * ioratio);
+    // works for bayer and xtrans
+    tiling->overlap = MAX(d->cs_strength ? 18 : 6, tiling->overlap);
   }
   return;
 }
@@ -601,7 +668,6 @@ void process(dt_iop_module_t *self,
   const gboolean run_fast = pipe->type & DT_DEV_PIXELPIPE_FAST;
   const gboolean fullpipe = pipe->type & DT_DEV_PIXELPIPE_FULL;
   const gboolean previewpipe = pipe->type & DT_DEV_PIXELPIPE_PREVIEW;
-
   const uint8_t(*const xtrans)[6] = (const uint8_t(*const)[6])pipe->dsc.xtrans;
 
   const dt_iop_demosaic_data_t *d = piece->data;
@@ -622,17 +688,27 @@ void process(dt_iop_module_t *self,
           && demosaicing_method != DT_IOP_DEMOSAIC_PASSTHROUGH_COLOR))
     demosaicing_method = is_xtrans ? DT_IOP_DEMOSAIC_VNG : DT_IOP_DEMOSAIC_VNG4;
 
-  gboolean showmask = FALSE;
+  gboolean show_dualmask = FALSE;
+  gboolean show_capturemask = FALSE;
+  gboolean vng_linear = FALSE;
   if(self->dev->gui_attached && fullpipe)
   {
     if(g->dual_mask)
     {
-      showmask = TRUE;
+      show_dualmask = TRUE;
+      pipe->mask_display = DT_DEV_PIXELPIPE_DISPLAY_MASK;
+    }
+    if(g->cs_mask)
+    {
+      show_capturemask = TRUE;
       pipe->mask_display = DT_DEV_PIXELPIPE_DISPLAY_MASK;
     }
     // take care of passthru modes
     if(pipe->mask_display == DT_DEV_PIXELPIPE_DISPLAY_PASSTHRU)
+    {
       demosaicing_method = is_xtrans ? DT_IOP_DEMOSAIC_VNG : DT_IOP_DEMOSAIC_VNG4;
+      vng_linear = TRUE;
+    }
   }
 
   float *in  = (float *)i;
@@ -656,6 +732,16 @@ void process(dt_iop_module_t *self,
 
   const gboolean direct = roi_out->width == width && roi_out->height == height && feqf(roi_in->scale, roi_out->scale, 1e-8f);
 
+  const gboolean passthru = demosaicing_method == DT_IOP_DEMOSAIC_PASSTHROUGH_MONOCHROME
+                         || demosaicing_method == DT_IOP_DEMOSAIC_PASSTHROUGH_COLOR;
+
+  const gboolean do_capture = !passthru
+                          &&  !is_4bayer
+                          &&  !show_dualmask
+                          &&  !run_fast
+                          &&  !previewpipe
+                          &&  d->cs_strength;
+
   if(!direct)
     out = dt_alloc_align_float((size_t)4 * width * height);
 
@@ -696,13 +782,13 @@ void process(dt_iop_module_t *self,
     else if(base_demosaicing_method == DT_IOP_DEMOSAIC_MARKESTEIJN || base_demosaicing_method == DT_IOP_DEMOSAIC_MARKESTEIJN_3)
       xtrans_markesteijn_interpolate(out, in, roi_in, xtrans, passes);
     else
-      vng_interpolate(out, in, roi_in, pipe->dsc.filters, xtrans, qual_flags & DT_DEMOSAIC_ONLY_VNG_LINEAR);
+      vng_interpolate(out, in, roi_in, pipe->dsc.filters, xtrans, vng_linear);
   }
   else
   {
     if(demosaicing_method == DT_IOP_DEMOSAIC_VNG4 || is_4bayer)
     {
-      vng_interpolate(out, in, roi_in, pipe->dsc.filters, xtrans, qual_flags & DT_DEMOSAIC_ONLY_VNG_LINEAR);
+      vng_interpolate(out, in, roi_in, pipe->dsc.filters, xtrans, vng_linear);
       if(is_4bayer)
       {
         dt_colorspaces_cygm_to_rgb(out, width * height, d->CAM_to_RGB);
@@ -722,8 +808,11 @@ void process(dt_iop_module_t *self,
   if(pipe->want_detail_mask)
     dt_dev_write_scharr_mask(piece, out, roi_in, TRUE);
 
-  if(dual)
-    dual_demosaic(piece, out, in, roi_in, pipe->dsc.filters, xtrans, showmask, d->dual_thrs);
+  if(do_capture)
+    _capture_sharpen(self, piece, in, out, roi_in, show_capturemask);
+
+  if(dual && !show_capturemask)
+    dual_demosaic(piece, out, in, roi_in, pipe->dsc.filters, xtrans, show_dualmask, d->dual_thrs);
 
   if((float *)i != in) dt_free_align(in);
 
@@ -755,6 +844,7 @@ int process_cl(dt_iop_module_t *self,
   const gboolean run_fast = pipe->type & DT_DEV_PIXELPIPE_FAST;
   const gboolean fullpipe = pipe->type & DT_DEV_PIXELPIPE_FULL;
   const gboolean previewpipe = pipe->type & DT_DEV_PIXELPIPE_PREVIEW;
+
   const int qual_flags = demosaic_qual_flags(piece, img, roi_out);
   const gboolean fullscale = qual_flags & DT_DEMOSAIC_FULL_SCALE;
   const gboolean is_xtrans = pipe->dsc.filters == 9u;
@@ -782,17 +872,27 @@ int process_cl(dt_iop_module_t *self,
       && demosaicing_method != DT_IOP_DEMOSAIC_PASSTHROUGH_COLOR))
     demosaicing_method = is_xtrans ? DT_IOP_DEMOSAIC_VNG : DT_IOP_DEMOSAIC_VNG4;
 
-  gboolean showmask = FALSE;
+  gboolean show_dualmask = FALSE;
+  gboolean show_capturemask = FALSE;
+  gboolean vng_linear = FALSE;
   if(self->dev->gui_attached && fullpipe)
   {
     if(g->dual_mask)
     {
-      showmask = TRUE;
+      show_dualmask = TRUE;
+      pipe->mask_display = DT_DEV_PIXELPIPE_DISPLAY_MASK;
+    }
+    if(g->cs_mask)
+    {
+      show_capturemask = TRUE;
       pipe->mask_display = DT_DEV_PIXELPIPE_DISPLAY_MASK;
     }
     // take care of passthru modes
     if(pipe->mask_display == DT_DEV_PIXELPIPE_DISPLAY_PASSTHRU)
+    {
       demosaicing_method = is_xtrans ? DT_IOP_DEMOSAIC_VNG : DT_IOP_DEMOSAIC_VNG4;
+      vng_linear = TRUE;
+    }
   }
 
   const int devid = pipe->devid;
@@ -829,6 +929,14 @@ int process_cl(dt_iop_module_t *self,
   const gboolean direct = roi_out->width == width && roi_out->height == height && feqf(roi_in->scale, roi_out->scale, 1e-8f);
   const int base_demosaicing_method = demosaicing_method & ~DT_DEMOSAIC_DUAL;
   const gboolean dual = (demosaicing_method & DT_DEMOSAIC_DUAL) && !run_fast && !previewpipe;
+  const gboolean passthru = demosaicing_method == DT_IOP_DEMOSAIC_PASSTHROUGH_MONOCHROME
+                         || demosaicing_method == DT_IOP_DEMOSAIC_PASSTHROUGH_COLOR;
+
+  const gboolean do_capture = !passthru
+                          &&  !run_fast
+                          &&  !show_dualmask
+                          &&  !previewpipe
+                          &&  d->cs_strength;
 
   cl_mem out_image = direct ? dev_out : dt_opencl_alloc_device(devid, width, height, sizeof(float) * 4);
   cl_mem in_image = dev_in;
@@ -845,9 +953,7 @@ int process_cl(dt_iop_module_t *self,
     if(err != CL_SUCCESS) goto finish;
   }
 
-  if(demosaicing_method == DT_IOP_DEMOSAIC_PASSTHROUGH_MONOCHROME ||
-     demosaicing_method == DT_IOP_DEMOSAIC_PPG ||
-     demosaicing_method == DT_IOP_DEMOSAIC_PASSTHROUGH_COLOR)
+  if(passthru || demosaicing_method == DT_IOP_DEMOSAIC_PPG)
   {
     err = process_default_cl(self, piece, in_image, out_image, roi_in, demosaicing_method);
     if(err != CL_SUCCESS) return err;
@@ -859,7 +965,7 @@ int process_cl(dt_iop_module_t *self,
   }
   else if(demosaicing_method == DT_IOP_DEMOSAIC_VNG4 || demosaicing_method == DT_IOP_DEMOSAIC_VNG)
   {
-    err = process_vng_cl(self, piece, in_image, out_image, roi_in, qual_flags & DT_DEMOSAIC_ONLY_VNG_LINEAR);
+    err = process_vng_cl(self, piece, in_image, out_image, roi_in, vng_linear);
     if(err != CL_SUCCESS) goto finish;
   }
   else if(base_demosaicing_method == DT_IOP_DEMOSAIC_MARKESTEIJN || base_demosaicing_method == DT_IOP_DEMOSAIC_MARKESTEIJN_3)
@@ -880,7 +986,13 @@ int process_cl(dt_iop_module_t *self,
     if(err != CL_SUCCESS) goto finish;
   }
 
-  if(dual)
+  if(do_capture)
+  {
+    err = _capture_sharpen_cl(self, piece, dev_in, out_image, roi_in, show_capturemask);
+    if(err != CL_SUCCESS) goto finish;
+  }
+
+  if(dual && !show_capturemask)
   {
     err = CL_MEM_OBJECT_ALLOCATION_FAILURE;
     cl_mem low_image = dt_opencl_alloc_device(devid, width, height, sizeof(float) * 4);
@@ -895,7 +1007,7 @@ int process_cl(dt_iop_module_t *self,
       if(err == CL_SUCCESS)
         err = color_smoothing_cl(self, piece, low_image, low_image, roi_in, DT_DEMOSAIC_SMOOTH_2);
       if(err == CL_SUCCESS)
-        err = dual_demosaic_cl(self, piece, cp_image, low_image, out_image, roi_in, showmask);
+        err = dual_demosaic_cl(self, piece, cp_image, low_image, out_image, roi_in, show_dualmask);
       dt_opencl_release_mem_object(cp_image);
       dt_opencl_release_mem_object(low_image);
     }
@@ -991,6 +1103,19 @@ void init_global(dt_iop_module_so_t *self)
   gd->kernel_rcd_border_redblue = dt_opencl_create_kernel(rcd, "rcd_border_redblue");
   gd->kernel_rcd_border_green = dt_opencl_create_kernel(rcd, "rcd_border_green");
   gd->kernel_write_blended_dual  = dt_opencl_create_kernel(rcd, "write_blended_dual");
+
+  const int capt = 38; // capture.cl, from programs.conf
+  gd->gaussian_9x9_mul = dt_opencl_create_kernel(capt, "kernel_9x9_mul");
+  gd->gaussian_9x9_div = dt_opencl_create_kernel(capt, "kernel_9x9_div");
+  gd->prefill_clip_mask = dt_opencl_create_kernel(capt, "prefill_clip_mask");
+  gd->prepare_blend = dt_opencl_create_kernel(capt, "prepare_blend");
+  gd->modify_blend = dt_opencl_create_kernel(capt, "modify_blend");
+  gd->show_blend_mask = dt_opencl_create_kernel(capt, "show_blend_mask");
+  gd->capture_result = dt_opencl_create_kernel(capt, "capture_result");
+
+  gd->gauss_coeffs = dt_alloc_align_float(CAPTURE_KERNEL_ALIGN * (UCHAR_MAX+1));
+  for(int i = 0; i <= UCHAR_MAX; i++)
+    _calc_9x9_gauss_coeffs(&gd->gauss_coeffs[i * CAPTURE_KERNEL_ALIGN], MAX(1e-7f, (float)i * CAPTURE_GAUSS_FRACTION));
 }
 
 void cleanup_global(dt_iop_module_so_t *self)
@@ -1046,6 +1171,14 @@ void cleanup_global(dt_iop_module_so_t *self)
   dt_opencl_free_kernel(gd->kernel_rcd_border_redblue);
   dt_opencl_free_kernel(gd->kernel_rcd_border_green);
   dt_opencl_free_kernel(gd->kernel_write_blended_dual);
+  dt_opencl_free_kernel(gd->gaussian_9x9_mul);
+  dt_opencl_free_kernel(gd->gaussian_9x9_div);
+  dt_opencl_free_kernel(gd->prefill_clip_mask);
+  dt_opencl_free_kernel(gd->prepare_blend);
+  dt_opencl_free_kernel(gd->modify_blend);
+  dt_opencl_free_kernel(gd->show_blend_mask);
+  dt_opencl_free_kernel(gd->capture_result);
+  dt_free_align(gd->gauss_coeffs);
   free(self->data);
   self->data = NULL;
   _cleanup_lmmse_gamma();
@@ -1067,7 +1200,10 @@ void commit_params(dt_iop_module_t *self,
   d->dual_thrs = p->dual_thrs;
   d->lmmse_refine = p->lmmse_refine;
   dt_iop_demosaic_method_t use_method = p->demosaicing_method;
-
+  d->cs_radius = p->cs_radius;
+  d->cs_thrs = p->cs_thrs;
+  d->cs_boost = p->cs_boost;
+  d->cs_strength = p->cs_strength;
   const gboolean xmethod = use_method & DT_DEMOSAIC_XTRANS;
   const gboolean bayer4  = self->dev->image_storage.flags & DT_IMAGE_4BAYER;
   const gboolean bayer   = self->dev->image_storage.buf_dsc.filters != 9u && !bayer4;
@@ -1154,7 +1290,6 @@ void commit_params(dt_iop_module_t *self,
       piece->process_cl_ready = FALSE;
   }
 
-
   // green-equilibrate over full image excludes tiling
   // The details mask calculation required for dual demosaicing does not allow tiling.
   if(    d->green_eq == DT_IOP_GREEN_EQ_FULL
@@ -1244,10 +1379,18 @@ void gui_changed(dt_iop_module_t *self, GtkWidget *w, void *previous)
     || use_method == DT_IOP_DEMOSAIC_PASSTHR_MONOX
     || use_method == DT_IOP_DEMOSAIC_PASSTHR_COLORX;
 
+  const gboolean capture_support = !passing && !bayer4;
+  const gboolean do_capture = capture_support && p->cs_strength;
+
   gtk_widget_set_visible(g->demosaic_method_bayer, bayer);
   gtk_widget_set_visible(g->demosaic_method_bayerfour, bayer4);
   gtk_widget_set_visible(g->demosaic_method_xtrans, xtrans);
 
+  gtk_widget_set_sensitive(g->cs_radius, do_capture);
+  gtk_widget_set_sensitive(g->cs_thrs, do_capture);
+  gtk_widget_set_sensitive(g->cs_boost, do_capture);
+  gtk_widget_set_sensitive(g->cs_strength, capture_support);
+
   // we might have a wrong method dur to xtrans/bayer - mode mismatch
   if(bayer)
     dt_bauhaus_combobox_set_from_value(g->demosaic_method_bayer, use_method);
@@ -1286,6 +1429,11 @@ void gui_changed(dt_iop_module_t *self, GtkWidget *w, void *previous)
     dt_bauhaus_widget_set_quad_active(g->dual_thrs, FALSE);
     g->dual_mask = FALSE;
   }
+  if(!w || w != g->cs_thrs)
+  {
+    dt_bauhaus_widget_set_quad_active(g->cs_thrs, FALSE);
+    g->cs_mask = FALSE;
+  }
 
   // as the dual modes change behaviour for previous pipeline modules we do a reprocess
   if(isdual && (w == g->demosaic_method_bayer || w == g->demosaic_method_xtrans))
@@ -1296,26 +1444,64 @@ void gui_update(dt_iop_module_t *self)
 {
   gui_changed(self, NULL, NULL);
   gtk_stack_set_visible_child_name(GTK_STACK(self->widget), self->default_enabled ? "raw" : "non_raw");
+  dt_iop_demosaic_gui_data_t *g = self->gui_data;
+  g->autoradius = FALSE;
 }
 
-static void _visualize_callback(GtkWidget *quad, dt_iop_module_t *self)
+static void _dual_quad_callback(GtkWidget *quad, dt_iop_module_t *self)
 {
   if(darktable.gui->reset) return;
   dt_iop_demosaic_gui_data_t *g = self->gui_data;
 
   g->dual_mask = dt_bauhaus_widget_get_quad_active(quad);
+
+  dt_bauhaus_widget_set_quad_active(g->cs_thrs, FALSE);
+  g->cs_mask = FALSE;
+  dt_dev_reprocess_center(self->dev);
+}
+
+static void _cs_quad_callback(GtkWidget *quad, dt_iop_module_t *self)
+{
+  if(darktable.gui->reset) return;
+  dt_iop_demosaic_gui_data_t *g = self->gui_data;
+  g->cs_mask = dt_bauhaus_widget_get_quad_active(quad);
+
+  dt_bauhaus_widget_set_quad_active(g->dual_thrs, FALSE);
+  g->dual_mask = FALSE;
   dt_dev_reprocess_center(self->dev);
 }
 
+static void _cs_autoradius_callback(GtkWidget *quad, dt_iop_module_t *self)
+{
+  if(darktable.gui->reset) return;
+  dt_iop_demosaic_gui_data_t *g = self->gui_data;
+  g->autoradius = TRUE;
+  dt_dev_reprocess_center(self->dev);
+}
+
+static void _check_autoradius(gpointer instance, dt_iop_module_t *self)
+{
+  dt_iop_demosaic_gui_data_t *g = self->gui_data;
+  if(g && g->autoradius)
+  {
+    dt_iop_demosaic_params_t *p = self->params;
+    g->autoradius = FALSE;
+    dt_bauhaus_slider_set_val(g->cs_radius, p->cs_radius);
+    dt_dev_add_history_item(darktable.develop, self, TRUE);
+  }
+}
+
 void gui_focus(dt_iop_module_t *self, gboolean in)
 {
   dt_iop_demosaic_gui_data_t *g = self->gui_data;
   if(!in)
   {
-    const gboolean was_dualmask = g->dual_mask;
+    const gboolean was_masking = g->dual_mask || g->cs_mask;
     dt_bauhaus_widget_set_quad_active(g->dual_thrs, FALSE);
     g->dual_mask = FALSE;
-    if(was_dualmask) dt_dev_reprocess_center(self->dev);
+    dt_bauhaus_widget_set_quad_active(g->cs_thrs, FALSE);
+    g->cs_mask = FALSE;
+    if(was_masking) dt_dev_reprocess_center(self->dev);
   }
 }
 
@@ -1342,18 +1528,17 @@ void gui_init(dt_iop_module_t *self)
   for(int i=0;i<4;i++) dt_bauhaus_combobox_remove_at(g->demosaic_method_bayerfour, 1);
   gtk_widget_set_tooltip_text(g->demosaic_method_bayerfour, _("Bayer4 sensor demosaicing methods."));
 
-  g->median_thrs = dt_bauhaus_slider_from_params(self, "median_thrs");
-  dt_bauhaus_slider_set_digits(g->median_thrs, 3);
-  gtk_widget_set_tooltip_text(g->median_thrs, _("threshold for edge-aware median.\nset to 0.0 to switch off\n"
-                                                "set to 1.0 to ignore edges"));
-
   g->dual_thrs = dt_bauhaus_slider_from_params(self, "dual_thrs");
   dt_bauhaus_slider_set_digits(g->dual_thrs, 2);
   gtk_widget_set_tooltip_text(g->dual_thrs, _("contrast threshold for dual demosaic.\nset to 0.0 for high frequency content\n"
                                                 "set to 1.0 for flat content"));
-  dt_bauhaus_widget_set_quad(g->dual_thrs, self, dtgtk_cairo_paint_showmask, TRUE, _visualize_callback,
+  dt_bauhaus_widget_set_quad(g->dual_thrs, self, dtgtk_cairo_paint_showmask, TRUE, _dual_quad_callback,
                              _("toggle mask visualization"));
 
+  g->median_thrs = dt_bauhaus_slider_from_params(self, "median_thrs");
+  dt_bauhaus_slider_set_digits(g->median_thrs, 3);
+  gtk_widget_set_tooltip_text(g->median_thrs, _("threshold for edge-aware median.\nset to 0.0 to switch off\n"
+                                                "set to 1.0 to ignore edges"));
   g->lmmse_refine = dt_bauhaus_combobox_from_params(self, "lmmse_refine");
   gtk_widget_set_tooltip_text(g->lmmse_refine, _("LMMSE refinement steps. the median steps average the output,\nrefine adds some recalculation of red & blue channels"));
 
@@ -1363,6 +1548,33 @@ void gui_init(dt_iop_module_t *self)
   g->greeneq = dt_bauhaus_combobox_from_params(self, "green_eq");
   gtk_widget_set_tooltip_text(g->greeneq, _("green channels matching method"));
 
+  g->cs_strength = dt_bauhaus_slider_from_params(self, "cs_strength");
+  gtk_widget_set_tooltip_text(g->cs_strength, _("enable capture sharpening and set effect strength based on iterations"));
+
+  g->cs_radius = dt_bauhaus_slider_from_params(self, "cs_radius");
+  dt_bauhaus_slider_set_digits(g->cs_radius, 2);
+  dt_bauhaus_slider_set_format(g->cs_radius, _(_(" px")));
+  gtk_widget_set_tooltip_text(g->cs_radius, _("capture sharpen radius should reflect the gaussian type blur by camera\n"
+                                              "sensor, possibly the anti-aliasing filter and the lens.\n"
+                                              "increasing this too far will lead to artifacts like halos\n"
+                                              "especially at sharp transitions\n"));
+  dt_bauhaus_slider_set_hard_min(g->cs_radius, 0.01f);
+  dt_bauhaus_widget_set_quad(g->cs_radius, self, dtgtk_cairo_paint_reset, FALSE, _cs_autoradius_callback,
+    _("calculate the capture sharpen radius from sensor data"));
+  g->autoradius = FALSE;
+
+  g->cs_thrs = dt_bauhaus_slider_from_params(self, "cs_thrs");
+  dt_bauhaus_slider_set_format(g->cs_thrs, "%");
+  dt_bauhaus_slider_set_digits(g->cs_thrs, 0);
+  gtk_widget_set_tooltip_text(g->cs_thrs, _("adjust the threshold to restrict capture sharpening to parts with high contrast"));
+  dt_bauhaus_widget_set_quad(g->cs_thrs, self, dtgtk_cairo_paint_showmask, TRUE, _cs_quad_callback, _("visualize the restricting mask"));
+
+  g->cs_boost = dt_bauhaus_slider_from_params(self, "cs_boost");
+  dt_bauhaus_slider_set_digits(g->cs_boost, 2);
+  dt_bauhaus_slider_set_format(g->cs_boost, _(_(" px")));
+  gtk_widget_set_tooltip_text(g->cs_boost, _("further increase radius at image corners.\n"
+                                             "the centre half of the image will not be affected"));
+
   // start building top level widget
   self->widget = gtk_stack_new();
   gtk_stack_set_homogeneous(GTK_STACK(self->widget), FALSE);
@@ -1372,6 +1584,7 @@ void gui_init(dt_iop_module_t *self)
 
   gtk_stack_add_named(GTK_STACK(self->widget), label_non_raw, "non_raw");
   gtk_stack_add_named(GTK_STACK(self->widget), box_raw, "raw");
+  DT_CONTROL_SIGNAL_HANDLE(DT_SIGNAL_DEVELOP_UI_PIPE_FINISHED, _check_autoradius);
 }
 
 // clang-format off
diff --git a/src/iop/demosaicing/capture.c b/src/iop/demosaicing/capture.c
new file mode 100644
index 000000000000..5d7d4fedd5d8
--- /dev/null
+++ b/src/iop/demosaicing/capture.c
@@ -0,0 +1,836 @@
+/*
+    This file is part of darktable,
+    Copyright (C) 2025 darktable developers.
+
+    darktable is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    darktable is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with darktable.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* remarks:
+    Credits go to: Ingo Weyrich (heckflosse67@gmx.de), he implemented the original algorithm for rawtherapee.
+
+   1) - The gaussian convolution filters take the coeffs from precalculated data in gd->gauss_coeffs,
+        we have CHAR_MAX kernels with a sigma step of CAPTURE_GAUSS_FRACTION.
+      - The chosen kernel is selected per pixel via an index map, this is derived from cs_radius, cs_boost
+        and distance from image centre.
+      - using the index map improves performance and allows runtime modification of the used per pixel
+        gaussian sigma.
+      - Note: this is similar to the per-tile sigma in the RT implementation.
+   2) It's currently not planned to increase the maximum sigma so we can stay with the 9x9 kernels.
+   3) Reminders and possibly left to do:
+      - halo supprssion at very strong gradients?
+      - automatic noise detection or reduction?
+      - can we auto-stop? per pixel?
+      - Internal CPU code tiling for performance? tile size would be the same as for rcd
+   4) Notes
+      - If the demosaicer downscales effects are less visible so it seems safe to reduce the
+        number of iterartions for performance.
+*/
+
+#ifdef __GNUC__
+  #pragma GCC push_options
+  #pragma GCC optimize ("fast-math", "fp-contract=fast", "finite-math-only", "no-math-errno")
+#endif
+
+#define CAPTURE_KERNEL_ALIGN 32
+#define CAPTURE_BLEND_EPS 0.01f
+#define CAPTURE_GAUSS_FRACTION 0.01f
+#define CAPTURE_YMIN 0.001f
+#define CAPTURE_THRESHPOWER 0.15f
+#define CAPTURE_CFACLIP 0.9f
+
+static inline void _calc_9x9_gauss_coeffs(float *coeffs, const float sigma)
+{
+  float kernel[9][9];
+  const float range = 4.5f * 4.5f;
+  const float temp = -2.0f * sigma * sigma;
+  float sum = 0.0;
+  for(int k = -4; k < 5; k++)
+  {
+    for(int j = -4; j < 5; j++)
+    {
+      const float rad = (float)(k*k + j*j);
+      if(rad <= range)
+      {
+        kernel[k + 4][j + 4] = expf(rad / temp);
+        sum += kernel[k + 4][j + 4];
+      }
+      else
+        kernel[k + 4][j + 4] = 0.0f;
+    }
+  }
+
+  for(int k = 0; k < 5; k++)
+    for(int j = 0; j < 5; j++)
+      coeffs[5*k+j] = kernel[k+4][j+4] / sum;
+}
+
+static inline unsigned char _sigma_to_index(const float sigma)
+{
+  return CLAMP((int)(sigma / CAPTURE_GAUSS_FRACTION), 0, UCHAR_MAX);
+}
+
+// provide an index map so the convolution kernels can easily get the correct coeffs
+static unsigned char *_cs_precalc_gauss_idx(dt_iop_module_t *self,
+                                            const dt_iop_roi_t *const roi,
+                                            const float isigma,
+                                            const float boost)
+{
+  const dt_image_t *img = &self->dev->image_storage;
+  const int rwidth = img->p_width / 2;
+  const int rheight = img->p_height / 2;
+  const float mdim = MIN(rwidth, rheight);
+  const int width = roi->width;
+  const int height = roi->height;
+  const int dy = roi->y;
+  const int dx = roi->x;
+  unsigned char *table = dt_alloc_aligned((size_t)height * width);
+  if(!table) return NULL;
+
+  DT_OMP_FOR()
+  for(int row = 0; row < height; row++)
+  {
+    const float frow = row + dy - rheight;
+    for(int col = 0; col < width; col++)
+    {
+      const float fcol = col + dx - rwidth;
+      const float sc = sqrtf(frow * frow + fcol * fcol) / mdim;
+      const float corr = MAX(0.0f, sc - 0.5f);
+      float sigma = isigma + boost * powf(corr, 1.5f);
+
+      // special care for the image borders
+      if(col < 8)               sigma *= (float)col * 0.125f;
+      else if(row < 8)          sigma *= (float)row * 0.125f;
+      else if(col > width - 8)  sigma *= (float)(width - col) * 0.125f;
+      else if(row > height - 8) sigma *= (float)(height - row) * 0.125f;
+
+      table[row * width + col] = _sigma_to_index(sigma);
+    }
+  }
+  return table;
+}
+
+#define RAWEPS 0.005f
+static float _calcRadiusBayer(const float *in,
+                              const int width,
+                              const int height,
+                              const float lowerLimit,
+                              const float upperLimit,
+                              const uint32_t filters)
+{
+  const unsigned int fc[2] = {FC(0, 0, filters), FC(1, 0, filters)};
+  float maxRatio = 1.f;
+  DT_OMP_FOR(reduction(max: maxRatio))
+  for(int row = 4; row < height - 4; ++row)
+  {
+    for(int col = 5 + (fc[row & 1] & 1); col < width - 4; col += 2)
+    {
+      const float *cfa = in + row*width + col;
+      const float val00 = cfa[0];
+      if(val00 > RAWEPS)
+      {
+        const float val1m1 = cfa[width-1];
+        const float val1p1 = cfa[width+1];
+        const float maxVal0 = MAX(val00, val1m1);
+        if(val1m1 > RAWEPS && maxVal0 > lowerLimit)
+        {
+          const float minVal = MIN(val00, val1m1);
+          if(maxVal0 > maxRatio * minVal)
+          {
+            gboolean clipped = FALSE;
+            if(maxVal0 == val00)
+            { // check for influence by clipped green in neighborhood
+              if(MAX(MAX(cfa[-width-1], cfa[-width+1]), val1p1) >= upperLimit)
+                clipped = TRUE;
+            }
+            else
+            { // check for influence by clipped green in neighborhood
+              if(MAX(MAX(MAX(cfa[-2], val00), cfa[2*width-2]), cfa[2*width]) >= upperLimit)
+                clipped = TRUE;
+            }
+            if(!clipped)
+              maxRatio = maxVal0 / minVal;
+          }
+        }
+
+        const float maxVal1 = MAX(val00, val1p1);
+        if(val1p1 > RAWEPS && maxVal1 > lowerLimit)
+        {
+          const float minVal = MIN(val00, val1p1);
+          if(maxVal1 > maxRatio * minVal)
+          {
+            if(maxVal1 == val00)
+            { // check for influence by clipped green in neighborhood
+              if(MAX(MAX(cfa[-width-1], cfa[-width+1]), val1p1) >= upperLimit)
+                continue;
+             }
+            else
+            { // check for influence by clipped green in neighborhood
+              if(MAX(MAX(MAX(val00, cfa[2]), cfa[2*width]), cfa[2*width+2]) >= upperLimit)
+                continue;
+             }
+            maxRatio = maxVal1 / minVal;
+          }
+        }
+      }
+    }
+  }
+  return sqrtf(1.0f / logf(maxRatio));
+}
+
+static float _calcRadiusXtrans(const float *in,
+                               const float lowerLimit,
+                               const float upperLimit,
+                               const dt_iop_roi_t *const roi,
+                               const uint8_t(*const xtrans)[6])
+{
+  const int width = roi->width;
+  const int height = roi->height;
+
+  int startx, starty;
+  gboolean found = FALSE;
+  for(starty = 6; starty < 12 && !found; starty++)
+  {
+    for(startx = 6; startx < 12 && !found; startx++)
+    {
+      if(FCxtrans(starty, startx, roi, xtrans) == 1)
+      {
+        if(FCxtrans(starty, startx - 1, roi, xtrans) != FCxtrans(starty, startx + 1, roi, xtrans))
+        {
+          if(FCxtrans(starty -1, startx, roi, xtrans) != 1)
+          {
+            if(FCxtrans(starty, startx -1, roi, xtrans) != 1)
+            {
+              found = TRUE;
+              break;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  float maxRatio = 1.0f;
+  DT_OMP_FOR(reduction(max: maxRatio))
+  for(int row = starty + 2; row < height - 4; row += 3)
+  {
+    for(int col = startx + 2; col < width - 4; col += 3)
+    {
+      const float *cfa = in + row*width + col;
+      const float valp1p1 = cfa[width+1];
+      const gboolean squareClipped = MAX(MAX(MAX(valp1p1, cfa[width+2]), cfa[2*width+1]), cfa[2*width+2]) >= upperLimit;
+      const float greenSolitary = cfa[0];
+      if(greenSolitary > RAWEPS && MAX(cfa[-width-1], cfa[-width+1]) < upperLimit)
+      {
+        if(greenSolitary < upperLimit)
+        {
+          const float valp1m1 = cfa[width-1];
+          if(valp1m1 > RAWEPS && MAX(MAX(MAX(cfa[width-2], valp1m1), cfa[2*width-2]), cfa[width-1]) < upperLimit)
+          {
+            const float maxVal = MAX(greenSolitary, valp1m1);
+            if(maxVal > lowerLimit)
+            {
+              const float minVal = MIN(greenSolitary, valp1m1);
+              if(maxVal > maxRatio * minVal)
+                maxRatio = maxVal / minVal;
+            }
+          }
+          if(valp1p1 > RAWEPS && !squareClipped)
+          {
+            const float maxVal = MAX(greenSolitary, valp1p1);
+            if(maxVal > lowerLimit)
+            {
+              const float minVal = MIN(greenSolitary, valp1p1);
+              if(maxVal > maxRatio * minVal)
+                maxRatio = maxVal / minVal;
+            }
+          }
+        }
+      }
+
+      if(!squareClipped)
+      {
+        const float valp2p2 = cfa[2*width+2];
+        if(valp2p2 > RAWEPS)
+        {
+          if(valp1p1 > RAWEPS)
+          {
+            const float maxVal = MAX(valp1p1, valp2p2);
+            if(maxVal > lowerLimit)
+            {
+              const float minVal = MIN(valp1p1, valp2p2);
+              if(maxVal > maxRatio * minVal)
+                 maxRatio = maxVal / minVal;
+            }
+          }
+          const float greenSolitaryRight = cfa[3*width+3];
+          if(MAX(MAX(greenSolitaryRight, cfa[4*width+2]), cfa[4*width+4]) < upperLimit)
+          {
+            if(greenSolitaryRight > RAWEPS)
+            {
+              const float maxVal = MAX(greenSolitaryRight, valp2p2);
+              if(maxVal > lowerLimit)
+              {
+                const float minVal = MIN(greenSolitaryRight, valp2p2);
+                if(maxVal > maxRatio * minVal)
+                  maxRatio = maxVal / minVal;
+              }
+            }
+          }
+        }
+        const float valp1p2 = cfa[width+2];
+        const float valp2p1 = cfa[2*width+1];
+        if(valp2p1 > RAWEPS)
+        {
+          if(valp1p2 > RAWEPS)
+          {
+            const float maxVal = MAX(valp1p2, valp2p1);
+            if(maxVal > lowerLimit)
+            {
+              const float minVal = MIN(valp1p2, valp2p1);
+              if(maxVal > maxRatio * minVal)
+                maxRatio = maxVal / minVal;
+            }
+          }
+          const float greenSolitaryLeft = cfa[3*width];
+          if(MAX(MAX(greenSolitaryLeft, cfa[4*width-1]), cfa[4*width+1]) < upperLimit)
+          {
+            if(greenSolitaryLeft > RAWEPS)
+            {
+              const float maxVal = MAX(greenSolitaryLeft, valp2p1);
+              if(maxVal > lowerLimit)
+              {
+                const float minVal = MIN(greenSolitaryLeft, valp2p1);
+                if(maxVal > maxRatio * minVal)
+                  maxRatio = maxVal / minVal;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  return sqrtf(1.0f / logf(maxRatio));
+}
+#undef RAWEPS
+
+DT_OMP_DECLARE_SIMD(aligned(in, out, kernels:64))
+static inline void _blur_mul(const float *const in,
+                             float *out,
+                             const float *blend,
+                             const float *const kernels,
+                             const unsigned char *const table,
+                             const int w1,
+                             const int height)
+{
+  const int w2 = 2 * w1;
+  const int w3 = 3 * w1;
+  const int w4 = 4 * w1;
+
+  DT_OMP_FOR()
+  for(int row = 0; row < height; row++)
+  {
+    for(int col = 0; col < w1; col++)
+    {
+      const size_t i = (size_t)row * w1 + col;
+      if(blend[i] > CAPTURE_BLEND_EPS)
+      {
+        const float *kern = kernels + CAPTURE_KERNEL_ALIGN * table[i];
+        float val = 0.0f;
+        if(col >= 4 && row >= 4 && col < w1 - 4 && row < height - 4)
+        {
+          const float *d = in + i;
+          val =
+              kern[10+4] * (d[-w4-2] + d[-w4+2] + d[-w2-4] + d[-w2+4] + d[w2-4] + d[w2+4] + d[w4-2] + d[w4+2]) +
+              kern[5 +4] * (d[-w4-1] + d[-w4+1] + d[-w1-4] + d[-w1+4] + d[w1-4] + d[w1+4] + d[w4-1] + d[w4+1]) +
+              kern[4]    * (d[-w4  ] + d[   -4] + d[    4] + d[ w4  ]) +
+              kern[15+3] * (d[-w3-3] + d[-w3+3] + d[ w3-3] + d[ w3+3]) +
+              kern[10+3] * (d[-w3-2] + d[-w3+2] + d[-w2-3] + d[-w2+3] + d[w2-3] + d[w2+3] + d[w3-2] + d[w3+2]) +
+              kern[ 5+3] * (d[-w3-1] + d[-w3+1] + d[-w1-3] + d[-w1+3] + d[w1-3] + d[w1+3] + d[w3-1] + d[w3+1]) +
+              kern[   3] * (d[-w3  ] + d[   -3] + d[    3] + d[ w3  ]) +
+              kern[10+2] * (d[-w2-2] + d[-w2+2] + d[ w2-2] + d[ w2+2]) +
+              kern[ 5+2] * (d[-w2-1] + d[-w2+1] + d[-w1-2] + d[-w1+2] + d[w1-2] + d[w1+2] + d[w2-1] + d[w2+1]) +
+              kern[   2] * (d[-w2  ] + d[   -2] + d[    2] + d[ w2  ]) +
+              kern[ 5+1] * (d[-w1-1] + d[-w1+1] + d[ w1-1] + d[ w1+1]) +
+              kern[   1] * (d[-w1  ] + d[   -1] + d[    1] + d[ w1  ]) +
+              kern[   0] * (d[0]);
+        }
+        else
+        {
+          for(int ir = -4; ir <= 4; ir++)
+          {
+            const int irow = row+ir;
+            if(irow >= 0 && irow < height)
+            {
+              for(int ic = -4; ic <= 4; ic++)
+              {
+                const int icol = col+ic;
+                if(icol >=0 && icol < w1)
+                  val += kern[5 * ABS(ir) + ABS(ic)] * in[(size_t)irow * w1 + icol];
+              }
+            }
+          }
+        }
+        out[i] *= val;
+      }
+      // if blend value is too low we don't have to copy data as we also didn't in _blur_div
+      // and we just keep the original
+    }
+  }
+}
+
+DT_OMP_DECLARE_SIMD(aligned(in, out, divbuff, kernels :64))
+static inline void _blur_div(const float *const in,
+                             float *out,
+                             const float *const divbuff,
+                             const float *blend,
+                             const float *const kernels,
+                             const unsigned char *const table,
+                             const int w1,
+                             const int height)
+{
+  const int w2 = 2 * w1;
+  const int w3 = 3 * w1;
+  const int w4 = 4 * w1;
+
+  DT_OMP_FOR()
+  for(int row = 0; row < height; row++)
+  {
+    for(int col = 0; col < w1; col++)
+    {
+      const size_t i = (size_t)row * w1 + col;
+      if(blend[i] > CAPTURE_BLEND_EPS)
+      {
+        const float *kern = kernels + CAPTURE_KERNEL_ALIGN * table[i];
+        float val = 0.0f;
+        if(col >= 4 && row >= 4 && col < w1 - 4 && row < height - 4)
+        {
+          const float *d = in + i;
+          val =
+              kern[10+4] * (d[-w4-2] + d[-w4+2] + d[-w2-4] + d[-w2+4] + d[w2-4] + d[w2+4] + d[w4-2] + d[w4+2]) +
+              kern[5 +4] * (d[-w4-1] + d[-w4+1] + d[-w1-4] + d[-w1+4] + d[w1-4] + d[w1+4] + d[w4-1] + d[w4+1]) +
+              kern[4]    * (d[-w4  ] + d[   -4] + d[    4] + d[ w4  ]) +
+              kern[15+3] * (d[-w3-3] + d[-w3+3] + d[ w3-3] + d[ w3+3]) +
+              kern[10+3] * (d[-w3-2] + d[-w3+2] + d[-w2-3] + d[-w2+3] + d[w2-3] + d[w2+3] + d[w3-2] + d[w3+2]) +
+              kern[ 5+3] * (d[-w3-1] + d[-w3+1] + d[-w1-3] + d[-w1+3] + d[w1-3] + d[w1+3] + d[w3-1] + d[w3+1]) +
+              kern[   3] * (d[-w3  ] + d[   -3] + d[    3] + d[ w3  ]) +
+              kern[10+2] * (d[-w2-2] + d[-w2+2] + d[ w2-2] + d[ w2+2]) +
+              kern[ 5+2] * (d[-w2-1] + d[-w2+1] + d[-w1-2] + d[-w1+2] + d[w1-2] + d[w1+2] + d[w2-1] + d[w2+1]) +
+              kern[   2] * (d[-w2  ] + d[   -2] + d[    2] + d[ w2  ]) +
+              kern[ 5+1] * (d[-w1-1] + d[-w1+1] + d[ w1-1] + d[ w1+1]) +
+              kern[   1] * (d[-w1  ] + d[   -1] + d[    1] + d[ w1  ]) +
+              kern[   0] * (d[0]);
+        }
+        else
+        {
+          for(int ir = -4; ir <= 4; ir++)
+          {
+            const int irow = row+ir;
+            if(irow >= 0 && irow < height)
+            {
+              for(int ic = -4; ic <= 4; ic++)
+              {
+                const int icol = col+ic;
+                if(icol >=0 && icol < w1)
+                  val += kern[5 * ABS(ir) + ABS(ic)] * in[(size_t)irow * w1 + icol];
+              }
+            }
+          }
+        }
+        out[i] = divbuff[i] / MAX(val, 0.00001f);
+      }
+    }
+  }
+}
+
+static void _prepare_blend(const float *cfa,
+                           const float *rgb,
+                           const uint32_t filters,
+                           const uint8_t (*const xtrans)[6],
+                           const dt_iop_roi_t *const roi,
+                           float *mask,
+                           float *Yold,
+                           const float *whites,
+                           const int w1,
+                           const int height)
+{
+  dt_iop_image_fill(mask, 1.0f, w1, height, 1);
+  const int w2 = 2 * w1;
+  DT_OMP_FOR(collapse(2))
+  for(size_t row = 0; row < height; row++)
+  {
+    for(size_t col = 0; col < w1; col++)
+    {
+      const size_t k = row * w1 + col;
+      // Photometric/digital ITU BT.709
+      const float Y = MAX(0.0f, 0.2126f*rgb[k*4] + 0.7152f*rgb[k*4+1] + 0.0722f*rgb[k*4+2]);
+      Yold[k] = Y;
+      if(row > 1 && col > 1 && row < height-2 && col < w1-2)
+      {
+        const int color = (filters == 9u) ? FCxtrans(row, col, roi, xtrans) : FC(row, col, filters);
+        if(cfa[k] > whites[color] || Y < CAPTURE_YMIN)
+        {
+          mask[k-w2-1] = mask[k-w2]   = mask[k-w2+1] =
+          mask[k-w1-2] = mask[k-w1-1] = mask[k-w1]   = mask[k-w1+1] = mask[k-w1+2] =
+          mask[k-2]    = mask[k-1]    = mask[k]      = mask[k+1]    = mask[k+2] =
+          mask[k+w1-2] = mask[k+w1-1] = mask[k+w1]   = mask[k+w1+1] = mask[k+w1+2] =
+          mask[k+w2-1] = mask[k+w2]   = mask[k+w2+1] = 0.0f;
+        }
+      }
+      else
+        mask[k] = 0.0f;
+    }
+  }
+}
+
+static void _modify_blend(float *blend,
+                          float *Yold,
+                          float *luminance,
+                          const float threshold,
+                          const int width,
+                          const int height)
+{
+  DT_OMP_FOR()
+  for(int irow = 0; irow < height; irow++)
+  {
+    const int row = CLAMP(irow, 2, height-3);
+    for(int icol = 0; icol < width; icol++)
+    {
+      const int col = CLAMP(icol, 2, width-3);
+      const size_t k = (size_t)irow * width + icol;
+      float av = 0.0f;
+      for(int y = row-1; y < row+2; y++)
+      {
+        for(int x = col-2; x < col+3; x++)
+          av += Yold[(size_t)y*width + x];
+      }
+      for(int x = col-1; x < col+2; x++)
+      {
+        av += Yold[(size_t)(row-2)*width + x];
+        av += Yold[(size_t)(row+2)*width + x];
+      }
+      av /= 21.0f;
+
+      float sv = 0.0f;
+      for(int y = row-1; y < row+2; y++)
+      {
+        for(int x = col-2; x < col+3; x++)
+          sv += sqrf(Yold[(size_t)y*width + x] - av);
+      }
+      for(int x = col-1; x < col+2; x++)
+      {
+        sv += sqrf(Yold[(size_t)(row-2)*width + x] - av);
+        sv += sqrf(Yold[(size_t)(row+2)*width + x] - av);
+      }
+      sv = powf(MAX(0.0f, 5.0f * sqrtf(sv / 21.f) - threshold), CAPTURE_THRESHPOWER);
+      blend[k] *= CLIP(sv);
+      luminance[k] = Yold[k];
+    }
+  }
+}
+
+static inline float _get_threshold(const float threshold)
+{
+  return 0.06f * sqrf(threshold);
+}
+
+void _capture_sharpen(dt_iop_module_t *self,
+                      dt_dev_pixelpipe_iop_t *piece,
+                      float *in,
+                      float *out,
+                      const dt_iop_roi_t *const roi,
+                      const gboolean showmask)
+{
+  dt_dev_pixelpipe_t *pipe = piece->pipe;
+
+  const size_t width = roi->width;
+  const size_t height = roi->height;
+  const size_t pixels = width * height;
+  const dt_iop_demosaic_data_t *d = piece->data;
+  const dt_iop_demosaic_global_data_t *gd = self->global_data;
+  dt_iop_demosaic_gui_data_t *g = self->gui_data;
+
+  if(pipe->type & DT_DEV_PIXELPIPE_THUMBNAIL)
+  {
+    const gboolean hqthumb = _get_thumb_quality(pipe->final_width, pipe->final_height);
+    if(!hqthumb) return;
+  }
+
+  const int iterations = powf((float)d->cs_strength, 1.3f);
+  if(iterations < 1 && !showmask) return;
+
+  const uint8_t(*const xtrans)[6] = (const uint8_t(*const)[6])pipe->dsc.xtrans;
+  const uint32_t filters = pipe->dsc.filters;
+  const dt_iop_buffer_dsc_t *dsc = &pipe->dsc;
+  const gboolean wbon = dsc->temperature.enabled;
+  const dt_aligned_pixel_t icoeffs = { wbon ? CAPTURE_CFACLIP * dsc->temperature.coeffs[0] : CAPTURE_CFACLIP,
+                                       wbon ? CAPTURE_CFACLIP * dsc->temperature.coeffs[1] : CAPTURE_CFACLIP,
+                                       wbon ? CAPTURE_CFACLIP * dsc->temperature.coeffs[2] : CAPTURE_CFACLIP,
+                                       0.0f };
+  const gboolean fullpipe = pipe->type & DT_DEV_PIXELPIPE_FULL;
+  const gboolean autoradius = fullpipe && g && g->autoradius;
+  const float old_radius = d->cs_radius;
+  float radius = old_radius;
+  if(autoradius || radius < 0.005f)
+  {
+    radius = filters != 9u
+              ? _calcRadiusBayer(in, width, height, 0.01f, 1.0f, filters)
+              : _calcRadiusXtrans(in, 0.01f, 1.0f, roi, xtrans);
+
+    dt_print_pipe(DT_DEBUG_PIPE, filters != 9u ? "bayer autoradius" : "xtrans autoradius",
+      pipe, self, DT_DEVICE_CPU, roi, NULL, "autoradius=%.2f", radius);
+
+    if(!feqf(radius, old_radius, 0.002f))
+    {
+      if(fullpipe)
+      {
+        if(g)
+        {
+          dt_control_log(_("calculated radius: %.2f"), radius);
+          g->autoradius = TRUE;
+        }
+        dt_iop_demosaic_params_t *p = self->params;
+        p->cs_radius = radius;
+      }
+    }
+    else if(g) g->autoradius = FALSE;
+  }
+
+  unsigned char *gauss_idx = NULL;
+  gboolean error = TRUE;
+
+  float *luminance = dt_alloc_align_float(pixels);
+  float *tmp2 = dt_alloc_align_float(pixels);
+  float *tmp1 = dt_alloc_align_float(pixels);
+  float *blendmask = dt_alloc_align_float(pixels);
+  if(!luminance || !tmp2 || !tmp1 || !blendmask)
+    goto finalize;
+
+  const float threshold = _get_threshold(d->cs_thrs);
+
+  // tmp2 will hold the temporary clipmask, tmp1 holds Y data
+  _prepare_blend(in, out, filters, xtrans, roi, tmp2, tmp1, icoeffs, width, height);
+  // modify clipmask in tmp2 according to Y variance, also write L to luminance
+  _modify_blend(tmp2, tmp1, luminance, threshold, width, height);
+  dt_gaussian_fast_blur(tmp2, blendmask, width, height, 2.0f, 0.0f, 1.0f, 1);
+
+  if(showmask)
+  {
+    DT_OMP_FOR()
+    for(size_t k = 0; k < pixels*4; k +=4)
+    {
+      const float blend = blendmask[k/4];
+      out[k+3] = blend < CAPTURE_BLEND_EPS ? 0.0f : blend;
+    }
+    error = FALSE;
+    goto finalize;
+  }
+
+  gauss_idx = _cs_precalc_gauss_idx(self, roi, radius, d->cs_boost);
+  if(!gauss_idx) goto finalize;
+
+  for(int iter = 0; iter < iterations && !dt_pipe_shutdown(pipe); iter++)
+  {
+    _blur_div(tmp1, tmp2, luminance, blendmask, gd->gauss_coeffs, gauss_idx, width, height);
+    _blur_mul(tmp2, tmp1, blendmask, gd->gauss_coeffs, gauss_idx, width, height);
+  }
+
+  DT_OMP_FOR()
+  for(size_t k = 0; k < (size_t)width * height; k++)
+  {
+    if(blendmask[k] > CAPTURE_BLEND_EPS)
+    {
+      const float luminance_new = interpolatef(blendmask[k], tmp1[k], luminance[k]);
+      const float factor = luminance_new / MAX(luminance[k], 0.00001f);
+      for_three_channels(c) out[k*4+c] *= factor;
+    }
+  }
+
+  error = FALSE;
+
+  finalize:
+  if(error)
+    dt_print_pipe(DT_DEBUG_ALWAYS, "capture sharpen failed", pipe, self, DT_DEVICE_CPU, NULL, NULL,
+      "unable to allocate memory");
+
+  dt_free_align(gauss_idx);
+  dt_free_align(tmp2);
+  dt_free_align(tmp1);
+  dt_free_align(luminance);
+  dt_free_align(blendmask);
+}
+
+// revert aggressive optimizing
+#ifdef __GNUC__
+  #pragma GCC pop_options
+#endif
+
+#if HAVE_OPENCL
+
+int _capture_sharpen_cl(dt_iop_module_t *self,
+                        dt_dev_pixelpipe_iop_t *piece,
+                        cl_mem dev_in,
+                        cl_mem dev_out,
+                        const dt_iop_roi_t *const roi,
+                        const gboolean showmask)
+{
+  dt_dev_pixelpipe_t *pipe = piece->pipe;
+
+  const int width = roi->width;
+  const int height = roi->height;
+  const int bsize = sizeof(float) * width * height;
+  const int devid = piece->pipe->devid;
+
+  const dt_iop_demosaic_data_t *d = piece->data;
+  dt_iop_demosaic_global_data_t *const gd = self->global_data;
+  dt_iop_demosaic_gui_data_t *g = self->gui_data;
+
+  if(pipe->type & DT_DEV_PIXELPIPE_THUMBNAIL)
+  {
+    const gboolean hqthumb = _get_thumb_quality(pipe->final_width, pipe->final_height);
+    if(!hqthumb) return CL_SUCCESS;
+  }
+
+  const int iterations = powf((float)d->cs_strength, 1.3f);
+  if(iterations < 1 && !showmask) return CL_SUCCESS;
+
+  const uint32_t filters = pipe->dsc.filters;
+  const dt_iop_buffer_dsc_t *dsc = &pipe->dsc;
+  const gboolean wbon = dsc->temperature.enabled;
+  dt_aligned_pixel_t icoeffs = { wbon ? CAPTURE_CFACLIP * dsc->temperature.coeffs[0] : CAPTURE_CFACLIP,
+                                 wbon ? CAPTURE_CFACLIP * dsc->temperature.coeffs[1] : CAPTURE_CFACLIP,
+                                 wbon ? CAPTURE_CFACLIP * dsc->temperature.coeffs[2] : CAPTURE_CFACLIP,
+                                 0.0f };
+
+  const gboolean fullpipe = pipe->type & DT_DEV_PIXELPIPE_FULL;
+  const gboolean autoradius = fullpipe && g && g->autoradius;
+  const float old_radius = d->cs_radius;
+  float radius = old_radius;
+  if(autoradius || radius < 0.005f)
+  {
+    float *in = dt_alloc_align_float((size_t)width * height);
+    if(in)
+    {
+      if(dt_opencl_copy_device_to_host(devid, in, dev_in, width, height, sizeof(float)) == CL_SUCCESS)
+      {
+        radius = filters != 9u
+                ? _calcRadiusBayer(in, width, height, 0.01f, 1.0f, filters)
+                : _calcRadiusXtrans(in, 0.01f, 1.0f, roi, (const uint8_t(*const)[6])pipe->dsc.xtrans);
+        dt_print_pipe(DT_DEBUG_PIPE, filters != 9u ? "bayer autoradius" : "xtrans autoradius",
+            pipe, self, devid, roi, NULL, "autoradius=%.2f", radius);
+
+        if(!feqf(radius, old_radius, 0.002f))
+        {
+          if(fullpipe)
+          {
+            if(g)
+            {
+              dt_control_log(_("calculated radius: %.2f"), radius);
+              g->autoradius = TRUE;
+            }
+            dt_iop_demosaic_params_t *p = self->params;
+            p->cs_radius = radius;
+          }
+        }
+        else if(g) g->autoradius = FALSE;
+      }
+      dt_free_align(in);
+    }
+  }
+
+  const float threshold = _get_threshold(d->cs_thrs);
+
+  cl_mem gcoeffs = NULL;
+  cl_mem gauss_idx = NULL;
+
+  cl_int err = CL_MEM_OBJECT_ALLOCATION_FAILURE;
+  cl_mem blendmask = dt_opencl_alloc_device_buffer(devid, bsize);
+  cl_mem luminance = dt_opencl_alloc_device_buffer(devid, bsize);
+  cl_mem tmp2 = dt_opencl_alloc_device_buffer(devid, bsize);
+  cl_mem tmp1 = dt_opencl_alloc_device_buffer(devid, bsize);
+  cl_mem xtrans = dt_opencl_copy_host_to_device_constant(devid, sizeof(pipe->dsc.xtrans), pipe->dsc.xtrans);
+  cl_mem whites = dt_opencl_copy_host_to_device_constant(devid, 4 * sizeof(float), icoeffs);
+  cl_mem dev_rgb = dt_opencl_duplicate_image(devid, dev_out);
+
+  if(!blendmask || !luminance || !tmp2 || !tmp1 || !xtrans || !whites || !dev_rgb) goto finish;
+
+  err = dt_opencl_enqueue_kernel_2d_args(devid, gd->prefill_clip_mask, width, height,
+          CLARG(tmp2), CLARG(width), CLARG(height));
+  if(err != CL_SUCCESS) goto finish;
+
+  err = dt_opencl_enqueue_kernel_2d_args(devid, gd->prepare_blend, width, height,
+          CLARG(dev_in), CLARG(dev_out), CLARG(filters), CLARG(xtrans), CLARG(tmp2), CLARG(tmp1),
+          CLARG(whites), CLARG(width), CLARG(height));
+  if(err != CL_SUCCESS) goto finish;
+
+  err = dt_opencl_enqueue_kernel_2d_args(devid, gd->modify_blend, width, height,
+          CLARG(tmp2), CLARG(tmp1), CLARG(luminance), CLARG(threshold), CLARG(width), CLARG(height));
+  if(err != CL_SUCCESS) goto finish;
+
+  err = dt_gaussian_fast_blur_cl_buffer(devid, tmp2, blendmask, width, height, 2.0f, 1, 0.0f, 1.0f);
+  if(err != CL_SUCCESS) goto finish;
+
+  if(showmask)
+  {
+    err = dt_opencl_enqueue_kernel_2d_args(devid, gd->show_blend_mask, width, height,
+          CLARG(dev_rgb), CLARG(dev_out), CLARG(blendmask), CLARG(width), CLARG(height));
+    goto finish;
+  }
+
+  unsigned char *f_gauss_idx = _cs_precalc_gauss_idx(self, roi, radius, d->cs_boost);
+  if(f_gauss_idx)
+  {
+    gcoeffs = dt_opencl_copy_host_to_device_constant(devid, sizeof(float) * (UCHAR_MAX+1) * CAPTURE_KERNEL_ALIGN, gd->gauss_coeffs);
+    gauss_idx = dt_opencl_copy_host_to_device_constant(devid, sizeof(unsigned char) * height * width, f_gauss_idx);
+  }
+  dt_free_align(f_gauss_idx);
+
+  err = CL_MEM_OBJECT_ALLOCATION_FAILURE;
+  if(!gcoeffs || !gauss_idx) goto finish;
+
+  for(int iter = 0; iter < iterations && !dt_pipe_shutdown(pipe); iter++)
+  {
+    err = dt_opencl_enqueue_kernel_2d_args(devid, gd->gaussian_9x9_div, width, height,
+      CLARG(tmp1), CLARG(tmp2), CLARG(luminance), CLARG(blendmask),
+      CLARG(gcoeffs), CLARG(gauss_idx), CLARG(width), CLARG(height));
+    if(err != CL_SUCCESS) goto finish;
+
+    err = dt_opencl_enqueue_kernel_2d_args(devid, gd->gaussian_9x9_mul, width, height,
+      CLARG(tmp2), CLARG(tmp1), CLARG(blendmask),
+      CLARG(gcoeffs), CLARG(gauss_idx), CLARG(width), CLARG(height));
+    if(err != CL_SUCCESS) goto finish;
+  }
+
+  err = dt_opencl_enqueue_kernel_2d_args(devid, gd->capture_result, width, height,
+    CLARG(dev_rgb), CLARG(dev_out), CLARG(blendmask), CLARG(luminance), CLARG(tmp1),
+    CLARG(width), CLARG(height));
+
+  finish:
+  if(err != CL_SUCCESS)
+    dt_print_pipe(DT_DEBUG_ALWAYS, "capture sharpen failed",
+      pipe, self, devid, NULL, NULL,
+      "Error: %s", cl_errstr(err));
+
+  dt_opencl_release_mem_object(gauss_idx);
+  dt_opencl_release_mem_object(gcoeffs);
+  dt_opencl_release_mem_object(blendmask);
+  dt_opencl_release_mem_object(dev_rgb);
+  dt_opencl_release_mem_object(tmp2);
+  dt_opencl_release_mem_object(tmp1);
+  dt_opencl_release_mem_object(luminance);
+  dt_opencl_release_mem_object(xtrans);
+  dt_opencl_release_mem_object(whites);
+
+  return err;
+}
+#endif // OpenCL