Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Faster gaussian blur for OpenCL and CPU #17572

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 1 addition & 33 deletions data/kernels/demosaic_rcd.cl
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*
This file is part of darktable,
rcd_cl implemented Hanno Schwalm (hanno@schwalm-bremen.de)
Copyright (C) 2020-2024 darktable developers.

darktable is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
Expand Down Expand Up @@ -346,38 +346,6 @@ __kernel void calc_detail_blend(global float *in, global float *out, const int w
out[idx] = detail ? blend : 1.0f - blend;
}

__kernel void fastblur_mask_9x9(global float *src, global float *out, const int w, const int height, global const float *kern)
{
const int col = get_global_id(0);
const int row = get_global_id(1);
if((col >= w) || (row >= height)) return;

const int oidx = mad24(row, w, col);
int incol = col < 4 ? 4 : col;
incol = col > w - 5 ? w - 5 : incol;
int inrow = row < 4 ? 4 : row;
inrow = row > height - 5 ? height - 5 : inrow;
const int i = mad24(inrow, w, incol);

const int w2 = 2 * w;
const int w3 = 3 * w;
const int w4 = 4 * w;
const float val = kern[12] * (src[i - w4 - 2] + src[i - w4 + 2] + src[i - w2 - 4] + src[i - w2 + 4] + src[i + w2 - 4] + src[i + w2 + 4] + src[i + w4 - 2] + src[i + w4 + 2]) +
kern[11] * (src[i - w4 - 1] + src[i - w4 + 1] + src[i - w - 4] + src[i - w + 4] + src[i + w - 4] + src[i + w + 4] + src[i + w4 - 1] + src[i + w4 + 1]) +
kern[10] * (src[i - w4] + src[i - 4] + src[i + 4] + src[i + w4]) +
kern[9] * (src[i - w3 - 3] + src[i - w3 + 3] + src[i + w3 - 3] + src[i + w3 + 3]) +
kern[8] * (src[i - w3 - 2] + src[i - w3 + 2] + src[i - w2 - 3] + src[i - w2 + 3] + src[i + w2 - 3] + src[i + w2 + 3] + src[i + w3 - 2] + src[i + w3 + 2]) +
kern[7] * (src[i - w3 - 1] + src[i - w3 + 1] + src[i - w - 3] + src[i - w + 3] + src[i + w - 3] + src[i + w + 3] + src[i + w3 - 1] + src[i + w3 + 1]) +
kern[6] * (src[i - w3] + src[i - 3] + src[i + 3] + src[i + w3]) +
kern[5] * (src[i - w2 - 2] + src[i - w2 + 2] + src[i + w2 - 2] + src[i + w2 + 2]) +
kern[4] * (src[i - w2 - 1] + src[i - w2 + 1] + src[i - w - 2] + src[i - w + 2] + src[i + w - 2] + src[i + w + 2] + src[i + w2 - 1] + src[i + w2 + 1]) +
kern[3] * (src[i - w2] + src[i - 2] + src[i + 2] + src[i + w2]) +
kern[2] * (src[i - w - 1] + src[i - w + 1] + src[i + w - 1] + src[i + w + 1]) +
kern[1] * (src[i - w] + src[i - 1] + src[i + 1] + src[i + w]) +
kern[0] * src[i];
out[oidx] = clamp(val, 0.0f, 1.0f);
}

kernel void rcd_border_green(read_only image2d_t in, write_only image2d_t out, const int width, const int height,
const unsigned int filters, local float *buffer, const int border)
{
Expand Down
224 changes: 224 additions & 0 deletions data/kernels/gaussian.cl
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,230 @@ gaussian_column_1c(global float *in, global float *out, unsigned int width, unsi
}
}

/*
kern always is an array of 25 floats for kernel coeffs.
Access i = y*5 + x
*/
__kernel void gaussian_kernel_9x9(global float *input,
global float *output,
const int w,
const int h,
const int ch,
global const float *kern,
const float4 minval,
const float4 maxval,
const int dim)
{
const int col = get_global_id(0);
const int row = get_global_id(1);
if((col >= w) || (row >= h)) return;

const int i = mad24(row, w, col);
const int w2 = 2 * w;
const int w3 = 3 * w;
const int w4 = 4 * w;

if(ch == 1)
{
global float *in = input;
global float *out = output;
float val = 0.0f;
if(dim == 4 && col >= 4 && row >= 4 && col < w - 4 && row < h - 4)
{
val = kern[10+4] * (in[i - w4 -2] + in[i - w4 +2] + in[i - w2 -4] + in[i - w2 +4] + in[i + w2 -4] + in[i + w2 +4] + in[i + w4 -2] + in[i + w4 +2])
+ kern[5 +4] * (in[i - w4 -1] + in[i - w4 +1] + in[i - w -4] + in[i - w +4] + in[i + w -4] + in[i + w +4] + in[i + w4 -1] + in[i + w4 +1])
+ kern[4] * (in[i - w4] + in[i - 4] + in[i + 4] + in[i + w4])
+ kern[15+3] * (in[i - w3 -3] + in[i - w3 +3] + in[i + w3 -3] + in[i + w3 +3])
+ kern[10+3] * (in[i - w3 -2] + in[i - w3 +2] + in[i - w2 -3] + in[i - w2 +3] + in[i + w2 -3] + in[i + w2 +3] + in[i + w3 -2] + in[i + w3 +2])
+ kern[ 5+3] * (in[i - w3 -1] + in[i - w3 +1] + in[i - w -3] + in[i - w +3] + in[i + w -3] + in[i + w +3] + in[i + w3 -1] + in[i + w3 +1])
+ kern[ 3] * (in[i - w3] + in[i - 3] + in[i + 3] + in[i + w3])
+ kern[10+2] * (in[i - w2 -2] + in[i - w2 +2] + in[i + w2 -2] + in[i + w2 +2])
+ kern[ 5+2] * (in[i - w2 -1] + in[i - w2 +1] + in[i - w -2] + in[i - w +2] + in[i + w -2] + in[i + w +2] + in[i + w2 -1] + in[i + w2 +1])
+ kern[ 2] * (in[i - w2] + in[i - 2] + in[i + 2] + in[i + w2])
+ kern[ 5+1] * (in[i - w -1] + in[i - w +1] + in[i + w -1] + in[i + w +1])
+ kern[ 1] * (in[i - w] + in[i - 1] + in[i + 1] + in[i + w])
+ kern[ 0] * in[i];
}
else if(dim == 3 && col >= 3 && row >= 3 && col < w - 3 && row < h - 3)
{
val = kern[10+3] * (in[i - w3 -2] + in[i - w3 +2] + in[i - w2 -3] + in[i - w2 +3] + in[i + w2 -3] + in[i + w2 +3] + in[i + w3 -2] + in[i + w3 +2])
+ kern[ 5+3] * (in[i - w3 -1] + in[i - w3 +1] + in[i - w -3] + in[i - w +3] + in[i + w -3] + in[i + w +3] + in[i + w3 -1] + in[i + w3 +1])
+ kern[ 3] * (in[i - w3] + in[i - 3] + in[i + 3] + in[i + w3])
+ kern[10+2] * (in[i - w2 -2] + in[i - w2 +2] + in[i + w2 -2] + in[i + w2 +2])
+ kern[ 5+2] * (in[i - w2 -1] + in[i - w2 +1] + in[i - w -2] + in[i - w +2] + in[i + w -2] + in[i + w +2] + in[i + w2 -1] + in[i + w2 +1])
+ kern[ 2] * (in[i - w2] + in[i - 2] + in[i + 2] + in[i + w2])
+ kern[ 5+1] * (in[i - w -1] + in[i - w +1] + in[i + w -1] + in[i + w +1])
+ kern[ 1] * (in[i - w] + in[i - 1] + in[i + 1] + in[i + w])
+ kern[ 0] * in[i];
}
else if(dim == 2 && col >= 2 && row >= 2 && col < w - 2 && row < h - 2)
{
val = kern[ 5+2] * (in[i - w2 -1] + in[i - w2 +1] + in[i - w -2] + in[i - w +2] + in[i + w -2] + in[i + w +2] + in[i + w2 -1] + in[i + w2 +1])
+ kern[ 2] * (in[i - w2] + in[i - 2] + in[i + 2] + in[i + w2])
+ kern[ 5+1] * (in[i - w -1] + in[i - w +1] + in[i + w -1] + in[i + w +1])
+ kern[ 1] * (in[i - w] + in[i - 1] + in[i + 1] + in[i + w])
+ kern[ 0] * in[i];
}
else
{
float sum = 0.0f;
float div = 0.0f;
for(int ir = -dim; ir <= dim; ir++)
{
const int irow = row+ir;
if(irow >= 0 && irow < h)
{
for(int ic = -dim; ic <= dim; ic++)
{
const int icol = col+ic;
if(icol >=0 && icol < w)
{
const float coeff = kern[5 * abs(ir) + abs(ic)];
div += coeff;
sum += coeff * in[mad24(irow, w, icol)];
}
}
}
}
val = (div != 0.0f) ? sum / div : 0.0f;
}
out[i] = clamp(val, minval.x, maxval.x);
}

else if(ch == 2)
{
global float2 *in = (global float2 *)input;
global float2 *out = (global float2 *)output;
float2 val = 0.0f;
if(dim == 4 && col >= 4 && row >= 4 && col < w - 4 && row < h - 4)
{
val = kern[10+4] * (in[i - w4 -2] + in[i - w4 +2] + in[i - w2 -4] + in[i - w2 +4] + in[i + w2 -4] + in[i + w2 +4] + in[i + w4 -2] + in[i + w4 +2])
+ kern[5 +4] * (in[i - w4 -1] + in[i - w4 +1] + in[i - w -4] + in[i - w +4] + in[i + w -4] + in[i + w +4] + in[i + w4 -1] + in[i + w4 +1])
+ kern[4] * (in[i - w4] + in[i - 4] + in[i + 4] + in[i + w4])
+ kern[15+3] * (in[i - w3 -3] + in[i - w3 +3] + in[i + w3 -3] + in[i + w3 +3])
+ kern[10+3] * (in[i - w3 -2] + in[i - w3 +2] + in[i - w2 -3] + in[i - w2 +3] + in[i + w2 -3] + in[i + w2 +3] + in[i + w3 -2] + in[i + w3 +2])
+ kern[ 5+3] * (in[i - w3 -1] + in[i - w3 +1] + in[i - w -3] + in[i - w +3] + in[i + w -3] + in[i + w +3] + in[i + w3 -1] + in[i + w3 +1])
+ kern[ 3] * (in[i - w3] + in[i - 3] + in[i + 3] + in[i + w3])
+ kern[10+2] * (in[i - w2 -2] + in[i - w2 +2] + in[i + w2 -2] + in[i + w2 +2])
+ kern[ 5+2] * (in[i - w2 -1] + in[i - w2 +1] + in[i - w -2] + in[i - w +2] + in[i + w -2] + in[i + w +2] + in[i + w2 -1] + in[i + w2 +1])
+ kern[ 2] * (in[i - w2] + in[i - 2] + in[i + 2] + in[i + w2])
+ kern[ 5+1] * (in[i - w -1] + in[i - w +1] + in[i + w -1] + in[i + w +1])
+ kern[ 1] * (in[i - w] + in[i - 1] + in[i + 1] + in[i + w])
+ kern[ 0] * in[i];
}
else if(dim == 3 && col >= 3 && row >= 3 && col < w - 3 && row < h - 3)
{
val = kern[10+3] * (in[i - w3 -2] + in[i - w3 +2] + in[i - w2 -3] + in[i - w2 +3] + in[i + w2 -3] + in[i + w2 +3] + in[i + w3 -2] + in[i + w3 +2])
+ kern[ 5+3] * (in[i - w3 -1] + in[i - w3 +1] + in[i - w -3] + in[i - w +3] + in[i + w -3] + in[i + w +3] + in[i + w3 -1] + in[i + w3 +1])
+ kern[ 3] * (in[i - w3] + in[i - 3] + in[i + 3] + in[i + w3])
+ kern[10+2] * (in[i - w2 -2] + in[i - w2 +2] + in[i + w2 -2] + in[i + w2 +2])
+ kern[ 5+2] * (in[i - w2 -1] + in[i - w2 +1] + in[i - w -2] + in[i - w +2] + in[i + w -2] + in[i + w +2] + in[i + w2 -1] + in[i + w2 +1])
+ kern[ 2] * (in[i - w2] + in[i - 2] + in[i + 2] + in[i + w2])
+ kern[ 5+1] * (in[i - w -1] + in[i - w +1] + in[i + w -1] + in[i + w +1])
+ kern[ 1] * (in[i - w] + in[i - 1] + in[i + 1] + in[i + w])
+ kern[ 0] * in[i];
}
else if(dim == 2 && col >= 2 && row >= 2 && col < w - 2 && row < h - 2)
{
val = kern[ 5+2] * (in[i - w2 -1] + in[i - w2 +1] + in[i - w -2] + in[i - w +2] + in[i + w -2] + in[i + w +2] + in[i + w2 -1] + in[i + w2 +1])
+ kern[ 2] * (in[i - w2] + in[i - 2] + in[i + 2] + in[i + w2])
+ kern[ 5+1] * (in[i - w -1] + in[i - w +1] + in[i + w -1] + in[i + w +1])
+ kern[ 1] * (in[i - w] + in[i - 1] + in[i + 1] + in[i + w])
+ kern[ 0] * in[i];
}
else
{
float2 sum = 0.0f;
float div = 0.0f;
for(int ir = -dim; ir <= dim; ir++)
{
const int irow = row+ir;
if(irow >= 0 && irow < h)
{
for(int ic = -dim; ic <= dim; ic++)
{
const int icol = col+ic;
if(icol >=0 && icol < w)
{
const float coeff = kern[5 * abs(ir) + abs(ic)];
div += coeff;
sum += coeff * in[mad24(irow, w, icol)];
}
}
}
}
val = (div != 0.0f) ? sum / div : 0.0f;
}
out[i] = clamp(val, minval.xy, maxval.xy);
}

else if(ch == 4)
{
global float4 *in = (global float4 *)input;
global float4 *out = (global float4 *)output;
float4 val = 0.0f;
if(dim == 4 && col >= 4 && row >= 4 && col < w - 4 && row < h - 4)
{
val = kern[10+4] * (in[i - w4 -2] + in[i - w4 +2] + in[i - w2 -4] + in[i - w2 +4] + in[i + w2 -4] + in[i + w2 +4] + in[i + w4 -2] + in[i + w4 +2])
+ kern[5 +4] * (in[i - w4 -1] + in[i - w4 +1] + in[i - w -4] + in[i - w +4] + in[i + w -4] + in[i + w +4] + in[i + w4 -1] + in[i + w4 +1])
+ kern[4] * (in[i - w4] + in[i - 4] + in[i + 4] + in[i + w4])
+ kern[15+3] * (in[i - w3 -3] + in[i - w3 +3] + in[i + w3 -3] + in[i + w3 +3])
+ kern[10+3] * (in[i - w3 -2] + in[i - w3 +2] + in[i - w2 -3] + in[i - w2 +3] + in[i + w2 -3] + in[i + w2 +3] + in[i + w3 -2] + in[i + w3 +2])
+ kern[ 5+3] * (in[i - w3 -1] + in[i - w3 +1] + in[i - w -3] + in[i - w +3] + in[i + w -3] + in[i + w +3] + in[i + w3 -1] + in[i + w3 +1])
+ kern[ 3] * (in[i - w3] + in[i - 3] + in[i + 3] + in[i + w3])
+ kern[10+2] * (in[i - w2 -2] + in[i - w2 +2] + in[i + w2 -2] + in[i + w2 +2])
+ kern[ 5+2] * (in[i - w2 -1] + in[i - w2 +1] + in[i - w -2] + in[i - w +2] + in[i + w -2] + in[i + w +2] + in[i + w2 -1] + in[i + w2 +1])
+ kern[ 2] * (in[i - w2] + in[i - 2] + in[i + 2] + in[i + w2])
+ kern[ 5+1] * (in[i - w -1] + in[i - w +1] + in[i + w -1] + in[i + w +1])
+ kern[ 1] * (in[i - w] + in[i - 1] + in[i + 1] + in[i + w])
+ kern[ 0] * in[i];
}
else if(dim == 3 && col >= 3 && row >= 3 && col < w - 3 && row < h - 3)
{
val = kern[10+3] * (in[i - w3 -2] + in[i - w3 +2] + in[i - w2 -3] + in[i - w2 +3] + in[i + w2 -3] + in[i + w2 +3] + in[i + w3 -2] + in[i + w3 +2])
+ kern[ 5+3] * (in[i - w3 -1] + in[i - w3 +1] + in[i - w -3] + in[i - w +3] + in[i + w -3] + in[i + w +3] + in[i + w3 -1] + in[i + w3 +1])
+ kern[ 3] * (in[i - w3] + in[i - 3] + in[i + 3] + in[i + w3])
+ kern[10+2] * (in[i - w2 -2] + in[i - w2 +2] + in[i + w2 -2] + in[i + w2 +2])
+ kern[ 5+2] * (in[i - w2 -1] + in[i - w2 +1] + in[i - w -2] + in[i - w +2] + in[i + w -2] + in[i + w +2] + in[i + w2 -1] + in[i + w2 +1])
+ kern[ 2] * (in[i - w2] + in[i - 2] + in[i + 2] + in[i + w2])
+ kern[ 5+1] * (in[i - w -1] + in[i - w +1] + in[i + w -1] + in[i + w +1])
+ kern[ 1] * (in[i - w] + in[i - 1] + in[i + 1] + in[i + w])
+ kern[ 0] * in[i];
}
else if(dim == 2 && col >= 2 && row >= 2 && col < w - 2 && row < h - 2)
{
val = kern[ 5+2] * (in[i - w2 -1] + in[i - w2 +1] + in[i - w -2] + in[i - w +2] + in[i + w -2] + in[i + w +2] + in[i + w2 -1] + in[i + w2 +1])
+ kern[ 2] * (in[i - w2] + in[i - 2] + in[i + 2] + in[i + w2])
+ kern[ 5+1] * (in[i - w -1] + in[i - w +1] + in[i + w -1] + in[i + w +1])
+ kern[ 1] * (in[i - w] + in[i - 1] + in[i + 1] + in[i + w])
+ kern[ 0] * in[i];
}
else
{
float4 sum = 0.0f;
float div = 0.0f;
for(int ir = -dim; ir <= dim; ir++)
{
const int irow = row+ir;
if(irow >= 0 && irow < h)
{
for(int ic = -dim; ic <= dim; ic++)
{
const int icol = col+ic;
if(icol >=0 && icol < w)
{
const float coeff = kern[5 * abs(ir) + abs(ic)];
div += coeff;
sum += coeff * in[mad24(irow, w, icol)];
}
}
}
}
val = (div != 0.0f) ? sum / div : 0.0f;
}
out[i] = clamp(val, minval, maxval);
}
}


float
Expand Down
Loading
Loading