Skip to content

Commit

Permalink
simd::unfilter_paethN: Load 4 (or 8) bytes at a time (faster than 3…
Browse files Browse the repository at this point in the history
… or 6).

This CL loads RGB data using 4-bytes-wide loads (and RRGGBB data using
8-byte-wide loads), because:

* This is faster as measured by the microbenchmarks below
* It doesn't change the behavior - before and after these changes we
  were ignoring the 4th SIMD lane when processing RGB data (after this
  change the 4th SIMD lane will contain data from the next pixel, before
  this change it contained a 0 value)
* This is safe as long as we have more than 4 bytes of remaining input
  data (we have to fall back to a 3-bytes-wide load for the last pixel).

Results of running microbenchmarks on the author's machine:

```
$ bench --bench=unfilter --features=unstable,benchmarks -- --baseline=simd1 Paeth/bpp=[36]
...
unfilter/filter=Paeth/bpp=3
                        time:   [18.755 µs 18.761 µs 18.767 µs]
                        thrpt:  [624.44 MiB/s 624.65 MiB/s 624.83 MiB/s]
                 change:
                        time:   [-16.148% -15.964% -15.751%] (p = 0.00 < 0.05)
                        thrpt:  [+18.696% +18.997% +19.258%]
                        Performance has improved.
...
unfilter/filter=Paeth/bpp=6
                        time:   [18.991 µs 19.000 µs 19.009 µs]
                        thrpt:  [1.2041 GiB/s 1.2047 GiB/s 1.2052 GiB/s]
                 change:
                        time:   [-15.161% -15.074% -14.987%] (p = 0.00 < 0.05)
                        thrpt:  [+17.629% +17.750% +17.871%]
                        Performance has improved.
```
  • Loading branch information
anforowicz committed Sep 27, 2023
1 parent d7798cd commit f5021fb
Showing 1 changed file with 44 additions and 10 deletions.
54 changes: 44 additions & 10 deletions src/filter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -96,17 +96,34 @@ mod simd {
}

/// Undoes `FilterType::Paeth` for `BytesPerPixel::Three`.
pub fn unfilter_paeth3(prev_row: &[u8], curr_row: &mut [u8]) {
pub fn unfilter_paeth3(mut prev_row: &[u8], mut curr_row: &mut [u8]) {
debug_assert_eq!(prev_row.len(), curr_row.len());
debug_assert_eq!(prev_row.len() % 3, 0);

let mut state = PaethState::<4>::default();
for (prev, curr) in prev_row.chunks_exact(3).zip(curr_row.chunks_exact_mut(3)) {
let b = load3(prev);
let mut x = load3(curr);
while prev_row.len() >= 4 {
// `u8x4` requires working with `[u8;4]`, but we can just load and ignore the first
// byte from the next triple. This optimization technique mimics the algorithm found
// in
// https://github.com/glennrp/libpng/blob/f8e5fa92b0e37ab597616f554bee254157998227/intel/filter_sse2_intrinsics.c#L130-L131
let b = u8x4::from_slice(prev_row);
let mut x = u8x4::from_slice(curr_row);

paeth_step(&mut state, b, &mut x);
store3(x, curr);

// We can speculate that writing 4 bytes might be more efficient (just as with using
// `u8x4::from_slice` above), but we can't use that here, because we can't clobber the
// first byte of the next pixel in the `curr_row`.
store3(x, curr_row);

prev_row = &prev_row[3..];
curr_row = &mut curr_row[3..];
}
// Can't use `u8x4::from_slice` for the last `[u8;3]`.
let b = load3(prev_row);
let mut x = load3(curr_row);
paeth_step(&mut state, b, &mut x);
store3(x, curr_row);
}

fn load6(src: &[u8]) -> u8x8 {
Expand All @@ -118,17 +135,34 @@ mod simd {
}

/// Undoes `FilterType::Paeth` for `BytesPerPixel::Three`.
pub fn unfilter_paeth6(prev_row: &[u8], curr_row: &mut [u8]) {
pub fn unfilter_paeth6(mut prev_row: &[u8], mut curr_row: &mut [u8]) {
debug_assert_eq!(prev_row.len(), curr_row.len());
debug_assert_eq!(prev_row.len() % 6, 0);

let mut state = PaethState::<8>::default();
for (prev, curr) in prev_row.chunks_exact(6).zip(curr_row.chunks_exact_mut(6)) {
let b = load6(prev);
let mut x = load6(curr);
while prev_row.len() >= 8 {
// `u8x8` requires working with `[u8;8]`, but we can just load and ignore the first two
// bytes from the next pixel. This optimization technique mimics the algorithm found
// in
// https://github.com/glennrp/libpng/blob/f8e5fa92b0e37ab597616f554bee254157998227/intel/filter_sse2_intrinsics.c#L130-L131
let b = u8x8::from_slice(prev_row);
let mut x = u8x8::from_slice(curr_row);

paeth_step(&mut state, b, &mut x);
store6(x, curr);

// We can speculate that writing 8 bytes might be more efficient (just as with using
// `u8x8::from_slice` above), but we can't use that here, because we can't clobber the
// first bytes of the next pixel in the `curr_row`.
store6(x, curr_row);

prev_row = &prev_row[6..];
curr_row = &mut curr_row[6..];
}
// Can't use `u8x8::from_slice` for the last `[u8;6]`.
let b = load6(prev_row);
let mut x = load6(curr_row);
paeth_step(&mut state, b, &mut x);
store6(x, curr_row);
}
}

Expand Down

0 comments on commit f5021fb

Please sign in to comment.