Skip to content

Commit

Permalink
Attempt to optimize bpp=4 and bpp=8 unfilter scenarios.
Browse files Browse the repository at this point in the history
It seems that there are no improvements from using `std::simd` outside
of bpp=3 and bpp=6... I am guessing that auto-vectorization does a
good-enough job for the other cases.
  • Loading branch information
anforowicz committed Sep 25, 2023
1 parent 788633a commit 36b541b
Showing 1 changed file with 82 additions and 30 deletions.
112 changes: 82 additions & 30 deletions src/filter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,40 @@ mod simd {
unfilter3::<PaethState<4>>(prev_row, curr_row);
}

fn unfilter4<T: UnfilterState<4>>(prev_row: &[u8], curr_row: &mut [u8]) {
debug_assert_eq!(prev_row.len(), curr_row.len());
debug_assert_eq!(prev_row.len() % 4, 0);

let mut state = T::default();
for (prev, curr) in prev_row.chunks_exact(4).zip(curr_row.chunks_exact_mut(4)) {
let prev_simd = u8x4::from_slice(prev);
let mut curr_simd = u8x4::from_slice(curr);
state.step(prev_simd, &mut curr_simd);
curr.copy_from_slice(&curr_simd.to_array());
}
}

pub fn unfilter_paeth4(prev_row: &[u8], curr_row: &mut [u8]) {
unfilter4::<PaethState<4>>(prev_row, curr_row);
}

fn unfilter8<T: UnfilterState<8>>(prev_row: &[u8], curr_row: &mut [u8]) {
debug_assert_eq!(prev_row.len(), curr_row.len());
debug_assert_eq!(prev_row.len() % 8, 0);

let mut state = T::default();
for (prev, curr) in prev_row.chunks_exact(8).zip(curr_row.chunks_exact_mut(8)) {
let prev_simd = u8x8::from_slice(prev);
let mut curr_simd = u8x8::from_slice(curr);
state.step(prev_simd, &mut curr_simd);
curr.copy_from_slice(&curr_simd.to_array());
}
}

pub fn unfilter_avg8(prev_row: &[u8], curr_row: &mut [u8]) {
unfilter8::<AvgState<8>>(prev_row, curr_row);
}

fn unfilter6<T: UnfilterState<8>>(mut prev_row: &[u8], mut curr_row: &mut [u8]) {
debug_assert_eq!(prev_row.len(), curr_row.len());
debug_assert_eq!(prev_row.len() % 6, 0);
Expand Down Expand Up @@ -631,20 +665,27 @@ pub(crate) fn unfilter(
}
}
BytesPerPixel::Eight => {
let mut lprev = [0; 8];
for (chunk, above) in current.chunks_exact_mut(8).zip(previous.chunks_exact(8)) {
let new_chunk = [
chunk[0].wrapping_add(((above[0] as u16 + lprev[0] as u16) / 2) as u8),
chunk[1].wrapping_add(((above[1] as u16 + lprev[1] as u16) / 2) as u8),
chunk[2].wrapping_add(((above[2] as u16 + lprev[2] as u16) / 2) as u8),
chunk[3].wrapping_add(((above[3] as u16 + lprev[3] as u16) / 2) as u8),
chunk[4].wrapping_add(((above[4] as u16 + lprev[4] as u16) / 2) as u8),
chunk[5].wrapping_add(((above[5] as u16 + lprev[5] as u16) / 2) as u8),
chunk[6].wrapping_add(((above[6] as u16 + lprev[6] as u16) / 2) as u8),
chunk[7].wrapping_add(((above[7] as u16 + lprev[7] as u16) / 2) as u8),
];
*TryInto::<&mut [u8; 8]>::try_into(chunk).unwrap() = new_chunk;
lprev = new_chunk;
#[cfg(feature = "unstable")]
simd::unfilter_avg8(previous, current);

#[cfg(not(feature = "unstable"))]
{
let mut lprev = [0; 8];
for (chunk, above) in current.chunks_exact_mut(8).zip(previous.chunks_exact(8))
{
let new_chunk = [
chunk[0].wrapping_add(((above[0] as u16 + lprev[0] as u16) / 2) as u8),
chunk[1].wrapping_add(((above[1] as u16 + lprev[1] as u16) / 2) as u8),
chunk[2].wrapping_add(((above[2] as u16 + lprev[2] as u16) / 2) as u8),
chunk[3].wrapping_add(((above[3] as u16 + lprev[3] as u16) / 2) as u8),
chunk[4].wrapping_add(((above[4] as u16 + lprev[4] as u16) / 2) as u8),
chunk[5].wrapping_add(((above[5] as u16 + lprev[5] as u16) / 2) as u8),
chunk[6].wrapping_add(((above[6] as u16 + lprev[6] as u16) / 2) as u8),
chunk[7].wrapping_add(((above[7] as u16 + lprev[7] as u16) / 2) as u8),
];
*TryInto::<&mut [u8; 8]>::try_into(chunk).unwrap() = new_chunk;
lprev = new_chunk;
}
}
}
},
Expand Down Expand Up @@ -710,23 +751,34 @@ pub(crate) fn unfilter(
}
}
BytesPerPixel::Four => {
let mut a_bpp = [0; 4];
let mut c_bpp = [0; 4];
for (chunk, b_bpp) in current.chunks_exact_mut(4).zip(previous.chunks_exact(4))
#[cfg(feature = "unstable")]
simd::unfilter_paeth4(previous, current);

#[cfg(not(feature = "unstable"))]
{
let new_chunk = [
chunk[0]
.wrapping_add(filter_paeth_decode(a_bpp[0], b_bpp[0], c_bpp[0])),
chunk[1]
.wrapping_add(filter_paeth_decode(a_bpp[1], b_bpp[1], c_bpp[1])),
chunk[2]
.wrapping_add(filter_paeth_decode(a_bpp[2], b_bpp[2], c_bpp[2])),
chunk[3]
.wrapping_add(filter_paeth_decode(a_bpp[3], b_bpp[3], c_bpp[3])),
];
*TryInto::<&mut [u8; 4]>::try_into(chunk).unwrap() = new_chunk;
a_bpp = new_chunk;
c_bpp = b_bpp.try_into().unwrap();
let mut a_bpp = [0; 4];
let mut c_bpp = [0; 4];
for (chunk, b_bpp) in
current.chunks_exact_mut(4).zip(previous.chunks_exact(4))
{
let new_chunk = [
chunk[0].wrapping_add(filter_paeth_decode(
a_bpp[0], b_bpp[0], c_bpp[0],
)),
chunk[1].wrapping_add(filter_paeth_decode(
a_bpp[1], b_bpp[1], c_bpp[1],
)),
chunk[2].wrapping_add(filter_paeth_decode(
a_bpp[2], b_bpp[2], c_bpp[2],
)),
chunk[3].wrapping_add(filter_paeth_decode(
a_bpp[3], b_bpp[3], c_bpp[3],
)),
];
*TryInto::<&mut [u8; 4]>::try_into(chunk).unwrap() = new_chunk;
a_bpp = new_chunk;
c_bpp = b_bpp.try_into().unwrap();
}
}
}
BytesPerPixel::Six => {
Expand Down

0 comments on commit 36b541b

Please sign in to comment.