diff --git a/src/filter.rs b/src/filter.rs index 65490ece..ddb097cf 100644 --- a/src/filter.rs +++ b/src/filter.rs @@ -217,6 +217,40 @@ mod simd { unfilter3::>(prev_row, curr_row); } + fn unfilter4>(prev_row: &[u8], curr_row: &mut [u8]) { + debug_assert_eq!(prev_row.len(), curr_row.len()); + debug_assert_eq!(prev_row.len() % 4, 0); + + let mut state = T::default(); + for (prev, curr) in prev_row.chunks_exact(4).zip(curr_row.chunks_exact_mut(4)) { + let prev_simd = u8x4::from_slice(prev); + let mut curr_simd = u8x4::from_slice(curr); + state.step(prev_simd, &mut curr_simd); + curr.copy_from_slice(&curr_simd.to_array()); + } + } + + pub fn unfilter_paeth4(prev_row: &[u8], curr_row: &mut [u8]) { + unfilter4::>(prev_row, curr_row); + } + + fn unfilter8>(prev_row: &[u8], curr_row: &mut [u8]) { + debug_assert_eq!(prev_row.len(), curr_row.len()); + debug_assert_eq!(prev_row.len() % 8, 0); + + let mut state = T::default(); + for (prev, curr) in prev_row.chunks_exact(8).zip(curr_row.chunks_exact_mut(8)) { + let prev_simd = u8x8::from_slice(prev); + let mut curr_simd = u8x8::from_slice(curr); + state.step(prev_simd, &mut curr_simd); + curr.copy_from_slice(&curr_simd.to_array()); + } + } + + pub fn unfilter_avg8(prev_row: &[u8], curr_row: &mut [u8]) { + unfilter8::>(prev_row, curr_row); + } + fn unfilter6>(mut prev_row: &[u8], mut curr_row: &mut [u8]) { debug_assert_eq!(prev_row.len(), curr_row.len()); debug_assert_eq!(prev_row.len() % 6, 0); @@ -631,20 +665,27 @@ pub(crate) fn unfilter( } } BytesPerPixel::Eight => { - let mut lprev = [0; 8]; - for (chunk, above) in current.chunks_exact_mut(8).zip(previous.chunks_exact(8)) { - let new_chunk = [ - chunk[0].wrapping_add(((above[0] as u16 + lprev[0] as u16) / 2) as u8), - chunk[1].wrapping_add(((above[1] as u16 + lprev[1] as u16) / 2) as u8), - chunk[2].wrapping_add(((above[2] as u16 + lprev[2] as u16) / 2) as u8), - chunk[3].wrapping_add(((above[3] as u16 + lprev[3] as u16) / 2) as u8), - chunk[4].wrapping_add(((above[4] as u16 + lprev[4] as u16) / 2) as u8), - chunk[5].wrapping_add(((above[5] as u16 + lprev[5] as u16) / 2) as u8), - chunk[6].wrapping_add(((above[6] as u16 + lprev[6] as u16) / 2) as u8), - chunk[7].wrapping_add(((above[7] as u16 + lprev[7] as u16) / 2) as u8), - ]; - *TryInto::<&mut [u8; 8]>::try_into(chunk).unwrap() = new_chunk; - lprev = new_chunk; + #[cfg(feature = "unstable")] + simd::unfilter_avg8(previous, current); + + #[cfg(not(feature = "unstable"))] + { + let mut lprev = [0; 8]; + for (chunk, above) in current.chunks_exact_mut(8).zip(previous.chunks_exact(8)) + { + let new_chunk = [ + chunk[0].wrapping_add(((above[0] as u16 + lprev[0] as u16) / 2) as u8), + chunk[1].wrapping_add(((above[1] as u16 + lprev[1] as u16) / 2) as u8), + chunk[2].wrapping_add(((above[2] as u16 + lprev[2] as u16) / 2) as u8), + chunk[3].wrapping_add(((above[3] as u16 + lprev[3] as u16) / 2) as u8), + chunk[4].wrapping_add(((above[4] as u16 + lprev[4] as u16) / 2) as u8), + chunk[5].wrapping_add(((above[5] as u16 + lprev[5] as u16) / 2) as u8), + chunk[6].wrapping_add(((above[6] as u16 + lprev[6] as u16) / 2) as u8), + chunk[7].wrapping_add(((above[7] as u16 + lprev[7] as u16) / 2) as u8), + ]; + *TryInto::<&mut [u8; 8]>::try_into(chunk).unwrap() = new_chunk; + lprev = new_chunk; + } } } }, @@ -710,23 +751,34 @@ pub(crate) fn unfilter( } } BytesPerPixel::Four => { - let mut a_bpp = [0; 4]; - let mut c_bpp = [0; 4]; - for (chunk, b_bpp) in current.chunks_exact_mut(4).zip(previous.chunks_exact(4)) + #[cfg(feature = "unstable")] + simd::unfilter_paeth4(previous, current); + + #[cfg(not(feature = "unstable"))] { - let new_chunk = [ - chunk[0] - .wrapping_add(filter_paeth_decode(a_bpp[0], b_bpp[0], c_bpp[0])), - chunk[1] - .wrapping_add(filter_paeth_decode(a_bpp[1], b_bpp[1], c_bpp[1])), - chunk[2] - .wrapping_add(filter_paeth_decode(a_bpp[2], b_bpp[2], c_bpp[2])), - chunk[3] - .wrapping_add(filter_paeth_decode(a_bpp[3], b_bpp[3], c_bpp[3])), - ]; - *TryInto::<&mut [u8; 4]>::try_into(chunk).unwrap() = new_chunk; - a_bpp = new_chunk; - c_bpp = b_bpp.try_into().unwrap(); + let mut a_bpp = [0; 4]; + let mut c_bpp = [0; 4]; + for (chunk, b_bpp) in + current.chunks_exact_mut(4).zip(previous.chunks_exact(4)) + { + let new_chunk = [ + chunk[0].wrapping_add(filter_paeth_decode( + a_bpp[0], b_bpp[0], c_bpp[0], + )), + chunk[1].wrapping_add(filter_paeth_decode( + a_bpp[1], b_bpp[1], c_bpp[1], + )), + chunk[2].wrapping_add(filter_paeth_decode( + a_bpp[2], b_bpp[2], c_bpp[2], + )), + chunk[3].wrapping_add(filter_paeth_decode( + a_bpp[3], b_bpp[3], c_bpp[3], + )), + ]; + *TryInto::<&mut [u8; 4]>::try_into(chunk).unwrap() = new_chunk; + a_bpp = new_chunk; + c_bpp = b_bpp.try_into().unwrap(); + } } } BytesPerPixel::Six => {