Attempt to optimize bpp=4 and bpp=8 unfilter scenarios.

It seems that there are no improvements from using `std::simd` outside of bpp=3 and bpp=6... I am guessing that auto-vectorization does a good-enough job for the other cases.
image-rs · Sep 25, 2023 · 36b541b · 36b541b
1 parent 788633a
commit 36b541b
Showing 1 changed file with 82 additions and 30 deletions.
diff --git a/src/filter.rs b/src/filter.rs
@@ -217,6 +217,40 @@ mod simd {
         unfilter3::<PaethState<4>>(prev_row, curr_row);
     }
 
+    fn unfilter4<T: UnfilterState<4>>(prev_row: &[u8], curr_row: &mut [u8]) {
+        debug_assert_eq!(prev_row.len(), curr_row.len());
+        debug_assert_eq!(prev_row.len() % 4, 0);
+
+        let mut state = T::default();
+        for (prev, curr) in prev_row.chunks_exact(4).zip(curr_row.chunks_exact_mut(4)) {
+            let prev_simd = u8x4::from_slice(prev);
+            let mut curr_simd = u8x4::from_slice(curr);
+            state.step(prev_simd, &mut curr_simd);
+            curr.copy_from_slice(&curr_simd.to_array());
+        }
+    }
+
+    pub fn unfilter_paeth4(prev_row: &[u8], curr_row: &mut [u8]) {
+        unfilter4::<PaethState<4>>(prev_row, curr_row);
+    }
+
+    fn unfilter8<T: UnfilterState<8>>(prev_row: &[u8], curr_row: &mut [u8]) {
+        debug_assert_eq!(prev_row.len(), curr_row.len());
+        debug_assert_eq!(prev_row.len() % 8, 0);
+
+        let mut state = T::default();
+        for (prev, curr) in prev_row.chunks_exact(8).zip(curr_row.chunks_exact_mut(8)) {
+            let prev_simd = u8x8::from_slice(prev);
+            let mut curr_simd = u8x8::from_slice(curr);
+            state.step(prev_simd, &mut curr_simd);
+            curr.copy_from_slice(&curr_simd.to_array());
+        }
+    }
+
+    pub fn unfilter_avg8(prev_row: &[u8], curr_row: &mut [u8]) {
+        unfilter8::<AvgState<8>>(prev_row, curr_row);
+    }
+
     fn unfilter6<T: UnfilterState<8>>(mut prev_row: &[u8], mut curr_row: &mut [u8]) {
         debug_assert_eq!(prev_row.len(), curr_row.len());
         debug_assert_eq!(prev_row.len() % 6, 0);
@@ -631,20 +665,27 @@ pub(crate) fn unfilter(
                 }
             }
             BytesPerPixel::Eight => {
-                let mut lprev = [0; 8];
-                for (chunk, above) in current.chunks_exact_mut(8).zip(previous.chunks_exact(8)) {
-                    let new_chunk = [
-                        chunk[0].wrapping_add(((above[0] as u16 + lprev[0] as u16) / 2) as u8),
-                        chunk[1].wrapping_add(((above[1] as u16 + lprev[1] as u16) / 2) as u8),
-                        chunk[2].wrapping_add(((above[2] as u16 + lprev[2] as u16) / 2) as u8),
-                        chunk[3].wrapping_add(((above[3] as u16 + lprev[3] as u16) / 2) as u8),
-                        chunk[4].wrapping_add(((above[4] as u16 + lprev[4] as u16) / 2) as u8),
-                        chunk[5].wrapping_add(((above[5] as u16 + lprev[5] as u16) / 2) as u8),
-                        chunk[6].wrapping_add(((above[6] as u16 + lprev[6] as u16) / 2) as u8),
-                        chunk[7].wrapping_add(((above[7] as u16 + lprev[7] as u16) / 2) as u8),
-                    ];
-                    *TryInto::<&mut [u8; 8]>::try_into(chunk).unwrap() = new_chunk;
-                    lprev = new_chunk;
+                #[cfg(feature = "unstable")]
+                simd::unfilter_avg8(previous, current);
+
+                #[cfg(not(feature = "unstable"))]
+                {
+                    let mut lprev = [0; 8];
+                    for (chunk, above) in current.chunks_exact_mut(8).zip(previous.chunks_exact(8))
+                    {
+                        let new_chunk = [
+                            chunk[0].wrapping_add(((above[0] as u16 + lprev[0] as u16) / 2) as u8),
+                            chunk[1].wrapping_add(((above[1] as u16 + lprev[1] as u16) / 2) as u8),
+                            chunk[2].wrapping_add(((above[2] as u16 + lprev[2] as u16) / 2) as u8),
+                            chunk[3].wrapping_add(((above[3] as u16 + lprev[3] as u16) / 2) as u8),
+                            chunk[4].wrapping_add(((above[4] as u16 + lprev[4] as u16) / 2) as u8),
+                            chunk[5].wrapping_add(((above[5] as u16 + lprev[5] as u16) / 2) as u8),
+                            chunk[6].wrapping_add(((above[6] as u16 + lprev[6] as u16) / 2) as u8),
+                            chunk[7].wrapping_add(((above[7] as u16 + lprev[7] as u16) / 2) as u8),
+                        ];
+                        *TryInto::<&mut [u8; 8]>::try_into(chunk).unwrap() = new_chunk;
+                        lprev = new_chunk;
+                    }
                 }
             }
         },
@@ -710,23 +751,34 @@ pub(crate) fn unfilter(
                     }
                 }
                 BytesPerPixel::Four => {
-                    let mut a_bpp = [0; 4];
-                    let mut c_bpp = [0; 4];
-                    for (chunk, b_bpp) in current.chunks_exact_mut(4).zip(previous.chunks_exact(4))
+                    #[cfg(feature = "unstable")]
+                    simd::unfilter_paeth4(previous, current);
+
+                    #[cfg(not(feature = "unstable"))]
                     {
-                        let new_chunk = [
-                            chunk[0]
-                                .wrapping_add(filter_paeth_decode(a_bpp[0], b_bpp[0], c_bpp[0])),
-                            chunk[1]
-                                .wrapping_add(filter_paeth_decode(a_bpp[1], b_bpp[1], c_bpp[1])),
-                            chunk[2]
-                                .wrapping_add(filter_paeth_decode(a_bpp[2], b_bpp[2], c_bpp[2])),
-                            chunk[3]
-                                .wrapping_add(filter_paeth_decode(a_bpp[3], b_bpp[3], c_bpp[3])),
-                        ];
-                        *TryInto::<&mut [u8; 4]>::try_into(chunk).unwrap() = new_chunk;
-                        a_bpp = new_chunk;
-                        c_bpp = b_bpp.try_into().unwrap();
+                        let mut a_bpp = [0; 4];
+                        let mut c_bpp = [0; 4];
+                        for (chunk, b_bpp) in
+                            current.chunks_exact_mut(4).zip(previous.chunks_exact(4))
+                        {
+                            let new_chunk = [
+                                chunk[0].wrapping_add(filter_paeth_decode(
+                                    a_bpp[0], b_bpp[0], c_bpp[0],
+                                )),
+                                chunk[1].wrapping_add(filter_paeth_decode(
+                                    a_bpp[1], b_bpp[1], c_bpp[1],
+                                )),
+                                chunk[2].wrapping_add(filter_paeth_decode(
+                                    a_bpp[2], b_bpp[2], c_bpp[2],
+                                )),
+                                chunk[3].wrapping_add(filter_paeth_decode(
+                                    a_bpp[3], b_bpp[3], c_bpp[3],
+                                )),
+                            ];
+                            *TryInto::<&mut [u8; 4]>::try_into(chunk).unwrap() = new_chunk;
+                            a_bpp = new_chunk;
+                            c_bpp = b_bpp.try_into().unwrap();
+                        }
                     }
                 }
                 BytesPerPixel::Six => {