rapidfuzz · jnqnfe · Nov 3, 2018 · Nov 3, 2018 · Nov 3, 2018 · Nov 3, 2018
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,19 @@
 This project attempts to adhere to [Semantic Versioning](http://semver.org).
 
 ## [Unreleased]
+Most of the improvements of this release are thanks to [@jnqnfe](https://github.com/jnqnfe)
+
+### Changed
+- Optimisations to metric implementations:
+   - Avoided char counting where unnecessary
+   - Avoided comparing portions of strings twice in Levenshtein variants with
+     equal length but non-identical strings
+   - Avoided repeated char counting with `normalized_levenshtein`
+   - Avoided using floats for counting in Jaro, converting to float at end instead
+- Moved tests out to test directory and reorganised
+- Simplified the Hamming tests
+- Simplified and improved failure output of the Jaro/Jaro-Winkler tests
+- Tidied up documentation
 
 ## [0.8.0] - (2018-08-19)
 ### Added
@@ -12,8 +25,8 @@ This project attempts to adhere to [Semantic Versioning](http://semver.org).
 - Faster Levenshtein implementation (thanks [@wdv4758h](https://github.com/wdv4758h))
 
 ### Removed
-- Remove the "against_vec" functions. They are one-liners now, so they don't
-  seem to add enough value to justify making the API larger. I didn't find
+- Remove the “against_vec” functions. They are one-liners now, so they don’t
+  seem to add enough value to justify making the API larger. I didn’t find
   anybody using them when I skimmed through a GitHub search. If you do use them,
   you can change the calls to something like:
 ```rust

diff --git a/Cargo.toml b/Cargo.toml
@@ -12,8 +12,7 @@ keywords = ["string", "similarity", "Hamming", "Levenshtein", "Jaro"]
 homepage = "https://github.com/dguo/strsim-rs"
 repository = "https://github.com/dguo/strsim-rs"
 documentation = "https://docs.rs/strsim/"
-exclude = ["/.travis.yml", "/appveyor.yml", "/dev"]
+exclude = ["/.travis.yml", "/dev"]
 
 [badges]
 travis-ci = { repository = "dguo/strsim-rs" }
-appveyor = { repository = "dguo/strsim-rs" }
diff --git a/README.md b/README.md
@@ -60,7 +60,7 @@ fn main() {
 
 ## Contributing
 
-If you don't want to install Rust itself, you can run `$ ./dev` for a
+If you don’t want to install Rust itself, you can run `$ ./dev` for a
 development CLI if you have [Docker] installed.
 
 Benchmarks require a Nightly toolchain. Run `$ cargo +nightly bench`.

diff --git a/benches/benches.rs b/benches/benches.rs
@@ -1,84 +1,86 @@
+// Copyright 2015 Danny Guo
+//
+// Licensed under the MIT license. You may not copy, modify, or distribute this
+// file except in compliance with said license. You can find a copy of this
+// license either in the LICENSE file, or alternatively at
+// <http://opensource.org/licenses/MIT>.
+
 //! Benchmarks for strsim.
 
 #![feature(test)]
 
 extern crate strsim;
+extern crate test;
+use self::test::Bencher;
 
-mod benches {
-    use super::*;
-
-    extern crate test;
-    use self::test::Bencher;
-
-    #[bench]
-    fn bench_hamming(bencher: &mut Bencher) {
-        let a = "ACAAGATGCCATTGTCCCCCGGCCTCCTGCTGCTGCTGCTCTCCGGGG";
-        let b = "CCTGGAGGGTGGCCCCACCGGCCGAGACAGCGAGCATATGCAGGAAGC";
-        bencher.iter(|| {
-            strsim::hamming(&a, &b).unwrap();
-        })
-    }
+#[bench]
+fn bench_hamming(bencher: &mut Bencher) {
+    let a = "ACAAGATGCCATTGTCCCCCGGCCTCCTGCTGCTGCTGCTCTCCGGGG";
+    let b = "CCTGGAGGGTGGCCCCACCGGCCGAGACAGCGAGCATATGCAGGAAGC";
+    bencher.iter(|| {
+        strsim::hamming(a, b).unwrap();
+    })
+}
 
-    #[bench]
-    fn bench_jaro(bencher: &mut Bencher) {
-        let a = "Philosopher Friedrich Nietzsche";
-        let b = "Philosopher Jean-Paul Sartre";
-        bencher.iter(|| {
-            strsim::jaro(&a, &b);
-        })
-    }
+#[bench]
+fn bench_jaro(bencher: &mut Bencher) {
+    let a = "Philosopher Friedrich Nietzsche";
+    let b = "Philosopher Jean-Paul Sartre";
+    bencher.iter(|| {
+        strsim::jaro(a, b);
+    })
+}
 
-    #[bench]
-    fn bench_jaro_winkler(bencher: &mut Bencher) {
-        let a = "Philosopher Friedrich Nietzsche";
-        let b = "Philosopher Jean-Paul Sartre";
-        bencher.iter(|| {
-            strsim::jaro_winkler(&a, &b);
-        })
-    }
+#[bench]
+fn bench_jaro_winkler(bencher: &mut Bencher) {
+    let a = "Philosopher Friedrich Nietzsche";
+    let b = "Philosopher Jean-Paul Sartre";
+    bencher.iter(|| {
+        strsim::jaro_winkler(a, b);
+    })
+}
 
-    #[bench]
-    fn bench_levenshtein(bencher: &mut Bencher) {
-        let a = "Philosopher Friedrich Nietzsche";
-        let b = "Philosopher Jean-Paul Sartre";
-        bencher.iter(|| {
-            strsim::levenshtein(&a, &b);
-        })
-    }
+#[bench]
+fn bench_levenshtein(bencher: &mut Bencher) {
+    let a = "Philosopher Friedrich Nietzsche";
+    let b = "Philosopher Jean-Paul Sartre";
+    bencher.iter(|| {
+        strsim::levenshtein(a, b);
+    })
+}
 
-    #[bench]
-    fn bench_normalized_levenshtein(bencher: &mut Bencher) {
-        let a = "Philosopher Friedrich Nietzsche";
-        let b = "Philosopher Jean-Paul Sartre";
-        bencher.iter(|| {
-            strsim::normalized_levenshtein(&a, &b);
-        })
-    }
+#[bench]
+fn bench_normalized_levenshtein(bencher: &mut Bencher) {
+    let a = "Philosopher Friedrich Nietzsche";
+    let b = "Philosopher Jean-Paul Sartre";
+    bencher.iter(|| {
+        strsim::normalized_levenshtein(a, b);
+    })
+}
 
-    #[bench]
-    fn bench_osa_distance(bencher: &mut Bencher) {
-        let a = "Philosopher Friedrich Nietzsche";
-        let b = "Philosopher Jean-Paul Sartre";
-        bencher.iter(|| {
-            strsim::osa_distance(&a, &b);
-        })
-    }
+#[bench]
+fn bench_osa_distance(bencher: &mut Bencher) {
+    let a = "Philosopher Friedrich Nietzsche";
+    let b = "Philosopher Jean-Paul Sartre";
+    bencher.iter(|| {
+        strsim::osa_distance(a, b);
+    })
+}
 
-    #[bench]
-    fn bench_damerau_levenshtein(bencher: &mut Bencher) {
-        let a = "Philosopher Friedrich Nietzsche";
-        let b = "Philosopher Jean-Paul Sartre";
-        bencher.iter(|| {
-            strsim::damerau_levenshtein(&a, &b);
-        })
-    }
+#[bench]
+fn bench_damerau_levenshtein(bencher: &mut Bencher) {
+    let a = "Philosopher Friedrich Nietzsche";
+    let b = "Philosopher Jean-Paul Sartre";
+    bencher.iter(|| {
+        strsim::damerau_levenshtein(a, b);
+    })
+}
 
-    #[bench]
-    fn bench_normalized_damerau_levenshtein(bencher: &mut Bencher) {
-        let a = "Philosopher Friedrich Nietzsche";
-        let b = "Philosopher Jean-Paul Sartre";
-        bencher.iter(|| {
-            strsim::normalized_damerau_levenshtein(&a, &b);
-        })
-    }
+#[bench]
+fn bench_normalized_damerau_levenshtein(bencher: &mut Bencher) {
+    let a = "Philosopher Friedrich Nietzsche";
+    let b = "Philosopher Jean-Paul Sartre";
+    bencher.iter(|| {
+        strsim::normalized_damerau_levenshtein(a, b);
+    })
 }
diff --git a/src/helpers.rs b/src/helpers.rs
@@ -0,0 +1,75 @@
+// Copyright 2018 Lyndon Brown
+//
+// Licensed under the MIT license. You may not copy, modify, or distribute this
+// file except in compliance with said license. You can find a copy of this
+// license either in the LICENSE file, or alternatively at
+// <http://opensource.org/licenses/MIT>.
+
+/// Checks both strings for a common prefix, splitting them after it.
+///
+/// It returns a tuple consisting of the prefix, the two suffixes, and the
+/// `char` count of the prefix: `(prefix, a-suffix, b-suffix,
+/// prefix-char-count)`.
+#[inline(always)]
+pub(crate) fn split_on_common_prefix<'a, 'b>(a: &'a str, b: &'b str)
+    -> (&'a str, &'a str, &'b str, usize)
+{
+    let (i, cc) = get_diverge_indice(a, b);
+    unsafe {
+        (a.get_unchecked(..i), a.get_unchecked(i..), b.get_unchecked(i..), cc)
+    }
+}
+
+/// Finds the byte offset of the next `char` following a prefix common to both
+/// strings, and returns this along with the count of `char`s that make up the
+/// prefix.
+#[inline(always)]
+pub(crate) fn get_diverge_indice(a: &str, b: &str) -> (usize, usize) {
+    let mut char_count = 0;
+    let indice = a.char_indices()
+                  .zip(b.char_indices())
+                  .take_while(|&((_, a_char), (_, b_char))| a_char == b_char)
+                  .inspect(|_| char_count += 1)
+                  .last()
+                  .map_or(0, |((a_indice, a_char), (_, _))| a_indice + a_char.len_utf8());
+    (indice, char_count)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_split_on_common_prefix() {
+        assert_eq!(("", "", "", 0), split_on_common_prefix("", ""));
+        assert_eq!(("", "a", "", 0), split_on_common_prefix("a", ""));
+        assert_eq!(("", "", "a", 0), split_on_common_prefix("", "a"));
+        assert_eq!(("a", "", "", 1), split_on_common_prefix("a", "a"));
+        assert_eq!(("", "thank", "you", 0), split_on_common_prefix("thank", "you"));
+        assert_eq!(("", "hello world!", "foo bar", 0), split_on_common_prefix("hello world!", "foo bar"));
+        assert_eq!(("hello w", "orld!", "urld?", 7), split_on_common_prefix("hello world!", "hello wurld?"));
+        assert_eq!(("kit", "ten", "es", 3), split_on_common_prefix("kitten", "kites"));
+        assert_eq!(("kitten", "", "", 6), split_on_common_prefix("kitten", "kitten"));
+        assert_eq!(("ki", "香ten", "tten", 2), split_on_common_prefix("ki香ten", "kitten"));
+        assert_eq!(("ki", "tten", "香ten", 2), split_on_common_prefix("kitten", "ki香ten"));
+        assert_eq!(("ki香ten", "", "s", 6), split_on_common_prefix("ki香ten", "ki香tens"));
+        assert_eq!(("ki香", "ten", "zen", 3), split_on_common_prefix("ki香ten", "ki香zen"));
+    }
+
+    #[test]
+    fn test_get_diverge_indice() {
+        assert_eq!((0, 0), get_diverge_indice("", ""));
+        assert_eq!((0, 0), get_diverge_indice("a", ""));
+        assert_eq!((0, 0), get_diverge_indice("", "a"));
+        assert_eq!((1, 1), get_diverge_indice("a", "a"));
+        assert_eq!((0, 0), get_diverge_indice("thank", "you"));
+        assert_eq!((0, 0), get_diverge_indice("hello world!", "foo bar"));
+        assert_eq!((7, 7), get_diverge_indice("hello world!", "hello wurld?"));
+        assert_eq!((3, 3), get_diverge_indice("kitten", "kites"));
+        assert_eq!((6, 6), get_diverge_indice("kitten", "kitten"));
+        assert_eq!((2, 2), get_diverge_indice("ki香ten", "kitten"));
+        assert_eq!((2, 2), get_diverge_indice("kitten", "ki香ten"));
+        assert_eq!((8, 6), get_diverge_indice("ki香ten", "ki香tens"));
+        assert_eq!((5, 3), get_diverge_indice("ki香ten", "ki香zen"));
+    }
+}