Improve Longest Common Subsequence Implementation (#771)

* ref: refactor implementation - Decopose the `longest_common_subsequence` to use helper functions - Rewrite tests using macro - Naming variables and functions descriptively - Add docstrings * chore(docs): explain which is returned LCS * chore: remove redundant comments * chore: explain the different LCS outputs returned * tests: add a test case showing that `longest_common_subsequence` is not symmetric --------- Co-authored-by: vil02 <65706193+vil02@users.noreply.github.com>
TheAlgorithms · Jul 31, 2024 · 3c50b91 · 3c50b91
1 parent 27bfd42
commit 3c50b91
Showing 1 changed file with 94 additions and 51 deletions.
diff --git a/src/dynamic_programming/longest_common_subsequence.rs b/src/dynamic_programming/longest_common_subsequence.rs
@@ -1,73 +1,116 @@
-/// Longest common subsequence via Dynamic Programming
+//! This module implements the Longest Common Subsequence (LCS) algorithm.
+//! The LCS problem is finding the longest subsequence common to two sequences.
+//! It differs from the problem of finding common substrings: unlike substrings, subsequences
+//! are not required to occupy consecutive positions within the original sequences.
+//! This implementation handles Unicode strings efficiently and correctly, ensuring
+//! that multi-byte characters are managed properly.
 
-/// longest_common_subsequence(a, b) returns the longest common subsequence
-/// between the strings a and b.
-pub fn longest_common_subsequence(a: &str, b: &str) -> String {
-    let a: Vec<_> = a.chars().collect();
-    let b: Vec<_> = b.chars().collect();
-    let (na, nb) = (a.len(), b.len());
+/// Computes the longest common subsequence of two input strings.
+///
+/// The longest common subsequence (LCS) of two strings is the longest sequence that can
+/// be derived from both strings by deleting some elements without changing the order of
+/// the remaining elements.
+///
+/// ## Note
+/// The function may return different LCSs for the same pair of strings depending on the
+/// order of the inputs and the nature of the sequences. This is due to the way the dynamic
+/// programming algorithm resolves ties when multiple common subsequences of the same length
+/// exist. The order of the input strings can influence the specific path taken through the
+/// DP table, resulting in different valid LCS outputs.
+///
+///  For example:
+/// `longest_common_subsequence("hello, world!", "world, hello!")` returns `"hello!"`
+/// but
+/// `longest_common_subsequence("world, hello!", "hello, world!")` returns `"world!"`
+///
+/// This difference arises because the dynamic programming table is filled differently based
+/// on the input order, leading to different tie-breaking decisions and thus different LCS results.
+pub fn longest_common_subsequence(first_seq: &str, second_seq: &str) -> String {
+    let first_seq_chars = first_seq.chars().collect::<Vec<char>>();
+    let second_seq_chars = second_seq.chars().collect::<Vec<char>>();
 
-    // solutions[i][j] is the length of the longest common subsequence
-    // between a[0..i-1] and b[0..j-1]
-    let mut solutions = vec![vec![0; nb + 1]; na + 1];
+    let lcs_lengths = initialize_lcs_lengths(&first_seq_chars, &second_seq_chars);
+    let lcs_chars = reconstruct_lcs(&first_seq_chars, &second_seq_chars, &lcs_lengths);
 
-    for (i, ci) in a.iter().enumerate() {
-        for (j, cj) in b.iter().enumerate() {
-            // if ci == cj, there is a new common character;
-            // otherwise, take the best of the two solutions
-            // at (i-1,j) and (i,j-1)
-            solutions[i + 1][j + 1] = if ci == cj {
-                solutions[i][j] + 1
+    lcs_chars.into_iter().collect()
+}
+
+fn initialize_lcs_lengths(first_seq_chars: &[char], second_seq_chars: &[char]) -> Vec<Vec<usize>> {
+    let first_seq_len = first_seq_chars.len();
+    let second_seq_len = second_seq_chars.len();
+
+    let mut lcs_lengths = vec![vec![0; second_seq_len + 1]; first_seq_len + 1];
+
+    // Populate the LCS lengths table
+    (1..=first_seq_len).for_each(|i| {
+        (1..=second_seq_len).for_each(|j| {
+            lcs_lengths[i][j] = if first_seq_chars[i - 1] == second_seq_chars[j - 1] {
+                lcs_lengths[i - 1][j - 1] + 1
             } else {
-                solutions[i][j + 1].max(solutions[i + 1][j])
-            }
-        }
-    }
+                lcs_lengths[i - 1][j].max(lcs_lengths[i][j - 1])
+            };
+        });
+    });
 
-    // reconstitute the solution string from the lengths
-    let mut result: Vec<char> = Vec::new();
-    let (mut i, mut j) = (na, nb);
+    lcs_lengths
+}
+
+fn reconstruct_lcs(
+    first_seq_chars: &[char],
+    second_seq_chars: &[char],
+    lcs_lengths: &[Vec<usize>],
+) -> Vec<char> {
+    let mut lcs_chars = Vec::new();
+    let mut i = first_seq_chars.len();
+    let mut j = second_seq_chars.len();
     while i > 0 && j > 0 {
-        if a[i - 1] == b[j - 1] {
-            result.push(a[i - 1]);
+        if first_seq_chars[i - 1] == second_seq_chars[j - 1] {
+            lcs_chars.push(first_seq_chars[i - 1]);
             i -= 1;
             j -= 1;
-        } else if solutions[i - 1][j] > solutions[i][j - 1] {
+        } else if lcs_lengths[i - 1][j] >= lcs_lengths[i][j - 1] {
             i -= 1;
         } else {
             j -= 1;
         }
     }
 
-    result.reverse();
-    result.iter().collect()
+    lcs_chars.reverse();
+    lcs_chars
 }
 
 #[cfg(test)]
 mod tests {
-    use super::longest_common_subsequence;
-
-    #[test]
-    fn test_longest_common_subsequence() {
-        // empty case
-        assert_eq!(&longest_common_subsequence("", ""), "");
-        assert_eq!(&longest_common_subsequence("", "abcd"), "");
-        assert_eq!(&longest_common_subsequence("abcd", ""), "");
+    use super::*;
 
-        // simple cases
-        assert_eq!(&longest_common_subsequence("abcd", "c"), "c");
-        assert_eq!(&longest_common_subsequence("abcd", "d"), "d");
-        assert_eq!(&longest_common_subsequence("abcd", "e"), "");
-        assert_eq!(&longest_common_subsequence("abcdefghi", "acegi"), "acegi");
-
-        // less simple cases
-        assert_eq!(&longest_common_subsequence("abcdgh", "aedfhr"), "adh");
-        assert_eq!(&longest_common_subsequence("aggtab", "gxtxayb"), "gtab");
+    macro_rules! longest_common_subsequence_tests {
+        ($($name:ident: $test_case:expr,)*) => {
+            $(
+                #[test]
+                fn $name() {
+                    let (first_seq, second_seq, expected_lcs) = $test_case;
+                    assert_eq!(longest_common_subsequence(&first_seq, &second_seq), expected_lcs);
+                }
+            )*
+        };
+    }
 
-        // unicode
-        assert_eq!(
-            &longest_common_subsequence("你好，世界", "再见世界"),
-            "世界"
-        );
+    longest_common_subsequence_tests! {
+        empty_case: ("", "", ""),
+        one_empty: ("", "abcd", ""),
+        identical_strings: ("abcd", "abcd", "abcd"),
+        completely_different: ("abcd", "efgh", ""),
+        single_character: ("a", "a", "a"),
+        different_length: ("abcd", "abc", "abc"),
+        special_characters: ("$#%&", "#@!%", "#%"),
+        long_strings: ("abcdefghabcdefghabcdefghabcdefghabcdefghabcdefghabcdefghabcdefghabcdefghabcdefgh",
+                      "bcdefghabcdefghabcdefghabcdefghabcdefghabcdefghabcdefghabcdefghabcdefghabcdefgha",
+                      "bcdefghabcdefghabcdefghabcdefghabcdefghabcdefghabcdefghabcdefghabcdefghabcdefgh"),
+        unicode_characters: ("你好，世界", "再见，世界", "，世界"),
+        spaces_and_punctuation_0: ("hello, world!", "world, hello!", "hello!"),
+        spaces_and_punctuation_1: ("hello, world!", "world, hello!", "hello!"), // longest_common_subsequence is not symmetric
+        random_case_1: ("abcdef", "xbcxxxe", "bce"),
+        random_case_2: ("xyz", "abc", ""),
+        random_case_3: ("abracadabra", "avadakedavra", "aaadara"),
     }
 }