Skip to content

Commit

Permalink
Improve Longest Common Subsequence Implementation (#771)
Browse files Browse the repository at this point in the history
* ref: refactor implementation
- Decopose the `longest_common_subsequence` to use helper functions
- Rewrite tests using macro
- Naming variables and functions descriptively
- Add docstrings

* chore(docs): explain which is returned LCS

* chore: remove redundant comments

* chore: explain the different LCS outputs returned

* tests: add a test case showing that `longest_common_subsequence` is not symmetric

---------

Co-authored-by: vil02 <65706193+vil02@users.noreply.github.com>
  • Loading branch information
sozelfist and vil02 authored Jul 31, 2024
1 parent 27bfd42 commit 3c50b91
Showing 1 changed file with 94 additions and 51 deletions.
145 changes: 94 additions & 51 deletions src/dynamic_programming/longest_common_subsequence.rs
Original file line number Diff line number Diff line change
@@ -1,73 +1,116 @@
/// Longest common subsequence via Dynamic Programming
//! This module implements the Longest Common Subsequence (LCS) algorithm.
//! The LCS problem is finding the longest subsequence common to two sequences.
//! It differs from the problem of finding common substrings: unlike substrings, subsequences
//! are not required to occupy consecutive positions within the original sequences.
//! This implementation handles Unicode strings efficiently and correctly, ensuring
//! that multi-byte characters are managed properly.

/// longest_common_subsequence(a, b) returns the longest common subsequence
/// between the strings a and b.
pub fn longest_common_subsequence(a: &str, b: &str) -> String {
let a: Vec<_> = a.chars().collect();
let b: Vec<_> = b.chars().collect();
let (na, nb) = (a.len(), b.len());
/// Computes the longest common subsequence of two input strings.
///
/// The longest common subsequence (LCS) of two strings is the longest sequence that can
/// be derived from both strings by deleting some elements without changing the order of
/// the remaining elements.
///
/// ## Note
/// The function may return different LCSs for the same pair of strings depending on the
/// order of the inputs and the nature of the sequences. This is due to the way the dynamic
/// programming algorithm resolves ties when multiple common subsequences of the same length
/// exist. The order of the input strings can influence the specific path taken through the
/// DP table, resulting in different valid LCS outputs.
///
/// For example:
/// `longest_common_subsequence("hello, world!", "world, hello!")` returns `"hello!"`
/// but
/// `longest_common_subsequence("world, hello!", "hello, world!")` returns `"world!"`
///
/// This difference arises because the dynamic programming table is filled differently based
/// on the input order, leading to different tie-breaking decisions and thus different LCS results.
pub fn longest_common_subsequence(first_seq: &str, second_seq: &str) -> String {
let first_seq_chars = first_seq.chars().collect::<Vec<char>>();
let second_seq_chars = second_seq.chars().collect::<Vec<char>>();

// solutions[i][j] is the length of the longest common subsequence
// between a[0..i-1] and b[0..j-1]
let mut solutions = vec![vec![0; nb + 1]; na + 1];
let lcs_lengths = initialize_lcs_lengths(&first_seq_chars, &second_seq_chars);
let lcs_chars = reconstruct_lcs(&first_seq_chars, &second_seq_chars, &lcs_lengths);

for (i, ci) in a.iter().enumerate() {
for (j, cj) in b.iter().enumerate() {
// if ci == cj, there is a new common character;
// otherwise, take the best of the two solutions
// at (i-1,j) and (i,j-1)
solutions[i + 1][j + 1] = if ci == cj {
solutions[i][j] + 1
lcs_chars.into_iter().collect()
}

fn initialize_lcs_lengths(first_seq_chars: &[char], second_seq_chars: &[char]) -> Vec<Vec<usize>> {
let first_seq_len = first_seq_chars.len();
let second_seq_len = second_seq_chars.len();

let mut lcs_lengths = vec![vec![0; second_seq_len + 1]; first_seq_len + 1];

// Populate the LCS lengths table
(1..=first_seq_len).for_each(|i| {
(1..=second_seq_len).for_each(|j| {
lcs_lengths[i][j] = if first_seq_chars[i - 1] == second_seq_chars[j - 1] {
lcs_lengths[i - 1][j - 1] + 1
} else {
solutions[i][j + 1].max(solutions[i + 1][j])
}
}
}
lcs_lengths[i - 1][j].max(lcs_lengths[i][j - 1])
};
});
});

// reconstitute the solution string from the lengths
let mut result: Vec<char> = Vec::new();
let (mut i, mut j) = (na, nb);
lcs_lengths
}

fn reconstruct_lcs(
first_seq_chars: &[char],
second_seq_chars: &[char],
lcs_lengths: &[Vec<usize>],
) -> Vec<char> {
let mut lcs_chars = Vec::new();
let mut i = first_seq_chars.len();
let mut j = second_seq_chars.len();
while i > 0 && j > 0 {
if a[i - 1] == b[j - 1] {
result.push(a[i - 1]);
if first_seq_chars[i - 1] == second_seq_chars[j - 1] {
lcs_chars.push(first_seq_chars[i - 1]);
i -= 1;
j -= 1;
} else if solutions[i - 1][j] > solutions[i][j - 1] {
} else if lcs_lengths[i - 1][j] >= lcs_lengths[i][j - 1] {
i -= 1;
} else {
j -= 1;
}
}

result.reverse();
result.iter().collect()
lcs_chars.reverse();
lcs_chars
}

#[cfg(test)]
mod tests {
use super::longest_common_subsequence;

#[test]
fn test_longest_common_subsequence() {
// empty case
assert_eq!(&longest_common_subsequence("", ""), "");
assert_eq!(&longest_common_subsequence("", "abcd"), "");
assert_eq!(&longest_common_subsequence("abcd", ""), "");
use super::*;

// simple cases
assert_eq!(&longest_common_subsequence("abcd", "c"), "c");
assert_eq!(&longest_common_subsequence("abcd", "d"), "d");
assert_eq!(&longest_common_subsequence("abcd", "e"), "");
assert_eq!(&longest_common_subsequence("abcdefghi", "acegi"), "acegi");

// less simple cases
assert_eq!(&longest_common_subsequence("abcdgh", "aedfhr"), "adh");
assert_eq!(&longest_common_subsequence("aggtab", "gxtxayb"), "gtab");
macro_rules! longest_common_subsequence_tests {
($($name:ident: $test_case:expr,)*) => {
$(
#[test]
fn $name() {
let (first_seq, second_seq, expected_lcs) = $test_case;
assert_eq!(longest_common_subsequence(&first_seq, &second_seq), expected_lcs);
}
)*
};
}

// unicode
assert_eq!(
&longest_common_subsequence("你好,世界", "再见世界"),
"世界"
);
longest_common_subsequence_tests! {
empty_case: ("", "", ""),
one_empty: ("", "abcd", ""),
identical_strings: ("abcd", "abcd", "abcd"),
completely_different: ("abcd", "efgh", ""),
single_character: ("a", "a", "a"),
different_length: ("abcd", "abc", "abc"),
special_characters: ("$#%&", "#@!%", "#%"),
long_strings: ("abcdefghabcdefghabcdefghabcdefghabcdefghabcdefghabcdefghabcdefghabcdefghabcdefgh",
"bcdefghabcdefghabcdefghabcdefghabcdefghabcdefghabcdefghabcdefghabcdefghabcdefgha",
"bcdefghabcdefghabcdefghabcdefghabcdefghabcdefghabcdefghabcdefghabcdefghabcdefgh"),
unicode_characters: ("你好,世界", "再见,世界", ",世界"),
spaces_and_punctuation_0: ("hello, world!", "world, hello!", "hello!"),
spaces_and_punctuation_1: ("hello, world!", "world, hello!", "hello!"), // longest_common_subsequence is not symmetric
random_case_1: ("abcdef", "xbcxxxe", "bce"),
random_case_2: ("xyz", "abc", ""),
random_case_3: ("abracadabra", "avadakedavra", "aaadara"),
}
}

0 comments on commit 3c50b91

Please sign in to comment.