-
-
Notifications
You must be signed in to change notification settings - Fork 2.2k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Improve Longest Common Subsequence Implementation (#771)
* ref: refactor implementation - Decopose the `longest_common_subsequence` to use helper functions - Rewrite tests using macro - Naming variables and functions descriptively - Add docstrings * chore(docs): explain which is returned LCS * chore: remove redundant comments * chore: explain the different LCS outputs returned * tests: add a test case showing that `longest_common_subsequence` is not symmetric --------- Co-authored-by: vil02 <65706193+vil02@users.noreply.github.com>
- Loading branch information
Showing
1 changed file
with
94 additions
and
51 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,73 +1,116 @@ | ||
/// Longest common subsequence via Dynamic Programming | ||
//! This module implements the Longest Common Subsequence (LCS) algorithm. | ||
//! The LCS problem is finding the longest subsequence common to two sequences. | ||
//! It differs from the problem of finding common substrings: unlike substrings, subsequences | ||
//! are not required to occupy consecutive positions within the original sequences. | ||
//! This implementation handles Unicode strings efficiently and correctly, ensuring | ||
//! that multi-byte characters are managed properly. | ||
|
||
/// longest_common_subsequence(a, b) returns the longest common subsequence | ||
/// between the strings a and b. | ||
pub fn longest_common_subsequence(a: &str, b: &str) -> String { | ||
let a: Vec<_> = a.chars().collect(); | ||
let b: Vec<_> = b.chars().collect(); | ||
let (na, nb) = (a.len(), b.len()); | ||
/// Computes the longest common subsequence of two input strings. | ||
/// | ||
/// The longest common subsequence (LCS) of two strings is the longest sequence that can | ||
/// be derived from both strings by deleting some elements without changing the order of | ||
/// the remaining elements. | ||
/// | ||
/// ## Note | ||
/// The function may return different LCSs for the same pair of strings depending on the | ||
/// order of the inputs and the nature of the sequences. This is due to the way the dynamic | ||
/// programming algorithm resolves ties when multiple common subsequences of the same length | ||
/// exist. The order of the input strings can influence the specific path taken through the | ||
/// DP table, resulting in different valid LCS outputs. | ||
/// | ||
/// For example: | ||
/// `longest_common_subsequence("hello, world!", "world, hello!")` returns `"hello!"` | ||
/// but | ||
/// `longest_common_subsequence("world, hello!", "hello, world!")` returns `"world!"` | ||
/// | ||
/// This difference arises because the dynamic programming table is filled differently based | ||
/// on the input order, leading to different tie-breaking decisions and thus different LCS results. | ||
pub fn longest_common_subsequence(first_seq: &str, second_seq: &str) -> String { | ||
let first_seq_chars = first_seq.chars().collect::<Vec<char>>(); | ||
let second_seq_chars = second_seq.chars().collect::<Vec<char>>(); | ||
|
||
// solutions[i][j] is the length of the longest common subsequence | ||
// between a[0..i-1] and b[0..j-1] | ||
let mut solutions = vec![vec![0; nb + 1]; na + 1]; | ||
let lcs_lengths = initialize_lcs_lengths(&first_seq_chars, &second_seq_chars); | ||
let lcs_chars = reconstruct_lcs(&first_seq_chars, &second_seq_chars, &lcs_lengths); | ||
|
||
for (i, ci) in a.iter().enumerate() { | ||
for (j, cj) in b.iter().enumerate() { | ||
// if ci == cj, there is a new common character; | ||
// otherwise, take the best of the two solutions | ||
// at (i-1,j) and (i,j-1) | ||
solutions[i + 1][j + 1] = if ci == cj { | ||
solutions[i][j] + 1 | ||
lcs_chars.into_iter().collect() | ||
} | ||
|
||
fn initialize_lcs_lengths(first_seq_chars: &[char], second_seq_chars: &[char]) -> Vec<Vec<usize>> { | ||
let first_seq_len = first_seq_chars.len(); | ||
let second_seq_len = second_seq_chars.len(); | ||
|
||
let mut lcs_lengths = vec![vec![0; second_seq_len + 1]; first_seq_len + 1]; | ||
|
||
// Populate the LCS lengths table | ||
(1..=first_seq_len).for_each(|i| { | ||
(1..=second_seq_len).for_each(|j| { | ||
lcs_lengths[i][j] = if first_seq_chars[i - 1] == second_seq_chars[j - 1] { | ||
lcs_lengths[i - 1][j - 1] + 1 | ||
} else { | ||
solutions[i][j + 1].max(solutions[i + 1][j]) | ||
} | ||
} | ||
} | ||
lcs_lengths[i - 1][j].max(lcs_lengths[i][j - 1]) | ||
}; | ||
}); | ||
}); | ||
|
||
// reconstitute the solution string from the lengths | ||
let mut result: Vec<char> = Vec::new(); | ||
let (mut i, mut j) = (na, nb); | ||
lcs_lengths | ||
} | ||
|
||
fn reconstruct_lcs( | ||
first_seq_chars: &[char], | ||
second_seq_chars: &[char], | ||
lcs_lengths: &[Vec<usize>], | ||
) -> Vec<char> { | ||
let mut lcs_chars = Vec::new(); | ||
let mut i = first_seq_chars.len(); | ||
let mut j = second_seq_chars.len(); | ||
while i > 0 && j > 0 { | ||
if a[i - 1] == b[j - 1] { | ||
result.push(a[i - 1]); | ||
if first_seq_chars[i - 1] == second_seq_chars[j - 1] { | ||
lcs_chars.push(first_seq_chars[i - 1]); | ||
i -= 1; | ||
j -= 1; | ||
} else if solutions[i - 1][j] > solutions[i][j - 1] { | ||
} else if lcs_lengths[i - 1][j] >= lcs_lengths[i][j - 1] { | ||
i -= 1; | ||
} else { | ||
j -= 1; | ||
} | ||
} | ||
|
||
result.reverse(); | ||
result.iter().collect() | ||
lcs_chars.reverse(); | ||
lcs_chars | ||
} | ||
|
||
#[cfg(test)] | ||
mod tests { | ||
use super::longest_common_subsequence; | ||
|
||
#[test] | ||
fn test_longest_common_subsequence() { | ||
// empty case | ||
assert_eq!(&longest_common_subsequence("", ""), ""); | ||
assert_eq!(&longest_common_subsequence("", "abcd"), ""); | ||
assert_eq!(&longest_common_subsequence("abcd", ""), ""); | ||
use super::*; | ||
|
||
// simple cases | ||
assert_eq!(&longest_common_subsequence("abcd", "c"), "c"); | ||
assert_eq!(&longest_common_subsequence("abcd", "d"), "d"); | ||
assert_eq!(&longest_common_subsequence("abcd", "e"), ""); | ||
assert_eq!(&longest_common_subsequence("abcdefghi", "acegi"), "acegi"); | ||
|
||
// less simple cases | ||
assert_eq!(&longest_common_subsequence("abcdgh", "aedfhr"), "adh"); | ||
assert_eq!(&longest_common_subsequence("aggtab", "gxtxayb"), "gtab"); | ||
macro_rules! longest_common_subsequence_tests { | ||
($($name:ident: $test_case:expr,)*) => { | ||
$( | ||
#[test] | ||
fn $name() { | ||
let (first_seq, second_seq, expected_lcs) = $test_case; | ||
assert_eq!(longest_common_subsequence(&first_seq, &second_seq), expected_lcs); | ||
} | ||
)* | ||
}; | ||
} | ||
|
||
// unicode | ||
assert_eq!( | ||
&longest_common_subsequence("你好,世界", "再见世界"), | ||
"世界" | ||
); | ||
longest_common_subsequence_tests! { | ||
empty_case: ("", "", ""), | ||
one_empty: ("", "abcd", ""), | ||
identical_strings: ("abcd", "abcd", "abcd"), | ||
completely_different: ("abcd", "efgh", ""), | ||
single_character: ("a", "a", "a"), | ||
different_length: ("abcd", "abc", "abc"), | ||
special_characters: ("$#%&", "#@!%", "#%"), | ||
long_strings: ("abcdefghabcdefghabcdefghabcdefghabcdefghabcdefghabcdefghabcdefghabcdefghabcdefgh", | ||
"bcdefghabcdefghabcdefghabcdefghabcdefghabcdefghabcdefghabcdefghabcdefghabcdefgha", | ||
"bcdefghabcdefghabcdefghabcdefghabcdefghabcdefghabcdefghabcdefghabcdefghabcdefgh"), | ||
unicode_characters: ("你好,世界", "再见,世界", ",世界"), | ||
spaces_and_punctuation_0: ("hello, world!", "world, hello!", "hello!"), | ||
spaces_and_punctuation_1: ("hello, world!", "world, hello!", "hello!"), // longest_common_subsequence is not symmetric | ||
random_case_1: ("abcdef", "xbcxxxe", "bce"), | ||
random_case_2: ("xyz", "abc", ""), | ||
random_case_3: ("abracadabra", "avadakedavra", "aaadara"), | ||
} | ||
} |