Skip to content

Commit

Permalink
Merge pull request #82 from bacpop/i81_distances_ambig
Browse files Browse the repository at this point in the history
Better default filtering options for ska distance
  • Loading branch information
johnlees committed Sep 25, 2024
2 parents b9c8909 + cc496f8 commit 55667e8
Show file tree
Hide file tree
Showing 10 changed files with 44 additions and 23 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "ska"
version = "0.3.10"
version = "0.3.11"
authors = [
"John Lees <jlees@ebi.ac.uk>",
"Simon Harris <simon.harris@gatesfoundation.org>",
Expand Down
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,18 @@ Split k-mer analysis (version 2) uses exact matching of split k-mer sequences to

SKA can only align SNPs further than the k-mer length apart, and does not use a gap penalty approach or give alignment scores. But the advantages are speed and flexibility, particularly the ability to run on a reference-free manner (i.e. including accessory genome variation) on both assemblies and reads.

### Citation

Romain Derelle, Johanna von Wachsmann, Tommi M&auml;klin, Joel Hellewell, Timothy Russell, Ajit Lalvani, Leonid Chindelevitch, Nicholas J. Croucher, Simon R. Harris, John A. Lees (2024).
**Seamless, rapid and accurate analyses of outbreak genomic data using Split _k_-mer Analysis**
*Genome Research*

## Documentation

Can be found at https://docs.rs/ska. We also have some tutorials available:

- [From genomes to trees](https://www.bacpop.org/guides/building_trees_with_ska/).
- [Filtering options](https://www.bacpop.org/guides/snp_alignment_with_ska/).

## Installation

Expand Down
10 changes: 6 additions & 4 deletions src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ pub const DEFAULT_KMER: usize = 17;
pub const DEFAULT_PROPORTION_READS: Option<f64> = None;
/// Default single strand (which is equivalent to !rc)
pub const DEFAULT_STRAND: bool = false;
/// Default minimum frequency filter threshold
pub const DEFAULT_MINFREQ: f64 = 0.9;
/// Default behaviour when min-freq counting ambig sites
pub const DEFAULT_AMBIGMISSING: bool = false;
/// Default repeat masking behaviour
Expand Down Expand Up @@ -191,7 +193,7 @@ pub enum Commands {
output: Option<String>,

/// Minimum fraction of samples a k-mer has to appear in
#[arg(short, long, value_parser = zero_to_one, default_value_t = 0.9)]
#[arg(short, long, value_parser = zero_to_one, default_value_t = DEFAULT_MINFREQ)]
min_freq: f64,

/// With min_freq, only count non-ambiguous sites
Expand Down Expand Up @@ -255,9 +257,9 @@ pub enum Commands {
#[arg(short, long, value_parser = zero_to_one, default_value_t = 0.0)]
min_freq: f64,

/// Filter for ambiguous bases
/// Filter out ambiguous bases ('N' still a mismatch)
#[arg(long, default_value_t = false)]
filter_ambiguous: bool,
allow_ambiguous: bool,

/// Number of CPU threads
#[arg(long, value_parser = valid_cpus, default_value_t = 1)]
Expand Down Expand Up @@ -312,7 +314,7 @@ pub enum Commands {
reverse: bool,

/// Minimum fraction of samples a k-mer has to appear in
#[arg(short, long, value_parser = zero_to_one, default_value_t = 0.0)]
#[arg(short, long, value_parser = zero_to_one, default_value_t = DEFAULT_MINFREQ)]
min_freq: f64,

/// With min_freq, only count non-ambiguous sites
Expand Down
2 changes: 2 additions & 0 deletions src/generic_modes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,8 @@ pub fn distance<IntT: for<'a> UInt<'a>>(
);
if filt_ambig || (min_freq * ska_array.nsamples() as f64 >= 1.0) {
if filt_ambig {
let filter_ambig_as_missing = true;
let mask_ambig = true;
apply_filters(
ska_array,
min_freq,
Expand Down
17 changes: 11 additions & 6 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
//!
//! Tutorials:
//! - [From genomes to trees](https://www.bacpop.org/guides/building_trees_with_ska/).
//! - [Filtering options](https://www.bacpop.org/guides/snp_alignment_with_ska/).
//!
//! Command line usage follows. For API documentation and usage, see the [end of this section](#api-usage).
//!
Expand Down Expand Up @@ -208,9 +209,12 @@
//! ska distance -o distances.txt seqs.skf
//! ```
//!
//! Ignore ambiguous bases by adding `--filter-ambiguous` flag, and `--min-freq` to
//! ignore k-mers only found in some samples. Multiple threads
//! can be used, but this will only be effective with large numbers of samples.
//! Consider ambiguous bases by adding `--allow-ambiguous` flag, and change `--min-freq` to
//! ignore more/less k-mers only found in some samples (default = 0.9). Note that ambiguous bases may overestimate
//! distances due to repeat k-mers. For finer control over filtering, first run `ska weed`
//! on the input .skf.
//!
//! Multiple threads can be used, but this will only be effective with large numbers of samples.
//!
//! The companion script in `scripts/cluster_dists.py` (requires `networkx`) can
//! be used to make single linkage clusters from these distances at given thresholds,
Expand Down Expand Up @@ -619,18 +623,19 @@ pub fn main() {
skf_file,
output,
min_freq,
filter_ambiguous,
allow_ambiguous,
threads,
} => {
check_threads(*threads);
let filter_ambiguous = !*allow_ambiguous;
if let Ok(mut ska_array) = MergeSkaArray::<u64>::load(skf_file) {
// In debug mode (cannot be set from CLI, give details)
log::debug!("{ska_array}");
distance(
&mut ska_array,
output,
*min_freq,
*filter_ambiguous,
filter_ambiguous,
*threads,
);
} else if let Ok(mut ska_array) = MergeSkaArray::<u128>::load(skf_file) {
Expand All @@ -640,7 +645,7 @@ pub fn main() {
&mut ska_array,
output,
*min_freq,
*filter_ambiguous,
filter_ambiguous,
*threads,
);
} else {
Expand Down
2 changes: 2 additions & 0 deletions src/merge_ska_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,8 @@ where
} else {
removed += 1;
}
} else {
removed += 1;
}
}
self.variants = filtered_variants;
Expand Down
4 changes: 2 additions & 2 deletions src/ska_dict/bit_encoding.rs
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ pub fn base_to_prob(base: u8) -> [f64; 4] {
b'D' => [1.0 / 3.0, 0.0, 1.0 / 3.0, 1.0 / 3.0],
b'H' => [1.0 / 3.0, 1.0 / 3.0, 1.0 / 3.0, 0.0],
b'V' => [1.0 / 3.0, 1.0 / 3.0, 0.0, 1.0 / 3.0],
b'N' => [0.25, 0.25, 0.25, 0.25],
b'N' => [0.0, 0.0, 0.0, 0.0], // This gives more expected behaviour than 0.25 in each entry
_ => [0.0, 0.0, 0.0, 0.0],
}
}
Expand Down Expand Up @@ -494,7 +494,7 @@ mod tests {

assert_eq!(overlap(&k, &b), 1.0 / 3.0);
assert_eq!(overlap(&d, &h), 2.0 / 9.0);
assert_eq!(overlap(&v, &n), 0.25);
assert_eq!(overlap(&v, &n), 0.0);

assert_eq!(overlap(&n, &empty), 0.0);
}
Expand Down
7 changes: 5 additions & 2 deletions tests/distance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ fn dist_filter() {
.current_dir(sandbox.get_wd())
.arg("distance")
.arg(sandbox.file_string("merge_k9.skf", TestDir::Input))
.arg("--allow-ambiguous")
.arg("-v")
.args(&["--threads", "2"])
.assert()
Expand All @@ -60,7 +61,6 @@ fn dist_filter() {
.current_dir(sandbox.get_wd())
.arg("distance")
.arg(sandbox.file_string("merge_k9.skf", TestDir::Input))
.arg("--filter-ambiguous")
.arg("-v")
.assert()
.stdout_eq_path(sandbox.file_string("merge_k9_no_ambig.dist.stdout", TestDir::Correct));
Expand Down Expand Up @@ -97,21 +97,24 @@ fn multisample_dists() {
.assert()
.success();

// Test with filters off
Command::new(cargo_bin("ska"))
.current_dir(sandbox.get_wd())
.arg("distance")
.arg("multidist.skf")
.arg("-v")
.args(&["--min-freq", "0"])
.arg("--allow-ambiguous")
.args(&["--threads", "2"])
.assert()
.stdout_eq_path(sandbox.file_string("multidist.stdout", TestDir::Correct));

// Test with default filters
Command::new(cargo_bin("ska"))
.current_dir(sandbox.get_wd())
.arg("distance")
.arg("multidist.skf")
.arg("-v")
.args(&["--min-freq", "0.5"])
.assert()
.stdout_eq_path(sandbox.file_string("multidist.filter.stdout", TestDir::Correct));
}
2 changes: 1 addition & 1 deletion tests/test_results_correct/merge_k9_min_freq.dist.stdout
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
Sample1 Sample2 Distance Mismatches
test_1 test_2 2.50 0.00000
test_1 test_2 2.00 0.00000
14 changes: 7 additions & 7 deletions tests/test_results_correct/multidist.filter.stdout
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
Sample1 Sample2 Distance Mismatches
N_test_1 N_test_2 2.00 0.22222
N_test_1 N_test_2 2.00 0.51724
N_test_1 ambig_test_1 0.00 1.00000
N_test_1 ambig_test_2 0.00 1.00000
N_test_1 test_1 0.50 0.08333
N_test_1 test_2 1.00 0.05556
N_test_1 test_1 1.00 0.25455
N_test_1 test_2 1.00 0.42623
N_test_2 ambig_test_1 0.00 1.00000
N_test_2 ambig_test_2 0.00 1.00000
N_test_2 test_1 1.50 0.19444
N_test_2 test_2 1.00 0.16667
ambig_test_1 ambig_test_2 0.00 0.00000
N_test_2 test_1 2.00 0.56716
N_test_2 test_2 1.00 0.28571
ambig_test_1 ambig_test_2 1.00 0.44444
ambig_test_1 test_1 0.00 1.00000
ambig_test_1 test_2 0.00 1.00000
ambig_test_2 test_1 0.00 1.00000
ambig_test_2 test_2 0.00 1.00000
test_1 test_2 1.50 0.02778
test_1 test_2 3.00 0.44118

0 comments on commit 55667e8

Please sign in to comment.