diff --git a/Cargo.toml b/Cargo.toml index a064174..6f420c8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "ska" -version = "0.3.8" +version = "0.3.9" authors = [ "John Lees ", "Simon Harris ", diff --git a/scripts/plot_cov.py b/scripts/plot_cov.py index 6eb9fb8..e54c498 100644 --- a/scripts/plot_cov.py +++ b/scripts/plot_cov.py @@ -8,9 +8,12 @@ import matplotlib.pyplot as plt import argparse +from math import exp +# Space at the top +y_padding = 1.1 -def norm_series(series: list()): +def norm_series(series: list): norm = max(series) series = [item / norm for item in series] return series @@ -54,7 +57,7 @@ def main(): ax1.set_xlabel("K-mer count") ax1.set_ylabel("Frequency") - ax1.set_ylim(0, k_norm[1]) + ax1.set_ylim(0, max(k_norm[1:]) * y_padding) ax1.plot( idx_xseries, k_norm, color="black", linewidth=2, label="K-mer count frequency" ) @@ -79,7 +82,7 @@ def main(): ax2.set_yscale("log") ax2.set_xlabel("K-mer count") ax2.set_ylabel("log(Frequency)") - ax2.set_ylim(min(k_norm), k_norm[1]) + ax2.set_ylim(min(k_norm), max(k_norm[1:]) * exp(y_padding)) ax2.plot( idx_xseries, k_norm, color="black", linewidth=2, label="K-mer count frequency" ) diff --git a/src/coverage.rs b/src/coverage.rs index 72d4e33..0b0018d 100644 --- a/src/coverage.rs +++ b/src/coverage.rs @@ -148,6 +148,7 @@ where /// /// # Panics /// - If the fit has already been run + #[allow(clippy::map_clone)] pub fn fit_histogram(&mut self) -> Result { if self.fitted { panic!("Model already fitted"); @@ -163,15 +164,15 @@ where } // Truncate count vec and covert to float - let mut counts_f64: Vec = Vec::new(); - for hist_bin in &self.counts { - if *hist_bin < MIN_FREQ { - break; - } else { - counts_f64.push(*hist_bin as f64); - } - } - let count_len = counts_f64.len(); + self.counts = self + .counts + .iter() + .rev() + .skip_while(|x| **x < MIN_FREQ) + .map(|x| *x) + .collect(); + self.counts.reverse(); + let counts_f64: Vec = self.counts.iter().map(|x| *x as f64).collect(); // Fit with maximum likelihood. Using BFGS optimiser and simple line search // seems to work fine @@ -182,13 +183,13 @@ where // and it gave very poor results for the c optimisation let init_hessian: Vec> = vec![vec![1.0, 0.0], vec![0.0, 1.0]]; let linesearch = BacktrackingLineSearch::new(ArmijoCondition::new(0.0001f64)?); - let solver = BFGS::new(linesearch); + let solver = BFGS::new(linesearch).with_tolerance_cost(1e-6)?; // Usually around 10 iterations should be enough let mut exec = Executor::new(mixture_fit, solver).configure(|state| { state .param(init_param) .inv_hessian(init_hessian) - .max_iters(100) + .max_iters(20) }); if self.verbose { exec = exec.add_observer(SlogLogger::term(), ObserverMode::Always); @@ -205,7 +206,7 @@ where self.c = best[1]; // calculate the coverage cutoff - self.cutoff = find_cutoff(best, count_len); + self.cutoff = find_cutoff(best, self.counts.len()); self.fitted = true; Ok(self.cutoff) } else { @@ -232,13 +233,10 @@ where log::info!("Calculating and printing count series"); println!("Count\tK_mers\tMixture_density\tComponent"); for (idx, count) in self.counts.iter().enumerate() { - if *count < MIN_FREQ { - break; - } println!( "{}\t{}\t{:e}\t{}", idx + 1, - *count, + *count as usize, f64::exp(lse( a(self.w0, idx as f64 + 1.0), b(self.w0, self.c, idx as f64 + 1.0) diff --git a/src/lib.rs b/src/lib.rs index c65fdd4..1eeb3a2 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -56,14 +56,14 @@ //! ## Important notes //! //! - In this version of ska the k-mer length is the _total_ split k-mer size, -//! whereas in ska1 the k-mer length is half the split length. So the default of -//! k=15 in ska1 corresponds to choosing `-k 31` in this version. +//! whereas in ska1 the k-mer length is half the split length. So the default of +//! k=15 in ska1 corresponds to choosing `-k 31` in this version. //! - If you are using FASTQ input files (reads), these must be provided as two -//! deinterleaved files using the `-f` option to `ska build`. Providing them as -//! a single file will treat them as FASTA, and not correct sequencing errors. +//! deinterleaved files using the `-f` option to `ska build`. Providing them as +//! a single file will treat them as FASTA, and not correct sequencing errors. //! - If you are running `ska map`, it is more memory efficient to run these one at -//! a time, rather than merging a single `.skf` file. A command for doing this -//! in parallel is listed below. +//! a time, rather than merging a single `.skf` file. A command for doing this +//! in parallel is listed below. //! //! ## Common options //! @@ -107,14 +107,15 @@ //! ``` //! Options for filtering/error correction are: //! - `--min-count`. Specify a minimum number of appearances a k-mer must have -//! to be included. This is an effective way of filtering sequencing errors if set -//! to at least three, but higher may be appropriate for higher coverage data. -//! A two-step blocked bloom and hash table filter is used for memory efficiency. +//! to be included. This is an effective way of filtering sequencing errors if set +//! to at least three, but higher may be appropriate for higher coverage data. +//! A two-step blocked bloom and hash table filter is used for memory efficiency. //! - `--proportion-reads`. Specify a proportion of reads to use to build the .skf file. -//! The value of this parameter must be between 0 and 1. +//! The value of this parameter must be between 0 and 1. If you have high coverage samples +//! this can be used to reduce the build time. //! - `--qual-filter`. `none` do not filter based on quality scores. -//! `middle` (default) filter k-mers where the middle base is below the minimum quality. -//! `strict` filter k-mers where any base is below the minimum quality. +//! `middle` (default) filter k-mers where the middle base is below the minimum quality. +//! `strict` filter k-mers where any base is below the minimum quality. //! - `--min-qual`. Specify a minimum PHRED score to use in the filter. //! //! FASTQ files must be paired end. If you'd like to request more flexibility in diff --git a/src/merge_ska_array.rs b/src/merge_ska_array.rs index fad5642..ff472a8 100644 --- a/src/merge_ska_array.rs +++ b/src/merge_ska_array.rs @@ -379,7 +379,7 @@ where /// # Arguments /// /// - `constant` – the number of prefiltered constant bases, used to adjust - /// the denominator of mismatch proportion + /// the denominator of mismatch proportion pub fn distance(&self, constant: f64) -> Vec> { let mut distances: Vec> = Vec::new(); self.variants