Skip to content

Commit

Permalink
Merge pull request #653 from wtsi-npg/devel
Browse files Browse the repository at this point in the history
devel to master, prep for release 66.6
  • Loading branch information
dozy authored Nov 4, 2019
2 parents 43d6a7f + 06e9754 commit 1339bc9
Show file tree
Hide file tree
Showing 6 changed files with 146 additions and 39 deletions.
5 changes: 4 additions & 1 deletion Changes
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
LIST OF CHANGES FOR NPG-QC PACKAGE

- bug fix for mqc skipper script which, because of filtering on
study name always gave count of unique studies on a run as one

release 67.3.0
- change default file type to cram for genotype check
- script for skippimg manual QC step: an additional filter
- script for skipping manual QC step: an additional filter
to ensure that the all lanes for a run that bypasses
manual QC deplexed properly

Expand Down
27 changes: 22 additions & 5 deletions bin/npg_mqc_skipper
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,15 @@ my $run_list = sub {
return join q[,], map { q[?] } @ids;
};

my $get_run_ids = sub {
my $sth = shift;
my @run_ids = ();
while (my @data = $sth->fetchrow_array()) {
push @run_ids, $data[0];
}
return @run_ids;
};

my @run_ids = map { $_->id_run } @rows;
my $placeholders = $run_list->(@run_ids);

Expand All @@ -72,16 +81,24 @@ my $query =
q[from iseq_product_metrics p ] .
q[join iseq_flowcell f on p.id_iseq_flowcell_tmp=f.id_iseq_flowcell_tmp ] .
q[join study s on s.id_study_tmp=f.id_study_tmp ] .
qq[where s.name = ? and s.name != ? and p.id_run in (${placeholders}) ] .
qq[where s.name != ? and p.id_run in (${placeholders}) ] .
q[group by p.id_run having study_count = ?];
my $sth = $dbh->prepare($query) or croak "Failed to prepare statement: $DBI::errstr";
# Run time database errors are thrown by the execute method, no need to
# do anything special.
$sth->execute($study_name, $EXCLUDE_STUDY_NAME, @run_ids, $NUM_DISTINCT_STUDIES);
$sth->execute($EXCLUDE_STUDY_NAME, @run_ids, $NUM_DISTINCT_STUDIES);
@run_ids = $get_run_ids->($sth);

@run_ids = ();
while (my @data = $sth->fetchrow_array()) {
push @run_ids, $data[0];
if (@run_ids) {
$placeholders = $run_list->(@run_ids);
$query =
q[select distinct p.id_run from iseq_product_metrics p join iseq_flowcell f ] .
q[on p.id_iseq_flowcell_tmp=f.id_iseq_flowcell_tmp ] .
q[join study s on s.id_study_tmp=f.id_study_tmp ] .
qq[where s.name = ? and p.id_run in (${placeholders})];
$sth = $dbh->prepare($query) or croak "Failed to prepare statement: $DBI::errstr";
$sth->execute($study_name, @run_ids);
@run_ids = $get_run_ids->($sth);
}

if (@run_ids) {
Expand Down
102 changes: 71 additions & 31 deletions lib/npg_qc/autoqc/checks/bam_flagstats.pm
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,18 @@ Readonly::Hash my %METRICS_FIELD_MAPPING => {
'ESTIMATED_LIBRARY_SIZE' => 'library_size'
};

# picard and biobambam mark duplicates assign this
# value for aligned data with no mapped paired reads
Readonly::Scalar my $LIBRARY_SIZE_NOT_AVAILABLE => -1;
Readonly::Scalar my $METRICS_NUMBER => 9;
Readonly::Hash my %SAMTOOLS_METRICS_FIELD_MAPPING => {
'LIBRARY' => 'library',
'PAIRED' => 'read_pairs_examined',
'DUPLICATE SINGLE' => 'unpaired_read_duplicates',
'DUPLICATE PAIR' => 'paired_read_duplicates',
'DUPLICATE PAIR OPTICAL' => 'read_pair_optical_duplicates',
'PERCENT_DUPLICATION' => 'percent_duplicate',
'ESTIMATED_LIBRARY_SIZE' => 'library_size'
};

Readonly::Scalar my $LIBRARY_SIZE_NOT_AVAILABLE => -1; # assigned to ESTIMATED_LIBRARY_SIZE by picard and biobambam for aligned data with no mapped paired reads
Readonly::Scalar my $METRICS_NUMBER => 10;
Readonly::Scalar my $PAIR_NUMBER => 2;
Readonly::Scalar my $TARGET_STATS_PATTERN => 'target';
Readonly::Scalar my $TARGET_AUTOSOME_STATS_PATTERN => 'target_autosome';
Expand Down Expand Up @@ -181,48 +189,80 @@ sub _parse_markdups_metrics {
my @file_contents = slurp ( $self->markdups_metrics_file, { irs => qr/\n\n/mxs } );

my $header = $file_contents[0];
chomp $header;
$self->result()->set_info('markdups_metrics_header', $header);

my $metrics = $file_contents[1];
my $histogram = $file_contents[2];
if($header =~ /^COMMAND:[^\n]*samtools[ ]markdup/smx) {
chomp $header;
$self->result()->set_info('markdups_metrics_header', $header);

my %metrics = map { split /:/smx } (split /\n/smx, $header);

my @metrics_lines = split /\n/mxs, $metrics;
my @metrics_header = split /\t/mxs, $metrics_lines[1];
my @metrics_numbers = split /\t/mxs, $metrics_lines[2];
@metrics{keys %metrics} = (map { _trim($_) } values %metrics); # remove any leading and trailing spaces from values

my %metrics;
@metrics{@metrics_header} = @metrics_numbers;
for my $field (keys %SAMTOOLS_METRICS_FIELD_MAPPING) {
my $field_value = $metrics{$field};

if (scalar @metrics_numbers > $METRICS_NUMBER ) {
croak 'MarkDuplicate metrics format is wrong';
($field eq q[PAIRED] or $field eq q[DUPLICATE PAIR] or $field eq q[DUPLICATE PAIR OPTICAL]) && ($field_value /= 2);
($field eq q[PERCENT_DUPLICATION]) && ((($field_value = $metrics{'EXAMINED'}) == 0) || ($field_value = sprintf q[%0.6f], ($metrics{'DUPLICATE PAIR'} + $metrics{'DUPLICATE SINGLE'}) / $metrics{'EXAMINED'}));
($field eq q[COMMAND]) && next;

$self->result->${\$SAMTOOLS_METRICS_FIELD_MAPPING{$field}}($field_value);
}

# note: no histogram from samtools markdup
}
else { # not samtools, assume picard/biobambam2 format
chomp $header;
$self->result()->set_info('markdups_metrics_header', $header);

my $metrics = $file_contents[1];
my $histogram = $file_contents[2];

my @metrics_lines = split /\n/mxs, $metrics;
my @metrics_header = split /\t/mxs, $metrics_lines[1];
my @metrics_numbers = split /\t/mxs, $metrics_lines[2];

foreach my $field (keys %METRICS_FIELD_MAPPING){
my $field_value = $metrics{$field};
if ($field_value) {
if ($field_value =~/\?/mxs) {
$field_value = undef;
} elsif ($field eq 'ESTIMATED_LIBRARY_SIZE' && $field_value < 0) {
if ($field_value == $LIBRARY_SIZE_NOT_AVAILABLE) {
my %metrics;
@metrics{@metrics_header} = @metrics_numbers;

if (scalar @metrics_numbers > $METRICS_NUMBER ) {
croak 'MarkDuplicate metrics format is wrong';
}

foreach my $field (keys %METRICS_FIELD_MAPPING){
my $field_value = $metrics{$field};
if ($field_value) {
if ($field_value =~/\?/mxs) {
$field_value = undef;
} else {
croak "Library size less than $LIBRARY_SIZE_NOT_AVAILABLE";
} elsif ($field eq 'ESTIMATED_LIBRARY_SIZE' && $field_value < 0) {
if ($field_value == $LIBRARY_SIZE_NOT_AVAILABLE) {
$field_value = undef;
} else {
croak "Library size less than $LIBRARY_SIZE_NOT_AVAILABLE";
}
}
}
$self->result()->${\$METRICS_FIELD_MAPPING{$field}}( $field_value );
}
$self->result()->${\$METRICS_FIELD_MAPPING{$field}}( $field_value );
}

if ($histogram) {
my @histogram_lines = split /\n/mxs, $histogram;
my %histogram_hash = map { $_->[0] => $_->[1] } map{ [split /\s/mxs] } grep {/^[\d]/mxs } @histogram_lines;
$self->result()->histogram(\%histogram_hash);
if ($histogram) {
my @histogram_lines = split /\n/mxs, $histogram;
my %histogram_hash = map { $_->[0] => $_->[1] } map{ [split /\s/mxs] } grep {/^[\d]/mxs } @histogram_lines;
$self->result()->histogram(\%histogram_hash);
}
}

return;
}

sub _trim {
my ($s) = @_;

# remove any leading and trailing spaces
$s =~ s/^\s*//smx;
$s=~s/\s*$//smx;

return $s;
}

sub _parse_flagstats {
my $self = shift;

Expand Down
2 changes: 1 addition & 1 deletion npg_qc_viewer/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
"url": "https://github.com/wtsi-npg/npg_qc.git"
},
"devDependencies" : {
"bower": "1.8.2",
"bower": "1.8.8",
"grunt": "1.0.1",
"grunt-cli": "1.2.0",
"load-grunt-tasks": "3.5.0",
Expand Down
49 changes: 48 additions & 1 deletion t/60-autoqc-checks-bam_flagstats.t
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use strict;
use warnings;
use Test::More tests => 11;
use Test::More tests => 12;
use Test::Exception;
use Test::Deep;
use File::Temp qw( tempdir );
Expand Down Expand Up @@ -562,4 +562,51 @@ subtest 'full functionality with optional target and autosome target stats' => s

};

my $archive_30917 = '30917_1_1';
my $ae_30917 = Archive::Extract->new(archive => "t/data/autoqc/bam_flagstats/${archive_30917}.tar.gz");
$ae_30917->extract(to => $tempdir) or die $ae->error;
$archive_30917 = join q[/], $tempdir, $archive_30917;
my $qc_in = qq[$archive_30917/30917_1#1];
my $markdups_metrics_file = qq[$qc_in/30917_1#1.markdups_metrics.txt];
my $flagstat_file = qq[$qc_in/30917_1#1.flagstat];
my $input_file = qq[$qc_in/30917_1#1.cram];
my $qc_out = join q[/], $qc_in, 'qc';

subtest 'test samtools markdups metrics' => sub {
plan tests => 3;

mkdir $qc_out;

my $c = npg_qc::autoqc::checks::bam_flagstats->new(
id_run => 30917,
position => 1,
tag_index => 1,
qc_in => $qc_in,
qc_out => $qc_out,
markdups_metrics_file => $markdups_metrics_file,
flagstats_metrics_file => $flagstat_file,
input_files => [$input_file],
);

my $expected = from_json(slurp qq{$qc_in/qc_expected/30917_1#1.bam_flagstats.json}, {chomp=>1});
$expected->{path} = qq{$tempdir/$expected->{path}};

lives_ok { $c->execute() } 'execute method is ok';
my $r;
my $result_json;
lives_ok {
$r = $c->result();
$result_json = from_json($r->freeze());
} 'no error when serializing to json string and file';

foreach my $res ($expected, $result_json){
delete $res->{'__CLASS__'};
delete $res->{'composition'};
delete $res->{'info'}->{'Check'};
delete $res->{'info'}->{'Check_version'};
}

is_deeply($result_json, $expected, 'correct json output');
};

1;
Binary file added t/data/autoqc/bam_flagstats/30917_1_1.tar.gz
Binary file not shown.

0 comments on commit 1339bc9

Please sign in to comment.