Merge pull request #653 from wtsi-npg/devel

devel to master, prep for release 66.6
wtsi-npg · Nov 4, 2019 · 1339bc9 · 1339bc9
2 parents 43d6a7f + 06e9754
commit 1339bc9
Show file tree

Hide file tree

Showing 6 changed files with 146 additions and 39 deletions.
diff --git a/Changes b/Changes
@@ -1,8 +1,11 @@
 LIST OF CHANGES FOR NPG-QC PACKAGE
 
+ - bug fix for mqc skipper script which, because of filtering on
+   study name always gave count of unique studies on a run as one
+
 release 67.3.0
   - change default file type to cram for genotype check
-  - script for skippimg manual QC step: an additional filter
+  - script for skipping manual QC step: an additional filter
     to ensure that the all lanes for a run that bypasses
     manual QC deplexed properly
 

diff --git a/bin/npg_mqc_skipper b/bin/npg_mqc_skipper
@@ -63,6 +63,15 @@ my $run_list = sub {
   return join q[,], map { q[?] } @ids;
 };
 
+my $get_run_ids = sub {
+  my $sth = shift;
+  my @run_ids = ();
+  while (my @data = $sth->fetchrow_array()) {
+    push @run_ids, $data[0];
+  }
+  return @run_ids;
+};
+
 my @run_ids = map { $_->id_run } @rows;
 my $placeholders = $run_list->(@run_ids);
 
@@ -72,16 +81,24 @@ my $query =
    q[from iseq_product_metrics p ] .
    q[join iseq_flowcell f on p.id_iseq_flowcell_tmp=f.id_iseq_flowcell_tmp ] .
    q[join study s on s.id_study_tmp=f.id_study_tmp ] .
-  qq[where s.name = ? and s.name != ? and p.id_run in (${placeholders}) ] .
+  qq[where s.name != ? and p.id_run in (${placeholders}) ] .
    q[group by p.id_run having study_count = ?];
 my $sth = $dbh->prepare($query) or croak "Failed to prepare statement: $DBI::errstr";
 # Run time database errors are thrown by the execute method, no need to
 # do anything special.
-$sth->execute($study_name, $EXCLUDE_STUDY_NAME, @run_ids, $NUM_DISTINCT_STUDIES);
+$sth->execute($EXCLUDE_STUDY_NAME, @run_ids, $NUM_DISTINCT_STUDIES);
+@run_ids = $get_run_ids->($sth);
 
-@run_ids = ();
-while (my @data = $sth->fetchrow_array()) {
-   push @run_ids, $data[0];
+if (@run_ids) {
+  $placeholders = $run_list->(@run_ids);
+  $query =
+     q[select distinct p.id_run from iseq_product_metrics p join iseq_flowcell f ] .
+     q[on p.id_iseq_flowcell_tmp=f.id_iseq_flowcell_tmp ] .
+     q[join study s on s.id_study_tmp=f.id_study_tmp ] .
+    qq[where s.name = ? and p.id_run in (${placeholders})];
+  $sth = $dbh->prepare($query) or croak "Failed to prepare statement: $DBI::errstr";
+  $sth->execute($study_name, @run_ids);
+  @run_ids = $get_run_ids->($sth);
 }
 
 if (@run_ids) {

diff --git a/lib/npg_qc/autoqc/checks/bam_flagstats.pm b/lib/npg_qc/autoqc/checks/bam_flagstats.pm
@@ -29,10 +29,18 @@ Readonly::Hash my %METRICS_FIELD_MAPPING => {
    'ESTIMATED_LIBRARY_SIZE'       => 'library_size'
 };
 
-# picard and biobambam mark duplicates assign this
-# value for aligned data with no mapped paired reads
-Readonly::Scalar my $LIBRARY_SIZE_NOT_AVAILABLE => -1;
-Readonly::Scalar my $METRICS_NUMBER => 9;
+Readonly::Hash my %SAMTOOLS_METRICS_FIELD_MAPPING => {
+   'LIBRARY'                      => 'library',
+   'PAIRED'                       => 'read_pairs_examined',
+   'DUPLICATE SINGLE'             => 'unpaired_read_duplicates',
+   'DUPLICATE PAIR'               => 'paired_read_duplicates',
+   'DUPLICATE PAIR OPTICAL'       => 'read_pair_optical_duplicates',
+   'PERCENT_DUPLICATION'          => 'percent_duplicate',
+   'ESTIMATED_LIBRARY_SIZE'       => 'library_size'
+};
+
+Readonly::Scalar my $LIBRARY_SIZE_NOT_AVAILABLE => -1;  # assigned to ESTIMATED_LIBRARY_SIZE by picard and biobambam for aligned data with no mapped paired reads
+Readonly::Scalar my $METRICS_NUMBER => 10;
 Readonly::Scalar my $PAIR_NUMBER => 2;
 Readonly::Scalar my $TARGET_STATS_PATTERN => 'target';
 Readonly::Scalar my $TARGET_AUTOSOME_STATS_PATTERN => 'target_autosome';
@@ -181,48 +189,80 @@ sub _parse_markdups_metrics {
   my @file_contents = slurp ( $self->markdups_metrics_file, { irs => qr/\n\n/mxs } );
 
   my $header = $file_contents[0];
-  chomp $header;
-  $self->result()->set_info('markdups_metrics_header', $header);
 
-  my $metrics    = $file_contents[1];
-  my $histogram  = $file_contents[2];
+  if($header =~ /^COMMAND:[^\n]*samtools[ ]markdup/smx) {
+    chomp $header;
+    $self->result()->set_info('markdups_metrics_header', $header);
+
+    my %metrics = map { split /:/smx } (split /\n/smx, $header);
 
-  my @metrics_lines   = split /\n/mxs, $metrics;
-  my @metrics_header  = split /\t/mxs, $metrics_lines[1];
-  my @metrics_numbers = split /\t/mxs, $metrics_lines[2];
+    @metrics{keys %metrics} = (map { _trim($_) } values %metrics); # remove any leading and trailing spaces from values
 
-  my %metrics;
-  @metrics{@metrics_header} = @metrics_numbers;
+    for my $field (keys %SAMTOOLS_METRICS_FIELD_MAPPING) {
+      my $field_value = $metrics{$field};
 
-  if (scalar  @metrics_numbers > $METRICS_NUMBER ) {
-    croak 'MarkDuplicate metrics format is wrong';
+      ($field eq q[PAIRED] or $field eq q[DUPLICATE PAIR] or $field eq q[DUPLICATE PAIR OPTICAL]) && ($field_value /= 2);
+      ($field eq q[PERCENT_DUPLICATION]) && ((($field_value = $metrics{'EXAMINED'}) == 0) || ($field_value = sprintf q[%0.6f], ($metrics{'DUPLICATE PAIR'} + $metrics{'DUPLICATE SINGLE'}) / $metrics{'EXAMINED'}));
+      ($field eq q[COMMAND]) && next;
+
+      $self->result->${\$SAMTOOLS_METRICS_FIELD_MAPPING{$field}}($field_value);
+    }
+
+    # note: no histogram from samtools markdup
   }
+  else { # not samtools, assume picard/biobambam2 format
+    chomp $header;
+    $self->result()->set_info('markdups_metrics_header', $header);
+
+    my $metrics    = $file_contents[1];
+    my $histogram  = $file_contents[2];
+
+    my @metrics_lines   = split /\n/mxs, $metrics;
+    my @metrics_header  = split /\t/mxs, $metrics_lines[1];
+    my @metrics_numbers = split /\t/mxs, $metrics_lines[2];
 
-  foreach my $field (keys %METRICS_FIELD_MAPPING){
-    my $field_value = $metrics{$field};
-    if ($field_value) {
-      if ($field_value =~/\?/mxs) {
-        $field_value = undef;
-      } elsif ($field eq 'ESTIMATED_LIBRARY_SIZE' && $field_value < 0) {
-        if ($field_value == $LIBRARY_SIZE_NOT_AVAILABLE) {
+    my %metrics;
+    @metrics{@metrics_header} = @metrics_numbers;
+
+    if (scalar  @metrics_numbers > $METRICS_NUMBER ) {
+      croak 'MarkDuplicate metrics format is wrong';
+    }
+
+    foreach my $field (keys %METRICS_FIELD_MAPPING){
+      my $field_value = $metrics{$field};
+      if ($field_value) {
+        if ($field_value =~/\?/mxs) {
           $field_value = undef;
-        } else {
-          croak "Library size less than $LIBRARY_SIZE_NOT_AVAILABLE";
+        } elsif ($field eq 'ESTIMATED_LIBRARY_SIZE' && $field_value < 0) {
+          if ($field_value == $LIBRARY_SIZE_NOT_AVAILABLE) {
+            $field_value = undef;
+          } else {
+            croak "Library size less than $LIBRARY_SIZE_NOT_AVAILABLE";
+          }
         }
       }
+      $self->result()->${\$METRICS_FIELD_MAPPING{$field}}( $field_value );
     }
-    $self->result()->${\$METRICS_FIELD_MAPPING{$field}}( $field_value );
-  }
 
-  if ($histogram) {
-    my @histogram_lines = split /\n/mxs, $histogram;
-    my %histogram_hash = map { $_->[0] => $_->[1] } map{ [split /\s/mxs] } grep {/^[\d]/mxs } @histogram_lines;
-    $self->result()->histogram(\%histogram_hash);
+    if ($histogram) {
+      my @histogram_lines = split /\n/mxs, $histogram;
+      my %histogram_hash = map { $_->[0] => $_->[1] } map{ [split /\s/mxs] } grep {/^[\d]/mxs } @histogram_lines;
+      $self->result()->histogram(\%histogram_hash);
+    }
   }
-
   return;
 }
 
+sub _trim {
+  my ($s) = @_;
+
+  # remove any leading and trailing spaces
+  $s =~ s/^\s*//smx;
+  $s=~s/\s*$//smx;
+
+  return $s;
+}
+
 sub _parse_flagstats {
   my $self = shift;
 

diff --git a/npg_qc_viewer/package.json b/npg_qc_viewer/package.json
@@ -12,7 +12,7 @@
     "url":  "https://github.com/wtsi-npg/npg_qc.git"
   },
   "devDependencies" : {
-    "bower": "1.8.2",
+    "bower": "1.8.8",
     "grunt":                 "1.0.1",
     "grunt-cli":             "1.2.0",
     "load-grunt-tasks":      "3.5.0",

diff --git a/t/60-autoqc-checks-bam_flagstats.t b/t/60-autoqc-checks-bam_flagstats.t
@@ -1,6 +1,6 @@
 use strict;
 use warnings;
-use Test::More tests => 11;
+use Test::More tests => 12;
 use Test::Exception;
 use Test::Deep;
 use File::Temp qw( tempdir );
@@ -562,4 +562,51 @@ subtest 'full functionality with optional target and autosome target stats' => s
 
 };
 
+my $archive_30917 = '30917_1_1';
+my $ae_30917 = Archive::Extract->new(archive => "t/data/autoqc/bam_flagstats/${archive_30917}.tar.gz");
+$ae_30917->extract(to => $tempdir) or die $ae->error;
+$archive_30917 = join q[/], $tempdir, $archive_30917;
+my $qc_in = qq[$archive_30917/30917_1#1];
+my $markdups_metrics_file = qq[$qc_in/30917_1#1.markdups_metrics.txt];
+my $flagstat_file = qq[$qc_in/30917_1#1.flagstat];
+my $input_file = qq[$qc_in/30917_1#1.cram];
+my $qc_out = join q[/], $qc_in, 'qc';
+
+subtest 'test samtools markdups metrics' => sub {
+  plan tests => 3;
+
+  mkdir $qc_out;
+
+  my $c = npg_qc::autoqc::checks::bam_flagstats->new(
+                        id_run                 => 30917,
+                        position               => 1,
+                        tag_index              => 1,
+                        qc_in                  => $qc_in,
+                        qc_out                 => $qc_out,
+                        markdups_metrics_file  => $markdups_metrics_file,
+                        flagstats_metrics_file => $flagstat_file,
+                        input_files            => [$input_file],
+                       );
+
+  my $expected = from_json(slurp qq{$qc_in/qc_expected/30917_1#1.bam_flagstats.json}, {chomp=>1});
+  $expected->{path} = qq{$tempdir/$expected->{path}};
+
+  lives_ok { $c->execute() } 'execute method is ok';
+  my $r;
+  my $result_json;
+  lives_ok {
+    $r = $c->result();
+    $result_json = from_json($r->freeze());
+  } 'no error when serializing to json string and file';
+
+  foreach my $res ($expected, $result_json){
+     delete $res->{'__CLASS__'};
+     delete $res->{'composition'};
+     delete $res->{'info'}->{'Check'};
+     delete $res->{'info'}->{'Check_version'};
+  }
+
+  is_deeply($result_json, $expected, 'correct json output');
+};
+
 1;
diff --git a/t/data/autoqc/bam_flagstats/30917_1_1.tar.gz b/t/data/autoqc/bam_flagstats/30917_1_1.tar.gz