Skip to content

Commit

Permalink
Merge pull request #139 from dnanexus-rnd/mlin-deepvariant-20181109
Browse files Browse the repository at this point in the history
Two DeepVariant corner cases
  • Loading branch information
mlin committed Nov 10, 2018
2 parents 6031a2e + 380482c commit 5a7040c
Show file tree
Hide file tree
Showing 5 changed files with 95 additions and 14 deletions.
8 changes: 6 additions & 2 deletions src/cli_utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -464,7 +464,7 @@ bool check_dir_exists(const string &path) {
// should reside in some user-modifiable yml file
static const char* config_presets_yml = R"eof(
gatk:
description: Merge and joint-call GATK-style gVCFs
description: Joint-call GATK-style gVCFs
unifier_config:
min_AQ1: 70
min_AQ2: 40
Expand Down Expand Up @@ -514,7 +514,7 @@ static const char* config_presets_yml = R"eof(
count: 0
ignore_non_variants: true
gatk_unfiltered:
description: Merge GATK-style gVCFs with no filtering or genotype revision.
description: Merge GATK-style gVCFs with no QC filters or genotype revision
unifier_config:
min_AQ1: 0
min_AQ2: 0
Expand Down Expand Up @@ -564,6 +564,7 @@ static const char* config_presets_yml = R"eof(
count: 0
ignore_non_variants: true
xAtlas:
description: Joint-call xAtlas gVCFs
unifier_config:
min_AQ1: 10
min_AQ2: 3
Expand Down Expand Up @@ -632,6 +633,7 @@ static const char* config_presets_yml = R"eof(
count: 1
ignore_non_variants: true
xAtlas_unfiltered:
description: Merge xAtlas gVCFs with no QC filters or genotype revision
unifier_config:
monoallelic_sites_for_lost_alleles: true
genotyper_config:
Expand Down Expand Up @@ -697,6 +699,7 @@ static const char* config_presets_yml = R"eof(
count: 1
ignore_non_variants: true
weCall:
description: Joint-call xAtlas gVCFs
unifier_config:
min_AQ1: 30
min_AQ2: 30
Expand Down Expand Up @@ -756,6 +759,7 @@ static const char* config_presets_yml = R"eof(
count: 1
ignore_non_variants: true
DeepVariant:
description: "[EXPERIMENTAL] Merge DeepVariant gVCFs with no QC filters or genotype revision"
unifier_config:
min_AQ1: 0
min_AQ2: 0
Expand Down
13 changes: 13 additions & 0 deletions src/genotyper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,19 @@ Status prepare_dataset_records(const genotyper_config& cfg, const unified_site&
S(update_min_ref_depth(dataset, hdr, bcf_nsamples, sample_mapping,
ref_records, depth, min_ref_depth));

// ex post facto check for reference confidence records whose GT is other
// than 0/0 (probably ./.), which we'll translate to PartialData non-calls
for (const auto& rp : all_records) {
if (rp->is_ref) {
for (unsigned i = 0; i < 2*rp->p->n_sample; i++) {
if (bcf_gt_is_missing(rp->gt[i]) || bcf_gt_allele(rp->gt[i]) != 0) {
rnc = NoCallReason::PartialData;
return Status::OK();
}
}
}
}

// Success...
rnc = NoCallReason::N_A;
return Status::OK();
Expand Down
35 changes: 25 additions & 10 deletions src/unifier.cc
Original file line number Diff line number Diff line change
Expand Up @@ -473,18 +473,21 @@ Status delineate_sites(const unifier_config& cfg, discovered_alleles& alleles,
}

for (const auto& pa : pruned) {
discovered_allele *longest_original_ref = nullptr;
for (const auto& al : pa.second.originals) {
const auto r = refs_by_range.find(al.pos);
if (r == refs_by_range.end()) {
return Status::Invalid("delineate_sites: missing REF allele for ", al.pos.str());
}
if (!longest_original_ref || longest_original_ref->first.dna.size() < r->second.first.dna.size()) {
longest_original_ref = &(r->second);
// we need to supply a discovered reference allele to go along with
// the pruned alt allele; find the shortest one which contains the
// alt. note, we realigned the alt so this might not cover the
// original!
const discovered_allele *shortest_containing_ref = nullptr;
for (const auto& ref : refs_by_range) {
if (ref.first.contains(pa.first.pos) &&
(!shortest_containing_ref || ref.first.size() < shortest_containing_ref->first.pos.size())) {
shortest_containing_ref = &ref.second;
}
}
assert(longest_original_ref);
all_pruned_alleles.push_back(make_pair(pa, *longest_original_ref));
if (!shortest_containing_ref) {
return Status::Invalid("delineate_sites: missing REF allele for ", pa.first.str());
}
all_pruned_alleles.push_back(make_pair(pa, *shortest_containing_ref));
}
}
return Status::OK();
Expand Down Expand Up @@ -575,6 +578,18 @@ Status unified_sites(const unifier_config& cfg,
Status s;
unifier_stats stats;

/* desperate-straits debugging:
for (const auto& allele : alleles) {
if (allele.first.pos.overlaps(range(7, 16188960, 16188970))) {
cerr << allele.first.str();
if (allele.second.is_ref) {
cerr << " *";
}
cerr << endl;
}
}
*/

map<range,tuple<discovered_alleles,minimized_alleles,minimized_alleles>> sites;
vector<pair<minimized_allele,discovered_allele>> all_pruned_alleles;
S(delineate_sites(cfg, alleles, sites, all_pruned_alleles));
Expand Down
50 changes: 48 additions & 2 deletions test/data/gvcf_test_cases/deepvariant.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,12 @@ readme: |
DeepVariant 0.5.0 gVCF dialect
TODO: add third sample to provoke multiallelic site and focus PL liftover
The cluster at 16188963-16188973 provokes an obscure code path: a monoallelic
site including a non-left-aligned alternate allele that's equivalent to
another record in the same sample. (The 'in the same sample' bit hasn't
actually been seen in the wild, but otherwise the scenario is based on a real
example.) TODO: can we safely call more in NA12877?
input:
header : |-
##fileformat=VCFv4.2
Expand Down Expand Up @@ -42,6 +48,9 @@ input:
chr21 11000381 . AG A,<*> 5.5 PASS . GT:GQ:DP:AD:VAF:PL 1/1:3:32:20,12,0:0.375,0:2,3,0,990,990,990
chr21 11000382 . G A,<*> 4 PASS . GT:GQ:DP:AD:VAF:PL 1/1:4:20:0,20,0:1,0:1,14,0,990,990,990
chr21 11000383 . A <*> 0 . END=11000385 GT:GQ:MIN_DP:PL 0/0:50:50:0,150,1499
chr21 16188963 . CAT C,<*> 5.5 PASS . GT:GQ:DP:AD:VAF:PL 1/1:3:32:20,12,0:0.375,0:2,3,0,990,990,990
chr21 16188966 . A <*> 0 . END=16188966 GT:GQ:MIN_DP:PL 0/0:50:50:0,150,1499
chr21 16188967 . C G,<*> 5.5 PASS . GT:GQ:DP:AD:VAF:PL 1/1:3:32:20,12,0:0.375,0:2,3,0,990,990,990
- NA12877.gvcf: |
NA12877
chr21 9412001 . A <*> 0 . END=9412144 GT:GQ:MIN_DP:PL 0/0:50:23:0,84,839
Expand Down Expand Up @@ -70,7 +79,10 @@ input:
chr21 9418226 . C <*> 0 . END=9418259 GT:GQ:MIN_DP:PL 0/0:50:90:0,294,2939
chr21 9418260 . A T,<*> 36.1 PASS . GT:GQ:DP:AD:VAF:PL 1/1:35:91:0,91,0:1,0:36,40,0,990,990,990
chr21 9418261 . G <*> 0 . END=9418296 GT:GQ:MIN_DP:PL 0/0:50:79:0,276,2759
chr21 11000367 . G <*> 0 . END=11000385 GT:GQ:MIN_DP:PL 0/0:50:79:0,276,2759
chr21 11000367 . G <*> 0 . END=11000385 GT:GQ:MIN_DP:PL ./.:0:1:29,3,0
chr21 16188963 . C <*> 0 . END=16188964 GT:GQ:MIN_DP:PL 0/0:50:50:0,150,1499
chr21 16188965 . TAC T,<*> 5.5 PASS . GT:GQ:DP:AD:VAF:PL 0/1:3:32:20,12,0:0.375,0:2,3,0,990,990,990
chr21 16188967 . CATAT TAT,<*> 5.5 PASS . GT:GQ:DP:AD:VAF:PL 0/1:3:32:20,12,0:0.375,0:2,3,0,990,990,990
unifier_config:
min_AQ1: 0
Expand Down Expand Up @@ -174,6 +186,37 @@ truth_unified_sites:
quality: 1
frequency: 0.5
quality: 2
- range: {ref: chr21, beg: 16188963, end: 16188965}
in_target: {ref: chr21, beg: 1, end: 1000000000}
alleles:
- dna: CAT
- dna: C
quality: 2
frequency: 0.5
lost_allele_frequency: 0.5
quality: 2
- range: {ref: chr21, beg: 16188965, end: 16188967}
in_target: {ref: chr21, beg: 1, end: 1000000000}
alleles:
- dna: TAC
- dna: T
quality: 2
frequency: 0.5
quality: 2
monoallelic: true
unification:
- range: {beg: 16188967, end: 16188971}
dna: TAT
to: 1
- range: {ref: chr21, beg: 16188967, end: 16188967}
in_target: {ref: chr21, beg: 1, end: 1000000000}
alleles:
- dna: C
- dna: G
quality: 2
frequency: 0.5
lost_allele_frequency: 0.5
quality: 2

truth_output_vcf:
- truth.vcf: |
Expand All @@ -195,4 +238,7 @@ truth_output_vcf:
chr21 9412435 chr21_9412435_T_A T A 23 . AF=0.25;AQ=23 GT:GQ:PL:DP:AD:RNC 0/1:24:23,0,37:44:22,22:.. 0/0:14:0,13,42:103:89,14:..
chr21 9412441 chr21_9412441_TATA_T TATA T 31 . AF=0.5;AQ=31 GT:GQ:PL:DP:AD:RNC 0/1:26:31,0,27:47:25,22:.. 0/1:27:27,0,46:98:69,29:..
chr21 9418260 chr21_9418260_A_T;chr21_9418260_A_C A T,C 43 . AF=0.75,0.25;AQ=43,42 GT:GQ:PL:DP:AD:RNC 1/1:35:36,40,0,.,.,.:91:0,91,0:.. 1/2:38:45,48,42,48,0,43:135:0,106,29:..
chr21 11000381 chr21_11000381_AG_A;chr21_11000382_G_A AG A,AA 2 . AF=0.5,0.5;AQ=2,1 GT:GQ:PL:DP:AD:RNC 0/0:50:.:79:79,0,0:.. ./.:.:.:20:.:OO
chr21 11000381 chr21_11000381_AG_A;chr21_11000382_G_A AG A,AA 2 . AF=0.5,0.5;AQ=2,1 GT:GQ:PL:DP:AD:RNC ./.:.:.:.:.:PP ./.:.:.:20:.:OO
chr21 16188963 chr21_16188963_CAT_C CAT C 2 . AF=0.5;AQ=2 GT:DP:AD:GQ:PL:RNC ./0:32:20,0:3:.:-. 1/1:32:20,12:3:2,3,0:..
chr21 16188965 chr21_16188965_TAC_T TAC T 2 MONOALLELIC AF=0.5;AQ=2 GT:DP:AD:GQ:PL:RNC ./.:32:.,12:.:.:OO ./.:32:.,0:3:.:11
chr21 16188967 chr21_16188967_C_G C G 2 . AF=0.5;AQ=2 GT:DP:AD:GQ:PL:RNC ./.:32:.:.:.:OO 1/1:32:20,12:3:2,3,0:..
3 changes: 3 additions & 0 deletions test/gvcf_test_cases.cc
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,9 @@ class GVCFTestCase {

unifier_stats stats;
s = unified_sites(unifier_cfg, N, als, sites, stats);
if (!s.ok()) {
cout << s.str() << endl;
}
REQUIRE(s.ok());

REQUIRE(is_sorted(sites.begin(), sites.end()));
Expand Down

0 comments on commit 5a7040c

Please sign in to comment.