From aec31372cdabff403ba35ffc7c6b8318fd37e018 Mon Sep 17 00:00:00 2001 From: Mike Lin Date: Fri, 9 Nov 2018 17:30:12 -0800 Subject: [PATCH 1/3] unifier: fix bug in obscure code path (monoallelic site with realigned deletion) seen in a DeepVariant cohort --- src/unifier.cc | 35 ++++++++++++----- test/data/gvcf_test_cases/deepvariant.yml | 46 +++++++++++++++++++++++ test/gvcf_test_cases.cc | 3 ++ 3 files changed, 74 insertions(+), 10 deletions(-) diff --git a/src/unifier.cc b/src/unifier.cc index c446228e..e66869ad 100644 --- a/src/unifier.cc +++ b/src/unifier.cc @@ -473,18 +473,21 @@ Status delineate_sites(const unifier_config& cfg, discovered_alleles& alleles, } for (const auto& pa : pruned) { - discovered_allele *longest_original_ref = nullptr; - for (const auto& al : pa.second.originals) { - const auto r = refs_by_range.find(al.pos); - if (r == refs_by_range.end()) { - return Status::Invalid("delineate_sites: missing REF allele for ", al.pos.str()); - } - if (!longest_original_ref || longest_original_ref->first.dna.size() < r->second.first.dna.size()) { - longest_original_ref = &(r->second); + // we need to supply a discovered reference allele to go along with + // the pruned alt allele; find the shortest one which contains the + // alt. note, we realigned the alt so this might not cover the + // original! + const discovered_allele *shortest_containing_ref = nullptr; + for (const auto& ref : refs_by_range) { + if (ref.first.contains(pa.first.pos) && + (!shortest_containing_ref || ref.first.size() < shortest_containing_ref->first.pos.size())) { + shortest_containing_ref = &ref.second; } } - assert(longest_original_ref); - all_pruned_alleles.push_back(make_pair(pa, *longest_original_ref)); + if (!shortest_containing_ref) { + return Status::Invalid("delineate_sites: missing REF allele for ", pa.first.str()); + } + all_pruned_alleles.push_back(make_pair(pa, *shortest_containing_ref)); } } return Status::OK(); @@ -575,6 +578,18 @@ Status unified_sites(const unifier_config& cfg, Status s; unifier_stats stats; + /* desperate-straits debugging: + for (const auto& allele : alleles) { + if (allele.first.pos.overlaps(range(7, 16188960, 16188970))) { + cerr << allele.first.str(); + if (allele.second.is_ref) { + cerr << " *"; + } + cerr << endl; + } + } + */ + map> sites; vector> all_pruned_alleles; S(delineate_sites(cfg, alleles, sites, all_pruned_alleles)); diff --git a/test/data/gvcf_test_cases/deepvariant.yml b/test/data/gvcf_test_cases/deepvariant.yml index d6c7dfe5..80e49e95 100644 --- a/test/data/gvcf_test_cases/deepvariant.yml +++ b/test/data/gvcf_test_cases/deepvariant.yml @@ -2,6 +2,12 @@ readme: | DeepVariant 0.5.0 gVCF dialect TODO: add third sample to provoke multiallelic site and focus PL liftover + The cluster at 16188963-16188973 provokes an obscure code path: a monoallelic + site including a non-left-aligned alternate allele that's equivalent to + another record in the same sample. (The 'in the same sample' bit hasn't + actually been seen in the wild, but otherwise the scenario is based on a real + example.) TODO: can we safely call more in NA12877? + input: header : |- ##fileformat=VCFv4.2 @@ -42,6 +48,9 @@ input: chr21 11000381 . AG A,<*> 5.5 PASS . GT:GQ:DP:AD:VAF:PL 1/1:3:32:20,12,0:0.375,0:2,3,0,990,990,990 chr21 11000382 . G A,<*> 4 PASS . GT:GQ:DP:AD:VAF:PL 1/1:4:20:0,20,0:1,0:1,14,0,990,990,990 chr21 11000383 . A <*> 0 . END=11000385 GT:GQ:MIN_DP:PL 0/0:50:50:0,150,1499 + chr21 16188963 . CAT C,<*> 5.5 PASS . GT:GQ:DP:AD:VAF:PL 1/1:3:32:20,12,0:0.375,0:2,3,0,990,990,990 + chr21 16188966 . A <*> 0 . END=16188966 GT:GQ:MIN_DP:PL 0/0:50:50:0,150,1499 + chr21 16188967 . C G,<*> 5.5 PASS . GT:GQ:DP:AD:VAF:PL 1/1:3:32:20,12,0:0.375,0:2,3,0,990,990,990 - NA12877.gvcf: | NA12877 chr21 9412001 . A <*> 0 . END=9412144 GT:GQ:MIN_DP:PL 0/0:50:23:0,84,839 @@ -71,6 +80,9 @@ input: chr21 9418260 . A T,<*> 36.1 PASS . GT:GQ:DP:AD:VAF:PL 1/1:35:91:0,91,0:1,0:36,40,0,990,990,990 chr21 9418261 . G <*> 0 . END=9418296 GT:GQ:MIN_DP:PL 0/0:50:79:0,276,2759 chr21 11000367 . G <*> 0 . END=11000385 GT:GQ:MIN_DP:PL 0/0:50:79:0,276,2759 + chr21 16188963 . C <*> 0 . END=16188964 GT:GQ:MIN_DP:PL 0/0:50:50:0,150,1499 + chr21 16188965 . TAC T,<*> 5.5 PASS . GT:GQ:DP:AD:VAF:PL 0/1:3:32:20,12,0:0.375,0:2,3,0,990,990,990 + chr21 16188967 . CATAT TAT,<*> 5.5 PASS . GT:GQ:DP:AD:VAF:PL 0/1:3:32:20,12,0:0.375,0:2,3,0,990,990,990 unifier_config: min_AQ1: 0 @@ -174,6 +186,37 @@ truth_unified_sites: quality: 1 frequency: 0.5 quality: 2 +- range: {ref: chr21, beg: 16188963, end: 16188965} + in_target: {ref: chr21, beg: 1, end: 1000000000} + alleles: + - dna: CAT + - dna: C + quality: 2 + frequency: 0.5 + lost_allele_frequency: 0.5 + quality: 2 +- range: {ref: chr21, beg: 16188965, end: 16188967} + in_target: {ref: chr21, beg: 1, end: 1000000000} + alleles: + - dna: TAC + - dna: T + quality: 2 + frequency: 0.5 + quality: 2 + monoallelic: true + unification: + - range: {beg: 16188967, end: 16188971} + dna: TAT + to: 1 +- range: {ref: chr21, beg: 16188967, end: 16188967} + in_target: {ref: chr21, beg: 1, end: 1000000000} + alleles: + - dna: C + - dna: G + quality: 2 + frequency: 0.5 + lost_allele_frequency: 0.5 + quality: 2 truth_output_vcf: - truth.vcf: | @@ -196,3 +239,6 @@ truth_output_vcf: chr21 9412441 chr21_9412441_TATA_T TATA T 31 . AF=0.5;AQ=31 GT:GQ:PL:DP:AD:RNC 0/1:26:31,0,27:47:25,22:.. 0/1:27:27,0,46:98:69,29:.. chr21 9418260 chr21_9418260_A_T;chr21_9418260_A_C A T,C 43 . AF=0.75,0.25;AQ=43,42 GT:GQ:PL:DP:AD:RNC 1/1:35:36,40,0,.,.,.:91:0,91,0:.. 1/2:38:45,48,42,48,0,43:135:0,106,29:.. chr21 11000381 chr21_11000381_AG_A;chr21_11000382_G_A AG A,AA 2 . AF=0.5,0.5;AQ=2,1 GT:GQ:PL:DP:AD:RNC 0/0:50:.:79:79,0,0:.. ./.:.:.:20:.:OO + chr21 16188963 chr21_16188963_CAT_C CAT C 2 . AF=0.5;AQ=2 GT:DP:AD:GQ:PL:RNC ./0:32:20,0:3:.:-. 1/1:32:20,12:3:2,3,0:.. + chr21 16188965 chr21_16188965_TAC_T TAC T 2 MONOALLELIC AF=0.5;AQ=2 GT:DP:AD:GQ:PL:RNC ./.:32:.,12:.:.:OO ./.:32:.,0:3:.:11 + chr21 16188967 chr21_16188967_C_G C G 2 . AF=0.5;AQ=2 GT:DP:AD:GQ:PL:RNC ./.:32:.:.:.:OO 1/1:32:20,12:3:2,3,0:.. diff --git a/test/gvcf_test_cases.cc b/test/gvcf_test_cases.cc index 8625fb51..17f9a7ce 100644 --- a/test/gvcf_test_cases.cc +++ b/test/gvcf_test_cases.cc @@ -332,6 +332,9 @@ class GVCFTestCase { unifier_stats stats; s = unified_sites(unifier_cfg, N, als, sites, stats); + if (!s.ok()) { + cout << s.str() << endl; + } REQUIRE(s.ok()); REQUIRE(is_sorted(sites.begin(), sites.end())); From 497277a5fd55da48084c485b52dff1910037df41 Mon Sep 17 00:00:00 2001 From: Mike Lin Date: Fri, 9 Nov 2018 19:48:59 -0800 Subject: [PATCH 2/3] genotyper: don't call 0/0 based on a reference confidence record with GT != 0/0 a DeepVariant phenomenon --- src/genotyper.cc | 13 +++++++++++++ test/data/gvcf_test_cases/deepvariant.yml | 4 ++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/src/genotyper.cc b/src/genotyper.cc index 290d4e84..e11dada3 100644 --- a/src/genotyper.cc +++ b/src/genotyper.cc @@ -307,6 +307,19 @@ Status prepare_dataset_records(const genotyper_config& cfg, const unified_site& S(update_min_ref_depth(dataset, hdr, bcf_nsamples, sample_mapping, ref_records, depth, min_ref_depth)); + // ex post facto check for reference confidence records whose GT is other + // than 0/0 (probably ./.), which we'll translate to PartialData non-calls + for (const auto& rp : all_records) { + if (rp->is_ref) { + for (unsigned i = 0; i < 2*rp->p->n_sample; i++) { + if (bcf_gt_is_missing(rp->gt[i]) || bcf_gt_allele(rp->gt[i]) != 0) { + rnc = NoCallReason::PartialData; + return Status::OK(); + } + } + } + } + // Success... rnc = NoCallReason::N_A; return Status::OK(); diff --git a/test/data/gvcf_test_cases/deepvariant.yml b/test/data/gvcf_test_cases/deepvariant.yml index 80e49e95..75844ad2 100644 --- a/test/data/gvcf_test_cases/deepvariant.yml +++ b/test/data/gvcf_test_cases/deepvariant.yml @@ -79,7 +79,7 @@ input: chr21 9418226 . C <*> 0 . END=9418259 GT:GQ:MIN_DP:PL 0/0:50:90:0,294,2939 chr21 9418260 . A T,<*> 36.1 PASS . GT:GQ:DP:AD:VAF:PL 1/1:35:91:0,91,0:1,0:36,40,0,990,990,990 chr21 9418261 . G <*> 0 . END=9418296 GT:GQ:MIN_DP:PL 0/0:50:79:0,276,2759 - chr21 11000367 . G <*> 0 . END=11000385 GT:GQ:MIN_DP:PL 0/0:50:79:0,276,2759 + chr21 11000367 . G <*> 0 . END=11000385 GT:GQ:MIN_DP:PL ./.:0:1:29,3,0 chr21 16188963 . C <*> 0 . END=16188964 GT:GQ:MIN_DP:PL 0/0:50:50:0,150,1499 chr21 16188965 . TAC T,<*> 5.5 PASS . GT:GQ:DP:AD:VAF:PL 0/1:3:32:20,12,0:0.375,0:2,3,0,990,990,990 chr21 16188967 . CATAT TAT,<*> 5.5 PASS . GT:GQ:DP:AD:VAF:PL 0/1:3:32:20,12,0:0.375,0:2,3,0,990,990,990 @@ -238,7 +238,7 @@ truth_output_vcf: chr21 9412435 chr21_9412435_T_A T A 23 . AF=0.25;AQ=23 GT:GQ:PL:DP:AD:RNC 0/1:24:23,0,37:44:22,22:.. 0/0:14:0,13,42:103:89,14:.. chr21 9412441 chr21_9412441_TATA_T TATA T 31 . AF=0.5;AQ=31 GT:GQ:PL:DP:AD:RNC 0/1:26:31,0,27:47:25,22:.. 0/1:27:27,0,46:98:69,29:.. chr21 9418260 chr21_9418260_A_T;chr21_9418260_A_C A T,C 43 . AF=0.75,0.25;AQ=43,42 GT:GQ:PL:DP:AD:RNC 1/1:35:36,40,0,.,.,.:91:0,91,0:.. 1/2:38:45,48,42,48,0,43:135:0,106,29:.. - chr21 11000381 chr21_11000381_AG_A;chr21_11000382_G_A AG A,AA 2 . AF=0.5,0.5;AQ=2,1 GT:GQ:PL:DP:AD:RNC 0/0:50:.:79:79,0,0:.. ./.:.:.:20:.:OO + chr21 11000381 chr21_11000381_AG_A;chr21_11000382_G_A AG A,AA 2 . AF=0.5,0.5;AQ=2,1 GT:GQ:PL:DP:AD:RNC ./.:.:.:.:.:PP ./.:.:.:20:.:OO chr21 16188963 chr21_16188963_CAT_C CAT C 2 . AF=0.5;AQ=2 GT:DP:AD:GQ:PL:RNC ./0:32:20,0:3:.:-. 1/1:32:20,12:3:2,3,0:.. chr21 16188965 chr21_16188965_TAC_T TAC T 2 MONOALLELIC AF=0.5;AQ=2 GT:DP:AD:GQ:PL:RNC ./.:32:.,12:.:.:OO ./.:32:.,0:3:.:11 chr21 16188967 chr21_16188967_C_G C G 2 . AF=0.5;AQ=2 GT:DP:AD:GQ:PL:RNC ./.:32:.:.:.:OO 1/1:32:20,12:3:2,3,0:.. From 380482c64d280de60869c1d1c144764bca9a6abf Mon Sep 17 00:00:00 2001 From: Mike Lin Date: Fri, 9 Nov 2018 19:51:20 -0800 Subject: [PATCH 3/3] cli: fill out config descriptions --- src/cli_utils.cc | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/cli_utils.cc b/src/cli_utils.cc index cb469349..66dfc233 100644 --- a/src/cli_utils.cc +++ b/src/cli_utils.cc @@ -464,7 +464,7 @@ bool check_dir_exists(const string &path) { // should reside in some user-modifiable yml file static const char* config_presets_yml = R"eof( gatk: - description: Merge and joint-call GATK-style gVCFs + description: Joint-call GATK-style gVCFs unifier_config: min_AQ1: 70 min_AQ2: 40 @@ -514,7 +514,7 @@ static const char* config_presets_yml = R"eof( count: 0 ignore_non_variants: true gatk_unfiltered: - description: Merge GATK-style gVCFs with no filtering or genotype revision. + description: Merge GATK-style gVCFs with no QC filters or genotype revision unifier_config: min_AQ1: 0 min_AQ2: 0 @@ -564,6 +564,7 @@ static const char* config_presets_yml = R"eof( count: 0 ignore_non_variants: true xAtlas: + description: Joint-call xAtlas gVCFs unifier_config: min_AQ1: 10 min_AQ2: 3 @@ -632,6 +633,7 @@ static const char* config_presets_yml = R"eof( count: 1 ignore_non_variants: true xAtlas_unfiltered: + description: Merge xAtlas gVCFs with no QC filters or genotype revision unifier_config: monoallelic_sites_for_lost_alleles: true genotyper_config: @@ -697,6 +699,7 @@ static const char* config_presets_yml = R"eof( count: 1 ignore_non_variants: true weCall: + description: Joint-call xAtlas gVCFs unifier_config: min_AQ1: 30 min_AQ2: 30 @@ -756,6 +759,7 @@ static const char* config_presets_yml = R"eof( count: 1 ignore_non_variants: true DeepVariant: + description: "[EXPERIMENTAL] Merge DeepVariant gVCFs with no QC filters or genotype revision" unifier_config: min_AQ1: 0 min_AQ2: 0