Merge pull request #61 from Illumina/release/1.2.1

Release/1.2.1
Illumina · Apr 7, 2020 · 3fdd391 · 3fdd391
2 parents 2e4346d + 2a369c0
commit 3fdd391
Show file tree

Hide file tree

Showing 13 changed files with 8,016 additions and 8,035 deletions.
diff --git a/BAlleleFreqFormat.py b/BAlleleFreqFormat.py
@@ -2,62 +2,22 @@
 from IlluminaBeadArrayFiles import RefStrand
 import numpy as np
 
-REVERSE_COMPLEMENT = {"A": "T", "T": "A", "G": "C", "C": "G", "I": "I", "D": "D"}
-
-def extract_alleles_from_snp_string(snp_string):
-    """
-    Splits SNP string into tuple of individual alleles
-
-    Args: snp_string (string): allele string of the format
-        [T/C] or [G/C]
-
-    Returns:
-         allele1 (string), allele2 (string)
-    """
-    (allele1, allele2) = snp_string[1:4].split("/")
-    assert allele1 in REVERSE_COMPLEMENT, "allele %r is invalid (expected A,T,G,C,I,D)" % allele1
-    assert allele2 in REVERSE_COMPLEMENT, "allele %r is invalid (expected A,T,G,C,I,D)" % allele2
-    return allele1, allele2
-
-
-def extract_reverse_complement_alleles_from_snp_string(snp_string):
+def convert_indel_alleles(indel_allele, vcf_record):
     """
-    Splits SNP string into tuple of individual alleles and
-    gets takes the reverse compliment to match strand 1
+    Converts BPM allele to actual insertion/deletion alleles from the reference
 
-    Args: snp_string (string): allele string of the format
-        [T/C] or [G/C]
+    Args: indel_allele (string): BPM indel allele (I or D)
 
     Returns:
          allele1 (string), allele2 (string)
     """
-    # get alleles as tuple
-    allele1, allele2 = extract_alleles_from_snp_string(snp_string)
-    # get reverse compliment of bases and return
-    return REVERSE_COMPLEMENT[allele1], REVERSE_COMPLEMENT[allele2]
-
-
-def convert_indel_alleles(snp_string, vcf_record):
-    """
-    Splits SNP string into tuple of individual alleles and
-    converts to actual insertion/deletion alleles from the reference
-
-    Args: snp_string (string): allele string of the format
-        [I/D] or [D/I]
-
-    Returns:
-         allele1 (string), allele2 (string)
-    """
-    # get alleles as tuple, we only need allele1 because if its "I" we can assume allele2 will be "D" and vice-versa
-    allele1, _ = extract_alleles_from_snp_string(snp_string)
-
     assert len(vcf_record.ALT) == 1, "Cannot convert indel for BAF with multiple ALT alleles"
     alt_allele = str(vcf_record.ALT[0])
     ref_allele = vcf_record.REF
     # Assuming whichever sequence is smaller is the deletion and the larger is the insertion
     assert len(alt_allele) > len(ref_allele) or len(ref_allele) > len(alt_allele), "REF allele %r and ALT allele %r same length, cannot determine insertion or deletion" % (ref_allele, alt_allele)
     deletion_allele, insertion_allele = (alt_allele, ref_allele) if len(alt_allele) < len(ref_allele) else (ref_allele, alt_allele)
-    if allele1 == "I":
+    if indel_allele == "I": # We only need 1 allele because if its "I" we can assume allele2 will be "D" and vice-versa
         return insertion_allele, deletion_allele
     return deletion_allele, insertion_allele
 
@@ -98,19 +58,14 @@ def generate_sample_format_info(self, bpm_records, vcf_record, sample_name):
             return "."
         for i in range(len(bpm_records)):
             bpm_record = bpm_records[i]
-            snp = bpm_record.snp
-            strand = bpm_record.ref_strand
             idx = bpm_record.index_num
             b_allele_freq = self._b_allele_freq[idx]
 
             # if indel, convert to actual ref and alt sequences
             if bpm_record.is_indel():
-                allele1, allele2 = convert_indel_alleles(snp, vcf_record)
-            # if  2/minus strand, get rev comp
-            elif strand == RefStrand.Minus:
-                allele1, allele2 = extract_reverse_complement_alleles_from_snp_string(snp)
+                allele1, allele2 = convert_indel_alleles(bpm_record.plus_strand_alleles[0], vcf_record)
             else:
-                allele1, allele2 = extract_alleles_from_snp_string(snp)
+                allele1, allele2 = bpm_record.plus_strand_alleles
 
             # normalize to reference allele
             ref_allele = vcf_record.REF

diff --git a/GencallFormat.py b/GencallFormat.py
@@ -1,4 +1,5 @@
 from vcf.parser import _Format
+from math import log10
 
 class GencallFormat(object):
     """
@@ -30,7 +31,7 @@ def get_description():
     @staticmethod
     def get_format_obj():
         return _Format(
-            GencallFormat.get_id(), 1, "String", GencallFormat.get_description())
+            GencallFormat.get_id(), 1, "Integer", GencallFormat.get_description())
 
     def _get_min_gencall_score(self, bpm_records):
         """
@@ -55,6 +56,8 @@ def generate_sample_format_info(self, bpm_records, vcf_record, sample_name):
             sample_name (string): Name of sample
 
         Returns:
-            float: Minimum GenCall score (or None for empty arguments)
+            int: Minimum GenCall score, encoded as a phred quality (or None for empty arguments)
         """
-        return self._get_min_gencall_score(bpm_records)
+        min_gencall_score = self._get_min_gencall_score(bpm_records)
+        phred_quality_score = int(round(-10*log10(min(1.0, max(0.00001, 1 - min_gencall_score)))))
+        return phred_quality_score
diff --git a/GenotypeFormat.py b/GenotypeFormat.py
@@ -310,7 +310,7 @@ def generate_sample_format_info(self, bpm_records, vcf_record, sample_name):
                 int_genotype = self._genotypes[record.index_num]
                 if int_genotype != 0:
                     nucleotide_genotypes.append(convert_ab_genotype_to_nucleotide(
-                        int_genotype, bpm_records[0].plus_strand_alleles))
+                        int_genotype, record.plus_strand_alleles))
             vcf_genotype = convert_indel_genotype_to_vcf(
                 nucleotide_genotypes, vcf_record, bpm_records[0].is_deletion, ploidy)
         else:

diff --git a/IlluminaBeadArrayFiles.py b/IlluminaBeadArrayFiles.py
@@ -758,6 +758,8 @@ def __parse_file(self, manifest_file):
             all_norm_ids = set()
             for locus_idx in range(self.num_loci):
                 self.normalization_ids[locus_idx] += 100 * self.assay_types[locus_idx]
+                # To mimic the byte-wrapping behavior from GenomeStudio, AutoCall, IAAP take the mod of 256
+                self.normalization_ids[locus_idx] %= 256
                 all_norm_ids.add(self.normalization_ids[locus_idx])
             sorted_norm_ids = sorted(all_norm_ids)
             lookup_dictionary = {}

diff --git a/README.md b/README.md
@@ -69,7 +69,7 @@ The supplied manifest file may either be in CSV or BPM format; however, a CSV fo
 
 Any standard product manifest provided by Illumina will already follow these conventions. 
 ### Reference genome
-The contig identifiers in the provided genome FASTA file must match exactly the chromosome identifiers specified in the provided manifest. For a standard human product manifest, this means that the contig headers should read ">1" rather than ">chr1". For compatibility with BaseSpace Variant Interpreter (https://www.illumina.com/informatics/research/biological-data-interpretation/variant-interpreter.html), the specified path of the reference must contain either contain the string GrCh37 or GrCh38, for build 37 and 38, respectively. Suitable whole genome FASTA files can be built with the download_reference.sh script located within the scripts directory.
+The contig identifiers in the provided genome FASTA file must match exactly the chromosome identifiers specified in the provided manifest. For a standard human product manifest, this means that the contig headers should read ">1" rather than ">chr1". For compatibility with BaseSpace Variant Interpreter (https://www.illumina.com/informatics/research/biological-data-interpretation/variant-interpreter.html), the specified path of the reference must contain either contain the string GrCh37 or GrCh38, for build 37 and 38, respectively. Suitable whole genome FASTA files can be built with the download_reference.sh script located within the scripts directory. This bash script is dependent on samtools (http://www.htslib.org/download/). If running on OSX, you may need to install coreutils (e.g. `brew install coreutils`).
 
 ### Squashing duplicates
 In the manifest, there can be cases where the same variant is probed by multiple different assays. These assays may be the same design or alternate designs for the same locus. In the default mode of operation, these duplicates will be "squashed" into a single record in the VCF. The method used to incorporate information across multiple assays is under the latter "Output description" heading. When the "--unsquash-duplicates" option is provided, this "squashing" behavior is disabled, and each duplicate assay will be reported in a separate entry in the VCF file. This option is helpful when you are interested in investigating or validating the performance of individual assays, rather than trying to generate genotypes for specific variants. Note that if a locus has more than two alleles and is also queried with duplicated designs, the duplicates will not be unsquashed. 
@@ -122,6 +122,7 @@ conda install -c bioconda pysam=0.9.0
 ```
 where conda is a the package manager binary located in the installation location specified in the first step. 
 
+##
 ## License
 
 Copyright 2018 Illumina

diff --git a/VcfRecordFactory.py b/VcfRecordFactory.py
@@ -68,12 +68,10 @@ def create_vcf_record(self, bpm_record_group):
                 if len(auxiliary_record.alleles) != 2:
                     raise Exception(
                         "Auxiliary locus definition for " + auxiliary_record.ID + " is not bi-allelic")
-                for allele in auxiliary_record.alleles:
-                    if len(str(allele)) <= 1:
-                        raise Exception("Auxiliary locus definition for " +
+                if any([len(str(allele)) <= 1 for allele in auxiliary_record.alleles]):
+                    raise Exception("Auxiliary locus definition for " +
                                         auxiliary_record.ID + " is not a multi-nucleotide variant")
-                    else:
-                        return self._get_record_for_auxiliary(bpm_record_group, auxiliary_record)
+                return self._get_record_for_auxiliary(bpm_record_group, auxiliary_record)
         else:
             if bpm_record.is_indel():
                 return self._get_record_for_indel(bpm_record_group)

diff --git a/changelog.md b/changelog.md
@@ -1,3 +1,9 @@
+# 1.2.1 (2020/03/28)
+* Fix for large manifests with norm id byte-wrapping issue (e.g. Omni5)
+* Fixes for aux VCF MNV handling
+* Updates GenCallScore to phred-scaled int
+* Updates to download_reference.sh for new genome build location and to work on OSX
+
 # 1.2.0 (2019/05/24)
 * Fixes for python3 compatibility
 * Use chromosome ordering from reference

diff --git a/gtc_to_vcf.py b/gtc_to_vcf.py
@@ -17,7 +17,7 @@
 from ReaderTemplateFactory import ReaderTemplateFactory
 from FormatFactory import FormatFactory
 
-VERSION = "1.2.0"
+VERSION = "1.2.1"
 
 def is_dir_writable(parent_dir):
     try:

diff --git a/scripts/download_reference.sh b/scripts/download_reference.sh
@@ -6,32 +6,48 @@ then
     exit 1
 fi
 
-output_file=`readlink -f ${1}`
+if [[ "$OSTYPE" == "linux-gnu" ]]; then
+    output_file=`readlink -f ${1}`
+elif [[ "$OSTYPE" == "darwin"* ]]; then
+    output_file=`greadlink -f ${1}`
+else
+    echo "Error: Unsupported OS ${OSTYPE}"
+	exit 1
+fi
+
 output_dir=`dirname ${output_file}`
 genome_build=${2:-"37"}
 
+# Mitochondrial refseq moved to a different folder, hence the "mt_remote"
 if [ ${genome_build} = "37" ]; then
-	remote="ftp://ftp.ncbi.nlm.nih.gov/genomes/Homo_sapiens/ARCHIVE/BUILD.37.3/Assembled_chromosomes/seq"
+	remote="ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_assembly_structure/Primary_Assembly/assembled_chromosomes/FASTA/"
+    mt_remote="ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_assembly_structure/non-nuclear/assembled_chromosomes/FASTA/"
 elif [ ${genome_build} = "38" ]; then
-    remote=ftp://ftp.ncbi.nlm.nih.gov/genomes/Homo_sapiens/Assembled_chromosomes/seq/
+    remote="ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_assembly_structure/Primary_Assembly/assembled_chromosomes/FASTA/"
+    mt_remote="ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_assembly_structure/non-nuclear/assembled_chromosomes/FASTA/"
 else
 	echo "Error: Unsupported genome build ${genome_build}, valid values are 37,38"
 	exit 1
 fi
 
-temp_dir=`mktemp -d -p ${output_dir}`
+temp_dir=`mktemp -d 2>/dev/null || mktemp -d -t ${output_dir}`
 pushd ${temp_dir}
 
-for chrom in `seq 1 22` X Y MT
+for chrom in `seq 1 22` X Y
 do
-    wget ${remote}/*_ref_*chr${chrom}.fa.gz
+    wget ${remote}/chr${chrom}.fna.gz
 done
+wget ${mt_remote}/chrMT.fna.gz
 
-for chrom in `seq 1 22` X Y MT
+build_fa(){
+    echo ">${1}" >> ${2}
+    gunzip -c chr${1}.fna.gz | grep -v ">" >> "${2}"
+}
+for chrom in `seq 1 22` X Y
 do
-    echo ">${chrom}" >> ${output_file}
-    gunzip -c *_ref_*chr${chrom}.fa.gz | grep -v ">" >> ${output_file}
+    build_fa $chrom $output_file
 done
+build_fa "MT" $output_file
 
 if hash samtools 2>/dev/null; then
     samtools faidx ${output_file}