Skip to content

Commit

Permalink
Merge pull request #61 from Illumina/release/1.2.1
Browse files Browse the repository at this point in the history
Release/1.2.1
  • Loading branch information
jjzieve committed Apr 7, 2020
2 parents 2e4346d + 2a369c0 commit 3fdd391
Show file tree
Hide file tree
Showing 13 changed files with 8,016 additions and 8,035 deletions.
57 changes: 6 additions & 51 deletions BAlleleFreqFormat.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,62 +2,22 @@
from IlluminaBeadArrayFiles import RefStrand
import numpy as np

REVERSE_COMPLEMENT = {"A": "T", "T": "A", "G": "C", "C": "G", "I": "I", "D": "D"}

def extract_alleles_from_snp_string(snp_string):
"""
Splits SNP string into tuple of individual alleles
Args: snp_string (string): allele string of the format
[T/C] or [G/C]
Returns:
allele1 (string), allele2 (string)
"""
(allele1, allele2) = snp_string[1:4].split("/")
assert allele1 in REVERSE_COMPLEMENT, "allele %r is invalid (expected A,T,G,C,I,D)" % allele1
assert allele2 in REVERSE_COMPLEMENT, "allele %r is invalid (expected A,T,G,C,I,D)" % allele2
return allele1, allele2


def extract_reverse_complement_alleles_from_snp_string(snp_string):
def convert_indel_alleles(indel_allele, vcf_record):
"""
Splits SNP string into tuple of individual alleles and
gets takes the reverse compliment to match strand 1
Converts BPM allele to actual insertion/deletion alleles from the reference
Args: snp_string (string): allele string of the format
[T/C] or [G/C]
Args: indel_allele (string): BPM indel allele (I or D)
Returns:
allele1 (string), allele2 (string)
"""
# get alleles as tuple
allele1, allele2 = extract_alleles_from_snp_string(snp_string)
# get reverse compliment of bases and return
return REVERSE_COMPLEMENT[allele1], REVERSE_COMPLEMENT[allele2]


def convert_indel_alleles(snp_string, vcf_record):
"""
Splits SNP string into tuple of individual alleles and
converts to actual insertion/deletion alleles from the reference
Args: snp_string (string): allele string of the format
[I/D] or [D/I]
Returns:
allele1 (string), allele2 (string)
"""
# get alleles as tuple, we only need allele1 because if its "I" we can assume allele2 will be "D" and vice-versa
allele1, _ = extract_alleles_from_snp_string(snp_string)

assert len(vcf_record.ALT) == 1, "Cannot convert indel for BAF with multiple ALT alleles"
alt_allele = str(vcf_record.ALT[0])
ref_allele = vcf_record.REF
# Assuming whichever sequence is smaller is the deletion and the larger is the insertion
assert len(alt_allele) > len(ref_allele) or len(ref_allele) > len(alt_allele), "REF allele %r and ALT allele %r same length, cannot determine insertion or deletion" % (ref_allele, alt_allele)
deletion_allele, insertion_allele = (alt_allele, ref_allele) if len(alt_allele) < len(ref_allele) else (ref_allele, alt_allele)
if allele1 == "I":
if indel_allele == "I": # We only need 1 allele because if its "I" we can assume allele2 will be "D" and vice-versa
return insertion_allele, deletion_allele
return deletion_allele, insertion_allele

Expand Down Expand Up @@ -98,19 +58,14 @@ def generate_sample_format_info(self, bpm_records, vcf_record, sample_name):
return "."
for i in range(len(bpm_records)):
bpm_record = bpm_records[i]
snp = bpm_record.snp
strand = bpm_record.ref_strand
idx = bpm_record.index_num
b_allele_freq = self._b_allele_freq[idx]

# if indel, convert to actual ref and alt sequences
if bpm_record.is_indel():
allele1, allele2 = convert_indel_alleles(snp, vcf_record)
# if 2/minus strand, get rev comp
elif strand == RefStrand.Minus:
allele1, allele2 = extract_reverse_complement_alleles_from_snp_string(snp)
allele1, allele2 = convert_indel_alleles(bpm_record.plus_strand_alleles[0], vcf_record)
else:
allele1, allele2 = extract_alleles_from_snp_string(snp)
allele1, allele2 = bpm_record.plus_strand_alleles

# normalize to reference allele
ref_allele = vcf_record.REF
Expand Down
9 changes: 6 additions & 3 deletions GencallFormat.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from vcf.parser import _Format
from math import log10

class GencallFormat(object):
"""
Expand Down Expand Up @@ -30,7 +31,7 @@ def get_description():
@staticmethod
def get_format_obj():
return _Format(
GencallFormat.get_id(), 1, "String", GencallFormat.get_description())
GencallFormat.get_id(), 1, "Integer", GencallFormat.get_description())

def _get_min_gencall_score(self, bpm_records):
"""
Expand All @@ -55,6 +56,8 @@ def generate_sample_format_info(self, bpm_records, vcf_record, sample_name):
sample_name (string): Name of sample
Returns:
float: Minimum GenCall score (or None for empty arguments)
int: Minimum GenCall score, encoded as a phred quality (or None for empty arguments)
"""
return self._get_min_gencall_score(bpm_records)
min_gencall_score = self._get_min_gencall_score(bpm_records)
phred_quality_score = int(round(-10*log10(min(1.0, max(0.00001, 1 - min_gencall_score)))))
return phred_quality_score
2 changes: 1 addition & 1 deletion GenotypeFormat.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,7 +310,7 @@ def generate_sample_format_info(self, bpm_records, vcf_record, sample_name):
int_genotype = self._genotypes[record.index_num]
if int_genotype != 0:
nucleotide_genotypes.append(convert_ab_genotype_to_nucleotide(
int_genotype, bpm_records[0].plus_strand_alleles))
int_genotype, record.plus_strand_alleles))
vcf_genotype = convert_indel_genotype_to_vcf(
nucleotide_genotypes, vcf_record, bpm_records[0].is_deletion, ploidy)
else:
Expand Down
2 changes: 2 additions & 0 deletions IlluminaBeadArrayFiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -758,6 +758,8 @@ def __parse_file(self, manifest_file):
all_norm_ids = set()
for locus_idx in range(self.num_loci):
self.normalization_ids[locus_idx] += 100 * self.assay_types[locus_idx]
# To mimic the byte-wrapping behavior from GenomeStudio, AutoCall, IAAP take the mod of 256
self.normalization_ids[locus_idx] %= 256
all_norm_ids.add(self.normalization_ids[locus_idx])
sorted_norm_ids = sorted(all_norm_ids)
lookup_dictionary = {}
Expand Down
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ The supplied manifest file may either be in CSV or BPM format; however, a CSV fo

Any standard product manifest provided by Illumina will already follow these conventions.
### Reference genome
The contig identifiers in the provided genome FASTA file must match exactly the chromosome identifiers specified in the provided manifest. For a standard human product manifest, this means that the contig headers should read ">1" rather than ">chr1". For compatibility with BaseSpace Variant Interpreter (https://www.illumina.com/informatics/research/biological-data-interpretation/variant-interpreter.html), the specified path of the reference must contain either contain the string GrCh37 or GrCh38, for build 37 and 38, respectively. Suitable whole genome FASTA files can be built with the download_reference.sh script located within the scripts directory.
The contig identifiers in the provided genome FASTA file must match exactly the chromosome identifiers specified in the provided manifest. For a standard human product manifest, this means that the contig headers should read ">1" rather than ">chr1". For compatibility with BaseSpace Variant Interpreter (https://www.illumina.com/informatics/research/biological-data-interpretation/variant-interpreter.html), the specified path of the reference must contain either contain the string GrCh37 or GrCh38, for build 37 and 38, respectively. Suitable whole genome FASTA files can be built with the download_reference.sh script located within the scripts directory. This bash script is dependent on samtools (http://www.htslib.org/download/). If running on OSX, you may need to install coreutils (e.g. `brew install coreutils`).

### Squashing duplicates
In the manifest, there can be cases where the same variant is probed by multiple different assays. These assays may be the same design or alternate designs for the same locus. In the default mode of operation, these duplicates will be "squashed" into a single record in the VCF. The method used to incorporate information across multiple assays is under the latter "Output description" heading. When the "--unsquash-duplicates" option is provided, this "squashing" behavior is disabled, and each duplicate assay will be reported in a separate entry in the VCF file. This option is helpful when you are interested in investigating or validating the performance of individual assays, rather than trying to generate genotypes for specific variants. Note that if a locus has more than two alleles and is also queried with duplicated designs, the duplicates will not be unsquashed.
Expand Down Expand Up @@ -122,6 +122,7 @@ conda install -c bioconda pysam=0.9.0
```
where conda is a the package manager binary located in the installation location specified in the first step.

##
## License

Copyright 2018 Illumina
Expand Down
8 changes: 3 additions & 5 deletions VcfRecordFactory.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,12 +68,10 @@ def create_vcf_record(self, bpm_record_group):
if len(auxiliary_record.alleles) != 2:
raise Exception(
"Auxiliary locus definition for " + auxiliary_record.ID + " is not bi-allelic")
for allele in auxiliary_record.alleles:
if len(str(allele)) <= 1:
raise Exception("Auxiliary locus definition for " +
if any([len(str(allele)) <= 1 for allele in auxiliary_record.alleles]):
raise Exception("Auxiliary locus definition for " +
auxiliary_record.ID + " is not a multi-nucleotide variant")
else:
return self._get_record_for_auxiliary(bpm_record_group, auxiliary_record)
return self._get_record_for_auxiliary(bpm_record_group, auxiliary_record)
else:
if bpm_record.is_indel():
return self._get_record_for_indel(bpm_record_group)
Expand Down
6 changes: 6 additions & 0 deletions changelog.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
# 1.2.1 (2020/03/28)
* Fix for large manifests with norm id byte-wrapping issue (e.g. Omni5)
* Fixes for aux VCF MNV handling
* Updates GenCallScore to phred-scaled int
* Updates to download_reference.sh for new genome build location and to work on OSX

# 1.2.0 (2019/05/24)
* Fixes for python3 compatibility
* Use chromosome ordering from reference
Expand Down
2 changes: 1 addition & 1 deletion gtc_to_vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from ReaderTemplateFactory import ReaderTemplateFactory
from FormatFactory import FormatFactory

VERSION = "1.2.0"
VERSION = "1.2.1"

def is_dir_writable(parent_dir):
try:
Expand Down
34 changes: 25 additions & 9 deletions scripts/download_reference.sh
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -6,32 +6,48 @@ then
exit 1
fi

output_file=`readlink -f ${1}`
if [[ "$OSTYPE" == "linux-gnu" ]]; then
output_file=`readlink -f ${1}`
elif [[ "$OSTYPE" == "darwin"* ]]; then
output_file=`greadlink -f ${1}`
else
echo "Error: Unsupported OS ${OSTYPE}"
exit 1
fi

output_dir=`dirname ${output_file}`
genome_build=${2:-"37"}

# Mitochondrial refseq moved to a different folder, hence the "mt_remote"
if [ ${genome_build} = "37" ]; then
remote="ftp://ftp.ncbi.nlm.nih.gov/genomes/Homo_sapiens/ARCHIVE/BUILD.37.3/Assembled_chromosomes/seq"
remote="ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_assembly_structure/Primary_Assembly/assembled_chromosomes/FASTA/"
mt_remote="ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_assembly_structure/non-nuclear/assembled_chromosomes/FASTA/"
elif [ ${genome_build} = "38" ]; then
remote=ftp://ftp.ncbi.nlm.nih.gov/genomes/Homo_sapiens/Assembled_chromosomes/seq/
remote="ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_assembly_structure/Primary_Assembly/assembled_chromosomes/FASTA/"
mt_remote="ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_assembly_versions/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_assembly_structure/non-nuclear/assembled_chromosomes/FASTA/"
else
echo "Error: Unsupported genome build ${genome_build}, valid values are 37,38"
exit 1
fi

temp_dir=`mktemp -d -p ${output_dir}`
temp_dir=`mktemp -d 2>/dev/null || mktemp -d -t ${output_dir}`
pushd ${temp_dir}

for chrom in `seq 1 22` X Y MT
for chrom in `seq 1 22` X Y
do
wget ${remote}/*_ref_*chr${chrom}.fa.gz
wget ${remote}/chr${chrom}.fna.gz
done
wget ${mt_remote}/chrMT.fna.gz

for chrom in `seq 1 22` X Y MT
build_fa(){
echo ">${1}" >> ${2}
gunzip -c chr${1}.fna.gz | grep -v ">" >> "${2}"
}
for chrom in `seq 1 22` X Y
do
echo ">${chrom}" >> ${output_file}
gunzip -c *_ref_*chr${chrom}.fa.gz | grep -v ">" >> ${output_file}
build_fa $chrom $output_file
done
build_fa "MT" $output_file

if hash samtools 2>/dev/null; then
samtools faidx ${output_file}
Expand Down
Loading

0 comments on commit 3fdd391

Please sign in to comment.