Skip to content

Commit

Permalink
Merge pull request #194 from raphael-group/phase-bugfix
Browse files Browse the repository at this point in the history
bug fixes and additional QC
  • Loading branch information
balabanmetin authored Aug 30, 2023
2 parents d3b38e4 + 46249ae commit 9226e44
Show file tree
Hide file tree
Showing 5 changed files with 33 additions and 6 deletions.
14 changes: 13 additions & 1 deletion src/hatchet/utils/ArgParsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -554,7 +554,12 @@ def parse_count_reads_args(args=None):
bams = [args.normal] + args.tumor
for bamfile in bams:
ensure(isfile(bamfile), 'The specified tumor BAM file does not exist')

# also make sure the bam index files are present too
for bamfile in bams:
ensure(
isfile(bamfile + '.bai') or isfile(bamfile.replace('.bam', '.bai')),
'The specified tumor BAM file does not exist',
)
names = args.samples
ensure(
(names is None) or (len(bams) == len(names)),
Expand Down Expand Up @@ -996,6 +1001,13 @@ def parse_genotype_snps_arguments(args=None):
args.snps = None
if args.snps is not None and not (isfile(args.snps) or url_exists(args.snps)):
error('The provided list of SNPs does not exist!', raise_exception=True)
# if the input snps file is a bgzip compressed vcf file
# and associated tabix file is not located in the same directory, report error
if args.snps is not None and isfile(args.snps) and args.snps.endswith('gz') and not isfile(args.snps + '.tbi'):
error(
'The provided list of SNPs is a bgzip compressed vcf file' 'but the associated tabix file does not exist!',
raise_exception=True,
)

# Extract the names of the chromosomes and check their consistency across the given BAM files and the reference
chromosomes = extractChromosomes(samtools, normal, [], args.reference)
Expand Down
2 changes: 1 addition & 1 deletion src/hatchet/utils/cluster_bins.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ def read_bb(bbfile, subset=None):
populated_labels = False

chr_labels = []
for ch, df0 in bb.groupby(['#CHR']):
for ch, df0 in bb.groupby('#CHR'):
df0 = df0.sort_values('START')

p_arrs = []
Expand Down
18 changes: 16 additions & 2 deletions src/hatchet/utils/genotype_snps.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,29 +253,43 @@ def callSNPs(self, bamfile, samplename, chromosome):
cmd_mpileup = '{} mpileup {} -Ou -f {} --skip-indels -a INFO/AD,AD,DP -q {} -Q {} -d {}'.format(
self.bcftools, bamfile, self.reference, self.q, self.Q, self.dp
)
cmd_call = '{} call -mv -Oz -o {}'.format(self.bcftools, outfile)

if self.snplist is not None:
assert os.path.isfile(tgtfile)
cmd_mpileup += ' -T {}'.format(tgtfile)
else:
cmd_mpileup += ' -r {}'.format(chromosome)
if self.E:
cmd_mpileup += ' -E'
cmd_call = '{} call -mv -Ou'.format(self.bcftools)
cmd_filter = "{} view -i 'FMT/DP>={}' -Oz -o {}".format(self.bcftools, self.mincov, outfile)

with open(errname, 'w') as err:
pcss = []
mpileup = pr.Popen(
shlex.split(cmd_mpileup),
stdout=pr.PIPE,
stderr=err,
universal_newlines=True,
)
pcss.append(mpileup)
call = pr.Popen(
shlex.split(cmd_call),
stdin=mpileup.stdout,
stdout=pr.PIPE,
stderr=err,
universal_newlines=True,
)
codes = map(lambda p: p.wait(), [mpileup, call])
pcss.append(call)
filter = pr.Popen(
shlex.split(cmd_filter),
stdin=call.stdout,
stdout=pr.PIPE,
stderr=err,
universal_newlines=True,
)
pcss.append(filter)
codes = [p.wait() for p in pcss]
if any(c != 0 for c in codes):
raise ValueError(
error('SNP Calling failed on {} of {}, please check errors in {}!').format(
Expand Down
3 changes: 2 additions & 1 deletion src/hatchet/utils/phase_snps.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,7 @@ def run_shapeit(self, infile, chromosome):
f'--input-ref {inref} '
f'--output-log {self.outdir}/{chromosome}_alignments'
)
cmd_touch = f'touch {self.outdir}/{chromosome}_alignments.snp.strand.exclude'

cmd_phase = (
f'{self.shapeit} --input-vcf {self.outdir}/{chromosome}_filtered.vcf.gz '
Expand All @@ -356,7 +357,7 @@ def run_shapeit(self, infile, chromosome):

run(cmd_check, stdouterr_filepath=errname, check_return_codes=False) # May return 1
run(
[cmd_phase, cmd_convert, cmd_compress, cmd_index],
[cmd_touch, cmd_phase, cmd_convert, cmd_compress, cmd_index],
stdouterr_filepath=errname,
error_msg=f'Phasing failed on {infile}.',
)
Expand Down
2 changes: 1 addition & 1 deletion tests/test_steps.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ def test_genotype_snps(_mock1, bams, output_folder):
'-r',
config.paths.reference,
'-c',
'290', # min reads
'0', # min reads
'-C',
'300', # max reads
'-o',
Expand Down

0 comments on commit 9226e44

Please sign in to comment.