This repository has been archived by the owner on Feb 27, 2018. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 3
/
ADTEx.py
executable file
·445 lines (393 loc) · 17.5 KB
/
ADTEx.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
#!/usr/bin/python2.7 -u
# ----------------------------------------------------------------------#
# Note from Jeltje van Baren, 2015:
# Reworked the code to make more robust
# Infer bam files from input name
# Removed threading
# Added final DNAcopy step to take in log scores instead of ratios
# (CBS code doesn't work properly on those)
# Catch runtime errors
# Altered original code to avoid copying large input files, and
# removed concatenation of png files into a pdf
#
# Copyright (c) 2013, Kaushalya Amarasinghe.
#
# > Source License <
# This file is part of ADTEx.
#
# ADTEx is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# ADTEx is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with ADTEx. If not, see <http://www.gnu.org/licenses/>.
#
#
#-----------------------------------------------------------------------#
import os
import argparse
import sys
import subprocess
import shutil
#absolute script path
scriptPath = os.path.realpath(os.path.dirname(sys.argv[0]))
class Options:
"""
Note that True and False are set to strings - this is done to pass them to R programs in the pipeline
"""
def __init__(self):
self.parser = argparse.ArgumentParser('Aberration Detection in Tumour EXome')
self.parser.add_argument('-n','--normal', required=True,
help = 'Matched normal sample in bam format OR bed formatted coverage [REQUIRED], to generate bed formatted coverage please see documentation',
dest='control')
self.parser.add_argument('-t','--tumor', required=True,
help = 'Tumor sample in bam format OR bed format DOC coverage [REQUIRED], to generate bed formatted coverage please see documentation',dest='tumor')
self.parser.add_argument('-s','--sampleid', type=str, required=True, help = 'sample ID (will be used in output) [REQUIRED]')
self.parser.add_argument('-b','--targetbed', required=True, help = 'BED format of the targeted regions [REQUIRED]')
self.parser.add_argument('-c', '--centromeres', type=str, required=True, help="Centromere bed file [REQUIRED]")
self.parser.add_argument('-o','--out', type=str, required=True,
help='Output folder name to store temporary files, folder will be created as a subfolder of the working directory if it does not exist [REQUIRED]', action='store', dest='outFolder')
self.parser.add_argument('--keeptemp',help='do not delete temporary output folder adtexout', dest='keeptemp',action='store_true')
self.parser.add_argument('--tostdout',help='Print cnv file to STDOUT, remove workdir. Only works if ploidy is not estimated.', dest='tostdout',action='store_true')
self.parser.add_argument('--ploidy', type=str, help='Most common ploidy in the tumour sample [2]',
dest='ploidy',action='store')
self.parser.add_argument('--estimatePloidy', help='If provided, --baf must be specified to estimate base ploidy [FALSE]',
dest='p_est', action='store_true',default='False')
self.parser.add_argument('--minReadDepth', type=str, default='10',
help='The threshold for minimum read depth for each exon [10]',
action='store', dest='minReadDepth' )
self.parser.add_argument('-p', '--plot',
help='Plots each chromosome with CNV estimates [False]',
action='store_true', dest='plot', default=False)
self.parser.add_argument('--baf', help='File containing B allele frequencies at heterozygous loci of the normal [optional]', dest='baf',action='store')
args = self.parser.parse_args()
self.control = args.control
self.tumor = args.tumor
self.sample = args.sampleid
self.bed = args.targetbed
self.centro = args.centromeres
self.outFolder = args.outFolder
self.minRead = args.minReadDepth
self.bafin = 'False'
self.baf = 'False'
if args.baf:
self.baf = args.baf
self.bafin = 'True'
self.tostdout = False
if args.tostdout:
self.tostdout = True
self.keeptemp = False
if args.keeptemp:
self.keeptemp = True
self.ploidyIn = 'False'
self.ploidy = '2'
if args.ploidy:
self.ploidy = args.ploidy
self.p_est='False'
if str(args.p_est)=='True':
print >>sys.stderr, "Ploidy provided. Estimation step won't be executed"
self.ploidyIn = 'True'
if str(args.p_est)=='True' and str(self.ploidyIn)=='False':
if args.baf:
self.p_est='True'
else:
self.parser.error('--baf must be provided if base ploidy estimation is used')
elif str(self.ploidyIn)=='False':
self.ploidy = '2'
self.p_est = 'False'
self.plot = 'False'
if args.plot:
self.plot = 'True'
def mkdir_p(path):
"""
Allow overwriting an existing directory
Make sure the file path starts with /data if we're on a docker container
"""
if not path.startswith('/data/'):
if 'docker' in open('/proc/self/cgroup').read():
path = os.path.join('/data/', path)
if not os.path.isdir(path):
try:
os.makedirs(path)
except:
raise
def getCoverage(bamfile, targetfile, outfile):
"""
Runs bedtools coverage with the -d flag
"""
with open(outfile, 'w') as o:
subprocess.check_call([
'bedtools', 'coverage',
'-abam', bamfile,
'-d',
'-b', targetfile,
], stdout = o)
def sortFile(infile,outfile, tmpdir):
"""
Sort input coverage file by chromosome and start, end positions
"""
with open(outfile, 'w') as o:
return subprocess.Popen(['sort', '-T', tmpdir, '-V', '-k1', '-k2n', '-k3n', '-k4n', infile], stdout=o)
def getChroms(infile):
"""
Returns a list of chromosome IDs found in the first column of the input file, in the same order
"""
chr=[]
chr_prev=0
with open (infile, 'r') as f:
for line in f:
chr_current=line.rstrip().split("\t")[0]
if (chr_current!=chr_prev and chr_current[0]!="G" and chr_current!='chr'):
chr_prev=chr_current
chr.append(chr_current)
return ",".join(chr)
def getMeanCoverage(infile,outfile):
"""
Calculate mean coverage in input file, outputs a file with chrom, startpos, endpos, and mean coverage
"""
with open(infile, 'r') as inF, open (outfile, 'w') as outF:
# keep track of the values per chromosome
chrdict = dict()
tot = 0
count = 0
for line in inF:
l = line.rstrip().split('\t')
ll = len(l)
tot = tot + float(l[ll-1])
if (float(l[ll-1]) <10):
count = count + 1
if (int(l[2]) - int(l[1]) == int(l[ll-2])):
if count > -1:
chr = str(l[0])
start = str(l[1])
end = str(l[2])
mean = round(float(tot)/int(l[ll-2]),0)
chrdict.setdefault(chr,set()).add(mean)
outF.write(chr+"\t"+start+"\t"+end+"\t"+str(mean)+"\n")
tot = 0
count =0
# sanity check
noCov = False
empties = []
for chr in chrdict:
if len(chrdict[chr]) == 1:
noCov = True
empties.append(chr)
if noCov:
print >>sys.stderr, "ERROR: These chromosomes have no coverage in {}: {}".format(infile, ', '.join(c for c in empties))
exit(1)
def segmentRatio(params, cCoverage, tCoverage, outF, chroms, ratioOutfile, snpSegfile):
"""
Calls segment_ratio.R
if bafin is positive but no ploidy is asked, the program outputs a second file snp_segments,
which is then called later from ADTEx.py
chroms is a comma separated list of chromosomes
"""
rScriptName = os.path.join(scriptPath,'segment_ratio.R')
rFunctionsPath = os.path.join(scriptPath,'RFunction.R')
subprocess.check_call(['Rscript', rScriptName, rFunctionsPath, cCoverage,
tCoverage, params.minRead, outF, params.bafin, params.baf,
params.ploidyIn, ratioOutfile, snpSegfile, chroms])
def analyseCNV(params, ratio_data, outdir, tmpdir, chroms):
"""
Calls cnv_analyse.R to estimate copynumber. When ploidy estimation is required, it runs the script three times, once
for ploidy 2, 3, and 4. Output (in outdir) is called cnv.result or (in tmpdir) cnv.result2, cnv.result3, cnv.result4
"""
rScriptName = os.path.join(scriptPath, 'cnv_analyse.R')
rFunctionsPath = os.path.join(scriptPath, 'RFunction.R')
# if we don't want to estimate ploidy
if(params.p_est == "False"):
outfile = os.path.join(outdir, "cnv.result")
with open(outfile, 'w') as o:
subprocess.check_call(['Rscript', rScriptName, rFunctionsPath,
ratio_data, params.ploidy, params.minRead,
params.bafin, params.baf, params.plot, chroms], stdout=o)
else:
# if we DO want to estimate ploidy, run CNV three times, once for each ploidy and output files as temporaries
# in tmpdir
procs = []
for ploidy in ['2', '3', '4']:
outfile = os.path.join(tmpdir, "cnv.result" + ploidy)
with open(outfile, 'w') as o:
p = subprocess.Popen(['Rscript', rScriptName, rFunctionsPath,
ratio_data, ploidy, params.minRead,
params.bafin, params.baf, params.plot, chroms], stdout=o)
procs.append(p)
for p in procs:
if p.wait() != 0:
raise Exception("Could not finish cnv_analyse.R")
def zygosity(params, outdir, cnv, chroms):
"""
Calls zygosity.R, which outputs two files: contamination (with a contamimation number) and zygosity.res, which is
a table with zygosity numbers. If plots are requested, these are put in outdir too.
"""
rScriptName = os.path.join(scriptPath,"zygosity.R")
subprocess.check_call(['Rscript', rScriptName, params.baf, outdir, cnv, params.minRead, params.plot, chroms])
def estimatePloidy(tmpdir, workdir, snpSegfile):
"""
Runs extract_cnv.R, bedtools intersect, and base_cnv.R.
extract_cnv.R expects cnv.result<ploidy> and outputs cnv<ploidy>
bedtools then intersects the cnv<ploidy> file with the snpSegfile created in segmentRatio
base_cnv uses the intersectfiles to determine the correct ploidy, which it writes to a file named ploidy
The corresponding file is then moved to the working directory
"""
rScriptName = os.path.join(scriptPath,"extract_cnv.R")
subprocess.check_call(['Rscript', rScriptName, tmpdir])
for i in ["2", "3", "4"]:
cnvfile = os.path.join(tmpdir, 'cnv' + i)
outfile = os.path.join(tmpdir, 'cnv' + i + "_baf.txt")
with open(outfile, 'w') as o:
subprocess.check_call([
'bedtools', 'intersect',
'-a', snpSegfile,
'-b', cnvfile,
'-wb'
], stdout = o)
rScriptName = os.path.join(scriptPath,"base_cnv.R")
subprocess.check_call(['Rscript', rScriptName, tmpdir, workdir])
# now move the cnv results with the selected ploidy to the output file
ploidy=open(os.path.join(workdir, "ploidy")).readline().strip()
shutil.copyfile(os.path.join(tmpdir, "cnv.result" + ploidy), os.path.join(workdir, "cnv.result"))
class cent(object):
"""centromere objects by chromosome"""
def __init__(self, line):
[id, start, end] = line.split("\t")
self.name = id
self.p = int(start)
self.q = int(end)
def getChrom(id, centList, cur):
"""get split chromosome IDs"""
if cur and id == cur.name:
return cur
for i in centList:
if id == i.name:
return i
return False
def doCBS(centro, tmpdir, infile, outfile, sampleId):
"""
Splits input cnv file in chromosome arms based on the centro bed file input,
then runs basicDNAcopy.R and removes the split
"""
# create centromere object list
centros=[]
with open(centro, 'r') as f:
for line in f:
line = line.strip()
centobj = cent(line)
centros.append(centobj)
# split the chromosomes in arms
splitfile = os.path.join(tmpdir, 'splitchrom.cnv')
with open (infile, 'r') as f, open(splitfile, 'w') as o:
curchrom = False
for line in f:
fields = line.split("\t")
if fields[0] == 'chr':
o.write(line)
continue
id = fields[0]
start = int(fields[1])
end = int(fields[2])
curchrom = getChrom(id, centros, curchrom)
# centromere free chromosomes
if not curchrom:
o.write(line)
# print only if segment does not overlap centromere
elif end < curchrom.p:
fields[0] = ('.').join([fields[0], 'p'])
o.write(("\t").join(fields))
elif start > curchrom.q:
fields[0] = ('.').join([fields[0], 'q'])
o.write(("\t").join(fields))
# run CBS
rScriptName = os.path.join(scriptPath,"basicDNAcopy.R")
cbs = subprocess.check_output(['Rscript', rScriptName, splitfile, "2.5", sampleId])
cbs = cbs.replace('.p\t', '\t')
cbs = cbs.replace('.q\t', '\t')
with open(outfile, 'w') as o:
o.write(cbs)
def main():
subprocess.call("date")
options = Options()
control = options.control
tumor = options.tumor
targets = options.bed
bafIn = options.bafin
# These folders are local to the docker container
workdir = options.outFolder
tmpdir = os.path.join(workdir, 'tmp')
mkdir_p(tmpdir)
if control.endswith('bam'):
print >>sys.stderr, 'Looks like input files are bam, creating coverage files...'
# first create the coverage file, then set the control variable to be that file
covOut = os.path.join(tmpdir, 'control.cov')
getCoverage(control, targets, covOut)
assert os.path.exists(covOut)
control = covOut
# now for tumor
covOut = os.path.join(tmpdir, 'tumor.cov')
getCoverage(tumor, targets, covOut)
assert os.path.exists(covOut)
tumor = covOut
else:
print >>sys.stderr, 'Looks like input files are bedtools coverage...'
chromstring= getChroms(targets)
print >>sys.stderr, 'Sorting input files...'
sortedcontrol = os.path.join(tmpdir, 'control.coverage.sorted')
sortedtumor = os.path.join(tmpdir, 'tumor.coverage.sorted')
sortdir = os.path.join(tmpdir, 'sort')
mkdir_p(sortdir)
p1 = sortFile(os.path.abspath(control), sortedcontrol, sortdir)
p2 = sortFile(os.path.abspath(tumor), sortedtumor, sortdir)
p1.wait()
p2.wait()
print >>sys.stderr, 'Generating mean coverage files...'
coveredcontrol = os.path.join(tmpdir, 'control.coverage')
coveredtumor = os.path.join(tmpdir, 'tumor.coverage')
getMeanCoverage(sortedcontrol, coveredcontrol)
getMeanCoverage(sortedtumor, coveredtumor)
print >>sys.stderr, 'Creating segment ratios...'
ratiofile = os.path.join(tmpdir, 'ratio.data')
snpSegfile = os.path.join(tmpdir, 'snp_segments')
segmentRatio(options, coveredcontrol, coveredtumor, workdir, chromstring, ratiofile, snpSegfile)
# replace chromstring in case the ratio file contains fewer chromosomes
chromstring= getChroms(ratiofile)
print >>sys.stderr, 'Analyzing CNV...'
cnvFiles = analyseCNV(options, ratiofile, workdir, tmpdir, chromstring)
if(options.p_est == "True"):
print >>sys.stderr, "Estimating base ploidy..."
estimatePloidy(tmpdir, workdir, snpSegfile)
if options.plot == "True":
#Runs plot_results.R, which expects a file named cnv.results in the input directory
#Creates one plot per chromosome.
rScriptName = os.path.join(scriptPath,"plot_results.R")
subprocess.check_call(['Rscript', rScriptName, workdir, chromstring])
# if baf is provided, predict zygosity
# at this point we should have a file called cnv.result in workdir
cnvfile = os.path.join(workdir, 'cnv.result')
assert os.path.exists(cnvfile)
if(bafIn == "True"):
print >>sys.stderr, "Predicting Zygosity states"
zygosity(options, workdir, cnvfile, chromstring)
# This is an addition to the original ADTEx code. In cnv_analyse.R, DNAcopy is
# called but not supplied with a proper logratio. Changing it in the code
# doesn't appear to work well, so here we post process it, also blocking out centromeres.
sampleId = options.sample
cbsOut = os.path.join(workdir, sampleId + '.cnv')
doCBS(options.centro, tmpdir, cnvfile, cbsOut, sampleId)
if options.tostdout and bafIn != "True":
with open(cbsOut, 'r') as f:
print f.read()
shutil.rmtree(workdir)
else:
# Delete temporary directory
if not options.keeptemp:
shutil.rmtree(tmpdir)
subprocess.call("date",shell=True)
if __name__ == "__main__":
main()