-
Notifications
You must be signed in to change notification settings - Fork 1
/
fpsac.sh
executable file
·460 lines (398 loc) · 14.5 KB
/
fpsac.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
# FPSAC, Fast Phylogenetic Scaffolding of Ancient Contigs.
# June 2013.
# cedric.chauve@sfu.ca
# PARAMETERS
#
# $1 = directory where data and results are stored, directory name (created if required)
# $2 = species tree with marked ancestor, file name
# file format: Nexus file, with no annotation, with branch lengths and ancestor node marked by @
# $3 = ancient contigs sequence file
# file format: FASTA
# $4 = extant genomes sequence file
# file format: FASTA
# $5 = length threshold used to define homologous families, integer
# $6 = similarity threshold used to define homologous families, integer
# $7 = upper bound on ancestral multiplicity used in the scaffolding phase, integer
# $8 = ancestor name, single word text
date
uname -a
echo
# -----------------------------------------------------------
# DIRECTORIES AND FILE NAMES
# -----------------------------------------------------------
echo "CREATING DIRECTORY STRUCTURE AND FORMATTING INPUT FILES"
DIR=$1
DATA=$DIR/input
RESULTS=$DIR/results_$8
CONTIGS=$RESULTS/contigs
SCAFFOLD=$RESULTS/scaffold
FINISHING=$RESULTS/finishing
FINISHING_ALG=$FINISHING/alignments
LG_THRESHOLD=$5
SIM_THRESHOLD=$6
MULT_THRESHOLD=$7
ANCESTOR_NAME=$8
echo
echo "DIRECTORY: $DIR"
echo "ANCESTOR NAME: $ANCESTOR_NAME"
echo
echo "---> Creating/cleaning directories"
mkdir -p $DATA
mkdir -p $CONTIGS
mkdir -p $SCAFFOLD
mkdir -p $FINISHING_ALG
rm -f $FINISHING_ALG/*fasta_muscle
TREE=$DATA/tree
SPECIES=$DATA/species
SPECIES_PAIRS=$DATA/species_pairs
EXTANT_GENOMES_FA=$DATA/extant_genomes.fasta
EXTANT_GENOMES_LG=$DATA/extant_genomes_length
EXTANT_GENOMES_DB=$DATA/extant_genomes_db
ANCESTRAL_CONTIGS_FA=$DATA/ancestral_contigs.fasta
ANCESTRAL_CONTIGS_LG=$DATA/ancestral_contigs_length
MEGABLAST_HITS=$CONTIGS/megablast_hits
ANCIENT_EXTANT_HITS=$CONTIGS/ancient_extant_hits
FAMILIES_COORDINATES_ALL=$CONTIGS/families_with_contig_names
FAMILIES_PROFILES_ALL=$CONTIGS/families_profiles
FAMILIES_ERRORS=$CONTIGS/families_errors
FAMILIES_ANCESTRAL_CONTENT=$CONTIGS/families_ancestral_content
FAMILIES_COORDINATES_FILTERED=$CONTIGS/families_filtered
FAMILIES_COORDINATES_FILTERED_DOUBLED=$CONTIGS/families_filtered_doubled
FAMILIES_FA=$CONTIGS/families.fasta
FAMILIES_LG=$CONTIGS/families_length
ADJACENCIES_ALL=$SCAFFOLD/adjacencies
ADJACENCIES_WGTD=$SCAFFOLD/adjacencies_weighted
ADJACENCIES_KEPT=$SCAFFOLD/adjacencies_kept
ADJACENCIES_DISC=$SCAFFOLD/adjacencies_discarded
ADJACENCIES_CARS_ALL=$SCAFFOLD/adjacencies_CARS
RSI_ALL=$SCAFFOLD/repeat_spanning_intervals
RSI_WGTD=$SCAFFOLD/repeat_spanning_intervals_weighted
RSI_KEPT=$SCAFFOLD/repeat_spanning_intervals_kept
RSI_DISC=$SCAFFOLD/repeat_spanning_intervals_disc
REPEAT_CLUSTERS=$SCAFFOLD/repeat_clusters
SCAFFOLD_ORDER_DOUBLED=$SCAFFOLD/scaffold_order_doubled
SCAFFOLD_ORDER=$SCAFFOLD/scaffold_order
ADJACENCIES_UNASSIGNED=$SCAFFOLD/adjacencies_unassigned
SCAFFOLD_FA=$SCAFFOLD/scaffold.fasta
SCAFFOLD_FINISHED_FA=$FINISHING/ancestral_sequence.fasta
SCAFFOLD_MAP=$FINISHING/ancestral_sequence_map
GAPS_COORDINATES=$SCAFFOLD/gaps_coordinates
GAPS_COORDINATES_LG=$SCAFFOLD/gaps_coordinates_and_length
REPORT=$DIR/report_$8
echo "---> Formatting and copying input files"
python src/fpsac_format_fasta.py $4 $EXTANT_GENOMES_FA $EXTANT_GENOMES_LG
python src/fpsac_format_fasta.py $3 $ANCESTRAL_CONTIGS_FA $ANCESTRAL_CONTIGS_LG
cp $2 $TREE
# -----------------------------------------------------------
# STAGE A. COMPUTING HOMOLOGOUS FAMILIES
# -----------------------------------------------------------
echo
# -----------------------------------------------------------
echo "STAGE A.1: COMPUTING CONTIGS/EXTANT GENOMES HITS"
echo "---> Computing hits: BLAST index for extant genomes (makembindex)"
makembindex \
-input ${EXTANT_GENOMES_FA} \
-output ${EXTANT_GENOMES_DB} \
-verbosity quiet
echo "---> Computing hits: BLAST database for extant genomes (makeblastdb)"
makeblastdb \
-in ${EXTANT_GENOMES_FA} \
-dbtype nucl
echo "---> Computing hits (blastn -task megablast)"
blastn -task megablast \
-db ${EXTANT_GENOMES_FA} \
-query ${ANCESTRAL_CONTIGS_FA} \
-use_index True \
-index_name ${EXTANT_GENOMES_DB} \
-out ${MEGABLAST_HITS} \
-outfmt 6
echo "---> Formatting hits"
python src/fpsac_format_blast_hits.py \
${MEGABLAST_HITS} \
${ANCESTRAL_CONTIGS_LG} \
${ANCIENT_EXTANT_HITS}
# Output: $ANCIENT_EXTANT_HITS - contigs/genomes hits file
# -----------------------------------------------------------
echo
echo "STAGE A.2: COMPUTING HOMOLOGOUS FAMILIES AND PROFILES"
echo "---> Computing families"
python src/fpsac_compute_families_coordinates_and_profiles.py \
${ANCIENT_EXTANT_HITS} \
${FAMILIES_COORDINATES_ALL}_1 \
${FAMILIES_PROFILES_ALL}_1 \
${LG_THRESHOLD} \
${SIM_THRESHOLD}
python src/fpsac_correct_families.py \
${FAMILIES_COORDINATES_ALL}_1 \
${FAMILIES_PROFILES_ALL}_1 \
${FAMILIES_COORDINATES_ALL} \
${FAMILIES_PROFILES_ALL} \
${FAMILIES_ERRORS}
rm -f ${FAMILIES_COORDINATES_ALL}_1 ${FAMILIES_PROFILES_ALL}_1
# Output: ${FAMILIES_COORDINATES_ALL} - families and markers coordinates
# Output: ${FAMILIES_PROFILES_ALL} - families extant profiles
# Output: ${FAMILIES_ERRORS} - families where markers could not be oriented (discarded from further analyses)
# -----------------------------------------------------------
echo
echo "STAGE A.3: COMPUTING ANCESTRAL CONTENT/MULTIPLICITIES"
echo "---> Computing ancestral content/multiplicities"
python src/fpsac_compute_families_ancestral_content.py \
$TREE \
${FAMILIES_PROFILES_ALL} \
${FAMILIES_ANCESTRAL_CONTENT}
# Output: ${FAMILIES_ANCESTRAL_CONTENT} - families ancestral content
# -----------------------------------------------------------
# STAGE B. SCAFFOLDING
# -----------------------------------------------------------
echo
echo "STAGE B.1: SPECIES, SPECIES PAIRS, DOUBLING MARKERS, FILTERING HIGH MULTIPLICITIES FAMILIES"
echo "---> Computing species pairs list"
python src/fpsac_compute_species_pairs_list.py \
$TREE \
${SPECIES_PAIRS}
# Output: ${SPECIES_PAIRS} - pairs of species whose genomes are compared
echo "---> Computing species list"
python src/fpsac_compute_species_list.py \
$TREE \
ALL \
$SPECIES
# Output: $SPECIES - list of species in the tree
echo "---> Filtering high multiplicity homologous families"
python src/fpsac_filter_families.py \
${FAMILIES_COORDINATES_ALL} \
${FAMILIES_ANCESTRAL_CONTENT} \
${MULT_THRESHOLD} \
${FAMILIES_COORDINATES_FILTERED}
# Output: ${FAMILIES_COORDINATES_FILTERED} - families with multiplicity below chosen threshold
echo "---> Doubling markers"
python src/fpsac_double_markers.py \
${FAMILIES_COORDINATES_FILTERED} \
${FAMILIES_COORDINATES_FILTERED_DOUBLED}
# Output: ${FAMILIES_COORDINATES_FILTERED_DOUBLED} - doubled markers
# -----------------------------------------------------------
echo
echo "STAGE B.2.a: COMPUTING AND WEIGHTING ADJACENCIES"
echo "---> Computing adjacencies"
python src/fpsac_compute_conserved_syntenies.py \
${FAMILIES_COORDINATES_FILTERED_DOUBLED} \
${FAMILIES_ANCESTRAL_CONTENT} \
$SPECIES \
${SPECIES_PAIRS} \
1 \
ADJ \
${ADJACENCIES_ALL}
# Note: parameter 1 indicates that the considered genomes are circular
# Output: $ADJACENCIES_ALL - Dollo conserved adjacencies between markers
echo "---> Weighting adjacencies"
python src/fpsac_weight.py \
${ADJACENCIES_ALL} \
$TREE \
${ADJACENCIES_WGTD} \
d
# Output: $ADJACENCIES_WGTD - weighted adjacencies
# -----------------------------------------------------------
echo
echo "STAGE B.2.b: COMPUTING AND WEIGHTING REPEAT SPANNING INTERVALS"
echo "---> Computing repeat spanning intervals"
python src/fpsac_compute_conserved_syntenies.py \
${FAMILIES_COORDINATES_FILTERED_DOUBLED} \
${FAMILIES_ANCESTRAL_CONTENT} \
$SPECIES \
${SPECIES_PAIRS} \
1 \
RSI \
${RSI_ALL}
# Output: ${RSI_ALL} - repeat spanning intervals
echo "---> Weighting repeat spanning intervals"
python src/fpsac_weight.py \
${RSI_ALL} \
$TREE \
${RSI_WGTD}_tmp \
d
python src/fpsac_correct_weighted_repeat_spanning_intervals.py \
${RSI_WGTD}_tmp \
${RSI_ALL} \
${RSI_WGTD}
rm -f ${RSI_WGTD}_tmp
# Output: $RSI_WGTD - weighted repeat spanning intervals
# -----------------------------------------------------------
echo
echo "STAGE B.3.a: LINEARIZATION/CIRCULARIZATION"
echo "---> Filtering adjacencies"
python src/fpsac_filter_adjacencies.py \
${ADJACENCIES_WGTD} \
${FAMILIES_ANCESTRAL_CONTENT} \
1 \
${ADJACENCIES_KEPT} \
${ADJACENCIES_DISC}
# Note: parameter 1 indicates that the considered genomes are circular
# Output: $ADJACENCIES_KEPT - kept adjacencies
# Output: $ADJACENCIES_DISC - discarded adjacencies
echo "---> Filtering repeat spanning intervals"
python src/fpsac_filter_repeat_spanning_intervals.py \
${RSI_WGTD} \
${ADJACENCIES_KEPT} \
${FAMILIES_ANCESTRAL_CONTENT} \
${RSI_KEPT} \
${RSI_DISC}
# Output: $RSI_KEPT - kept repeat spanning intervals
# Output: $RSI_DISC - discarded repeat spanning intervals
# -----------------------------------------------------------
echo
echo "STAGE B.3.b: COMPUTING SCAFFOLDS FROM FILTERED ADJACENCIES AND REPEAT SPANNING INTERVALS"
echo "---> Computing doubled scaffolds"
python src/fpsac_compute_scaffold_order.py \
${RSI_KEPT} \
${ADJACENCIES_KEPT} \
${FAMILIES_ANCESTRAL_CONTENT} \
${SCAFFOLD_ORDER_DOUBLED} \
${ADJACENCIES_UNASSIGNED} \
${ANCESTOR_NAME}
# Output: $SCAFFOLD_ORDER_DOUBLED - scaffold order with doubled markers
# Output: $ADJACENCIES_UNASSIGNED - adjacencies unassigned to any scaffold
echo "---> Undoubling markers"
python src/fpsac_halve_scaffold.py \
${SCAFFOLD_ORDER_DOUBLED} \
${SCAFFOLD_ORDER}
# Output: $SCAFFOLD_ORDER - scaffold order
# -----------------------------------------------------------
echo
echo "STAGE B.3.c: COMPUTING ADJACENCIES BETWEEN CARS AND REPEAT CLUSTERS"
echo "---> Misc: Computing adjacencies between CARs extremities"
python src/fpsac_compute_cars_adjacencies.py \
${FAMILIES_COORDINATES_FILTERED_DOUBLED} \
${SCAFFOLD_ORDER_DOUBLED} \
$SPECIES \
1 \
${ADJACENCIES_CARS_ALL}
# Note: parameter 1 indicates that the considered genomes are circular
# Output: $ADJACENCIES_CARS_ALL - all adjacencies between CARS extremities
echo "---> Misc: Computing maximal repeat clusters"
python src/fpsac_compute_repeat_clusters.py \
${ADJACENCIES_ALL} \
${FAMILIES_ANCESTRAL_CONTENT} \
${REPEAT_CLUSTERS}
# Output: $REPEAT_CLUSTERS - list of all maximal repeat clusters
# -----------------------------------------------------------
echo
echo "STAGE C.1: COMPUTING ANCESTRAL GAPS LENGTHS"
echo "---> Extracting conserved extant gaps"
python src/fpsac_extract_gaps.py \
${ADJACENCIES_KEPT} \
${RSI_KEPT} \
${FAMILIES_ANCESTRAL_CONTENT} \
${GAPS_COORDINATES}
# Output: $GAPS_COORDINATES - coordinates in extant genomes of all gaps present in the scaffolds
echo "---> Computing gaps length"
python src/fpsac_compute_supported_gaps.py \
${GAPS_COORDINATES} \
${SPECIES_PAIRS} \
1 0 \
${GAPS_COORDINATES_LG}
# Output: $GAPS_COORDINATES_LG - coordinates in extant genomes and length in the ancestral genome of Dollo conserved gaps
# -----------------------------------------------------------
echo
echo "STAGE C.2: COMPUTING AND ALIGNING EXTANT GAPS SEQUENCES"
echo "---> Computing gaps sequences"
python src/fpsac_extract_extant_gaps_sequences.py \
${EXTANT_GENOMES_FA} \
${GAPS_COORDINATES_LG} \
${FINISHING_ALG}/
echo "---> Aligning gaps sequences with muscle"
GAPS_SEQ=`ls ${FINISHING_ALG}/*fasta`
for F in $GAPS_SEQ; do
python src/fpsac_format_fasta.py \
$F \
$F"_extant" \
$FINISHING/TMP
muscle \
-in $F"_extant" \
-out $F"_muscle_1" \
-quiet
# Below is to handle the well documented muscle bug: http://www.drive5.com/muscle/manual/msa_getletter_bug.html
if [ ! -f $F"_muscle_1" ];
then
echo "-----> Retrying with option maxiters 1: file " $F
muscle \
-in $F"_extant" \
-out $F"_muscle_1" \
-quiet -maxiters 1
fi
python src/fpsac_format_fasta.py \
$F"_muscle_1" \
$F"_muscle" \
$FINISHING/TMP
rm -f $FINISHING/TMP $F"_muscle_1"
done
# Output: $FINISHING_ALG/*.fasta_extant - extant gaps FASTA files
# Output: $FINISHING_ALG/*.fasta_muscle - aligned extant gaps FASTA files
# -----------------------------------------------------------
echo
echo "STAGE C.3: COMPUTING ANCESTRAL GAPS SEQUENCES"
echo "---> Computing ancestral gaps sequences"
for F in $GAPS_SEQ; do
python src/fpsac_compute_ancestral_gap_sequence.py \
$F"_muscle" \
$TREE \
$F"_ancestral"
done
# Output: $FINISHING_ALG/*.fasta_ancestral - ancestral gaps FASTA files
# -----------------------------------------------------------
echo
echo "STAGE C.4: COMPUTING ANCESTRAL SCAFFOLD SEQUENCE AND MAP"
echo "---> Computing sequences of ancestral markers"
python src/fpsac_extract_families_sequences.py \
${FAMILIES_COORDINATES_ALL} \
${ANCESTRAL_CONTIGS_FA} \
${FAMILIES_FA} \
${FAMILIES_LG}
# Output: $FAMILIES_FA - FASTA sequences of ancestral markers
# Output: $FAMILIES_LG - length of sequences of ancestral markers
echo "---> Computing scaffold sequence with Ns in gaps"
python src/fpsac_compute_scaffold_sequence_N.py \
${FAMILIES_FA} \
${SCAFFOLD_ORDER} \
${GAPS_COORDINATES_LG} \
${SCAFFOLD_FA}
# Output: $SCAFFOLD_FA - scaffold sequences with Ns in gaps
echo "---> Computing ancestral sequence with gaps"
python src/fpsac_compute_scaffold_sequence.py \
${FAMILIES_FA} \
${SCAFFOLD_ORDER} \
${GAPS_COORDINATES_LG} \
${FINISHING_ALG}/ \
${SCAFFOLD_FINISHED_FA}
# Output: $SCAFFOLD_FINISHED_FA - FASTA sequence of the ancestral scaffold, including gaps
echo "---> Computing ancestral map"
python src/fpsac_compute_scaffold_map.py \
${FAMILIES_COORDINATES_ALL} \
${FAMILIES_FA} \
${SCAFFOLD_ORDER} \
${GAPS_COORDINATES_LG} \
${FINISHING_ALG}/ \
${SCAFFOLD_MAP}
# Output: $SCAFFOLD_MAP - map of the ancestral scaffold
# -----------------------------------------------------------
echo
echo "STAGE D: GENERATING REPORT"
python src/fpsac_generate_report.py \
$REPORT \
$ANCESTRAL_CONTIGS_LG \
$EXTANT_GENOMES_LG \
$ANCIENT_EXTANT_HITS \
$FAMILIES_COORDINATES_ALL \
$FAMILIES_ANCESTRAL_CONTENT \
$FAMILIES_COORDINATES_FILTERED \
$FAMILIES_LG \
$ADJACENCIES_WGTD \
$ADJACENCIES_KEPT \
$ADJACENCIES_DISC \
$REPEAT_CLUSTERS \
$RSI_WGTD \
$RSI_KEPT \
$RSI_DISC \
$ADJACENCIES_CARS_ALL \
$SCAFFOLD_ORDER \
$GAPS_COORDINATES_LG \
$SCAFFOLD_MAP \
$ADJACENCIES_UNASSIGNED