Skip to content

Commit

Permalink
Merge pull request #84 from supernifty/vep_offline
Browse files Browse the repository at this point in the history
correct vep stage to annotate based on offline (cached) resources CPIPE-76
  • Loading branch information
supernifty committed Jan 28, 2016
2 parents e6ed54d + 4986196 commit cf7bbff
Show file tree
Hide file tree
Showing 2 changed files with 66 additions and 51 deletions.
4 changes: 3 additions & 1 deletion pipeline/pipeline_stages_config.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -853,11 +853,13 @@ annotate_vep = {
perl $VEP/variant_effect_predictor.pl --cache --dir $VEP/../vep_cache
-i $input.vcf
--vcf -o $output.vcf
-species human
-species homo_sapiens
--canonical --per_gene --protein
--sift=b --polyphen=b
--symbol hgnc --force_overwrite --hgvs --maf_1kg --maf_esp --pubmed
--plugin Condel,$CONDEL/config,s
--offline
--verbose
""", "vep"
}

Expand Down
113 changes: 63 additions & 50 deletions pipeline/scripts/update_gene_lists.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#!/usr/bin/env python
'''
###########################################################################
#
# This file is part of Cpipe.
Expand Down Expand Up @@ -26,66 +27,78 @@
# Usage:
# update_gene_lists --source dir --target dir
####################################################################################
'''

import argparse
import datetime
import glob
import os
import re
import sys

CATEGORY = '1'

def write_log( log, msg ):
log.write( '%s: %s\n' % ( datetime.datetime.now().strftime( '%y%m%d-%H%M%S' ), msg ) )
def write_log(log, msg):
'''
write date stamped msg to log
'''
log.write('%s: %s\n' % (datetime.datetime.now().strftime('%y%m%d-%H%M%S'), msg))

def update_gene_lists( source_dir, target_dir, log ):
for filename in glob.glob( os.path.join( source_dir, '*.add.genes.txt' ) ):
cohort = os.path.basename( filename ).split( '.' )[0]
# find corresponding flagship
target = os.path.join( target_dir, cohort, '%s.genes.txt' % cohort ) # target/cohort/cohort.genes.txt
if os.path.isfile( target ):
# read existing genes and categories
genes = {}
for line in open( target, 'r' ):
if line.startswith('#'):
continue
fields = line.strip().split('\t')
genes[fields[0].upper()] = fields[1]
# read new genes
added = set()
candidates = set()
for line in open( filename, 'r' ):
if line.startswith('#'):
continue
gene = line.strip().upper()
candidates.add( gene )
if gene not in genes:
added.add(gene)
genes[gene] = CATEGORY

# write out additional
if len(added) > 0:
with open( target, 'w' ) as fh:
fh.write( '#version %s\n' % datetime.datetime.now().strftime( '%y%m%d' ) )
fh.write( '#notes %i gene(s) added: %s\n' % ( len(added), ','.join( sorted( list(added) ) ) ) )
for gene in sorted(genes.keys()):
fh.write( '%s\t%s\n' % ( gene, genes[gene] ) )
write_log( log, '%s: %i gene(s) added from %i candidate(s): %s' % ( cohort, len(added), len(candidates), ','.join( sorted( list(added) ) ) ) )
else:
write_log( log, '%s: no changes from %i candidate(s)' % ( cohort, len(candidates) ) )
def update_gene_lists(source_dir, target_dir, log):
'''
adds genes from files of the form source_dir/*.add.genes.txt to gene lists in target_dir/cohort/cohort.genes.txt
'''
for filename in glob.glob(os.path.join(source_dir, '*.add.genes.txt')):
cohort = os.path.basename(filename).split('.')[0]
# find corresponding flagship
target = os.path.join(target_dir, cohort, '%s.genes.txt' % cohort) # target/cohort/cohort.genes.txt
if os.path.isfile(target):
# read existing genes and categories
genes = {}
for line in open(target, 'r'):
if line.startswith('#'):
continue
fields = line.strip().split('\t')
genes[fields[0].upper()] = fields[1]
# read new genes
added = set()
candidates = set()
for line in open(filename, 'r'):
if line.startswith('#'):
continue
gene = line.strip().upper()
candidates.add(gene)
if gene not in genes:
added.add(gene)
genes[gene] = CATEGORY

# write out additional
if len(added) > 0:
with open(target, 'w') as fh:
fh.write('#version %s\n' % datetime.datetime.now().strftime('%y%m%d'))
fh.write('#notes %i gene(s) added: %s\n' % (len(added), ','.join(sorted(list(added)))))
for gene in sorted(genes.keys()):
fh.write('%s\t%s\n' % (gene, genes[gene]))
write_log(log, '%s: %i gene(s) added from %i candidate(s): %s' % (cohort, len(added), len(candidates), ','.join(sorted(list(added)))))
else:
write_log(log, '%s: no changes from %i candidate(s)' % (cohort, len(candidates)))
else:
write_log(log, 'ERROR: target gene list %s does not exist' % target)

def main():
'''
update gene lists from command line options
'''
parser = argparse.ArgumentParser(description='Generate bed files')
parser.add_argument('--source', required=True, help='source of extra genes') # input
parser.add_argument('--target', required=True, help='target containing gene files to update') # input
parser.add_argument('--log', required=False, help='write changes to this file') # input
args = parser.parse_args()
if args.log:
log = open(args.log, 'a+')
else:
write_log( log, 'ERROR: target gene list %s does not exist' % target )
log = sys.stderr

if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Generate bed files')
parser.add_argument('--source', required=True, help='source of extra genes') # input
parser.add_argument('--target', required=True, help='target containing gene files to update') # input
parser.add_argument('--log', required=False, help='write changes to this file') # input
args = parser.parse_args()
if args.log:
log = open( args.log, 'a+' )
else:
log = sys.stderr
update_gene_lists(args.source, args.target, log)

update_gene_lists( args.source, args.target, log )
if __name__ == '__main__':
main()

0 comments on commit cf7bbff

Please sign in to comment.