Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow increasing cd-hit-est memory allocation #255 #256

Merged
11 changes: 10 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ __pycache__/
# Distribution / packaging
.Python
env/
venv/
build/
develop-eggs/
dist/
Expand All @@ -26,7 +27,7 @@ sdist/
var/
*.egg-info/
.installed.cfg
*.egg
*.egg*

# PyInstaller
# Usually these files are written by a python script from a template
Expand All @@ -43,6 +44,7 @@ htmlcov/
.tox/
.coverage
.cache
out.card*
nosetests.xml
coverage.xml

Expand All @@ -55,3 +57,10 @@ docs/_build/

# PyBuilder
target/

# PyCharm
.idea

# Mac files
.DS_Store

4 changes: 3 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,11 @@ RUN wget -q http://downloads.sourceforge.net/project/bowtie-bio/bowtie2/2.2.9/bo
# _tkinter.TclError: no display name and no $DISPLAY environment variable
ENV ARIBA_BOWTIE2=$PWD/bowtie2-2.2.9/bowtie2 ARIBA_CDHIT=cdhit-est MPLBACKEND="agg"

RUN cd /usr/local/bin && ln -s /usr/bin/python3 python && cd

RUN git clone https://github.com/sanger-pathogens/ariba.git \
&& cd ariba \
&& git checkout v2.12.0 \
&& git checkout v2.13.4 \
&& python3 setup.py test \
&& python3 setup.py install

Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,8 @@ Download the latest release from this github repository or clone it. Run the tes

python3 setup.py test

**Note for OS X:** The tests require gawk which will need to be installed separately, e.g. via Homebrew.

If the tests all pass, install:

python3 setup.py install
Expand Down
26 changes: 20 additions & 6 deletions ariba/cdhit.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,22 @@ def __init__(
seq_identity_threshold=0.9,
threads=1,
length_diff_cutoff=0.0,
memory_limit=None,
verbose=False,
min_cluster_number=0
):

if not os.path.exists(infile):
raise Error('File not found: "' + infile + '". Cannot continue')

if (memory_limit is not None) and (memory_limit < 0):
raise Error('Input parameter cdhit_max_memory is set to an invalid value. Cannot continue')

self.infile = os.path.abspath(infile)
self.seq_identity_threshold = seq_identity_threshold
self.threads = threads
self.length_diff_cutoff = length_diff_cutoff
self.memory_limit = memory_limit
self.verbose = verbose
self.min_cluster_number = min_cluster_number
extern_progs = external_progs.ExternalProgs(fail_on_error=True, using_spades=False)
Expand Down Expand Up @@ -133,24 +138,33 @@ def _get_clusters_from_bak_file(filename, min_cluster_number=0):
return clusters


def run(self):
tmpdir = tempfile.mkdtemp(prefix='tmp.run_cd-hit.', dir=os.getcwd())
cdhit_fasta = os.path.join(tmpdir, 'cdhit')
cluster_info_outfile = cdhit_fasta + '.bak.clstr'

def get_run_cmd(self, output_file):
cmd = ' '.join([
self.cd_hit_est,
'-i', self.infile,
'-o', cdhit_fasta,
'-o', output_file,
'-c', str(self.seq_identity_threshold),
'-T', str(self.threads),
'-s', str(self.length_diff_cutoff),
'-d 0',
'-bak 1',
])

# Add in cdhit memory allocation if one has been specified
if self.memory_limit is not None:
cmd = ' '.join([cmd, '-M', str(self.memory_limit)])

return cmd


def run(self):
tmpdir = tempfile.mkdtemp(prefix='tmp.run_cd-hit.', dir=os.getcwd())
cdhit_fasta = os.path.join(tmpdir, 'cdhit')
cluster_info_outfile = cdhit_fasta + '.bak.clstr'
cmd = self.get_run_cmd(cdhit_fasta)
common.syscall(cmd, verbose=self.verbose)
clusters = self._get_clusters_from_bak_file(cluster_info_outfile, self.min_cluster_number)
common.rmtree(tmpdir)
return clusters


4 changes: 3 additions & 1 deletion ariba/mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,10 @@ def run_bowtie2(
if LooseVersion(bowtie2_version) >= LooseVersion('2.3.1'):
map_cmd.append('--score-min G,1,10')

# We use gawk instead of awk here as we need bitwise comparisons
# and these are not available via awk on Mac OSX.
if remove_both_unmapped:
map_cmd.append(r''' | awk ' !(and($2,4)) || !(and($2,8)) ' ''')
map_cmd.append(r''' | gawk ' !(and($2,4)) || !(and($2,8)) ' ''')

tmp_sam_file = out_prefix + '.unsorted.sam'
map_cmd.append(' > ' + tmp_sam_file)
Expand Down
5 changes: 4 additions & 1 deletion ariba/ref_preparer.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ def __init__(self,
genetic_code=11,
cdhit_min_id=0.9,
cdhit_min_length=0.0,
cdhit_max_memory=None,
run_cdhit=True,
clusters_file=None,
threads=1,
Expand All @@ -40,6 +41,7 @@ def __init__(self,
self.genetic_code = genetic_code
self.cdhit_min_id = cdhit_min_id
self.cdhit_min_length = cdhit_min_length
self.cdhit_max_memory = cdhit_max_memory
self.run_cdhit = run_cdhit
self.clusters_file = clusters_file
self.threads = threads
Expand Down Expand Up @@ -193,6 +195,7 @@ def run(self, outdir):
seq_identity_threshold=self.cdhit_min_id,
threads=self.threads,
length_diff_cutoff=self.cdhit_min_length,
memory_limit=self.cdhit_max_memory,
nocluster=not self.run_cdhit,
verbose=self.verbose,
clusters_file=self.clusters_file,
Expand All @@ -214,4 +217,4 @@ def run(self, outdir):
print(' grep REMOVE', os.path.join(outdir, '01.filter.check_genes.log'), file=sys.stderr)

if number_of_bad_variants_logged > 0:
print('WARNING. Problem with at least one variant. Problem variants are rmoved. Please see the file', os.path.join(outdir, '01.filter.check_metadata.log'), 'for details.', file=sys.stderr)
print('WARNING. Problem with at least one variant. Problem variants are removed. Please see the file', os.path.join(outdir, '01.filter.check_metadata.log'), 'for details.', file=sys.stderr)
3 changes: 2 additions & 1 deletion ariba/reference_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -434,7 +434,7 @@ def write_cluster_allocation_file(clusters, outfile):
pyfastaq.utils.close(f_out)


def cluster_with_cdhit(self, outprefix, seq_identity_threshold=0.9, threads=1, length_diff_cutoff=0.0, nocluster=False, verbose=False, clusters_file=None):
def cluster_with_cdhit(self, outprefix, seq_identity_threshold=0.9, threads=1, length_diff_cutoff=0.0, memory_limit=None, nocluster=False, verbose=False, clusters_file=None):
clusters = {}
ReferenceData._write_sequences_to_files(self.sequences, self.metadata, outprefix)
ref_types = ('noncoding', 'noncoding.varonly', 'gene', 'gene.varonly')
Expand All @@ -454,6 +454,7 @@ def cluster_with_cdhit(self, outprefix, seq_identity_threshold=0.9, threads=1, l
seq_identity_threshold=seq_identity_threshold,
threads=threads,
length_diff_cutoff=length_diff_cutoff,
memory_limit=memory_limit,
verbose=verbose,
min_cluster_number = min_cluster_number,
)
Expand Down
1 change: 1 addition & 0 deletions ariba/tasks/prepareref.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ def run(options):
genetic_code=options.genetic_code,
cdhit_min_id=options.cdhit_min_id,
cdhit_min_length=options.cdhit_min_length,
cdhit_max_memory=options.cdhit_max_memory,
run_cdhit=not options.no_cdhit,
clusters_file=options.cdhit_clusters,
threads=options.threads,
Expand Down
36 changes: 36 additions & 0 deletions ariba/tests/cdhit_test.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import unittest
import os
import re
from ariba import cdhit, external_progs


modules_dir = os.path.dirname(os.path.abspath(cdhit.__file__))
data_dir = os.path.join(modules_dir, 'tests', 'data')
extern_progs = external_progs.ExternalProgs()
Expand All @@ -13,6 +15,13 @@ def test_init_fail_infile_missing(self):
cdhit.Runner('oopsnotafile', 'out')


def test_init_fail_invalid_memory(self):
'''test_init_fail_invalid_memory'''
infile = os.path.join(data_dir, 'cdhit_test_run.in.fa')
with self.assertRaises(cdhit.Error):
cdhit.Runner(infile, memory_limit=-10)


def test_get_clusters_from_bak_file(self):
'''test _get_clusters_from_bak_file'''
infile = os.path.join(data_dir, 'cdhit_test_get_clusters_from_bak_file.in')
Expand Down Expand Up @@ -162,3 +171,30 @@ def test_run_get_clusters_from_file_with_renaming(self):
'1': {'seq3'},
}
self.assertEqual(clusters, expected_clusters)


def test_get_run_cmd_with_default_memory(self):
'''test_get_run_cmd_with_default_memory'''
fa_infile = os.path.join(data_dir, 'cdhit_test_run_get_clusters_from_dict_rename.in.fa')
r = cdhit.Runner(fa_infile)
run_cmd = r.get_run_cmd('foo/bar/file.out')
match = re.search('^.+ -o foo/bar/file.out -c 0.9 -T 1 -s 0.0 -d 0 -bak 1$', run_cmd)
self.assertIsNotNone(match, msg="Command output was " + run_cmd)


def test_get_run_cmd_with_non_default_memory(self):
'''test_get_run_cmd_with_non_default_memory'''
fa_infile = os.path.join(data_dir, 'cdhit_test_run_get_clusters_from_dict_rename.in.fa')
r = cdhit.Runner(fa_infile, memory_limit=900)
run_cmd = r.get_run_cmd('foo/bar/file.out')
match = re.search('^.+ -o foo/bar/file.out -c 0.9 -T 1 -s 0.0 -d 0 -bak 1 -M 900$', run_cmd)
self.assertIsNotNone(match, msg="Command output was " + run_cmd)


def test_get_run_cmd_with_unlimited_memory(self):
'''test_get_run_cmd_with_unlimited_memory'''
fa_infile = os.path.join(data_dir, 'cdhit_test_run_get_clusters_from_dict_rename.in.fa')
r = cdhit.Runner(fa_infile, memory_limit=0)
run_cmd = r.get_run_cmd('foo/bar/file.out')
match = re.search('^.+ -o foo/bar/file.out -c 0.9 -T 1 -s 0.0 -d 0 -bak 1 -M 0$', run_cmd)
self.assertIsNotNone(match, msg="Command output was " + run_cmd)
3 changes: 2 additions & 1 deletion scripts/ariba
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,8 @@ cdhit_group = subparser_prepareref.add_argument_group('cd-hit options')
cdhit_group.add_argument('--no_cdhit', action='store_true', help='Do not run cd-hit. Each input sequence is put into its own "cluster". Incompatible with --cdhit_clusters.')
cdhit_group.add_argument('--cdhit_clusters', help='File specifying how the sequences should be clustered. Will be used instead of running cdhit. Format is one cluster per line. Sequence names separated by whitespace. Incompatible with --no_cdhit', metavar='FILENAME')
cdhit_group.add_argument('--cdhit_min_id', type=float, help='Sequence identity threshold (cd-hit option -c) [%(default)s]', default=0.9, metavar='FLOAT')
cdhit_group.add_argument('--cdhit_min_length', type=float, help='length difference cutoff (cd-hit option -s) [%(default)s]', default=0.0, metavar='FLOAT')
cdhit_group.add_argument('--cdhit_min_length', type=float, help='Length difference cutoff (cd-hit option -s) [%(default)s]', default=0.0, metavar='FLOAT')
cdhit_group.add_argument('--cdhit_max_memory', type=int, help='Memory limit in MB (cd-hit option -M) [%(default)s]. Use 0 for unlimited.', metavar='INT')

other_prep_group = subparser_prepareref.add_argument_group('other options')
other_prep_group.add_argument('--min_gene_length', type=int, help='Minimum allowed length in nucleotides of reference genes [%(default)s]', metavar='INT', default=6)
Expand Down
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@
setup(
ext_modules=[minimap_mod, fermilite_mod, vcfcall_mod],
name='ariba',
version='2.13.3',
version='2.13.4',
description='ARIBA: Antibiotic Resistance Identification By Assembly',
packages = find_packages(),
package_data={'ariba': ['test_run_data/*', 'tb_data/*']},
Expand All @@ -72,7 +72,7 @@
'matplotlib',
'pyfastaq >= 3.12.0',
'pysam >= 0.9.1',
'pymummer>=0.10.2',
'pymummer<=0.10.3',
],
license='GPLv3',
classifiers=[
Expand Down