diff --git a/ariba/__init__.py b/ariba/__init__.py
index 4446c002..9f5f5d84 100644
--- a/ariba/__init__.py
+++ b/ariba/__init__.py
@@ -1,5 +1,9 @@
 __all__ = [
+    'assembly',
+    'assembly_compare',
+    'assembly_variants',
     'bam_parse',
+    'best_seq_chooser',
     'cdhit',
     'cluster',
     'clusters',
@@ -10,8 +14,12 @@
     'histogram',
     'link',
     'mapping',
-    'refcheck',
+    'reference_data',
+    'ref_genes_getter',
+    'report',
     'scaffold_graph',
+    'sequence_metadata',
+    'sequence_variant',
     'summary',
     'tasks',
 ]
diff --git a/ariba/assembly.py b/ariba/assembly.py
new file mode 100644
index 00000000..9be64d24
--- /dev/null
+++ b/ariba/assembly.py
@@ -0,0 +1,313 @@
+import os
+import shutil
+import pyfastaq
+import pymummer
+from ariba import common, mapping, bam_parse, external_progs
+
+class Error (Exception): pass
+
+class Assembly:
+    def __init__(self,
+      reads1,
+      reads2,
+      ref_fasta,
+      working_dir,
+      final_assembly_fa,
+      final_assembly_bam,
+      log_fh,
+      scaff_name_prefix='scaffold',
+      kmer=0,
+      assembler='spades',
+      bowtie2_preset='very-sensitive-local',
+      max_insert=1000,
+      min_scaff_depth=10,
+      min_scaff_length=50,
+      nucmer_min_id=90,
+      nucmer_min_len=50,
+      nucmer_breaklen=50,
+      spades_other_options=None,
+      sspace_k=20,
+      sspace_sd=0.4,
+      reads_insert=500,
+      extern_progs=None,
+    ):
+        self.reads1 = os.path.abspath(reads1)
+        self.reads2 = os.path.abspath(reads2)
+        self.ref_fasta = os.path.abspath(ref_fasta)
+        self.working_dir = os.path.abspath(working_dir)
+        self.final_assembly_fa = os.path.abspath(final_assembly_fa)
+        self.final_assembly_bam = os.path.abspath(final_assembly_bam)
+        self.log_fh = log_fh
+        self.scaff_name_prefix = scaff_name_prefix
+
+        self.assembly_kmer = self._get_assembly_kmer(kmer, reads1, reads2)
+        self.assembler = assembler
+        self.bowtie2_preset = bowtie2_preset
+        self.max_insert = max_insert
+        self.min_scaff_depth = min_scaff_depth
+        self.min_scaff_length = min_scaff_length
+        self.nucmer_min_id = nucmer_min_id
+        self.nucmer_min_len = nucmer_min_len
+        self.nucmer_breaklen = nucmer_breaklen
+        self.spades_other_options = spades_other_options
+        self.sspace_k = sspace_k
+        self.sspace_sd = sspace_sd
+        self.reads_insert = reads_insert
+
+        if extern_progs is None:
+            self.extern_progs = external_progs.ExternalProgs()
+        else:
+            self.extern_progs = extern_progs
+
+        try:
+            os.mkdir(self.working_dir)
+        except:
+            raise Error('Error mkdir ' + self.working_dir)
+
+        self.assembler_dir = os.path.join(self.working_dir, 'Assemble')
+        self.assembly_contigs = os.path.join(self.working_dir, 'contigs.fa')
+        self.scaffold_dir = os.path.join(self.working_dir, 'Scaffold')
+        self.scaffolder_scaffolds = os.path.join(self.working_dir, 'scaffolds.fa')
+        self.gapfill_dir = os.path.join(self.working_dir, 'Gapfill')
+        self.gapfilled_scaffolds = os.path.join(self.working_dir, 'scaffolds.gapfilled.fa')
+        self.gapfilled_length_filtered = os.path.join(self.working_dir, 'scaffolds.gapfilled.length_filtered.fa')
+
+
+    @staticmethod
+    def _get_assembly_kmer(k, reads1, reads2):
+        '''If the kmer not given, uses 2/3 of the mean read length (using first 1000 forward and first 1000 reverse reads)'''
+        if k == 0:
+            read_length1 = pyfastaq.tasks.mean_length(reads1, limit=1000)
+            read_length2 = pyfastaq.tasks.mean_length(reads2, limit=1000)
+            assembly_kmer = round( (read_length1 + read_length2) / 3)
+            if assembly_kmer % 2 == 0:
+                assembly_kmer += 1
+        else:
+            assembly_kmer = k
+
+        return assembly_kmer
+
+
+    def _assemble_with_spades(self, unittest=False):
+        cmd = ' '.join([
+            self.extern_progs.exe('spades'),
+            '-1', self.reads1,
+            '-2', self.reads2,
+            '-o', self.assembler_dir,
+            '-k', str(self.assembly_kmer),
+            '--untrusted-contigs', self.ref_fasta,
+        ])
+        if self.spades_other_options is not None:
+            cmd += ' ' + self.spades_other_options
+
+        cwd = os.getcwd()
+        try:
+            os.chdir(self.working_dir)
+        except:
+            raise Error('Error chdir ' + self.working_dir)
+        spades_contigs = os.path.join(os.path.split(self.assembler_dir)[1], 'scaffolds.fasta')
+
+        if unittest:
+            os.mkdir(self.assembler_dir)
+            open(spades_contigs, 'w').close()
+            self.assembled_ok = True
+        else:
+            self.assembled_ok, err = common.syscall(cmd, verbose=True, allow_fail=True, verbose_filehandle=self.log_fh, print_errors=False)
+        if self.assembled_ok:
+            os.symlink(spades_contigs, os.path.basename(self.assembly_contigs))
+        else:
+            spades_errors_file = os.path.join(self.working_dir, 'spades_errors')
+            with open(spades_errors_file, 'w') as f:
+                print(err, file=f)
+            f.close()
+
+        os.chdir(cwd)
+
+
+    def _scaffold_with_sspace(self):
+        if not os.path.exists(self.assembly_contigs):
+            raise Error('Cannot scaffold because contigs file not found: ' + self.assembly_contigs)
+
+        try:
+            os.mkdir(self.scaffold_dir)
+        except:
+            raise Error('Error mkdir '+  self.scaffold_dir)
+
+        cwd = os.getcwd()
+
+        if self.extern_progs.exe('sspace') is None:
+            os.chdir(self.assembly_dir)
+            os.symlink(os.path.basename(self.assembly_contigs), os.path.basename(self.scaffolder_scaffolds))
+            os.chdir(cwd)
+            return
+
+        os.chdir(self.scaffold_dir)
+        lib_file = 'lib'
+        with open(lib_file, 'w') as f:
+            print('LIB', self.reads1, self.reads2, int(self.reads_insert), self.sspace_sd, 'FR', file=f)
+
+        cmd = ' '.join([
+            'perl', self.extern_progs.exe('sspace'),
+            '-k', str(self.sspace_k),
+            '-l', lib_file,
+            '-s', self.assembly_contigs
+        ])
+
+        sspace_scaffolds = os.path.abspath('standard_output.final.scaffolds.fasta')
+        common.syscall(cmd, verbose=True, verbose_filehandle=self.log_fh)
+        os.chdir(self.working_dir)
+        os.symlink(os.path.relpath(sspace_scaffolds), os.path.basename(self.scaffolder_scaffolds))
+        os.chdir(cwd)
+
+
+    @staticmethod
+    def _has_gaps_to_fill(filename):
+        seq_reader = pyfastaq.sequences.file_reader(filename)
+        for seq in seq_reader:
+            if 'n' in seq.seq or 'N' in seq.seq:
+                return True
+        return False
+
+
+    @staticmethod
+    def _rename_scaffolds(infile, outfile, prefix):
+        freader = pyfastaq.sequences.file_reader(infile)
+        f_out = pyfastaq.utils.open_file_write(outfile)
+        i = 1
+        for scaff in freader:
+            scaff.id = prefix + '.scaffold.' + str(i)
+            i += 1
+            print(scaff, file=f_out)
+        pyfastaq.utils.close(f_out)
+
+
+    def _gap_fill_with_gapfiller(self):
+        if not os.path.exists(self.scaffolder_scaffolds):
+            raise Error('Cannot gap fill because scaffolds file not found: ' + self.scaffolder_scaffolds)
+
+        cwd = os.getcwd()
+
+        if self.extern_progs.exe('gapfiller') is None or not self._has_gaps_to_fill(self.scaffolder_scaffolds):
+            self._rename_scaffolds(self.scaffolder_scaffolds, self.gapfilled_scaffolds, self.scaff_name_prefix)
+            return
+
+        try:
+            os.mkdir(self.gapfill_dir)
+        except:
+            raise Error('Error mkdir '+  self.gapfill_dir)
+
+        os.chdir(self.gapfill_dir)
+        lib_file = 'lib'
+        with open(lib_file, 'w') as f:
+            print('LIB', 'bwa', self.reads1, self.reads2, self.reads_insert, self.sspace_sd, 'FR', file=f)
+
+        cmd = ' '.join([
+            'perl', self.extern_progs.exe('gapfiller'),
+            '-l', lib_file,
+            '-s', self.scaffolder_scaffolds
+        ])
+
+        gapfilled_scaffolds = os.path.join(self.gapfill_dir, 'standard_output', 'standard_output.gapfilled.final.fa')
+        common.syscall(cmd, verbose=True, verbose_filehandle=self.log_fh)
+        self._rename_scaffolds(gapfilled_scaffolds, self.gapfilled_scaffolds, self.scaff_name_prefix)
+        os.chdir(cwd)
+
+
+    @staticmethod
+    def _fix_contig_orientation(contigs_fa, ref_fa, outfile, min_id=90, min_length=50, breaklen=50):
+        '''Changes orientation of each contig to match the reference, when possible.
+           Returns a set of names of contigs that had hits in both orientations to the reference'''
+        if not os.path.exists(contigs_fa):
+            raise Error('Cannot fix orientation of assembly contigs because file not found: ' + contigs_fa)
+
+        tmp_coords = os.path.join(outfile + '.tmp.rename.coords')
+        pymummer.nucmer.Runner(
+            ref_fa,
+            contigs_fa,
+            tmp_coords,
+            min_id=min_id,
+            min_length=min_length,
+            breaklen=breaklen,
+        ).run()
+
+        to_revcomp = set()
+        not_revcomp = set()
+        file_reader = pymummer.coords_file.reader(tmp_coords)
+        for hit in file_reader:
+            if hit.on_same_strand():
+                not_revcomp.add(hit.qry_name)
+            else:
+                to_revcomp.add(hit.qry_name)
+
+        os.unlink(tmp_coords)
+        in_both = to_revcomp.intersection(not_revcomp)
+
+        f = pyfastaq.utils.open_file_write(outfile)
+        seq_reader = pyfastaq.sequences.file_reader(contigs_fa)
+        for seq in seq_reader:
+            if seq.id in to_revcomp and seq.id not in in_both:
+                seq.revcomp()
+            print(seq, file=f)
+        pyfastaq.utils.close(f)
+
+        return in_both
+
+
+    @staticmethod
+    def _parse_bam(sequences, bam, min_scaff_depth, max_insert):
+        if not os.path.exists(bam):
+            raise Error('File not found: ' + bam)
+
+        bam_parser = bam_parse.Parser(bam, sequences)
+        bam_parser.parse()
+        bam_parser.write_files(bam)
+        return bam_parser.scaff_graph_is_consistent(min_scaff_depth, max_insert)
+
+
+    def run(self):
+        self._assemble_with_spades()
+        self.sequences = {}
+
+        # double-check we got some contigs
+        number_of_contigs = pyfastaq.tasks.count_sequences(self.assembly_contigs) if os.path.exists(self.assembly_contigs) else 0
+        if number_of_contigs == 0:
+            self.assembled_ok = False
+            # This is to make this object picklable, to keep multithreading happy
+            self.log_fh = None
+            return
+        else:
+            self.assembled_ok = True
+
+        if self.assembled_ok:
+            self._scaffold_with_sspace()
+            self._gap_fill_with_gapfiller()
+
+            pyfastaq.tasks.filter(self.gapfilled_scaffolds, self.gapfilled_length_filtered, minlength=self.min_scaff_length)
+            if pyfastaq.tasks.count_sequences(self.gapfilled_length_filtered) == 0:
+                self.assembled_ok = False
+                # This is to make this object picklable, to keep multithreading happy
+                self.log_fh = None
+                return
+
+            contigs_both_strands = self._fix_contig_orientation(self.gapfilled_length_filtered, self.ref_fasta, self.final_assembly_fa)
+            self.has_contigs_on_both_strands = len(contigs_both_strands) > 0
+            pyfastaq.tasks.file_to_dict(self.final_assembly_fa, self.sequences)
+
+            mapping.run_bowtie2(
+                self.reads1,
+                self.reads2,
+                self.final_assembly_fa,
+                self.final_assembly_bam[:-4],
+                threads=1,
+                sort=True,
+                samtools=self.extern_progs.exe('samtools'),
+                bowtie2=self.extern_progs.exe('bowtie2'),
+                bowtie2_preset=self.bowtie2_preset,
+                verbose=True,
+                verbose_filehandle=self.log_fh
+            )
+
+            self.scaff_graph_ok = self._parse_bam(self.sequences, self.final_assembly_bam, self.min_scaff_depth, self.max_insert)
+
+        # This is to make this object picklable, to keep multithreading happy
+        self.log_fh = None
diff --git a/ariba/assembly_compare.py b/ariba/assembly_compare.py
new file mode 100644
index 00000000..645fe13b
--- /dev/null
+++ b/ariba/assembly_compare.py
@@ -0,0 +1,291 @@
+import os
+import copy
+import pyfastaq
+import pymummer
+
+class Error (Exception): pass
+
+class AssemblyCompare:
+    def __init__(self,
+      assembly_fa,
+      assembly_sequences,
+      ref_fa,
+      ref_sequence,
+      outprefix,
+      refdata,
+      nucmer_min_id=90,
+      nucmer_min_len=50,
+      nucmer_breaklen=50,
+      assembled_threshold=0.95,
+      unique_threshold=0.03,
+    ):
+        self.assembly_fa = os.path.abspath(assembly_fa)
+        self.assembly_sequences = assembly_sequences
+        self.ref_fa = os.path.abspath(ref_fa)
+        self.ref_sequence = ref_sequence
+        self.outprefix = os.path.abspath(outprefix)
+        self.refdata = refdata
+
+        self.nucmer_min_id = nucmer_min_id
+        self.nucmer_min_len = nucmer_min_len
+        self.nucmer_breaklen = nucmer_breaklen
+        self.assembled_threshold = assembled_threshold
+        self.unique_threshold = unique_threshold
+
+        self.nucmer_coords_file = self.outprefix + '.nucmer.coords'
+        self.nucmer_snps_file = self.nucmer_coords_file + '.snps'
+        self.assembled_ref_seqs_file = self.outprefix + '.assembled_refs.fasta'
+
+
+    def _run_nucmer(self):
+        pymummer.nucmer.Runner(
+            self.ref_fa,
+            self.assembly_fa,
+            self.nucmer_coords_file,
+            min_id=self.nucmer_min_id,
+            min_length=self.nucmer_min_len,
+            breaklen=self.nucmer_breaklen,
+            show_snps=True
+        ).run()
+
+
+    @staticmethod
+    def _parse_nucmer_coords_file(coords_file, ref_name):
+        '''Input is coords file made by self._run_nucmer. Reference should have one sequence only.
+           ref_name is name fo the reference sequence, to sanity check the coords file.
+           Returns dictionary. Key = assembly contig name. Value = list of nucmer hits to that contig'''
+        file_reader = pymummer.coords_file.reader(coords_file)
+        nucmer_hits = {}
+        for hit in file_reader:
+            assert hit.ref_name == ref_name
+            contig = hit.qry_name
+            if contig not in nucmer_hits:
+                nucmer_hits[contig] = []
+            nucmer_hits[contig].append(copy.copy(hit))
+
+        return nucmer_hits
+
+
+    @staticmethod
+    def _nucmer_hits_to_percent_identity(nucmer_hits):
+        '''Input is hits made by self._parse_nucmer_coords_file.
+           Returns dictionary. key = contig name. Value = percent identity of hits to that contig'''
+        percent_identities = {}
+
+        for contig in nucmer_hits:
+            product_sum = 0
+            length_sum = 0
+            for hit in nucmer_hits[contig]:
+                product_sum += hit.hit_length_qry * hit.percent_identity
+                length_sum += hit.hit_length_qry
+            assert length_sum > 0
+            percent_identities[contig] = round(product_sum / length_sum, 2)
+
+        return percent_identities
+
+
+    @staticmethod
+    def _nucmer_hits_to_assembly_coords(nucmer_hits):
+        '''Input is hits made by self._parse_nucmer_coords_file.
+           Returns dictionary. key = contig name. Value = list of coords that match
+           to the reference gene'''
+        coords = {}
+        for l in nucmer_hits.values():
+            for hit in l:
+                if hit.qry_name not in coords:
+                    coords[hit.qry_name] = []
+                coords[hit.qry_name].append(hit.qry_coords())
+
+        for scaff in coords:
+            pyfastaq.intervals.merge_overlapping_in_list(coords[scaff])
+
+        return coords
+
+
+    def assembly_match_coords(self):
+        return self._nucmer_hits_to_assembly_coords(self.nucmer_hits)
+
+
+    @classmethod
+    def nucmer_hits_to_ref_coords(cls, nucmer_hits, contig=None):
+        '''Input is hits made by self._parse_nucmer_coords_file.
+           Returns dictionary. Key = contig name. Value = list of coords in the
+           reference sequence for that contig.
+           if contig=contig_name, then just gets the ref coords from that contig,
+           instead of using all the contigs'''
+        coords = []
+        if contig is None:
+            coords = {key: [] for key in nucmer_hits.keys()}
+        else:
+            coords = {contig: []}
+
+        for key in coords:
+            coords[key] = [hit.ref_coords() for hit in nucmer_hits[key]]
+            pyfastaq.intervals.merge_overlapping_in_list(coords[key])
+
+        return coords
+
+
+    @staticmethod
+    def ref_cov_per_contig(nucmer_hits):
+        '''Input is hits made by self._parse_nucmer_coords_file.
+           Returns dictionary. key = contig name. Value = number of bases that
+           match to the reference sequence.'''
+        cov = {}
+        coords = AssemblyCompare.nucmer_hits_to_ref_coords(nucmer_hits)
+        return {x: pyfastaq.intervals.length_sum_from_list(coords[x]) for x in coords}
+
+
+    @staticmethod
+    def _write_assembled_reference_sequences(nucmer_hits, ref_sequence, assembly, outfile):
+        '''nucmer_hits =  hits made by self._parse_nucmer_coords_file.
+           ref_gene = reference sequence (pyfastaq.sequences.Fasta object)
+           assembly = dictionary of contig name -> contig.
+           Writes each piece of assembly that corresponds to the reference sequence
+           to a fasta file.'''
+        f = pyfastaq.utils.open_file_write(outfile)
+
+        for contig in sorted(nucmer_hits):
+            for hit in nucmer_hits[contig]:
+                qry_coords = hit.qry_coords()
+                fa = assembly[hit.qry_name].subseq(qry_coords.start, qry_coords.end + 1)
+                if hit.on_same_strand():
+                    strand = '+'
+                else:
+                    fa.revcomp()
+                    strand = '-'
+                ref_coords = hit.ref_coords()
+                fa.id = '.'.join([
+                    ref_sequence.id,
+                    str(ref_coords.start + 1),
+                    str(ref_coords.end + 1),
+                    contig,
+                    str(qry_coords.start + 1),
+                    str(qry_coords.end + 1),
+                    strand
+                ])
+
+                if hit.hit_length_ref == hit.ref_length:
+                    fa.id += '.complete'
+
+                print(fa, file=f)
+
+        pyfastaq.utils.close(f)
+
+
+    @staticmethod
+    def _whole_gene_covered_by_nucmer_hits(nucmer_hits, ref_seq, threshold):
+        '''Returns true iff the reference sequence is covered by nucmer hits.
+           nucmer_hits = hits made by self._parse_nucmer_coords_file.
+           Counts as covered if (total ref bases covered) / len(ref_seq) >= threshold'''
+        coords = AssemblyCompare.nucmer_hits_to_ref_coords(nucmer_hits)
+        covered = []
+        for coords_list in coords.values():
+            covered.extend(coords_list)
+        pyfastaq.intervals.merge_overlapping_in_list(covered)
+        return pyfastaq.intervals.length_sum_from_list(covered) / len(ref_seq) >= threshold
+
+
+    @staticmethod
+    def _ref_has_region_assembled_twice(nucmer_hits, ref_seq, threshold):
+        '''Returns true iff there is a part of the reference that is assembled
+           more than once (ie covered by >1 nucmer hit).
+           Needs a minimum proportin of the ref to be assembled more than once,
+           determined by threshold.
+           nucmer_hits = hits made by self._parse_nucmer_coords_file.'''
+        coords = AssemblyCompare.nucmer_hits_to_ref_coords(nucmer_hits)
+        covered = []
+        for coords_list in coords.values():
+            covered.extend(coords_list)
+        covered.sort()
+
+        if len(covered) <= 1:
+            return False
+
+        coverage = {}
+        for i in covered:
+            for j in range(i.start, i.end + 1):
+                coverage[j] = coverage.get(j, 0) + 1
+
+        bases_depth_at_least_two = len([1 for x in coverage.values() if x > 1])
+        return bases_depth_at_least_two / len(ref_seq) >= threshold
+
+
+    @staticmethod
+    def _ref_covered_by_complete_contig_with_orf(nucmer_hits, contigs):
+        '''Returns true iff there is a contig that covers the entire reference,
+           and that contig has a complete open reading frame.
+           nucmer_hits = hits made by self._parse_nucmer_coords_file.'''
+        for l in nucmer_hits.values():
+            for hit in l:
+                if hit.hit_length_ref == hit.ref_length:
+                    start = min(hit.qry_start, hit.qry_end)
+                    end = max(hit.qry_start, hit.qry_end)
+                    assembled_gene = pyfastaq.sequences.Fasta('x', contigs[hit.qry_name][start:end+1])
+                    if (hit.ref_start < hit.ref_end) != (hit.qry_start < hit.qry_end):
+                        assembled_gene.revcomp()
+                    assembled_gene_aa = assembled_gene.translate()
+                    orfs = assembled_gene.orfs()
+                    if len(orfs) == 0:
+                        continue
+
+                    max_orf = orfs[0]
+                    for o in orfs:
+                        if len(o) > len(max_orf):
+                            max_orf = o
+
+                    if len(max_orf) == len(assembled_gene):
+                        return True
+        return False
+
+
+    @staticmethod
+    def _ref_covered_by_at_least_one_full_length_contig(nucmer_hits):
+        '''Returns true iff there exists a contig that completely
+           covers the reference sequence
+           nucmer_hits = hits made by self._parse_nucmer_coords_file.'''
+        for l in nucmer_hits.values():
+            for hit in l:
+                if len(hit.ref_coords()) == hit.ref_length:
+                    return True
+        return False
+
+
+    def update_flag(self, flag):
+        if self._whole_gene_covered_by_nucmer_hits(self.nucmer_hits, self.ref_sequence, self.assembled_threshold):
+            flag.add('assembled')
+
+        if self._ref_covered_by_at_least_one_full_length_contig(self.nucmer_hits):
+            flag.add('assembled_into_one_contig')
+
+        if self._ref_has_region_assembled_twice(self.nucmer_hits, self.ref_sequence, self.unique_threshold):
+            flag.add('region_assembled_twice')
+
+        ref_seq_type = self.refdata.sequence_type(self.ref_sequence.id)
+        if ref_seq_type != 'non_coding' and self._ref_covered_by_complete_contig_with_orf(self.nucmer_hits, self.assembly_sequences):
+            flag.add('complete_orf')
+
+        if len(self.nucmer_hits) == 1:
+            flag.add('unique_contig')
+
+        return flag
+
+
+    @staticmethod
+    def nucmer_hit_containing_reference_position(nucmer_hits, ref_name, ref_position):
+        '''Returns the first nucmer match found that contains the given
+           reference location. nucmer_hits = hits made by self._parse_nucmer_coords_file.
+           Returns None if no matching hit found'''
+        for contig_name in nucmer_hits:
+            for hit in nucmer_hits[contig_name]:
+                if hit.ref_name == ref_name and hit.ref_coords().distance_to_point(ref_position) == 0:
+                    return hit
+
+        return None
+
+
+    def run(self):
+        self._run_nucmer()
+        self.nucmer_hits = self._parse_nucmer_coords_file(self.nucmer_coords_file, self.ref_sequence.id)
+        self.percent_identities = self._nucmer_hits_to_percent_identity(self.nucmer_hits)
+        self._write_assembled_reference_sequences(self.nucmer_hits, self.ref_sequence, self.assembly_sequences, self.assembled_ref_seqs_file)
diff --git a/ariba/assembly_variants.py b/ariba/assembly_variants.py
new file mode 100644
index 00000000..3aee01d9
--- /dev/null
+++ b/ariba/assembly_variants.py
@@ -0,0 +1,327 @@
+import operator
+import pyfastaq
+import pymummer
+from ariba import sequence_variant
+from pyfastaq import intervals
+
+
+class Error (Exception): pass
+
+class AssemblyVariants:
+    def __init__(self,
+      refdata,
+      nucmer_snp_file,
+    ):
+        self.refdata = refdata
+        self.nucmer_snp_file = nucmer_snp_file
+
+
+    @classmethod
+    def _get_codon_start(cls, gene_start, position):
+        assert position >= gene_start
+        while  (position - gene_start) % 3 != 0:
+            position -= 1
+        return position
+
+
+    @classmethod
+    def _get_mummer_variants(cls, snp_file):
+        variants = pymummer.snp_file.get_all_variants(snp_file)
+        mummer_variants = {}
+
+        if len(variants) == 0:
+            return {}
+
+        variants.sort(key=operator.attrgetter('qry_name'))
+        variants.sort(key=operator.attrgetter('ref_start'))
+
+        for v in variants:
+            if v.qry_name not in mummer_variants:
+                mummer_variants[v.qry_name] = []
+            mummer_variants[v.qry_name].append(v)
+
+        for contig in mummer_variants:
+            l = mummer_variants[contig]
+            if len(l) > 1:
+                new_l = [[l[0]]]
+                previous_codon_start = AssemblyVariants._get_codon_start(0, l[0].ref_start)
+                for variant in l[1:]:
+                    codon_start = AssemblyVariants._get_codon_start(0, variant.ref_start)
+                    if codon_start == previous_codon_start:
+                        new_l[-1].append(variant)
+                    else:
+                        new_l.append([variant])
+                        previous_codon_start = codon_start
+                mummer_variants[contig] = new_l
+            else:
+                mummer_variants[contig] = [l]
+
+        return mummer_variants
+
+
+    @classmethod
+    def _get_variant_effect(cls, variants, ref_sequence):
+        '''variants = list of variants in the same codon.
+           returns type of variant (cannot handle more than one indel in the same codon).'''
+        if len(variants) == 0:
+            return None
+
+        var_types = [x.var_type for x in variants]
+        if len(set(var_types)) != 1:
+            return None
+
+        var_type = var_types[0]
+
+        assert set([x.ref_name for x in variants]) == set([ref_sequence.id])
+        codon_starts = [AssemblyVariants._get_codon_start(0, x.ref_start) for x in variants]
+        assert len(set(codon_starts)) == 1
+        codon_start = codon_starts[0]
+        aa_start = codon_start // 3
+        ref_codon = pyfastaq.sequences.Fasta('codon', ref_sequence[codon_start:codon_start+3])
+        ref_aa = ref_codon.translate()
+
+        if var_type == pymummer.variant.SNP:
+            new_codon = list(ref_codon.seq)
+            for v in variants:
+                new_codon[v.ref_start - codon_start] = v.qry_base
+            new_codon = pyfastaq.sequences.Fasta('new', ''.join(new_codon))
+            qry_aa = new_codon.translate()
+
+            if ref_aa.seq == qry_aa.seq:
+                return ('SYN', '.', aa_start)
+            elif qry_aa.seq == '*':
+                return ('TRUNC', ref_aa.seq + str(aa_start + 1) + 'trunc', aa_start)
+            else:
+                return ('NONSYN', ref_aa.seq + str(aa_start + 1) + qry_aa.seq, aa_start)
+        elif var_type in [pymummer.variant.INS, pymummer.variant.DEL]:
+            if len(variants) > 1:
+                print('More than one indel in same codon not yet implemented!', ref_sequence.id, file=sys.stderr)
+                return 'INDELS', '.', aa_start
+
+            var = variants[0]
+
+            if var_type == pymummer.variant.INS:
+                new_seq = pyfastaq.sequences.Fasta('seq', var.qry_base)
+            else:
+                new_seq = pyfastaq.sequences.Fasta('seq', var.ref_base)
+
+            if len(new_seq) % 3 != 0:
+                return ('FSHIFT', ref_aa.seq + str(aa_start + 1) + 'fs', aa_start)
+
+            new_seq_aa = new_seq.translate()
+            if '*' in new_seq_aa.seq:
+                return ('TRUNC', ref_aa.seq + str(aa_start + 1) + 'trunc', aa_start)
+            elif var_type == pymummer.variant.INS:
+                ref_codon_after_ins = pyfastaq.sequences.Fasta('codon', ref_sequence[codon_start+3:codon_start+6])
+                aa_after_ins = ref_codon_after_ins.translate()
+                return ('INS', ref_aa.seq + str(aa_start + 1) + '_' + aa_after_ins.seq + str(aa_start + 2) + 'ins' + new_seq_aa.seq , aa_start)
+            else:
+                if len(new_seq) == 3:
+                    return ('DEL', ref_aa.seq + str(aa_start + 1) + 'del', aa_start)
+                else:
+                    assert len(new_seq) % 3 == 0
+                    new_aa = new_seq.translate()
+                    ref_codon_after_ins = pyfastaq.sequences.Fasta('codon', ref_sequence[codon_start+3:codon_start+6])
+                    aa_after_ins = ref_codon_after_ins.translate()
+                    return ('DEL', ref_aa.seq + str(aa_start + 1)+ '_' + aa_after_ins.seq + str(aa_start + 2) + 'del', aa_start)
+
+        else:
+            return ('UNKNOWN', '.', aa_start)
+
+
+    @staticmethod
+    def _filter_mummer_variants(mummer_variants, ref_sequence):
+        if len(mummer_variants) == 0:
+            return
+
+        for contig in mummer_variants:
+            variants = mummer_variants[contig]
+            for i in range(len(variants)):
+                t = AssemblyVariants._get_variant_effect(variants[i], ref_sequence)
+                if t is not None and t[0] in ['TRUNC', 'FSHIFT']:
+                    break
+            mummer_variants[contig] = variants[:i+1]
+
+
+    @staticmethod
+    def _get_one_variant_for_one_contig_non_coding(refdata_var_dict, mummer_variant):
+        var_tuple = None
+        used_known_variants = set()
+
+        # if the variant is at the same position as a known variant in the reference
+        if refdata_var_dict is not None and mummer_variant.ref_start in refdata_var_dict['n']:
+            if mummer_variant.var_type == pymummer.variant.SNP:
+                variants_at_this_position = {x for x in refdata_var_dict['n'][mummer_variant.ref_start]}
+                matching_variants = {x for x in variants_at_this_position if mummer_variant.qry_base == x.variant.variant_value}
+                not_interesting_variants = {x for x in variants_at_this_position if mummer_variant.qry_base == x.variant.wild_value}
+                variants_at_this_position = variants_at_this_position.difference(matching_variants)
+            else:
+                matching_variants = set()
+                variants_at_this_position = refdata_var_dict['n'][mummer_variant.ref_start]
+                not_interesting_variants = set()
+
+            if len(not_interesting_variants) == 0:
+                var_tuple = (
+                    mummer_variant.ref_start,
+                    'n',
+                    mummer_variant.ref_base + str(mummer_variant.ref_start + 1) + mummer_variant.qry_base,
+                    pymummer.variant.var_types[mummer_variant.var_type],
+                    [mummer_variant],
+                    matching_variants,
+                    variants_at_this_position
+                )
+
+            used_known_variants.update(matching_variants, variants_at_this_position)
+        else: # not at a known variant position in the reference
+            var_tuple = (
+                mummer_variant.ref_start,
+                'n',
+                mummer_variant.ref_base + str(mummer_variant.ref_start + 1) + mummer_variant.qry_base,
+                pymummer.variant.var_types[mummer_variant.var_type],
+                [mummer_variant],
+                set(),
+                set()
+            )
+
+        return var_tuple, used_known_variants
+
+
+    @staticmethod
+    def _get_one_variant_for_one_contig_coding(ref_sequence, refdata_var_dict, mummer_variants_list):
+        aa_var_effect, aa_var_string, aa_var_position = AssemblyVariants._get_variant_effect(mummer_variants_list, ref_sequence)
+        var_tuple = None
+        used_known_variants = set()
+
+        # if this variant is at the same position as a known variant in the reference
+        if refdata_var_dict is not None and aa_var_position in refdata_var_dict['p']:
+            if aa_var_effect == 'NONSYN':
+                aa_variant = sequence_variant.Variant('p', aa_var_string)
+                variants_at_this_position = {x for x in refdata_var_dict['p'][aa_variant.position]}
+                matching_variants = {x for x in variants_at_this_position if aa_variant.variant_value == x.variant.variant_value}
+                not_interesting_variants = {x for x in variants_at_this_position if aa_variant.variant_value == x.variant.wild_value}
+                variants_at_this_position = variants_at_this_position.difference(matching_variants)
+            else:
+                matching_variants = set()
+                variants_at_this_position = refdata_var_dict['p'][aa_var_position]
+                not_interesting_variants = set()
+
+            if len(not_interesting_variants) == 0:
+                var_tuple = (
+                    aa_var_position,
+                    'p',
+                    aa_var_string,
+                    aa_var_effect,
+                    mummer_variants_list,
+                    matching_variants,
+                    variants_at_this_position
+                )
+
+            used_known_variants.update(matching_variants, variants_at_this_position)
+        else: # this variant is not at a known position in the reference
+            var_tuple = (
+                aa_var_position,
+                'p',
+                aa_var_string,
+                aa_var_effect,
+                mummer_variants_list,
+                set(),
+                set()
+            )
+
+        return var_tuple, used_known_variants
+
+
+    @staticmethod
+    def _get_remaining_known_ref_variants(known_ref_variants, used_ref_variants, nucmer_coords):
+        '''Finds variants where ref has the variant and so does the contig. Which means
+           that there was no mummer call to flag it up so need to look through the known
+           ref variants. Also need to check that the variant is in a nucmer match to an
+           assembly contig.'''
+        variants = []
+
+        for ref_variant_pos, ref_variants_set in sorted(known_ref_variants.items()):
+            for known_ref_variant in ref_variants_set:
+                if known_ref_variant not in used_ref_variants:
+                    variant_pos_matches_contig = False
+                    pos = known_ref_variant.variant.position
+
+                    if known_ref_variant.variant_type == 'n':
+                        ref_interval = intervals.Interval(pos, pos)
+                    elif known_ref_variant.variant_type == 'p':
+                        ref_interval = intervals.Interval(3 * pos, 3 * pos + 2)
+                    else:
+                        raise Error('Unexpected variant type "' + known_ref_variant.variant_type + '" in _get_remaining_known_ref_variants. Cannot continue')
+
+                    for interval in nucmer_coords:
+                        if ref_interval.intersects(interval):
+                            variant_pos_matches_contig = True
+                            break
+
+                    if variant_pos_matches_contig:
+                        variants.append((None, known_ref_variant.variant_type, None, None, None, {known_ref_variant}, set()))
+
+        return variants
+
+
+    def get_variants(self, ref_sequence_name, nucmer_coords):
+        '''Nucmr coords = dict. Key=contig name. Value = list of intervals of ref coords that match the contig.
+           Made by assembly_compare.AssemblyCompare.nucmer_hits_to_ref_coords
+           Returns dictionary. Key=contig name. Value = list of variants. Each variant
+           is a tuple: (
+               0 = position,
+               1 = type in ['n', 'p']
+               2 = Variant string, eg 'D2E',
+               3 = variant effect (as returned by _get_variant_effect)
+               4 = list of pymummer.variant.Variant that made up this variant (could be more than one because of
+                   variants in the same codon)
+               5 = set {matching known variants from metadata (=sequence_metadata.SequenceMetadata)}
+               6 = set {known ref metadata (=sequence_metadata.SequenceMetadata)  at same position as SNP}, excluding those from 4
+           )
+        '''
+        mummer_variants = self._get_mummer_variants(self.nucmer_snp_file)
+        variants = {}
+        ref_sequence_type = self.refdata.sequence_type(ref_sequence_name)
+        assert ref_sequence_type is not None
+        ref_sequence = self.refdata.sequence(ref_sequence_name)
+
+        if ref_sequence_name in self.refdata.metadata:
+            refdata_var_dict = self.refdata.metadata[ref_sequence_name]
+        else:
+            refdata_var_dict = None
+
+        known_non_wild_variants_in_ref = self.refdata.all_non_wild_type_variants(ref_sequence_name)
+
+        for contig in nucmer_coords:
+            used_known_variants = set()
+            variants[contig] = []
+
+            if contig in mummer_variants:
+                for mummer_variant_list in mummer_variants[contig]:
+                    if ref_sequence_type == 'non_coding':
+                        for mummer_variant in mummer_variant_list:
+                            new_variant, used_variants = self._get_one_variant_for_one_contig_non_coding(refdata_var_dict, mummer_variant)
+                    else:
+                        new_variant, used_variants = self._get_one_variant_for_one_contig_coding(ref_sequence, refdata_var_dict, mummer_variant_list)
+
+                    # include new variant, except if the ref type is variants only and
+                    # the new variant matches to a known variant
+                    if new_variant is not None and (ref_sequence_type != 'variants_only' or len(new_variant[5]) > 0):
+                            variants[contig].append(new_variant)
+                    used_known_variants.update(used_variants)
+
+            # for this contig, need to know all the ref sequence and coords it maps to.
+            # Then report just the unused known variants, as the contig also has these variants
+            if ref_sequence_type == 'non_coding':
+                new_variants = self._get_remaining_known_ref_variants(known_non_wild_variants_in_ref['n'], used_known_variants, nucmer_coords[contig])
+            else:
+                new_variants = self._get_remaining_known_ref_variants(known_non_wild_variants_in_ref['p'], used_known_variants, nucmer_coords[contig])
+
+                if ref_sequence_type == 'variants_only':
+                    new_variants = [x for x in new_variants if len(x[5]) > 0]
+
+            variants[contig].extend(new_variants)
+            if len(variants[contig]) == 0:
+                del variants[contig]
+
+        return variants
+
diff --git a/ariba/best_seq_chooser.py b/ariba/best_seq_chooser.py
new file mode 100644
index 00000000..09880a24
--- /dev/null
+++ b/ariba/best_seq_chooser.py
@@ -0,0 +1,99 @@
+import shutil
+import tempfile
+import sys
+import os
+import pyfastaq
+from ariba import mapping, faidx
+
+class Error (Exception): pass
+
+class BestSeqChooser:
+    def __init__(self,
+        reads1,
+        reads2,
+        references_fa,
+        log_fh,
+        samtools_exe='samtools',
+        bowtie2_exe='bowtie2',
+        bowtie2_preset='very-sensitive-local',
+        threads=1,
+    ):
+        self.reads1 = reads1
+        self.reads2 = reads2
+        self.references_fa = references_fa
+        self.log_fh = log_fh
+        self.samtools_exe = samtools_exe
+        self.bowtie2_exe = bowtie2_exe
+        self.bowtie2_preset = bowtie2_preset
+        self.threads = threads
+
+
+    def _total_alignment_score(self, seq_name):
+        tmpdir = tempfile.mkdtemp(prefix='tmp.get_total_aln_score.', dir=os.getcwd())
+        tmp_bam = os.path.join(tmpdir, 'tmp.get_total_alignment_score.bam')
+        tmp_fa = os.path.join(tmpdir, 'tmp.get_total_alignment_score.ref.fa')
+
+        faidx.write_fa_subset(
+            [seq_name],
+            self.references_fa,
+            tmp_fa,
+            samtools_exe=self.samtools_exe,
+            verbose=True,
+            verbose_filehandle=self.log_fh
+        )
+
+        mapping.run_bowtie2(
+            self.reads1,
+            self.reads2,
+            tmp_fa,
+            tmp_bam[:-4],
+            threads=self.threads,
+            samtools=self.samtools_exe,
+            bowtie2=self.bowtie2_exe,
+            bowtie2_preset=self.bowtie2_preset,
+            verbose=True,
+            verbose_filehandle=self.log_fh
+        )
+
+        score = mapping.get_total_alignment_score(tmp_bam)
+        shutil.rmtree(tmpdir)
+        return score
+
+
+    def _get_best_seq_by_alignment_score(self):
+        total_sequences = pyfastaq.tasks.count_sequences(self.references_fa)
+        if total_sequences == 1:
+            seqs = {}
+            pyfastaq.tasks.file_to_dict(self.references_fa, seqs)
+            assert len(seqs) == 1
+            seq_name = list(seqs.values())[0].id
+            print('No need to choose sequence for this cluster because only has one sequence:', seq_name, file=self.log_fh)
+            return seq_name
+
+        print('\nChoosing best sequence from cluster of', total_sequences, 'sequences...', file=self.log_fh)
+        file_reader = pyfastaq.sequences.file_reader(self.references_fa)
+        best_score = 0
+        best_seq_name = None
+        for seq in file_reader:
+            score = self._total_alignment_score(seq.id)
+            print('Total alignment score for sequence', seq.id, 'is', score, file=self.log_fh)
+            if score > best_score:
+                best_score = score
+                best_seq_name = seq.id
+
+        print('\nBest sequence is', best_seq_name, 'with total alignment score of', best_score, file=self.log_fh)
+        print(file=self.log_fh)
+        return best_seq_name
+
+
+    def best_seq(self, outfile):
+        '''Finds the closest matchng sequence, writes it to a FASTA file, and returns it as a pyfastaq.sequences.Fasta object'''
+        seq_name = self._get_best_seq_by_alignment_score()
+        if seq_name is None:
+            return None
+        faidx.write_fa_subset([seq_name], self.references_fa, outfile, samtools_exe=self.samtools_exe, verbose=True, verbose_filehandle=self.log_fh)
+        seqs = {}
+        pyfastaq.tasks.file_to_dict(outfile, seqs)
+        assert len(seqs) == 1
+        return list(seqs.values())[0]
+
diff --git a/ariba/cdhit.py b/ariba/cdhit.py
index 32764041..9b021339 100644
--- a/ariba/cdhit.py
+++ b/ariba/cdhit.py
@@ -6,8 +6,6 @@
 
 class Error (Exception): pass
 
-
-
 class Runner:
     def __init__(
       self,
@@ -17,6 +15,7 @@ def __init__(
       threads=1,
       length_diff_cutoff=0.9,
       verbose=False,
+      cd_hit_est='cd-hit-est',
     ):
 
         if not os.path.exists(infile):
@@ -28,113 +27,88 @@ def __init__(
         self.threads = threads
         self.length_diff_cutoff = length_diff_cutoff
         self.verbose = verbose
+        self.cd_hit_est = cd_hit_est
 
 
     def fake_run(self):
         '''Doesn't actually run cd-hit. Instead, puts each input sequence into its own cluster. So it's as if cdhit was run, but didn't cluster anything'''
-        cluster_to_name = {}
-        found_names = set()
+        clusters = {}
         seq_reader = pyfastaq.sequences.file_reader(self.infile)
         f = pyfastaq.utils.open_file_write(self.outfile)
+
         for seq in seq_reader:
-            if seq.id in found_names:
+            if seq.id in clusters:
+                pyfastaq.utils.close(f)
                 raise Error('Sequence name "' + seq.id + '" not unique. Cannot continue')
-            found_names.add(seq.id)
-            cluster_number = str(len(cluster_to_name))
-            cluster_to_name[cluster_number] = {seq.id}
-            seq.id = cluster_number
+
+            clusters[seq.id] = {seq.id}
             print(seq, file=f)
 
         pyfastaq.utils.close(f)
-        return cluster_to_name
-
-
-    def run(self):
-        tmpdir = tempfile.mkdtemp(prefix='tmp.run_cd-hit.', dir=os.getcwd())
-        cdhit_fasta = os.path.join(tmpdir, 'cdhit')
-        cluster_info_outfile = cdhit_fasta + '.bak.clstr'
-        infile_renamed = os.path.join(tmpdir, 'input.renamed.fa')
-
-        # cd-hit truncates all names to 19 bases in its report of which
-        # sequences belong to which clusters. So need to temporarily
-        # rename all sequences to have short enough names. Grrr.
-        new_to_old_name = self._enumerate_fasta(self.infile, infile_renamed)
-
-        cmd = ' '.join([
-            'cd-hit-est',
-            '-i', infile_renamed,
-            '-o', cdhit_fasta,
-            '-c', str(self.seq_identity_threshold),
-            '-T', str(self.threads),
-            '-s', str(self.length_diff_cutoff),
-            '-bak 1',
-        ])
-
-        common.syscall(cmd, verbose=self.verbose)
-
-        cluster_representatives = self._get_ids(cdhit_fasta)
-        clusters, cluster_rep_to_cluster = self._parse_cluster_info_file(cluster_info_outfile, new_to_old_name, cluster_representatives)
-        self._rename_fasta(cdhit_fasta, self.outfile, cluster_rep_to_cluster)
-        shutil.rmtree(tmpdir)
         return clusters
 
 
-    def _enumerate_fasta(self, infile, outfile):
-        rename_file = outfile + '.tmp.rename_info'
-        assert not os.path.exists(rename_file)
-        pyfastaq.tasks.enumerate_names(infile, outfile, rename_file=rename_file)
-
-        with open(rename_file) as f:
-            lines = [x.rstrip().split('\t') for x in f.readlines() if x != '#old\tnew\n']
-            new_to_old_name = {x[1]: x[0] for x in lines}
-            if len(lines) != len(new_to_old_name):
-                raise Error('Sequence names in input file not unique! Cannot continue')
-
-        os.unlink(rename_file)
-        return new_to_old_name
-
-
-    def _rename_fasta(self, infile, outfile, names_dict):
+    def _get_ids(self, infile):
         seq_reader = pyfastaq.sequences.file_reader(infile)
-        f = pyfastaq.utils.open_file_write(outfile)
-        for seq in seq_reader:
-            seq.id = names_dict[seq.id]
-            print(seq, file=f)
-
-        pyfastaq.utils.close(f)
+        return set([seq.id for seq in seq_reader])
 
 
-    def _parse_cluster_info_file(self, infile, names_dict, cluster_representatives):
+    @staticmethod
+    def _parse_cluster_info_file(infile, cluster_representatives):
         f = pyfastaq.utils.open_file_read(infile)
         clusters = {}
-        cluster_representative_to_cluster_number = {}
+        current_cluster = None
+
         for line in f:
             data = line.rstrip().split()
-            cluster = data[0]
             seqname = data[2]
             if not (seqname.startswith('>') and seqname.endswith('...')):
-                raise Error('Unexpected format of sequence name in line:\n' + line)
+                raise Error('Unexpected format of line from cdhit output file "' + infile + '". Line is:\n' + line)
             seqname = seqname[1:-3]
 
-            if seqname in cluster_representatives:
-                cluster_representative_to_cluster_number[seqname] = cluster
+            if data[3] == '*':
+                current_cluster = seqname
+                assert current_cluster not in clusters
+                clusters[current_cluster] = {current_cluster}
+            else:
+                assert current_cluster in clusters
+                if seqname in clusters[current_cluster]:
+                    raise Error('Duplicate name "' + seqname + '" found in cluster ' + cluster)
 
-            seqname = names_dict[seqname]
+                clusters[current_cluster].add(seqname)
 
-            if cluster not in clusters:
-                clusters[cluster] = set()
+        pyfastaq.utils.close(f)
+        if set(clusters.keys()) != cluster_representatives:
+            raise Error('Mismatch in cdhit output sequence names between fasta file and clusters file. Cannot continue')
 
-            if seqname in clusters[cluster]:
-                raise Error('Duplicate name "' + seqname + '" found in cluster ' + str(cluster))
+        return clusters
 
-            clusters[cluster].add(seqname)
 
-        pyfastaq.utils.close(f)
+    def run(self):
+        tmpdir = tempfile.mkdtemp(prefix='tmp.run_cd-hit.', dir=os.getcwd())
+        cdhit_fasta = os.path.join(tmpdir, 'cdhit')
+        cluster_info_outfile = cdhit_fasta + '.bak.clstr'
 
-        return clusters, cluster_representative_to_cluster_number
+        cmd = ' '.join([
+            self.cd_hit_est,
+            '-i', self.infile,
+            '-o', cdhit_fasta,
+            '-c', str(self.seq_identity_threshold),
+            '-T', str(self.threads),
+            '-s', str(self.length_diff_cutoff),
+            '-d 0',
+            '-bak 1',
+        ])
+
+        common.syscall(cmd, verbose=self.verbose)
+        cluster_representatives = self._get_ids(cdhit_fasta)
+        clusters = self._parse_cluster_info_file(cluster_info_outfile, cluster_representatives)
 
+        try:
+            os.rename(cdhit_fasta, self.outfile)
+        except:
+            raise Error('Error rname ' + cdhit_fasta + ' ' + self.outfile + '. Cannot continue')
 
-    def _get_ids(self, infile):
-        seq_reader = pyfastaq.sequences.file_reader(infile)
-        return set([seq.id for seq in seq_reader])
+        shutil.rmtree(tmpdir)
+        return clusters
 
diff --git a/ariba/cluster.py b/ariba/cluster.py
index 2294ceb7..c547cae1 100644
--- a/ariba/cluster.py
+++ b/ariba/cluster.py
@@ -1,13 +1,8 @@
 import os
-import copy
-from operator import itemgetter
-import sys
 import shutil
-import pysam
-import operator
+import sys
 import pyfastaq
-import pymummer
-from ariba import common, mapping, bam_parse, flag, faidx
+from ariba import assembly, assembly_compare, assembly_variants, bam_parse, best_seq_chooser, external_progs, flag, mapping, report, samtools_variants
 
 class Error (Exception): pass
 
@@ -17,15 +12,16 @@ class Cluster:
     def __init__(self,
       root_dir,
       name,
-      assembly_kmer=0,
-      assembler='velvet',
+      refdata,
+      assembly_kmer=21,
+      assembler='spades',
       max_insert=1000,
       min_scaff_depth=10,
       nucmer_min_id=90,
       nucmer_min_len=50,
       nucmer_breaklen=50,
-      sspace_k=20,
       reads_insert=500,
+      sspace_k=20,
       sspace_sd=0.4,
       threads=1,
       bcf_min_dp=10,
@@ -34,17 +30,10 @@ def __init__(self,
       bcf_min_qual=20,
       assembled_threshold=0.95,
       unique_threshold=0.03,
-      bcftools_exe='bcftools',
-      gapfiller_exe='GapFiller.pl',
-      samtools_exe='samtools',
-      bowtie2_exe='bowtie2',
       bowtie2_preset='very-sensitive-local',
-      smalt_exe='smalt',
-      spades_exe='spades.py',
-      sspace_exe='SSPACE_Basic_v2.0.pl',
-      velvet_exe='velvet', # prefix of velvet{g,h}
-      spades_other=None,
+      spades_other_options=None,
       clean=1,
+      extern_progs=None,
     ):
 
         self.root_dir = os.path.abspath(root_dir)
@@ -52,88 +41,58 @@ def __init__(self,
             raise Error('Directory ' + self.root_dir + ' not found. Cannot continue')
 
         self.name = name
+        self.refdata = refdata
+        self.assembly_kmer = assembly_kmer
+        self.assembler = assembler
+        self.sspace_k = sspace_k
+        self.sspace_sd = sspace_sd
+        self.reads_insert = reads_insert
+        self.spades_other_options = spades_other_options
+
         self.reads1 = os.path.join(self.root_dir, 'reads_1.fq')
         self.reads2 = os.path.join(self.root_dir, 'reads_2.fq')
-        self.gene_fa = os.path.join(self.root_dir, 'gene.fa')
-        self.genes_fa = os.path.join(self.root_dir, 'genes.fa')
-        self.gene_bam = os.path.join(self.root_dir, 'gene.reads_mapped.bam')
+        self.reference_fa = os.path.join(self.root_dir, 'reference.fa')
+        self.references_fa = os.path.join(self.root_dir, 'references.fa')
 
-        for fname in [self.reads1, self.reads2, self.genes_fa]:
+        for fname in [self.reads1, self.reads2, self.references_fa]:
             if not os.path.exists(fname):
                 raise Error('File ' + fname + ' not found. Cannot continue')
 
 
+        self.ref_sequence = None
+
         self.max_insert = max_insert
         self.min_scaff_depth = min_scaff_depth
 
         self.nucmer_min_id = nucmer_min_id
         self.nucmer_min_len = nucmer_min_len
         self.nucmer_breaklen = nucmer_breaklen
-        self.assembly_vs_gene_coords = os.path.join(self.root_dir, 'assembly_vs_gene.coords')
 
         self.bcf_min_dp = bcf_min_dp
         self.bcf_min_dv = bcf_min_dv
         self.bcf_min_dv_over_dp = bcf_min_dv_over_dp
         self.bcf_min_qual = bcf_min_qual
 
-        self._set_assembly_kmer(assembly_kmer)
-        self.assembler = assembler
-        assert self.assembler in ['velvet', 'spades']
-        self.spades_exe = spades_exe
-        self.spades_other = spades_other
-
-        self.bcftools_exe = bcftools_exe
-
-        self.sspace_exe = shutil.which(sspace_exe)
-        if self.sspace_exe is None:
-            self.gapfiller_exe = None
-        else:
-            self.sspace_exe = os.path.realpath(self.sspace_exe) # otherwise sspace dies loading packages
-            self.gapfiller_exe = shutil.which(gapfiller_exe)
-            if self.gapfiller_exe is not None:
-                self.gapfiller_exe = os.path.realpath(self.gapfiller_exe) # otherwise gapfiller dies loading packages
-
-        self.samtools_exe = samtools_exe
-        self.smalt_exe = smalt_exe
-        self.bowtie2_exe = bowtie2_exe
         self.bowtie2_preset = bowtie2_preset
 
-        if self.assembler == 'velvet':
-            self.velveth = velvet_exe + 'h'
-            self.velvetg = velvet_exe + 'g'
-
-        self.sspace_k = sspace_k
-        self.reads_insert = reads_insert
-        self.sspace_sd = sspace_sd
-
         self.threads = threads
         self.assembled_threshold = assembled_threshold
         self.unique_threshold = unique_threshold
         self.status_flag = flag.Flag()
-        self.flag_file = os.path.join(self.root_dir, 'flag')
         self.clean = clean
 
         self.assembly_dir = os.path.join(self.root_dir, 'Assembly')
-        try:
-            os.mkdir(self.assembly_dir)
-        except:
-            raise Error('Error mkdir ' + self.assembly_dir)
-        self.assembler_dir = os.path.join(self.assembly_dir, 'Assemble')
-        self.assembly_contigs = os.path.join(self.assembly_dir, 'contigs.fa')
-        self.scaffold_dir = os.path.join(self.assembly_dir, 'Scaffold')
-        self.scaffolder_scaffolds = os.path.join(self.assembly_dir, 'scaffolds.fa')
-        self.gapfill_dir = os.path.join(self.assembly_dir, 'Gapfill')
-        self.gapfilled_scaffolds = os.path.join(self.assembly_dir, 'scaffolds.gapfilled.fa')
         self.final_assembly_fa = os.path.join(self.root_dir, 'assembly.fa')
         self.final_assembly_bam = os.path.join(self.root_dir, 'assembly.reads_mapped.bam')
         self.final_assembly_read_depths = os.path.join(self.root_dir, 'assembly.reads_mapped.bam.read_depths.gz')
         self.final_assembly_vcf = os.path.join(self.root_dir, 'assembly.reads_mapped.bam.vcf')
-        self.final_assembled_genes_fa = os.path.join(self.root_dir, 'assembly.genes.fa')
-        self.final_assembly = {}
+        self.samtools_vars_prefix = self.final_assembly_bam
+        self.assembly_compare_prefix = os.path.join(self.root_dir, 'assembly_compare')
+
         self.mummer_variants = {}
         self.variant_depths = {}
         self.percent_identities = {}
-        self.total_reads = None
+        self.total_reads = self._count_reads(self.reads1, self.reads2)
 
         # The log filehandle self.log_fh is set at the start of the run() method.
         # Lots of other methods use self.log_fh. But for unit testing, run() isn't
@@ -145,983 +104,187 @@ def __init__(self,
         if unittest:
             self.log_fh = sys.stdout
 
-
-    def _get_read_counts(self):
-        if self.total_reads is not None:
-            return self.total_reads
-
-        count1 = pyfastaq.tasks.count_sequences(self.reads1)
-        count2 = pyfastaq.tasks.count_sequences(self.reads2)
-        if count1 == count2:
-            self.total_reads = count1 + count2
-            return self.total_reads
+        if extern_progs is None:
+            self.extern_progs = external_progs.ExternalProgs()
         else:
-            raise Error('Different number of fwd/rev reads in cluster ' + self.name + '! Cannot continue')
-
-
-    def _get_total_alignment_score(self, gene_name):
-        tmp_bam = os.path.join(self.root_dir, 'tmp.get_total_alignment_score.bam')
-        assert not os.path.exists(tmp_bam)
-        tmp_fa = os.path.join(self.root_dir, 'tmp.get_total_alignment_score.ref.fa')
-        assert not os.path.exists(tmp_fa)
-        faidx.write_fa_subset([gene_name], self.genes_fa, tmp_fa, samtools_exe=self.samtools_exe, verbose=True, verbose_filehandle=self.log_fh)
-        mapping.run_bowtie2(
-            self.reads1,
-            self.reads2,
-            tmp_fa,
-            tmp_bam[:-4],
-            threads=self.threads,
-            samtools=self.samtools_exe,
-            bowtie2=self.bowtie2_exe,
-            bowtie2_preset=self.bowtie2_preset,
-            verbose=True,
-            verbose_filehandle=self.log_fh
-        )
-
-        score = mapping.get_total_alignment_score(tmp_bam)
-        os.unlink(tmp_bam)
-        os.unlink(tmp_fa)
-        os.unlink(tmp_fa + '.fai')
-        return score
-
-
-    def _get_best_gene_by_alignment_score(self):
-        cluster_size = pyfastaq.tasks.count_sequences(self.genes_fa)
-        if cluster_size == 1:
-            seqs = {}
-            pyfastaq.tasks.file_to_dict(self.genes_fa, seqs)
-            assert len(seqs) == 1
-            gene_name = list(seqs.values())[0].id
-            print('No need to choose gene for this cluster because only has one gene:', gene_name, file=self.log_fh)
-            return gene_name
-
-        print('\nChoosing best gene from cluster of', cluster_size, 'genes...', file=self.log_fh)
-        file_reader = pyfastaq.sequences.file_reader(self.genes_fa)
-        best_score = 0
-        best_gene_name = None
-        for seq in file_reader:
-            score = self._get_total_alignment_score(seq.id)
-            print('Total alignment score for gene', seq.id, 'is', score, file=self.log_fh)
-            if score > best_score:
-                best_score = score
-                best_gene_name = seq.id
-
-        print('\nBest gene is', best_gene_name, 'with total alignment score of', best_score, file=self.log_fh)
-        print(file=self.log_fh)
-
-        return best_gene_name
-
-
-    def _choose_best_gene(self):
-        gene_name = self._get_best_gene_by_alignment_score()
-        if gene_name is None:
-            return None
-        faidx.write_fa_subset([gene_name], self.genes_fa, self.gene_fa, samtools_exe=self.samtools_exe, verbose=True, verbose_filehandle=self.log_fh)
-        seqs = {}
-        pyfastaq.tasks.file_to_dict(self.gene_fa, seqs)
-        assert len(seqs) == 1
-        return list(seqs.values())[0]
-
-
-    def _set_assembly_kmer(self, k):
-        '''If the kmer not given, uses 2/3 of the mean read length (using first 1000 forward and first 1000 reverse reads)'''
-        if k == 0:
-            read_length1 = pyfastaq.tasks.mean_length(self.reads1, limit=1000)
-            read_length2 = pyfastaq.tasks.mean_length(self.reads2, limit=1000)
-            self.assembly_kmer = round( (read_length1 + read_length2) / 3)
-            if self.assembly_kmer % 2 == 0:
-                self.assembly_kmer += 1
-        else:
-            self.assembly_kmer = k
-
-
-    def _assemble_with_velvet(self):
-        # map reads to reference gene to make BAM input to velvet columbus
-        mapping.run_bowtie2(
-            self.reads1,
-            self.reads2,
-            self.gene_fa,
-            self.gene_bam[:-4],
-            threads=self.threads,
-            sort=True,
-            samtools=self.samtools_exe,
-            bowtie2=self.bowtie2_exe,
-            bowtie2_preset=self.bowtie2_preset,
-            verbose=True,
-            verbose_filehandle=self.log_fh
-        )
-
-        cmd = ' '.join([
-            self.velveth,
-            self.assembler_dir,
-            str(self.assembly_kmer),
-            '-reference', self.gene_fa,
-            '-shortPaired -bam', self.gene_bam[:-4] + '.unsorted.bam'
-        ])
-
-        cwd = os.getcwd()
-        os.chdir(self.assembly_dir)
-        velvet_contigs = os.path.join(os.path.split(self.assembler_dir)[1], 'contigs.fa')
-
-        self.velveth_ok, err = common.syscall(cmd, verbose=True, allow_fail=True, verbose_filehandle=self.log_fh)
-        if not self.velveth_ok:
-            with open('velveth_errors', 'w') as f:
-                print(err, file=f)
-                f.close()
-            self.status_flag.add('assembly_fail')
-            os.chdir(cwd)
-            return
-
-        cmd = ' '.join([
-            self.velvetg,
-            self.assembler_dir,
-            '-ins_length', str(int(self.reads_insert)),
-            '-scaffolding no',
-            '-exp_cov auto',
-            '-very_clean yes',
-            '-cov_cutoff auto',
-        ])
-
-        self.assembled_ok, err = common.syscall(cmd, verbose=True, allow_fail=True, verbose_filehandle=self.log_fh)
-        if self.assembled_ok:
-            os.symlink(velvet_contigs, os.path.basename(self.assembly_contigs))
-        else:
-            with open('velvetg_errors', 'w') as f:
-                print(err, file=f)
-                f.close()
-            self.status_flag.add('assembly_fail')
-
-        os.chdir(cwd)
-
-
-    def _assemble_with_spades(self, unittest=False):
-        cmd = ' '.join([
-            self.spades_exe,
-            '-1', self.reads1,
-            '-2', self.reads2,
-            '-o', self.assembler_dir,
-            '-k', str(self.assembly_kmer),
-            '--threads', str(self.threads),
-            '--untrusted-contigs', self.gene_fa,
-        ])
-        if self.spades_other is not None:
-            cmd += ' ' + self.spades_other
-
-        cwd = os.getcwd()
-        os.chdir(self.assembly_dir)
-        spades_contigs = os.path.join(os.path.split(self.assembler_dir)[1], 'scaffolds.fasta')
-
-        if unittest:
-            os.mkdir(self.assembler_dir)
-            open(spades_contigs, 'w').close()
-            self.assembled_ok = True
-        else:
-            self.assembled_ok, err = common.syscall(cmd, verbose=True, allow_fail=True, verbose_filehandle=self.log_fh, print_errors=False)
-        if self.assembled_ok:
-            os.symlink(spades_contigs, os.path.basename(self.assembly_contigs))
-        else:
-            spades_errors_file = os.path.join(self.root_dir, 'spades_errors')
-            with open(spades_errors_file, 'w') as f:
-                print(err, file=f)
-            f.close()
-            self.status_flag.add('assembly_fail')
-            total_reads = self._get_read_counts()
-            print('WARNING: assembly failed for cluster', self.name, 'which is usually due to not enough reads for this cluster (from spurious mapping) and nothing to worry about.', file=sys.stderr)
-            print('WARNING: assembly failed for cluster', self.name, ' ... number of reads for this cluster:', total_reads, file=sys.stderr)
-            print('WARNING: assembly failed for cluster', self.name, ' ... SPAdes errors written to:', spades_errors_file, file=sys.stderr)
-
-        os.chdir(cwd)
-
-
-    def _scaffold_with_sspace(self):
-        if not os.path.exists(self.assembly_contigs):
-            raise Error('Cannot scaffold because contigs file not found: ' + self.assembly_contigs)
-
-        try:
-            os.mkdir(self.scaffold_dir)
-        except:
-            raise Error('Error mkdir '+  self.scaffold_dir)
-
-        cwd = os.getcwd()
-
-        if self.sspace_exe is None:
-            os.chdir(self.assembly_dir)
-            os.symlink(os.path.basename(self.assembly_contigs), os.path.basename(self.scaffolder_scaffolds))
-            os.chdir(cwd)
-            return
-
-        os.chdir(self.scaffold_dir)
-        lib_file = 'lib'
-        with open(lib_file, 'w') as f:
-            print('LIB', self.reads1, self.reads2, int(self.reads_insert), self.sspace_sd, 'FR', file=f)
-
-        cmd = ' '.join([
-            'perl', self.sspace_exe,
-            '-k', str(self.sspace_k),
-            '-l', lib_file,
-            '-s', self.assembly_contigs
-        ])
-
-        sspace_scaffolds = os.path.abspath('standard_output.final.scaffolds.fasta')
-        common.syscall(cmd, verbose=True, verbose_filehandle=self.log_fh)
-        os.chdir(self.assembly_dir)
-        os.symlink(os.path.relpath(sspace_scaffolds), os.path.basename(self.scaffolder_scaffolds))
-        os.chdir(cwd)
-
-
-    def _has_gaps_to_fill(self, filename):
-        seq_reader = pyfastaq.sequences.file_reader(filename)
-        for seq in seq_reader:
-            if 'n' in seq.seq or 'N' in seq.seq:
-                return True
-        return False
-
-
-    def _gap_fill_with_gapfiller(self):
-        if not os.path.exists(self.scaffolder_scaffolds):
-            raise Error('Cannot gap fill because scaffolds file not found: ' + self.scaffolder_scaffolds)
-
-
-        cwd = os.getcwd()
-
-        if self.gapfiller_exe is None or not self._has_gaps_to_fill(self.scaffolder_scaffolds):
-            self._rename_scaffolds(self.scaffolder_scaffolds, self.gapfilled_scaffolds)
-            return
-
-        try:
-            os.mkdir(self.gapfill_dir)
-        except:
-            raise Error('Error mkdir '+  self.gapfill_dir)
-
-        os.chdir(self.gapfill_dir)
-        lib_file = 'lib'
-        with open(lib_file, 'w') as f:
-            print('LIB', 'bwa', self.reads1, self.reads2, self.reads_insert, self.sspace_sd, 'FR', file=f)
-
-        cmd = ' '.join([
-            'perl', self.gapfiller_exe,
-            '-l', lib_file,
-            '-s', self.scaffolder_scaffolds
-        ])
-
-        gapfilled_scaffolds = os.path.join(self.gapfill_dir, 'standard_output', 'standard_output.gapfilled.final.fa')
-        common.syscall(cmd, verbose=True, verbose_filehandle=self.log_fh)
-        self._rename_scaffolds(gapfilled_scaffolds, self.gapfilled_scaffolds)
-        os.chdir(cwd)
-
-
-    def _rename_scaffolds(self, infile, outfile):
-        freader = pyfastaq.sequences.file_reader(infile)
-        f_out = pyfastaq.utils.open_file_write(outfile)
-        i = 1
-        for scaff in freader:
-            scaff.id = self.gene.id + '.scaffold.' + str(i)
-            i += 1
-            print(scaff, file=f_out)
-        pyfastaq.utils.close(f_out)
-
-
-    def _run_nucmer(self, qry, outfile, show_snps=False):
-        pymummer.nucmer.Runner(
-            self.gene_fa,
-            qry,
-            outfile,
-            min_id=self.nucmer_min_id,
-            min_length=self.nucmer_min_len,
-            breaklen=self.nucmer_breaklen,
-            show_snps=show_snps
-        ).run()
-
-
-    def _fix_contig_orientation(self):
-        if not os.path.exists(self.gapfilled_scaffolds):
-            raise Error('Cannot fix orientation of assembly contigs because file not found: ' + self.gapfilled_scaffolds)
-
-        tmp_coords = os.path.join(self.root_dir, 'tmp.coords')
-        self._run_nucmer(self.gapfilled_scaffolds, tmp_coords)
-
-        to_revcomp = set()
-        not_revcomp = set()
-        file_reader = pymummer.coords_file.reader(tmp_coords)
-        for hit in file_reader:
-            if hit.on_same_strand():
-                not_revcomp.add(hit.qry_name)
-            else:
-                to_revcomp.add(hit.qry_name)
-
-        os.unlink(tmp_coords)
-        in_both = to_revcomp.intersection(not_revcomp)
-        for name in in_both:
-            print('WARNING: hits to both strands of gene for scaffold. Interpretation of any variants cannot be trusted for this scaffold:', name, file=sys.stderr)
-            to_revcomp.remove(name)
-            self.status_flag.add('hit_both_strands')
-
-        f = pyfastaq.utils.open_file_write(self.final_assembly_fa)
-        seq_reader = pyfastaq.sequences.file_reader(self.gapfilled_scaffolds)
-        for seq in seq_reader:
-            if seq.id in to_revcomp:
-                seq.revcomp()
-            print(seq, file=f)
-        pyfastaq.utils.close(f)
-
-
-    def _load_final_contigs(self):
-        if not os.path.exists(self.final_assembly_fa):
-            raise Error('Cannot load final assembled contigs because file not found:' + self.final_assembly_fa)
-
-        self.final_assembly = {}
-        pyfastaq.tasks.file_to_dict(self.final_assembly_fa, self.final_assembly)
-
-
-    def _parse_assembly_bam(self):
-        if not os.path.exists(self.final_assembly_bam):
-            raise Error('File not found: ' + self.final_assembly_bam)
-
-        bam_parser = bam_parse.Parser(self.final_assembly_bam, self.final_assembly)
-        bam_parser.parse()
-        bam_parser.write_files(self.final_assembly_bam)
-        if not bam_parser.scaff_graph_is_consistent(self.min_scaff_depth, self.max_insert):
-            self.status_flag.add('scaffold_graph_bad')
-
-
-    def _parse_assembly_vs_gene_coords(self):
-        file_reader = pymummer.coords_file.reader(self.assembly_vs_gene_coords)
-        self.nucmer_hits = {}
-        for hit in file_reader:
-            assert hit.ref_name == self.gene.id
-            contig = hit.qry_name
-            if contig not in self.nucmer_hits:
-                self.nucmer_hits[contig] = []
-            self.nucmer_hits[contig].append(copy.copy(hit))
-
-
-    def _nucmer_hits_to_percent_identity(self):
-        self.percent_identities = {}
-        for contig in self.nucmer_hits:
-            product_sum = 0
-            length_sum = 0
-            for hit in self.nucmer_hits[contig]:
-                product_sum += hit.hit_length_qry * hit.percent_identity
-                length_sum += hit.hit_length_qry
-            assert length_sum > 0
-            self.percent_identities[contig] = round(product_sum / length_sum, 2)
-
-
-    def _nucmer_hits_to_scaff_coords(self):
-        coords = {}
-        for l in self.nucmer_hits.values():
-            for hit in l:
-                if hit.qry_name not in coords:
-                    coords[hit.qry_name] = []
-                coords[hit.qry_name].append(hit.qry_coords())
-
-        for scaff in coords:
-            pyfastaq.intervals.merge_overlapping_in_list(coords[scaff])
-
-        return coords
-
-
-    def _nucmer_hits_to_ref_coords(self, contig=None):
-        coords = []
-        if contig is None:
-            keys = list(self.nucmer_hits.keys())
-        else:
-            keys = [contig]
-
-        for key in keys:
-            coords += [hit.ref_coords() for hit in self.nucmer_hits[key]]
-        coords.sort()
-        return coords
-
-
-    def _nucmer_hits_to_gene_cov_per_contig(self):
-        cov = {}
-        for contig in self.nucmer_hits:
-            coords = self._nucmer_hits_to_ref_coords(contig)
-            pyfastaq.intervals.merge_overlapping_in_list(coords)
-            cov[contig] = pyfastaq.intervals.length_sum_from_list(coords)
-        return cov
+            self.extern_progs = extern_progs
 
 
     @staticmethod
-    def _nucmer_hits_to_assembled_gene_sequences(nucmer_hits, ref_gene, assembly, outfile):
-        f = pyfastaq.utils.open_file_write(outfile)
-
-        for contig in sorted(nucmer_hits):
-            for hit in nucmer_hits[contig]:
-                qry_coords = hit.qry_coords()
-                fa = assembly[hit.qry_name].subseq(qry_coords.start, qry_coords.end + 1)
-                if hit.on_same_strand():
-                    strand = '+'
-                else:
-                    fa.revcomp()
-                    strand = '-'
-                ref_coords = hit.ref_coords()
-                fa.id = '.'.join([
-                    ref_gene.id,
-                    str(ref_coords.start + 1),
-                    str(ref_coords.end + 1),
-                    contig,
-                    str(qry_coords.start + 1),
-                    str(qry_coords.end + 1),
-                    strand
-                ])
-
-                if hit.hit_length_ref == hit.ref_length:
-                    fa.id += '.complete'
-
-                print(fa, file=f)
-
-        pyfastaq.utils.close(f)
-
-
-    def _whole_gene_covered_by_nucmer_hits(self):
-        covered = self._nucmer_hits_to_ref_coords()
-        pyfastaq.intervals.merge_overlapping_in_list(covered)
-        return pyfastaq.intervals.length_sum_from_list(covered) / len(self.gene) >= self.assembled_threshold
-
-
-    def _gene_coverage_unique(self):
-        covered = self._nucmer_hits_to_ref_coords()
-        covered.sort()
-        if len(covered) <= 1:
-            return True
-
-        coverage = {}
-        for i in covered:
-            for j in range(i.start, i.end + 1):
-                coverage[j] = coverage.get(j, 0) + 1
-
-        bases_depth_at_least_two = len([1 for x in coverage.values() if x > 1])
-        return bases_depth_at_least_two / len(self.gene) <= self.unique_threshold
-
-
-    def _gene_covered_by_complete_contig_with_orf(self):
-        for l in self.nucmer_hits.values():
-            for hit in l:
-                if hit.hit_length_ref == len(self.gene):
-                    start = min(hit.qry_start, hit.qry_end)
-                    end = max(hit.qry_start, hit.qry_end)
-                    assembled_gene = pyfastaq.sequences.Fasta('x', self.final_assembly[hit.qry_name][start:end+1])
-                    if (hit.ref_start < hit.ref_end) != (hit.qry_start < hit.qry_end):
-                        assembled_gene.revcomp()
-                    assembled_gene_aa = assembled_gene.translate()
-                    orfs = assembled_gene.orfs()
-                    if len(orfs) == 0:
-                        continue
-
-                    max_orf = orfs[0]
-                    for o in orfs:
-                        if len(o) > len(max_orf):
-                            max_orf = o
-
-                    if len(max_orf) == len(assembled_gene):
-                        return True
-        return False
-
-
-    def _gene_covered_by_at_least_one_full_length_contig(self):
-        for l in self.nucmer_hits.values():
-            for hit in l:
-                if len(hit.ref_coords()) == len(self.gene):
-                    return True
-        return False
-
-
-    def _update_flag_from_nucmer_file(self):
-        if self._whole_gene_covered_by_nucmer_hits():
-            self.status_flag.add('gene_assembled')
-
-        if self._gene_covered_by_at_least_one_full_length_contig():
-            self.status_flag.add('gene_assembled_into_one_contig')
-
-        if not self._gene_coverage_unique():
-            self.status_flag.add('gene_region_assembled_twice')
-
-        if self._gene_covered_by_complete_contig_with_orf():
-            self.status_flag.add('complete_orf')
-
-        if len(self.nucmer_hits) == 1:
-            self.status_flag.add('unique_contig')
-
-
-    def _get_mummer_variants(self):
-        snp_file = self.assembly_vs_gene_coords + '.snps'
-        if not os.path.exists(snp_file):
-            raise Error('File not found ' + snp_file)
-        variants = pymummer.snp_file.get_all_variants(snp_file)
-        self.mummer_variants = {}
-
-        if len(variants) == 0:
-            return
+    def _count_reads(reads1, reads2):
+        count1 = pyfastaq.tasks.count_sequences(reads1)
+        count2 = pyfastaq.tasks.count_sequences(reads2)
+        assert(count1 == count2)
+        return count1 + count2
 
-        variants.sort(key=operator.attrgetter('qry_name'))
-        variants.sort(key=operator.attrgetter('ref_start'))
 
-        for v in variants:
-            if v.qry_name not in self.mummer_variants:
-                self.mummer_variants[v.qry_name] = []
-            self.mummer_variants[v.qry_name].append(v)
-
-        for contig in self.mummer_variants:
-            l = self.mummer_variants[contig]
-            if len(l) > 1:
-                new_l = [[l[0]]]
-                previous_codon_start = self._get_codon_start(0, l[0].ref_start)
-                for variant in l[1:]:
-                    codon_start = self._get_codon_start(0, variant.ref_start)
-                    if codon_start == previous_codon_start:
-                        new_l[-1].append(variant)
-                    else:
-                        new_l.append([variant])
-                        previous_codon_start = codon_start
-                self.mummer_variants[contig] = new_l
-            else:
-                self.mummer_variants[contig] = [l]
-
-
-    def _filter_mummer_variants(self):
-        if len(self.mummer_variants) == 0:
+    def _clean(self):
+        if self.clean == 0:
+            print('   ... not deleting anything because --clean 0 used', file=self.log_fh)
             return
-
-        for contig in self.mummer_variants:
-            variants = self.mummer_variants[contig]
-            for i in range(len(variants)):
-                t = self._get_variant_effect(variants[i])
-                if t is not None and t[0] in ['TRUNC', 'FSHIFT']:
-                    break
-            self.mummer_variants[contig] = variants[:i+1]
-
-
-    def _get_codon_start(self, gene_start, position):
-        assert position >= gene_start
-        while  (position - gene_start) % 3 != 0:
-            position -= 1
-        return position
-
-
-    def _get_variant_effect(self, variants):
-        if len(variants) == 0:
-            return None
-
-        var_types = [x.var_type for x in variants]
-        if len(set(var_types)) != 1:
-            return None
-
-        var_type = var_types[0]
-
-        assert set([x.ref_name for x in variants]) == set([self.gene.id])
-        codon_starts = [self._get_codon_start(0, x.ref_start) for x in variants]
-        assert len(set(codon_starts)) == 1
-        codon_start = codon_starts[0]
-        aa_start = codon_start // 3
-        ref_codon = pyfastaq.sequences.Fasta('codon', self.gene[codon_start:codon_start+3])
-        ref_aa = ref_codon.translate()
-
-        if var_type == pymummer.variant.SNP:
-            new_codon = list(ref_codon.seq)
-            for v in variants:
-                new_codon[v.ref_start - codon_start] = v.qry_base
-            new_codon = pyfastaq.sequences.Fasta('new', ''.join(new_codon))
-            qry_aa = new_codon.translate()
-
-            if ref_aa.seq == qry_aa.seq:
-                return ('SYN', '.')
-            elif qry_aa.seq == '*':
-                return ('TRUNC', ref_aa.seq + str(aa_start + 1) + 'trunc')
-            else:
-                return ('NONSYN', ref_aa.seq + str(aa_start + 1) + qry_aa.seq)
-        elif var_type in [pymummer.variant.INS, pymummer.variant.DEL]:
-            if len(variants) > 1:
-                print('More than one indel in same codon not yet implemented!', self.gene.id, file=sys.stderr)
-                return None
-
-            var = variants[0]
-
-            if var_type == pymummer.variant.INS:
-                new_seq = pyfastaq.sequences.Fasta('seq', var.qry_base)
-            else:
-                new_seq = pyfastaq.sequences.Fasta('seq', var.ref_base)
-
-            if len(new_seq) % 3 != 0:
-                return ('FSHIFT', ref_aa.seq + str(aa_start + 1) + 'fs')
-
-            new_seq_aa = new_seq.translate()
-            if '*' in new_seq_aa.seq:
-                return ('TRUNC', ref_aa.seq + str(aa_start + 1) + 'trunc')
-            elif var_type == pymummer.variant.INS:
-                ref_codon_after_ins = pyfastaq.sequences.Fasta('codon', self.gene[codon_start+3:codon_start+6])
-                aa_after_ins = ref_codon_after_ins.translate()
-                return ('INS', ref_aa.seq + str(aa_start + 1) + '_' + aa_after_ins.seq + str(aa_start + 2) + 'ins' + new_seq_aa.seq )
-            else:
-                if len(new_seq) == 3:
-                    return ('DEL', ref_aa.seq + str(aa_start + 1) + 'del')
-                else:
-                    assert len(new_seq) % 3 == 0
-                    new_aa = new_seq.translate()
-                    ref_codon_after_ins = pyfastaq.sequences.Fasta('codon', self.gene[codon_start+3:codon_start+6])
-                    aa_after_ins = ref_codon_after_ins.translate()
-                    return ('DEL', ref_aa.seq + str(aa_start + 1)+ '_' + aa_after_ins.seq + str(aa_start + 2) + 'del')
-
-        else:
-            return ('UNKNOWN', '.')
-
-
-    def _make_assembly_vcf(self):
-        tmp_vcf = self.final_assembly_vcf + '.tmp'
-        cmd = ' '.join([
-            self.samtools_exe, 'mpileup',
-            '-t INFO/DPR,DV',
-            '-A',
-            '-f', self.final_assembly_fa,
-            '-u',
-            '-v',
-            self.final_assembly_bam,
-            '>',
-            tmp_vcf
-        ])
-
-        common.syscall(cmd, verbose=True, verbose_filehandle=self.log_fh)
-
-        cmd = ' '.join([
-            self.bcftools_exe, 'call -m',
-            tmp_vcf,
-            '|',
-            self.bcftools_exe, 'query',
-            r'''-f '%CHROM\t%POS\t%REF\t%ALT\t%DP\t%DPR]\n' ''',
-            '>',
-            self.final_assembly_read_depths + '.tmp'
-        ])
-
-        common.syscall(cmd, verbose=True, verbose_filehandle=self.log_fh)
-        pysam.tabix_compress(self.final_assembly_read_depths + '.tmp', self.final_assembly_read_depths)
-        pysam.tabix_index(self.final_assembly_read_depths, seq_col=0, start_col=1, end_col=1)
-        os.unlink(self.final_assembly_read_depths + '.tmp')
-
-        cmd = ' '.join([
-            self.bcftools_exe, 'call -m -v',
-            tmp_vcf,
-            '|',
-            self.bcftools_exe, 'filter',
-            '-i', '"MIN(DP)>=' + str(self.bcf_min_dp),
-                  ' & MIN(DV)>=' + str(self.bcf_min_dv),
-                  ' & MIN(DV/DP)>=' + str(self.bcf_min_dv_over_dp),
-                  ' & QUAL >=', str(self.bcf_min_qual), '"',
-            '-o', self.final_assembly_vcf
-        ])
-
-        common.syscall(cmd, verbose=True, verbose_filehandle=self.log_fh)
-        os.unlink(tmp_vcf)
-
-
-    def _get_assembly_read_depths(self, ref, position):
-        '''Returns total read depth and depth of reads supporting alternative (if present)'''
-        assert os.path.exists(self.final_assembly_read_depths)
-        assert os.path.exists(self.final_assembly_read_depths + '.tbi')
-        tbx = pysam.TabixFile(self.final_assembly_read_depths)
-        try:
-            rows = [x for x in tbx.fetch(ref, position, position + 1)]
-        except:
-            return None
-
-        if len(rows) > 1: # which happens with indels, mutiple lines for same base of reference
-            test_rows = [x for x in rows if x.rstrip().split()[3] != '.']
-            if len(test_rows) != 1:
-                rows = [rows[-1]]
-            else:
-                rows = test_rows
-
-        if len(rows) == 1:
-            r, p, ref_base, alt_base, ref_counts, alt_counts = rows[0].rstrip().split()
-            return ref_base, alt_base, int(ref_counts), alt_counts
-        else:
-            return None
-
-
-    def _get_samtools_variant_positions(self):
-        if not os.path.exists(self.final_assembly_vcf):
-            return []
-        f = pyfastaq.utils.open_file_read(self.final_assembly_vcf)
-        positions = [l.rstrip().split('\t')[0:2] for l in f if not l.startswith('#')]
-        positions = [(t[0], int(t[1]) - 1) for t in positions]
-        pyfastaq.utils.close(f)
-        return positions
-
-
-    def _get_samtools_variants(self, positions=None):
-        if positions is None:
-            positions = self._get_samtools_variant_positions()
-        variants = {}
-        if len(positions) == 0:
-            return variants
-        if not (os.path.exists(self.final_assembly_vcf) and os.path.exists(self.final_assembly_read_depths)):
-            return variants
-        for t in positions:
-            name, pos = t[0], t[1]
-            depths = self._get_assembly_read_depths(name, pos)
-            if depths is None:
-                raise Error('Error getting read depths for sequence ' + name + ' at position ' + t[1])
-            if name not in variants:
-                variants[name] = {}
-            variants[name][t[1]] = depths
-        return variants
-
-
-    def _get_vcf_variant_counts(self):
-        scaff_coords = self._nucmer_hits_to_scaff_coords()
-        self.vcf_variant_counts = {}
-        f = pyfastaq.utils.open_file_read(self.final_assembly_vcf)
-        for line in f:
-            if line.startswith('#'):
-                continue
-
-            data = line.rstrip().split('\t')
-            scaff = data[0]
-
-            if scaff in scaff_coords:
-                position = int(data[1]) - 1
-                i = pyfastaq.intervals.Interval(position, position)
-                intersects = len([x for x in scaff_coords[scaff] if x.intersects(i)]) > 0
-                if intersects:
-                    self.vcf_variant_counts[scaff] = self.vcf_variant_counts.get(scaff, 0) + 1
-
-        pyfastaq.utils.close(f)
-        total = sum(list(self.vcf_variant_counts.values()))
-        if total >= 1:
-            self.status_flag.add('variants_suggest_collapsed_repeat')
-
-
-    def _initial_make_report_lines(self):
-        '''Makes report lines. While they are being made, we discover if there were
-        and non-synonymous variants. This affects the flag, which also gets updated
-        by the function. To then fix the report lines, must run _update_flag_in_report_lines()'''
-        self.report_lines = []
-        total_reads = self._get_read_counts()
-
-        if not self.assembled_ok:
-            gene_name = 'NA' if self.gene is None else self.gene.id
-            gene_length = '.' if self.gene is None else len(self.gene)
-            self.report_lines.append([
-                    gene_name,
-                    self.status_flag.to_number(),
-                    total_reads,
-                    self.name,
-                    gene_length,
-                    '.',
-                    '.',
-                  ] + \
-                  ['.'] * 14
-            )
+        elif self.clean == 2:
+            print('    rm -r ', self.root_dir)
+            shutil.rmtree(self.root_dir)
             return
 
-        cov_per_contig = self._nucmer_hits_to_gene_cov_per_contig()
-        samtools_variants = self._get_samtools_variants()
-
-
-        for contig in self.mummer_variants:
-            for variants in self.mummer_variants[contig]:
-                t = self._get_variant_effect(variants)
-                if t is not None:
-                    effect, new_bases = t
-                    if effect != 'SYN':
-                        self.status_flag.add('has_nonsynonymous_variants')
-
-                    for v in variants:
-                        depths = self._get_assembly_read_depths(contig, v.qry_start)
-                        if depths is None:
-                            # this happens with low coverage contigs. It can get assembled, but
-                            # there are some bases that do not have reads mapped to them.
-                            # If mummer called a variant at one of these, then we're looking
-                            # for read dpeth where there is none.
-                            print('Warning: could not get read depth info on contig "' + contig + '" at position ', str(v.qry_start + 1), 'from file', self.final_assembly_read_depths, file=sys.stderr)
-                            print(' - a variant was called at this position using nucmer, but there is no read depth (probably a mapping artifact)', file=sys.stderr)
-                            depths = ['.'] * 4
-
-                        ref_base, alt_base, ref_counts, alt_counts = depths
-
-                        self.report_lines.append([
-                            self.gene.id,
-                            self.status_flag.to_number(),
-                            total_reads,
-                            self.name,
-                            len(self.gene),
-                            cov_per_contig[contig],
-                            self.percent_identities[contig],
-                            pymummer.variant.var_types[v.var_type],
-                            effect,
-                            new_bases,
-                            v.ref_start + 1,
-                            v.ref_end + 1,
-                            v.ref_base,
-                            v.qry_name,
-                            v.qry_length,
-                            v.qry_start + 1,
-                            v.qry_end + 1,
-                            v.qry_base,
-                            ref_counts,
-                            alt_base,
-                            alt_counts,
-                        ])
-
-                        if contig in samtools_variants and v.qry_start in samtools_variants[contig]:
-                            del samtools_variants[contig][v.qry_start]
-                            if len(samtools_variants[contig]) == 0:
-                                del samtools_variants[contig]
-
-            if contig in samtools_variants:
-                for pos in samtools_variants[contig]:
-                    ref_base, alt_base, ref_counts, alt_counts = samtools_variants[contig][pos]
-                    self.report_lines.append(
-                      [
-                        self.gene.id,
-                        self.status_flag.to_number(),
-                        total_reads,
-                        self.name,
-                        len(self.gene),
-                        cov_per_contig[contig],
-                        self.percent_identities[contig],
-                      ] + \
-                      ['.'] * 6 + \
-                      [
-                        contig,
-                        len(self.final_assembly[contig]),
-                        pos + 1,
-                        pos + 1,
-                        ref_base,
-                        ref_counts,
-                        alt_base,
-                        alt_counts
-                      ]
-                    )
-
-        if len(self.report_lines) == 0:
-            for contig in self.percent_identities:
-                self.report_lines.append([
-                    self.gene.id,
-                    self.status_flag.to_number(),
-                    total_reads,
-                    self.name,
-                    len(self.gene),
-                    cov_per_contig[contig],
-                    self.percent_identities[contig],
-                  ] + \
-                  ['.'] * 6 + [contig, len(self.final_assembly[contig])] + ['.'] * 6
-                )
-
-        self.report_lines.sort(key=itemgetter(0, 14, 15))
-
-
-    def _update_flag_in_report_lines(self):
-        '''This corrects the flag in all the report lines made by _initial_make_report_lines()'''
-        flag_column = 1
-        if self.status_flag.has('has_nonsynonymous_variants'):
-            for line in self.report_lines:
-                line[flag_column] = self.status_flag.to_number()
-
-
-    def _make_report_lines(self):
-        self._initial_make_report_lines()
-        self._update_flag_in_report_lines()
-
-
-    def _clean(self):
-        print('Cleaning', self.root_dir, file=self.log_fh)
-
-        if self.clean > 0:
-            print('  rm -r', self.assembly_dir, file=self.log_fh)
+        if os.path.exists(self.assembly_dir):
+            print('    rm -r', self.assembly_dir, file=self.log_fh)
             shutil.rmtree(self.assembly_dir)
 
-        to_clean = [
-            [
-                'assembly.reads_mapped.unsorted.bam',
-            ],
-            [
-                'assembly.fa.fai',
-                'assembly.reads_mapped.bam.scaff',
-                'assembly.reads_mapped.bam.soft_clipped',
-                'assembly.reads_mapped.bam.unmapped_mates',
-                'assembly_vs_gene.coords',
-                'assembly_vs_gene.coords.snps',
-                'genes.fa',
-                'genes.fa.fai',
-                'reads_1.fq',
-                'reads_2.fq',
-            ],
-            [
-                'assembly.fa.fai',
-                'assembly.reads_mapped.bam',
-                'assembly.reads_mapped.bam.vcf',
-                'assembly_vs_gene.coords',
-                'assembly_vs_gene.coords.snps',
-            ]
+        to_delete = [
+            self.reads1,
+            self.reads2,
+            self.references_fa,
+            self.references_fa + '.fai',
+            self.final_assembly_bam + '.read_depths.gz',
+            self.final_assembly_bam + '.read_depths.gz.tbi',
+            self.final_assembly_bam + '.scaff',
+            self.final_assembly_bam + '.soft_clipped',
+            self.final_assembly_bam + '.unmapped_mates',
+            self.final_assembly_bam + '.unsorted.bam',
         ]
 
-        for i in range(self.clean + 1):
-            for fname in to_clean[i]:
-                fullname = os.path.join(self.root_dir, fname)
-                if os.path.exists(fullname):
-                    print('  rm', fname, file=self.log_fh)
-                    os.unlink(fullname)
+        for filename in to_delete:
+            if os.path.exists(filename):
+                print('    rm', filename, file=self.log_fh)
+                try:
+                    os.unlink(filename)
+                except:
+                    raise Error('Error deleting file', filename)
 
 
     def run(self):
         self.logfile = os.path.join(self.root_dir, 'log.txt')
         self.log_fh = pyfastaq.utils.open_file_write(self.logfile)
-        self.gene = self._choose_best_gene()
-        if self.gene is None:
+
+        print('Choosing best reference sequence:', file=self.log_fh)
+        seq_chooser = best_seq_chooser.BestSeqChooser(
+            self.reads1,
+            self.reads2,
+            self.references_fa,
+            self.log_fh,
+            samtools_exe=self.extern_progs.exe('samtools'),
+            bowtie2_exe=self.extern_progs.exe('bowtie2'),
+            bowtie2_preset=self.bowtie2_preset,
+            threads=1,
+        )
+        self.ref_sequence = seq_chooser.best_seq(self.reference_fa)
+
+        if self.ref_sequence is None:
+            self.status_flag.add('ref_seq_choose_fail')
             self.assembled_ok = False
         else:
-            if self.assembler == 'velvet':
-                self._assemble_with_velvet()
-            elif self.assembler == 'spades':
-                self._assemble_with_spades()
+            print('\nAssembling reads:', file=self.log_fh)
+            self.ref_sequence_type = self.refdata.sequence_type(self.ref_sequence.id)
+            assert self.ref_sequence_type is not None
+            self.assembly = assembly.Assembly(
+              self.reads1,
+              self.reads2,
+              self.reference_fa,
+              self.assembly_dir,
+              self.final_assembly_fa,
+              self.final_assembly_bam,
+              self.log_fh,
+              scaff_name_prefix=self.ref_sequence.id,
+              kmer=self.assembly_kmer,
+              assembler=self.assembler,
+              spades_other_options=self.spades_other_options,
+              sspace_k=self.sspace_k,
+              sspace_sd=self.sspace_sd,
+              reads_insert=self.reads_insert,
+              extern_progs=self.extern_progs
+            )
 
-        # velvet can finish successfully, but make an empty contigs file
-        if self.assembled_ok:
-            number_of_contigs = pyfastaq.tasks.count_sequences(self.assembly_contigs)
-            if number_of_contigs == 0:
-                self.assembled_ok = False
-                self.status_flag.add('assembly_fail')
+            self.assembly.run()
+            self.assembled_ok = self.assembly.assembled_ok
 
         if self.assembled_ok:
-            # finish the assembly
-            self._scaffold_with_sspace()
-            self._gap_fill_with_gapfiller()
-            self._fix_contig_orientation()
-            self._load_final_contigs()
+            print('\nAssembly was successful\n\nMapping reads to assembly:', file=self.log_fh)
 
-            # map reads to assembly
             mapping.run_bowtie2(
                 self.reads1,
                 self.reads2,
                 self.final_assembly_fa,
                 self.final_assembly_bam[:-4],
-                threads=self.threads,
+                threads=1,
                 sort=True,
-                samtools=self.samtools_exe,
-                bowtie2=self.bowtie2_exe,
+                samtools=self.extern_progs.exe('samtools'),
+                bowtie2=self.extern_progs.exe('bowtie2'),
                 bowtie2_preset=self.bowtie2_preset,
                 verbose=True,
                 verbose_filehandle=self.log_fh
             )
-            self._parse_assembly_bam()
+
+            if self.assembly.has_contigs_on_both_strands:
+                self.status_flag.add('hit_both_strands')
+
+            print('\nMaking and checking scaffold graph', file=self.log_fh)
+
+            bam_parser = bam_parse.Parser(self.final_assembly_bam, self.assembly.sequences)
+            bam_parser.parse()
+            if not bam_parser.scaff_graph_is_consistent(self.min_scaff_depth, self.max_insert):
+                self.status_flag.add('scaffold_graph_bad')
+
+            print('Comparing assembly against reference sequence', file=self.log_fh)
+            self.assembly_compare = assembly_compare.AssemblyCompare(
+              self.final_assembly_fa,
+              self.assembly.sequences,
+              self.reference_fa,
+              self.ref_sequence,
+              self.assembly_compare_prefix,
+              self.refdata,
+              nucmer_min_id=self.nucmer_min_id,
+              nucmer_min_len=self.nucmer_min_len,
+              nucmer_breaklen=self.nucmer_breaklen,
+              assembled_threshold=self.assembled_threshold,
+              unique_threshold=self.unique_threshold,
+            )
+            self.assembly_compare.run()
+            self.status_flag = self.assembly_compare.update_flag(self.status_flag)
+
+            nucmer_hits_to_ref = assembly_compare.AssemblyCompare.nucmer_hits_to_ref_coords(self.assembly_compare.nucmer_hits)
+            assembly_variants_obj = assembly_variants.AssemblyVariants(self.refdata, self.assembly_compare.nucmer_snps_file)
+            self.assembly_variants = assembly_variants_obj.get_variants(self.ref_sequence.id, nucmer_hits_to_ref)
+
+            for var_list in self.assembly_variants.values():
+                for var in var_list:
+                    if var[3] not in ['.', 'SYN', None]:
+                        self.status_flag.add('has_nonsynonymous_variants')
+                        break
+
+                if self.status_flag.has('has_nonsynonymous_variants'):
+                    break
 
 
-            # compare gene and assembly
-            self._run_nucmer(self.final_assembly_fa, self.assembly_vs_gene_coords, show_snps=True)
-            self._parse_assembly_vs_gene_coords()
-            self._nucmer_hits_to_percent_identity()
-            self._get_mummer_variants()
-            self._filter_mummer_variants()
-            self._update_flag_from_nucmer_file()
-            self._make_assembly_vcf()
-            self._get_vcf_variant_counts()
-            self._nucmer_hits_to_assembled_gene_sequences(self.nucmer_hits, self.gene, self.final_assembly, self.final_assembled_genes_fa)
+            print('\nCalling variants with samtools:', file=self.log_fh)
 
-        self._make_report_lines()
+            self.samtools_vars = samtools_variants.SamtoolsVariants(
+                self.final_assembly_fa,
+                self.final_assembly_bam,
+                self.samtools_vars_prefix,
+                log_fh=self.log_fh,
+                samtools_exe=self.extern_progs.exe('samtools'),
+                bcftools_exe=self.extern_progs.exe('bcftools'),
+                bcf_min_dp=self.bcf_min_dp,
+                bcf_min_dv=self.bcf_min_dv,
+                bcf_min_dv_over_dp=self.bcf_min_dv_over_dp,
+                bcf_min_qual=self.bcf_min_qual,
+            )
+            self.samtools_vars.run()
+            if self.samtools_vars.variants_in_coords(self.assembly_compare.assembly_match_coords(), self.samtools_vars.vcf_file):
+                self.status_flag.add('variants_suggest_collapsed_repeat')
+        else:
+            print('\nAssembly failed\n', file=self.log_fh)
+            self.status_flag.add('assembly_fail')
+
+        print('\nMaking report lines', file=self.log_fh)
+        self.report_lines = report.report_lines(self)
+        print('\nCleaning with clean option ', self.clean, ':', sep='', file=self.log_fh)
         self._clean()
+        print('Finished', file=self.log_fh)
         pyfastaq.utils.close(self.log_fh)
 
         # This stops multiprocessing complaining with the error:
         # multiprocessing.pool.MaybeEncodingError: Error sending result: '[<ariba.cluster.Cluster object at 0x7ffa50f8bcd0>]'. Reason: 'TypeError("cannot serialize '_io.TextIOWrapper' object",)'
         self.log_fh = None
+
diff --git a/ariba/clusters.py b/ariba/clusters.py
index b9391dfc..64b9dc80 100644
--- a/ariba/clusters.py
+++ b/ariba/clusters.py
@@ -1,4 +1,5 @@
 import os
+import copy
 import itertools
 import sys
 import shutil
@@ -6,7 +7,7 @@
 import multiprocessing
 import pysam
 import pyfastaq
-from ariba import cdhit, cluster, common, mapping, histogram, faidx
+from ariba import cdhit, cluster, common, mapping, histogram, faidx, report
 
 class Error (Exception): pass
 
@@ -22,17 +23,15 @@ def _run_cluster(obj, verbose):
 
 class Clusters:
     def __init__(self,
-      db_fasta,
+      refdata,
       reads_1,
       reads_2,
       outdir,
+      extern_progs,
       assembly_kmer=21,
       threads=1,
       verbose=False,
-      assembler='velvet',
-      smalt_k=13,
-      smalt_s=2,
-      smalt_min_id=0.9,
+      assembler='spades',
       spades_other=None,
       max_insert=1000,
       min_scaff_depth=10,
@@ -41,48 +40,38 @@ def __init__(self,
       nucmer_breaklen=50,
       assembled_threshold=0.95,
       unique_threshold=0.03,
-      bcftools_exe='bcftools',
-      gapfiller_exe='GapFiller.pl',
-      samtools_exe='samtools',
-      smalt_exe='smalt',
-      bowtie2_exe='bowtie2',
       bowtie2_preset='very-sensitive-local',
-      spades_exe='spades.py',
-      sspace_exe='SSPACE_Basic_v2.0.pl',
-      velvet_exe='velvet', # prefix of velvet{g,h}
       cdhit_seq_identity_threshold=0.9,
       cdhit_length_diff_cutoff=0.9,
       run_cd_hit=True,
       clean=1,
     ):
+        self.refdata = refdata
         self.reads_1 = os.path.abspath(reads_1)
         self.reads_2 = os.path.abspath(reads_2)
         self.outdir = os.path.abspath(outdir)
+        self.extern_progs = extern_progs
         self.clusters_outdir = os.path.join(self.outdir, 'Clusters')
-        self.clusters_info_file = os.path.join(self.outdir, 'clusters.tsv')
         self.clean = clean
 
         self.assembler = assembler
-        assert self.assembler in ['velvet', 'spades']
+        assert self.assembler in ['spades']
         self.assembly_kmer = assembly_kmer
         self.spades_other = spades_other
 
-        self.db_fasta_clustered = os.path.join(self.outdir, 'input_genes.clustered.fa')
+        self.refdata_files_prefix = os.path.join(self.outdir, 'refdata')
+        self.cdhit_files_prefix = os.path.join(self.outdir, 'cdhit')
+        self.cdhit_cluster_representatives_fa = self.cdhit_files_prefix + '.cluster_representatives.fa'
         self.cluster_ids = {}
-        self.bam_prefix = os.path.join(self.outdir, 'map_all_reads')
+        self.bam_prefix = self.cdhit_cluster_representatives_fa + '.map_reads'
         self.bam = self.bam_prefix + '.bam'
         self.report_file_tsv = os.path.join(self.outdir, 'report.tsv')
         self.report_file_xls = os.path.join(self.outdir, 'report.xls')
-        self.catted_assembled_genes_fasta = os.path.join(self.outdir, 'assembled_genes.fa')
+        self.catted_assembled_seqs_fasta = os.path.join(self.outdir, 'assembled_seqs.fa')
         self.threads = threads
         self.verbose = verbose
 
-        self.smalt_k = smalt_k
-        self.smalt_s = smalt_s
-        self.smalt_min_id = smalt_min_id
         self.max_insert = max_insert
-        self.smalt_exe = smalt_exe
-        self.bowtie2_exe = bowtie2_exe
         self.bowtie2_preset = bowtie2_preset
 
         self.insert_hist_bin = 10
@@ -102,25 +91,6 @@ def __init__(self,
         self.cluster_to_dir = {}  # gene name -> abs path of cluster directory
         self.clusters = {}        # gene name -> Cluster object
 
-        self.bcftools_exe = bcftools_exe
-
-        self.sspace_exe = shutil.which(sspace_exe)
-        if self.sspace_exe is None:
-            print('WARNING: SSPACE not found. Scaffolding and gap filling will be skipped!', file=sys.stderr)
-            self.gapfiller_exe = None
-        else:
-            self.sspace_exe = os.path.realpath(self.sspace_exe) # otherwise sspace dies loading packages
-            self.gapfiller_exe = shutil.which(gapfiller_exe)
-            if self.gapfiller_exe is None:
-                print('WARNING: GapFiller not found. No gap filling will be run after scaffolding!', file=sys.stderr)
-            else:
-                self.gapfiller_exe = os.path.realpath(self.gapfiller_exe) # otherwise gapfiller dies loading packages
-
-        self.samtools_exe = samtools_exe
-        self.spades_exe = spades_exe
-
-        self.velvet = velvet_exe
-
         self.cdhit_seq_identity_threshold = cdhit_seq_identity_threshold
         self.cdhit_length_diff_cutoff = cdhit_length_diff_cutoff
         self.run_cd_hit = run_cd_hit
@@ -131,48 +101,31 @@ def __init__(self,
             except:
                 raise Error('Error mkdir ' + d)
 
-        self.db_fasta = os.path.join(self.outdir, 'input_genes.not_clustered.fa')
-        pyfastaq.tasks.to_fasta(db_fasta, self.db_fasta, check_unique=True)
-        common.syscall(self.samtools_exe + ' faidx ' + self.db_fasta)
-
 
     def _run_cdhit(self):
-        r = cdhit.Runner(
-            self.db_fasta,
-            self.db_fasta_clustered,
+        self.cluster_ids = self.refdata.cluster_with_cdhit(
+            self.refdata_files_prefix + '.01.check_variants',
+            self.cdhit_files_prefix,
             seq_identity_threshold=self.cdhit_seq_identity_threshold,
             threads=self.threads,
             length_diff_cutoff=self.cdhit_length_diff_cutoff,
+            nocluster=not self.run_cd_hit,
             verbose=self.verbose,
         )
-        if self.run_cd_hit:
-            self.cluster_ids = r.run()
-        else:
-            if self.verbose:
-                print('Skipping cd-hit because --no_cdhit option used')
-            self.cluster_ids = r.fake_run()
-
-
-    def _write_clusters_info_file(self):
-        f = pyfastaq.utils.open_file_write(self.clusters_info_file)
-        print('#Cluster\tGene', file=f)
-        for c in sorted([int(x) for x in self.cluster_ids]):
-            for seqname in sorted(list(self.cluster_ids[str(c)])):
-                print(c, seqname, sep='\t', file=f)
-        pyfastaq.utils.close(f)
 
 
     def _map_reads_to_clustered_genes(self):
         mapping.run_bowtie2(
             self.reads_1,
             self.reads_2,
-            self.db_fasta_clustered,
+            self.cdhit_cluster_representatives_fa,
             self.bam_prefix,
             threads=self.threads,
-            samtools=self.samtools_exe,
-            bowtie2=self.bowtie2_exe,
+            samtools=self.extern_progs.exe('samtools'),
+            bowtie2=self.extern_progs.exe('bowtie2'),
             bowtie2_preset=self.bowtie2_preset,
             verbose=self.verbose,
+            remove_both_unmapped=True,
         )
 
 
@@ -287,100 +240,86 @@ def _init_and_run_clusters(self):
         counter = 0
         cluster_list = []
 
-        for gene in sorted(self.cluster_to_dir):
-            counter += 1
-            if self.verbose:
-                print('Constructing cluster', counter, 'of', str(len(self.cluster_to_dir)))
-            new_dir = self.cluster_to_dir[gene]
-
-            faidx.write_fa_subset(
-                self.cluster_ids[gene],
-                self.db_fasta,
-                os.path.join(new_dir, 'genes.fa'),
-                samtools_exe=self.samtools_exe,
-                verbose=self.verbose
-            )
-
-            cluster_list.append(cluster.Cluster(
-                new_dir,
-                gene,
-                assembly_kmer=self.assembly_kmer,
-                assembler=self.assembler,
-                max_insert=self.insert_proper_pair_max,
-                min_scaff_depth=self.min_scaff_depth,
-                nucmer_min_id=self.nucmer_min_id,
-                nucmer_min_len=self.nucmer_min_len,
-                nucmer_breaklen=self.nucmer_breaklen,
-                sspace_k=self.min_scaff_depth,
-                reads_insert=self.insert_size,
-                sspace_sd=self.insert_sspace_sd,
-                threads=1, # clusters now run in parallel, so this should always be 1!
-                assembled_threshold=self.assembled_threshold,
-                unique_threshold=self.unique_threshold,
-                bcftools_exe=self.bcftools_exe,
-                gapfiller_exe=self.gapfiller_exe,
-                samtools_exe=self.samtools_exe,
-                bowtie2_exe=self.bowtie2_exe,
-                bowtie2_preset=self.bowtie2_preset,
-                spades_exe=self.spades_exe,
-                sspace_exe=self.sspace_exe,
-                velvet_exe=self.velvet,
-                spades_other=self.spades_other,
-                clean=self.clean,
-            ))
+        for seq_type in sorted(self.cluster_ids):
+            if self.cluster_ids[seq_type] is None:
+                continue
+
+            for seq_name in sorted(self.cluster_ids[seq_type]):
+                if seq_name not in self.cluster_to_dir:
+                    continue
+                counter += 1
+                if self.verbose:
+                    print('Constructing cluster', seq_name + '.', counter, 'of', str(len(self.cluster_to_dir)))
+                new_dir = self.cluster_to_dir[seq_name]
+                self.refdata.write_seqs_to_fasta(os.path.join(new_dir, 'references.fa'), self.cluster_ids[seq_type][seq_name])
+
+                cluster_list.append(cluster.Cluster(
+                    new_dir,
+                    seq_name,
+                    refdata=self.refdata,
+                    assembly_kmer=self.assembly_kmer,
+                    assembler=self.assembler,
+                    max_insert=self.insert_proper_pair_max,
+                    min_scaff_depth=self.min_scaff_depth,
+                    nucmer_min_id=self.nucmer_min_id,
+                    nucmer_min_len=self.nucmer_min_len,
+                    nucmer_breaklen=self.nucmer_breaklen,
+                    reads_insert=self.insert_size,
+                    sspace_k=self.min_scaff_depth,
+                    sspace_sd=self.insert_sspace_sd,
+                    threads=1, # clusters now run in parallel, so this should always be 1!
+                    bcf_min_dp=10,            # let the user change this in a future version?
+                    bcf_min_dv=5,             # let the user change this in a future version?
+                    bcf_min_dv_over_dp=0.3,   # let the user change this in a future version?
+                    bcf_min_qual=20,          # let the user change this in a future version?
+                    assembled_threshold=self.assembled_threshold,
+                    unique_threshold=self.unique_threshold,
+                    bowtie2_preset=self.bowtie2_preset,
+                    spades_other_options=self.spades_other,
+                    clean=self.clean,
+                    extern_progs=self.extern_progs,
+                ))
+
 
         pool = multiprocessing.Pool(self.threads)
         cluster_list = pool.starmap(_run_cluster, zip(cluster_list, itertools.repeat(self.verbose)))
         self.clusters = {c.name: c for c in cluster_list}
 
 
-    def _write_reports(self):
-        columns = [
-            '#gene',
-            'flag',
-            'reads',
-            'cluster',
-            'gene_len',
-            'assembled',
-            'pc_ident',
-            'var_type',
-            'var_effect',
-            'new_aa',
-            'gene_start',
-            'gene_end',
-            'gene_nt',
-            'scaffold',
-            'scaff_len',
-            'scaff_start',
-            'scaff_end',
-            'scaff_nt',
-            'read_depth',
-            'alt_bases',
-            'ref_alt_depth'
-        ]
+    @staticmethod
+    def _write_reports(clusters_in, tsv_out, xls_out):
+        columns = copy.copy(report.columns)
+        columns[0] = '#' + columns[0]
 
-        f = pyfastaq.utils.open_file_write(self.report_file_tsv)
+        f = pyfastaq.utils.open_file_write(tsv_out)
         print('\t'.join(columns), file=f)
 
-        columns[0] = 'gene'
+        columns[0] = columns[0][1:]
         workbook = openpyxl.Workbook()
         worksheet = workbook.worksheets[0]
         worksheet.title = 'ARIBA_report'
         worksheet.append(columns)
 
-        for gene in sorted(self.clusters):
-            for line in self.clusters[gene].report_lines:
-                print('\t'.join([str(x) for x in line]), file=f)
-                worksheet.append(line)
+        for seq_name in sorted(clusters_in):
+            if clusters_in[seq_name].report_lines is None:
+                continue
+
+            for line in clusters_in[seq_name].report_lines:
+                print(line, file=f)
+                worksheet.append(line.split('\t'))
+
         pyfastaq.utils.close(f)
-        workbook.save(self.report_file_xls)
+        workbook.save(xls_out)
 
 
-    def _write_catted_assembled_genes_fasta(self):
-        f = pyfastaq.utils.open_file_write(self.catted_assembled_genes_fasta)
+    def _write_catted_assembled_seqs_fasta(self, outfile):
+        f = pyfastaq.utils.open_file_write(outfile)
 
         for gene in sorted(self.clusters):
-            cluster_fasta = self.clusters[gene].final_assembled_genes_fa
+            try:
+                cluster_fasta = self.clusters[gene].assembly_compare.assembled_ref_seqs_file
+            except:
+                continue
             if os.path.exists(cluster_fasta):
                 file_reader = pyfastaq.sequences.file_reader(cluster_fasta)
                 for seq in file_reader:
@@ -390,54 +329,74 @@ def _write_catted_assembled_genes_fasta(self):
 
 
     def _clean(self):
-        to_clean = [
-            [
-            ],
-            [
-                self.bam,
-                self.db_fasta,
-                self.db_fasta + '.fai',
-            ],
-            [
-                self.db_fasta_clustered,
-                self.db_fasta_clustered + '.fai',
-                self.clusters_info_file,
-            ]
+        if self.clean == 0:
+            if self.verbose:
+                print('   ... not deleting anything because --clean 0 used')
+            return
+
+        to_delete= [
+            self.bam,
+            self.cdhit_cluster_representatives_fa,
+            self.cdhit_cluster_representatives_fa + '.fai',
+            self.cdhit_files_prefix + '.non_coding.cdhit',
+            self.cdhit_files_prefix + '.presence_absence.cdhit',
+            self.cdhit_files_prefix + '.variants_only.cdhit',
         ]
 
-        for i in range(self.clean + 1):
-            for fname in to_clean[i]:
-                if os.path.exists(fname):
-                    if self.verbose:
-                        print('  rm', fname)
-                    os.unlink(fname)
-
-        if self.clean >= 2:
+        if self.clean == 2:
             if self.verbose:
-                print('  rm -r', self.clusters_outdir)
+                print('    rm -r', self.clusters_outdir)
                 shutil.rmtree(self.clusters_outdir)
 
+            to_delete.extend([
+                self.cdhit_files_prefix + '.clusters.tsv',
+                self.refdata_files_prefix + '.00.check_fasta_presence_absence.log',
+                self.refdata_files_prefix + '.00.check_fasta_variants_only.log',
+                self.refdata_files_prefix + '.01.check_variants.log',
+                self.refdata_files_prefix + '.01.check_variants.non_coding.fa',
+                self.refdata_files_prefix + '.01.check_variants.presence_absence.fa',
+                self.refdata_files_prefix + '.01.check_variants.tsv',
+                self.refdata_files_prefix + '.01.check_variants.variants_only.fa',
+            ])
+
+        for filename in to_delete:
+            if os.path.exists(filename):
+                if self.verbose:
+                    print('    rm', filename)
+                try:
+                    os.unlink(filename)
+                except:
+                    raise Error('Error deleting file', filename)
+
 
     def run(self):
         cwd = os.getcwd()
         os.chdir(self.outdir)
 
         if self.verbose:
+            print('{:_^79}'.format(' Checking reference data '), flush=True)
+        self.refdata.sanity_check(self.refdata_files_prefix)
+
+        if self.verbose:
+            print()
             print('{:_^79}'.format(' Running cd-hit '), flush=True)
         self._run_cdhit()
-        self._write_clusters_info_file()
+
         if self.verbose:
             print('Finished cd-hit\n')
             print('{:_^79}'.format(' Mapping reads to clustered genes '), flush=True)
         self._map_reads_to_clustered_genes()
+
         if self.verbose:
             print('Finished mapping\n')
             print('{:_^79}'.format(' Generating clusters '), flush=True)
         self._bam_to_clusters_reads()
+
         if len(self.cluster_to_dir) > 0:
             self._set_insert_size_data()
             if self.verbose:
-                print('{:_^79}'.format(' Assembling each cluster '), flush=True)
+                print('{:_^79}'.format(' Assembling each cluster '))
+                print('Will run', self.threads, 'cluster(s) in parallel', flush=True)
             self._init_and_run_clusters()
             if self.verbose:
                 print('Finished assembling clusters\n')
@@ -448,11 +407,19 @@ def run(self):
 
         if self.verbose:
             print('{:_^79}'.format(' Writing report files '), flush=True)
-        self._write_reports()
-        self._write_catted_assembled_genes_fasta()
+            print(self.report_file_tsv)
+            print(self.report_file_xls)
+        self._write_reports(self.clusters, self.report_file_tsv, self.report_file_xls)
+
+        if self.verbose:
+            print('{:_^79}'.format(' Writing fasta of assembled sequences '), flush=True)
+            print(self.catted_assembled_seqs_fasta)
+        self._write_catted_assembled_seqs_fasta(self.catted_assembled_seqs_fasta)
+
         if self.verbose:
-            print('Finished writing report files. Cleaning files', flush=True)
+            print('\n\nCleaning files:', flush=True)
         self._clean()
+
         if self.verbose:
             print('\nAll done!\n')
 
diff --git a/ariba/external_progs.py b/ariba/external_progs.py
index a1d7239c..317f2acc 100644
--- a/ariba/external_progs.py
+++ b/ariba/external_progs.py
@@ -4,14 +4,10 @@
 from distutils.version import LooseVersion
 import re
 import sys
-import pyfastaq
 from ariba import common
 
 class Error (Exception): pass
 
-def is_in_path(prog):
-    return shutil.which(prog) is not None
-
 
 prog_to_default = {
     'bcftools': 'bcftools',
@@ -20,19 +16,12 @@ def is_in_path(prog):
     'gapfiller': 'GapFiller.pl',
     'nucmer' : 'nucmer',
     'samtools': 'samtools',
-    'smalt': 'smalt',
     'spades': 'spades.py',
     'sspace': 'SSPACE_Basic_v2.0.pl',
-    'velvetg': 'velvetg',
-    'velveth': 'velveth',
 }
 
 
-prog_to_env_var = {
-    'bcftools': 'ARIBA_BCFTOOLS',
-    'samtools': 'ARIBA_SAMTOOLS',
-    'spades': 'ARIBA_SPADES', 
-}
+prog_to_env_var = {x: 'ARIBA_' + x.upper() for x in prog_to_default if x not in {'nucmer'}}
 
 
 prog_to_version_cmd = {
@@ -42,11 +31,8 @@ def is_in_path(prog):
     'gapfiller': ('', re.compile('^Usage: .*pl \[GapFiller_(.*)\]')),
     'nucmer': ('--version', re.compile('^NUCmer \(NUCleotide MUMmer\) version ([0-9\.]+)')),
     'samtools': ('', re.compile('^Version: ([0-9\.]+)')),
-    'smalt': ('version', re.compile('^Version: ([0-9\.]+)')),
     'spades': ('', re.compile('^SPAdes genome assembler v.([0-9\.]+)')),
     'sspace': ('', re.compile('^Usage: .*pl \[SSPACE_(.*)\]')),
-    'velvetg': ('', re.compile('Version ([0-9\.]+)')),
-    'velveth': ('', re.compile('Version ([0-9\.]+)')),
 }
 
 
@@ -56,110 +42,105 @@ def is_in_path(prog):
     'cd-hit': '4.6',
     'nucmer': '3.1',
     'samtools': '1.2',
-    'smalt': '0.7.4',
     'spades': '3.5.0',
-    'velvetg': '1.2.07',
-    'velveth': '1.2.07',
 }
 
 
-def set_path(prog, opts):
-    path_from_opts = eval('opts.' + prog)
-    if path_from_opts is not None:
-        return
-
-    if prog in prog_to_env_var:
-        env_var = prog_to_env_var[prog]
-        if env_var in os.environ:
-            exec('opts.' + prog + ' = "' + os.environ[env_var] + '"')
-            return
-
-    exec('opts.' + prog + ' = "' + prog_to_default[prog] + '"')
+class ExternalProgs:
+    def __init__(self, verbose=False):
+        optional_progs = {'sspace', 'gapfiller'}
+        self.progs = {}
 
+        if verbose:
+            print('{:_^79}'.format(' Checking dependencies and their versions '))
+            print('tool', 'version', 'path', sep='\t')
+
+        errors = []
+        warnings = []
+        failed_to_find = set()
+
+        for prog in sorted(prog_to_default):
+            prog_exe = self._get_exe(prog)
+            self.progs[prog] = shutil.which(prog_exe)
+            if self.progs[prog] is None:
+                if prog in optional_progs:
+                    warnings.append(prog + ' not found in path. Looked for ' + prog_exe + '. But it is optional so will be skipped during assembly')
+                else:
+                    errors.append(prog + ' not found in path. Looked for ' + prog_exe + '. Cannot continue')
+                if verbose:
+                    print(prog, 'NA', 'NOT_FOUND', sep='\t')
+                continue
+            elif prog in {'sspace', 'gapfiller'}:
+                self.progs[prog] = os.path.realpath(self.progs[prog])
+
+            got_version, version = self._get_version(prog, self.progs[prog])
+
+            if got_version:
+                if prog in min_versions and LooseVersion(version) < LooseVersion(min_versions[prog]):
+                    errors.append(' '.join(['Found version', version, 'of', prog, 'which is too low! Please update to at least', min_versions[prog] + '. Found it here:', prog_exe]))
+            else:
+                errors.append(version)
+                version = 'ERROR'
+
+            if verbose:
+                print(prog, version, self.progs[prog], sep='\t')
 
-def get_version(prog, path=None, raise_error=True):
-    assert prog in prog_to_version_cmd
-    if path is None:
-        path = prog
 
-    if not is_in_path(path):
-        if raise_error:
-            raise Error('Error getting version of ' + path + ' - not found in path.')
-        else:
-            return 'Not_in_path', 'Not_in_path'
-
-    path = shutil.which(path)
-
-    if prog in ['sspace', 'gapfiller']:
-        cmd = 'perl ' + os.path.realpath(shutil.which(path))
-        regex = prog_to_version_cmd[prog][1]
-    else:
-        cmd, regex = prog_to_version_cmd[prog]
-        cmd = path + ' ' + cmd
-
-    cmd_output = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
-    cmd_output = common.decode(cmd_output[0]).split('\n')[:-1] + common.decode(cmd_output[1]).split('\n')[:-1]
-    for line in cmd_output:
-        hits = regex.search(line)
-        if hits:
-            return hits.group(1), path
-    return 'UNKNOWN ...\n I tried running this to get the version: "' + cmd + '"\n and the output didn\'t match this regular expression: "' + regex.pattern + '"', path
-
-
-def check_versions(opts, verbose=False, not_required=None):
-    if not_required is None:
-        not_required = set()
-
-    if verbose:
-        print('{:_^79}'.format(' Checking dependencies and their versions '))
-        print('tool', 'version', 'path', sep='\t')
-
-    to_check = [
-        'bcftools',
-        'bowtie2',
-        'cdhit',
-        'nucmer',
-        'samtools',
-        'sspace',
-        'gapfiller',
-    ]
-    
-    if opts.assembler == 'spades':
-        to_check.append('spades')
-    elif opts.assembler == 'velvet':
-        to_check.append('velvetg')
-        to_check.append('velveth')
-    else:
-        raise Error('Assembler ' + opts.assembler + ' not recognised. Cannot continue')
-
-    errors = []
-    failed_to_find = set()
-
-    for prog in to_check:
-        set_path(prog, opts)
-        version, path = get_version(prog, path=eval('opts.' + prog), raise_error=prog not in not_required)
         if verbose:
-            print(prog, version, path, sep='\t')
-        if path == 'Not_in_path':
-            print('\nWARNING:', prog, 'not found in path, so will be skipped during assembly\n', file=sys.stderr)
-
-        if prog in min_versions and LooseVersion(version) < LooseVersion(min_versions[prog]):
-            errors.append(' '.join(['Found version', version, 'of', prog, 'which is too low! Please update to at least', min_versions[prog] + '\n   Found it here:', path]))
-            failed_to_find.add(prog)
-
-    if len(errors):
-        for e in errors:
-            print('\n*** Error! Bad dependency! ***', file=sys.stderr)
-            print(e, file=sys.stderr)
             print()
-        if len(failed_to_find.difference(not_required)) > 0:
-            raise Error('Cannot continue. Some dependencies need updating')
+
+        for line in warnings:
+            print('WARNING:', line, file=sys.stderr)
+
+
+        if len(errors):
+            for line in errors:
+                print('ERROR:', line, file=sys.stderr)
+            print('\nSomething wrong with at least one dependency. Please see the above error message(s)', file=sys.stderr)
+            raise Error('Depency error(s). Cannot continue')
+        elif verbose:
+            if len(warnings):
+                print('\nWARNING: Required dependencies found, but at least one optional one was not. Please see previous warning(s) for more details.', file=sys.stderr)
+            else:
+                print('\nDependencies look OK')
+
+
+    def exe(self, prog):
+        return self.progs[prog]
+
+
+    @staticmethod
+    def _get_exe(prog):
+        '''Given a program name, return what we expect its exectuable to be called'''
+        if prog in prog_to_env_var:
+            env_var = prog_to_env_var[prog]
+            if env_var in os.environ:
+                return os.environ[env_var]
+
+        return prog_to_default[prog]
+
+
+    @staticmethod
+    def _get_version(prog, path):
+        '''Given a program name and expected path, tries to determine its version.
+           Returns tuple (bool, version). First element True iff found version ok.
+           Second element is version string (if found), otherwise an error message'''
+        assert prog in prog_to_version_cmd
+
+        if prog in ['sspace', 'gapfiller']:
+            cmd = 'perl ' + os.path.realpath(shutil.which(path))
+            regex = prog_to_version_cmd[prog][1]
         else:
-            assert failed_to_find.issubset(not_required)
-            if 'sspace' in failed_to_find:
-                print('WARNING: SSPACE not found. Will not run scaffolding or gap filling', file=sys.stderr)
-            elif 'gapfiller' in failed_to_find:
-                print('WARNING: GapFiller not found. Will not run gap filling after scaffolding', file=sys.stderr)
-
-    if verbose:
-        print('\nDependencies look OK (but check in case there are warnings about SSPACE or GapFiller)\n')
+            cmd, regex = prog_to_version_cmd[prog]
+            cmd = path + ' ' + cmd
+
+        cmd_output = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
+        cmd_output = common.decode(cmd_output[0]).split('\n')[:-1] + common.decode(cmd_output[1]).split('\n')[:-1]
+
+        for line in cmd_output:
+            hits = regex.search(line)
+            if hits:
+                return True, hits.group(1)
+
+        return False, 'I tried to get the version of ' + prog + ' with: "' + cmd + '" and the output didn\'t match this regular expression: "' + regex.pattern + '"'
+
diff --git a/ariba/flag.py b/ariba/flag.py
index fa0d5cee..28e2c561 100644
--- a/ariba/flag.py
+++ b/ariba/flag.py
@@ -2,9 +2,9 @@ class Error (Exception): pass
 
 
 flags_in_order = [
-    'gene_assembled',
-    'gene_assembled_into_one_contig',
-    'gene_region_assembled_twice',
+    'assembled',
+    'assembled_into_one_contig',
+    'region_assembled_twice',
     'complete_orf',
     'unique_contig',
     'scaffold_graph_bad',
@@ -12,6 +12,7 @@ class Error (Exception): pass
     'variants_suggest_collapsed_repeat',
     'hit_both_strands',
     'has_nonsynonymous_variants',
+    'ref_seq_choose_fail',
 ]
 
 
diff --git a/ariba/mapping.py b/ariba/mapping.py
index e7e08ebb..46034375 100644
--- a/ariba/mapping.py
+++ b/ariba/mapping.py
@@ -18,7 +18,8 @@ def run_bowtie2(
       bowtie2='bowtie2',
       bowtie2_preset='very-sensitive-local',
       verbose=False,
-      verbose_filehandle=sys.stdout
+      verbose_filehandle=sys.stdout,
+      remove_both_unmapped=False,
     ):
 
     map_index = out_prefix + '.map_index'
@@ -36,101 +37,50 @@ def run_bowtie2(
     else:
         intermediate_bam = final_bam
 
-    map_cmd = ' '.join([
+    map_cmd = [
         bowtie2,
         '--threads', str(threads),
+        '--reorder',
         '--' + bowtie2_preset,
         '-X', str(max_insert),
         '-x', map_index,
         '-1', reads_fwd,
         '-2', reads_rev,
+    ]
+
+    if remove_both_unmapped:
+        map_cmd.append(r''' | awk ' !(and($2,4)) || !(and($2,8)) ' ''')
+
+
+    map_cmd.extend([
         '|', samtools, 'view',
         '-bS -T', ref_fa,
         '- >', intermediate_bam
     ])
 
+    map_cmd = ' '.join(map_cmd)
+
     common.syscall(index_cmd, verbose=verbose, verbose_filehandle=verbose_filehandle)
     common.syscall(map_cmd, verbose=verbose, verbose_filehandle=verbose_filehandle)
 
     if sort:
         threads = min(4, threads)
         thread_mem = int(500 / threads)
-        sort_cmd = samtools + ' sort -@' + str(threads) + ' -m ' + str(thread_mem) + 'M ' + intermediate_bam + ' ' + out_prefix
+        sort_cmd = ' '.join([
+            samtools,
+            'sort',
+            '-@' + str(threads),
+            '-m' + str(thread_mem) + 'M',
+            '-o', final_bam,
+            '-O bam',
+            '-T', out_prefix + '.tmp.samtool_sort',
+            intermediate_bam,
+        ])
         index_cmd = samtools + ' index ' + final_bam
         common.syscall(sort_cmd, verbose=verbose, verbose_filehandle=verbose_filehandle)
         common.syscall(index_cmd, verbose=verbose, verbose_filehandle=verbose_filehandle)
-    for fname in clean_files:
-        os.unlink(fname)
-
+        clean_files.append(intermediate_bam)
 
-def run_smalt(
-      reads_fwd,
-      reads_rev,
-      ref_fa,
-      out_prefix,
-      index_k=9,
-      index_s=2,
-      threads=1,
-      max_insert=1000,
-      minid=0.9,
-      sort=False,
-      extra_smalt_map_ops='-x',
-      samtools='samtools',
-      smalt='smalt',
-      verbose=False
-    ):
-    if extra_smalt_map_ops is None:
-        extra_smalt_map_ops = ''
-    map_index = out_prefix + '.map_index'
-    clean_files = [map_index + '.' + x for x in ['smi', 'sma']]
-    index_cmd = ' '.join([
-        smalt, 'index',
-        '-k', str(index_k),
-        '-s', str(index_s),
-        map_index,
-        ref_fa
-    ])
-
-    map_cmd = smalt + ' map ' + extra_smalt_map_ops + ' '
-
-    # depending on OS, -n can break smalt, so only use -n if it's > 1.
-    if threads > 1:
-        map_cmd += '-n ' + str(threads) + ' -O '
-
-    if reads_rev is None:
-        map_cmd += ' '.join([
-            '-y', str(minid),
-            map_index,
-            reads_fwd,
-        ])
-    else:
-        map_cmd += ' '.join([
-            '-i', str(max_insert),
-            '-y', str(minid),
-            map_index,
-            reads_fwd,
-            reads_rev,
-        ])
-
-    map_cmd += ' | ' + samtools + ' view'
-
-    final_bam = out_prefix + '.bam'
-    if sort:
-        intermediate_bam = out_prefix + '.unsorted.bam'
-    else:
-        intermediate_bam = final_bam
-
-    map_cmd += ' -bS -T ' + ref_fa + '  - > ' + intermediate_bam
-    common.syscall(index_cmd, verbose=verbose)
-    common.syscall(map_cmd, verbose=verbose)
-
-    if sort:
-        threads = min(4, threads)
-        thread_mem = int(500 / threads)
-        sort_cmd = samtools + ' sort -@' + str(threads) + ' -m ' + str(thread_mem) + 'M ' + intermediate_bam + ' ' + out_prefix
-        index_cmd = samtools + ' index ' + final_bam
-        common.syscall(sort_cmd, verbose=verbose)
-        common.syscall(index_cmd, verbose=verbose)
     for fname in clean_files:
         os.unlink(fname)
 
diff --git a/ariba/ref_genes_getter.py b/ariba/ref_genes_getter.py
new file mode 100644
index 00000000..f8057645
--- /dev/null
+++ b/ariba/ref_genes_getter.py
@@ -0,0 +1,333 @@
+class Error (Exception): pass
+
+import sys
+import os
+import shutil
+import re
+import requests
+import pyfastaq
+import urllib
+import time
+from bs4 import BeautifulSoup
+from ariba import common
+
+
+class RefGenesGetter:
+    def __init__(self, ref_db, genetic_code=11):
+        allowed_ref_dbs = {'card', 'argannot', 'resfinder'}
+        if ref_db not in allowed_ref_dbs:
+            raise Error('Error in RefGenesGetter. ref_db must be one of: ' + str(allowed_ref_dbs) + ', but I got "' + ref_db)
+        self.ref_db=ref_db
+        self.genetic_code = genetic_code
+        self.max_download_attempts = 3
+        self.sleep_time = 2
+        pyfastaq.sequences.genetic_code = self.genetic_code
+
+
+    def _download_file(self, url, outfile):
+        print('Downloading "', url, '" and saving as "', outfile, '" ...', end='', sep='')
+        for i in range(self.max_download_attempts):
+            time.sleep(self.sleep_time)
+            try:
+                urllib.request.urlretrieve(url, filename=outfile)
+            except:
+                continue
+            break
+        else:
+            raise Error('Error downloading: ' + url)
+        print(' done', flush=True)
+
+
+    def _get_souped_request(self, url):
+        print('Getting url "', url, '" ...', sep='', end='')
+        for i in range(self.max_download_attempts):
+            time.sleep(self.sleep_time)
+            r = requests.get(url)
+            if r.status_code == 200:
+                break
+        else:
+            raise Error('\nError requests.get with url: ' + url)
+
+        print('done', flush=True)
+        return BeautifulSoup(r.text, 'html.parser')
+
+
+    def _get_card_gene_variant_info(self, gene, index_url):
+        print('Getting variant info on CARD gene', gene, flush=True)
+        soup = self._get_souped_request(index_url)
+
+        # get link to Antibiotic Resistance page
+        rows = soup.find_all('tr')
+        gene_indexes = [i for i, j in enumerate(rows) if 'Antibiotic Resistance' in j.text]
+        if len(gene_indexes) != 1:
+            raise Error('Error getting one link to antibiotic resistance. Found ' + str(len(gene_indexes)) + ' links')
+
+        row_index = gene_indexes[0] + 1
+        assert row_index < len(rows)
+        antibio_links = rows[row_index].find_all('a')
+        print('Found', len(antibio_links), 'links to antibiotic resistance pages')
+
+        variants = []
+
+        for antibio_link_obj in antibio_links:
+            antibio_link = antibio_link_obj['href']
+            soup = self._get_souped_request(antibio_link)
+
+            # get description
+            ontology_def = soup.find(id='ontology-definition-field')
+            if ontology_def is None:
+                description = None
+            else:
+                # there are sometimes newline characters in the description.
+                # replace all whitepace characters with a space
+                description = re.sub('\s', ' ', ontology_def.text)
+
+            # get variants
+            bioinf_tables = [x for x in soup.find_all('table') if 'Bioinformatics' in x.text]
+
+            if len(bioinf_tables) != 1:
+                raise Error('Error getting Bioinformatics table from ' + antibio_link)
+
+            bioinf_table = bioinf_tables[0]
+            variant_elements = [x for x in bioinf_table.find_all('small') if 'Resistance Variant' in x.text]
+
+            if len(variant_elements) < 1:
+                print('WARNING:', gene, 'No variants found on page', antibio_link)
+            else:
+                new_variants = [x.text.split()[-1].split('<')[0] for x in variant_elements]
+                for variant in new_variants:
+                    print('New variant:', variant, description)
+                    variants.append((variant, description))
+
+
+        if len(variants):
+            print('Total of', len(variants), 'variants found for gene', gene)
+        else:
+            print('WARNING:', gene, 'No valid variants found for gene')
+
+        return variants
+
+
+    def _get_card_variant_data(self, tsv_outfile, got_genes_set, got_genes_file):
+        if got_genes_set is None:
+            got_genes_set = set()
+            try:
+                tsv_out_fh = open(tsv_outfile, 'w')
+            except:
+                raise Error('Error opening file for writing: "' + tsv_outfile + '"')
+        else:
+            try:
+                tsv_out_fh = open(tsv_outfile, 'a')
+            except:
+                raise Error('Error opening file for appending: "' + tsv_outfile + '"')
+
+        got_genes_fh = pyfastaq.utils.open_file_write(got_genes_file)
+        soup = self._get_souped_request('http://arpcard.mcmaster.ca/?q=CARD/search/mqt.35950.mqt.806')
+        table = soup.find(id='searchresultsTable')
+        links = {x.text : x['href'] for x in table.find_all('a')}
+        print('Found', len(links), 'genes to get variants for')
+        genes_done = 0
+
+        for gene, url in sorted(links.items()):
+            if gene in got_genes_set:
+                print('Info for gene', gene, 'already found. Skipping')
+            else:
+                print('\nGetting info for gene', gene, 'from', url)
+                variants = self._get_card_gene_variant_info(gene, links[gene])
+                if len(variants) == 0:
+                    print('WARNING: No valid variants found for gene', gene)
+                for variant, description in variants:
+                    print(gene, 'p', variant, 'N', description, sep='\t', file=tsv_out_fh, flush=True)
+
+            print(gene, file=got_genes_fh, flush=True)
+            genes_done += 1
+            print('Done', genes_done, 'genes of', len(links))
+
+        tsv_out_fh.close()
+        pyfastaq.utils.close(got_genes_fh)
+
+
+    @staticmethod
+    def _card_parse_presence_absence(infile, fa_outfile, metadata_fh):
+        presence_absence_ids = set()
+
+        file_reader = pyfastaq.sequences.file_reader(infile)
+        fa_out = pyfastaq.utils.open_file_write(fa_outfile)
+
+        for seq in file_reader:
+            try:
+                seq.id, description = seq.id.split(maxsplit=1)
+            except:
+                description = None
+
+            presence_absence_ids.add(seq.id)
+            print(seq, file=fa_out)
+
+            if description is not None:
+                print(seq.id, '.', '.', 'N', description, sep='\t', file=metadata_fh)
+
+        pyfastaq.utils.close(fa_out)
+        return presence_absence_ids
+
+
+    @staticmethod
+    def _card_parse_all_genes(infile, outfile, metadata_fh, presence_absence_ids):
+        file_reader = pyfastaq.sequences.file_reader(infile)
+        f_out = pyfastaq.utils.open_file_write(outfile)
+
+        for seq in file_reader:
+            try:
+                seq.id, description = seq.id.split(maxsplit=1)
+            except:
+                description = None
+
+            if seq.id in presence_absence_ids:
+                continue
+
+            print(seq, file=f_out)
+            if description is not None:
+                print(seq.id, '.', '.', 'N', description, sep='\t', file=metadata_fh)
+
+        pyfastaq.utils.close(f_out)
+
+
+    def _get_from_card(self, outprefix):
+        variant_metadata_tsv = outprefix + '.variant_metadata.tsv'
+        got_genes_file = outprefix + '.gene_variants.progress'
+        genes_done_file = outprefix + '.gene_variants.done'
+
+        if os.path.exists(genes_done_file):
+            if not os.path.exists(variant_metadata_tsv):
+                raise Error('Error from previous run. Found file ' + genes_done_file + ' but not ' + variant_metadata_tsv + '. Cannot continue. Delete all previous files and start again')
+            print('Found files', genes_done_file, 'and', variant_metadata_tsv, 'from previous run, so no need to get genes variants info.')
+        else:
+            if os.path.exists(got_genes_file) and os.path.exists(variant_metadata_tsv):
+                print('Existing files found. Try to continue getting gene variants')
+                with open(got_genes_file) as f:
+                    got_genes_set = {x.rstrip() for x in f}
+            else:
+                print('Found none or one (but not both) of', got_genes_file, 'and', variant_metadata_tsv, 'so starting downloading from scratch.')
+                for filename in (got_genes_file, variant_metadata_tsv):
+                    try:
+                        os.unlink(filename)
+                    except:
+                        pass
+                got_genes_set = None
+
+            self._get_card_variant_data(variant_metadata_tsv, got_genes_set, got_genes_file)
+            with open(genes_done_file, 'w') as f:
+                pass
+
+        print('Finished getting variant data. Getting FASTA files', flush=True)
+        all_ref_genes_fa_gz = outprefix + '.tmp.downloaded.all_genes.fa.gz'
+        presence_absence_fa_gz = outprefix + '.tmp.download.presence_absence.fa.gz'
+        variants_only_fa = outprefix + '.variants_only.fa'
+        presence_absence_fa = outprefix + '.presence_absence.fa'
+        self._download_file('http://arpcard.mcmaster.ca/blast/db/nucleotide/AR-genes.fa.gz', all_ref_genes_fa_gz)
+        self._download_file('http://arpcard.mcmaster.ca/blast/db/nucleotide/ARmeta-genes.fa.gz', presence_absence_fa_gz)
+
+        print('Making presence_absence and variants_only fasta files, and getting their metadata', flush=True)
+
+        general_metadata_tsv = outprefix + '.general_metadata.tsv'
+        general_metadata_fh = pyfastaq.utils.open_file_write(general_metadata_tsv)
+        presence_absence_ids = self._card_parse_presence_absence(presence_absence_fa_gz, presence_absence_fa, general_metadata_fh)
+        self._card_parse_all_genes(all_ref_genes_fa_gz, variants_only_fa, general_metadata_fh, presence_absence_ids)
+        pyfastaq.utils.close(general_metadata_fh)
+
+        print('Deleting temporary downloaded files', all_ref_genes_fa_gz, presence_absence_fa_gz)
+        os.unlink(all_ref_genes_fa_gz)
+        os.unlink(presence_absence_fa_gz)
+
+        print('Catting', variant_metadata_tsv, 'and', general_metadata_tsv)
+        final_tsv = outprefix + '.metadata.tsv'
+        with open(final_tsv, 'w') as f_out:
+            for filename in [variant_metadata_tsv, general_metadata_tsv]:
+                print('   ', filename)
+                with open(filename) as f_in:
+                    for line in f_in:
+                        print(line, end='', file=f_out)
+
+        print('Finished making files. Final genes files and metadata file:')
+        print('   ', presence_absence_fa)
+        print('   ', variants_only_fa)
+        print('   ', final_tsv)
+
+        print('\nYou can use them with ARIBA like this:')
+        print('ariba run --presabs', presence_absence_fa, '--varonly', variants_only_fa, '--metadata', final_tsv, ' reads_1.fq reads_2.fq output_directory\n')
+
+        print('If you use this downloaded data, please cite:')
+        print('"The Comprehensive Antibiotic Resistance Database", McArthur et al 2013, PMID: 23650175')
+
+
+    def _get_from_resfinder(self, outprefix):
+        outprefix = os.path.abspath(outprefix)
+        final_fasta = outprefix + '.genes.fa'
+        tmpdir = outprefix + '.tmp.download'
+        current_dir = os.getcwd()
+
+        try:
+            os.mkdir(tmpdir)
+            os.chdir(tmpdir)
+        except:
+            raise Error('Error mkdir/chdir ' + tmpdir)
+
+        zipfile = 'resfinder.zip'
+        cmd = 'curl -X POST --data "folder=resfinder&filename=resfinder.zip" -o ' + zipfile + ' https://cge.cbs.dtu.dk/cge/download_data.php'
+        print('Downloading data with:', cmd, sep='\n')
+        common.syscall(cmd)
+        common.syscall('unzip ' + zipfile)
+
+        print('Combining downloaded fasta files...')
+        f = pyfastaq.utils.open_file_write(final_fasta)
+
+        for filename in os.listdir('database'):
+            if filename.endswith('.fsa'):
+                print('   ', filename)
+                file_reader = pyfastaq.sequences.file_reader(os.path.join('database', filename))
+                for seq in file_reader:
+                    print(seq, file=f)
+
+        pyfastaq.utils.close(f)
+
+        print('\nCombined files. Final genes file is callled', final_fasta, end='\n\n')
+        os.chdir(current_dir)
+        shutil.rmtree(tmpdir)
+
+        print('You can use it with ARIBA like this:')
+        print('ariba run --presabs', os.path.relpath(final_fasta), 'reads_1.fq reads_2.fq output_directory\n')
+        print('If you use this downloaded data, please cite:')
+        print('"Identification of acquired antimicrobial resistance genes", Zankari et al 2012, PMID: 22782487\n')
+
+
+    def _get_from_argannot(self, outprefix):
+        outprefix = os.path.abspath(outprefix)
+        tmpdir = outprefix + '.tmp.download'
+        current_dir = os.getcwd()
+
+        try:
+            os.mkdir(tmpdir)
+            os.chdir(tmpdir)
+        except:
+            raise Error('Error mkdir/chdir ' + tmpdir)
+
+        zipfile = 'arg-annot-database_doc.zip'
+        self._download_file('http://www.mediterranee-infection.com/arkotheque/client/ihumed/_depot_arko/articles/304/arg-annot-database_doc.zip', zipfile)
+        common.syscall('unzip ' + zipfile)
+        os.chdir(current_dir)
+        print('Extracted files.')
+
+        genes_file = os.path.join(tmpdir, 'Database Nt Sequences File.txt')
+        final_fasta = outprefix + '.fa'
+        pyfastaq.tasks.to_fasta(genes_file, final_fasta)
+        shutil.rmtree(tmpdir)
+
+        print('Finished. Final genes file is called', final_fasta, end='\n\n')
+        print('You can use it with ARIBA like this:')
+        print('ariba run --presabs', os.path.relpath(final_fasta), 'reads_1.fq reads_2.fq output_directory\n')
+        print('If you use this downloaded data, please cite:')
+        print('"ARG-ANNOT, a new bioinformatic tool to discover antibiotic resistance genes in bacterial genomes",\nGupta et al 2014, PMID: 24145532\n')
+
+
+    def run(self, outprefix):
+        exec('self._get_from_' + self.ref_db + '(outprefix)')
+
diff --git a/ariba/refcheck.py b/ariba/refcheck.py
deleted file mode 100644
index 551b648f..00000000
--- a/ariba/refcheck.py
+++ /dev/null
@@ -1,90 +0,0 @@
-import os
-import pyfastaq
-
-class Error (Exception): pass
-
-
-class Checker:
-    def __init__(self, infile, min_length=1, max_length=10000, outprefix=None):
-        self.infile = os.path.abspath(infile)
-        if not os.path.exists(self.infile):
-            raise Error('File not found: "' + self.infile + '". Cannot continue')
-
-        self.min_length = min_length
-        self.max_length = max_length
-        self.outprefix = outprefix
-
-
-    def run(self):
-        file_reader = pyfastaq.sequences.file_reader(self.infile)
-        names = {}
-
-        if self.outprefix is not None:
-            old2new_out = self.outprefix + '.rename'
-            fasta_out = self.outprefix + '.fa'
-            bad_seqs_out = self.outprefix + '.removed.fa'
-            log_out = self.outprefix + '.log'
-            old2new_out_fh = pyfastaq.utils.open_file_write(old2new_out)
-            fasta_out_fh = pyfastaq.utils.open_file_write(fasta_out)
-            bad_seqs_out_fh = pyfastaq.utils.open_file_write(bad_seqs_out)
-            log_out_fh = pyfastaq.utils.open_file_write(log_out)
-
-        for seq in file_reader:
-            seq.seq = seq.seq.upper()
-            if len(seq) < self.min_length:
-                if self.outprefix is None:
-                    return False, 'Too short', seq
-                else:
-                    print(seq.id, 'Too short. Skipping', sep='\t', file=log_out_fh)
-                    print(seq, file=bad_seqs_out_fh)
-                    continue
-            elif len(seq) > self.max_length:
-                if self.outprefix is None:
-                    return False, 'Too long', seq
-                else:
-                    print(seq.id, 'Too long. Skipping', sep='\t', file=log_out_fh)
-                    print(seq, file=bad_seqs_out_fh)
-                    continue
-
-            if not seq.looks_like_gene():
-                if self.outprefix is None:
-                    return False, 'Not a gene', seq
-                else:
-                    seq.revcomp()
-                    if seq.looks_like_gene():
-                        print(seq.id, 'Reverse complemented', sep='\t', file=log_out_fh)
-                    else:
-                        print(seq.id, 'Does not look like a gene. Skipping', sep='\t', file=log_out_fh)
-                        seq.revcomp()
-                        print(seq, file=bad_seqs_out_fh)
-                        continue
-
-            original_id = seq.id
-            # replace unwanted characters with underscores
-            to_replace = ' '
-            seq.id = seq.id.translate(str.maketrans(to_replace, '_' *  len(to_replace)))
-
-            if self.outprefix is None and original_id != seq.id:
-                seq.id = original_id
-                return False, 'Name has spaces', seq
-
-            if seq.id in names:
-                if self.outprefix is None:
-                    return False, 'Duplicate name', seq
-                else:
-                    names[seq.id] += 1
-                    seq.id += '.' + str(names[seq.id])
-            else:
-                names[seq.id] = 1
-
-            if self.outprefix is not None:
-                print(original_id, seq.id, sep='\t', file=old2new_out_fh)
-                print(seq, file=fasta_out_fh)
-
-        if self.outprefix is not None:
-            pyfastaq.utils.close(fasta_out_fh)
-            pyfastaq.utils.close(bad_seqs_out_fh)
-            pyfastaq.utils.close(log_out_fh)
-            pyfastaq.utils.close(old2new_out_fh)
-
-        return True, None, None
diff --git a/ariba/reference_data.py b/ariba/reference_data.py
new file mode 100644
index 00000000..762b0c1d
--- /dev/null
+++ b/ariba/reference_data.py
@@ -0,0 +1,407 @@
+import os
+import sys
+import copy
+import pyfastaq
+from ariba import sequence_metadata, cdhit
+
+
+class Error (Exception): pass
+
+
+class ReferenceData:
+    def __init__(self,
+        presence_absence_fa=None,
+        variants_only_fa=None,
+        non_coding_fa=None,
+        metadata_tsv=None,
+        min_gene_length=6,
+        max_gene_length=10000,
+        genetic_code=11,
+    ):
+        self.seq_filenames = {}
+        self.seq_dicts = {}
+        self.min_gene_length = min_gene_length
+        self.max_gene_length = max_gene_length
+
+        total_ref_seqs_loaded = 0
+
+        for x in ['presence_absence', 'variants_only', 'non_coding']:
+            exec('self.seq_filenames[x] = self._get_filename(' + x + '_fa)')
+            self.seq_dicts[x] = self._load_fasta_file(self.seq_filenames[x])
+            total_ref_seqs_loaded += len(self.seq_dicts[x])
+
+        if {None} == set(self.seq_filenames.values()):
+            raise Error('Error! Must supply at least one of presence_absence_fa, variants_only_fa, non_coding_fa. Cannot continue')
+
+        if total_ref_seqs_loaded == 0:
+            raise Error('Error! No sequences found in input file(s). Maybe they were empty? Cannot continue.')
+
+        self.metadata = self._load_metadata_tsv(metadata_tsv)
+        self.genetic_code = genetic_code
+        pyfastaq.sequences.genetic_code = self.genetic_code
+        common_names = self._dict_keys_intersection(list(self.seq_dicts.values()))
+        if len(common_names):
+            raise Error('Error! Non-unique names found in input fasta files:\n' + '\n'.join(common_names))
+
+
+    @staticmethod
+    def _dict_keys_intersection(dicts):
+        dicts = [x for x in dicts if x is not None]
+        if len(dicts) == 0:
+            return set()
+
+        inter = set(dicts[0].keys())
+
+        for d in dicts[1:]:
+            inter = inter.intersection(set(d.keys()))
+        return inter
+
+
+    @staticmethod
+    def _get_filename(filename):
+        if filename is None:
+            return None
+        else:
+            if os.path.exists(filename):
+                return os.path.abspath(filename)
+            else:
+                raise Error('Error! File not found: ' + filename)
+
+
+    @staticmethod
+    def _load_metadata_tsv(filename):
+        if filename is None:
+            return {}
+
+        f = pyfastaq.utils.open_file_read(filename)
+        metadata_dict = {}
+
+        for line in f:
+            try:
+                metadata = sequence_metadata.SequenceMetadata(line)
+            except:
+                print('Problem with this line of metadata, which will be ignored:', line.rstrip(), file=sys.stderr)
+                continue
+
+            if metadata.name not in metadata_dict:
+                metadata_dict[metadata.name] = {'n': {}, 'p': {}, '.': set()}
+
+            if metadata.variant_type == '.':
+                metadata_dict[metadata.name]['.'].add(metadata)
+            else:
+                if metadata.variant.position not in metadata_dict[metadata.name][metadata.variant_type]:
+                    metadata_dict[metadata.name][metadata.variant_type][metadata.variant.position] = set()
+
+                metadata_dict[metadata.name][metadata.variant_type][metadata.variant.position].add(metadata)
+
+        pyfastaq.utils.close(f)
+        return metadata_dict
+
+
+    @staticmethod
+    def _load_fasta_file(filename):
+        d = {}
+
+        if filename is not None:
+            seq_reader = pyfastaq.sequences.file_reader(filename)
+            for seq in seq_reader:
+                seq.id = seq.id.split()[0]
+                if seq.id in d:
+                    raise Error('Duplicate name "' + seq.id + '" found in file ' + filename + '. Cannot continue)')
+                d[seq.id] = copy.copy(seq)
+
+        return d
+
+
+    @staticmethod
+    def _find_gene_in_seqs(name, dicts):
+        for dict_name, this_dict in dicts.items():
+            if this_dict is None:
+                continue
+            elif name in this_dict:
+                return dict_name
+
+        return None
+
+
+    @staticmethod
+    def _write_metadata_tsv(metadata, filename):
+        f = pyfastaq.utils.open_file_write(filename)
+
+        for gene_name, data_dict in sorted(metadata.items()):
+            for meta in data_dict['.']:
+                print(meta, file=f)
+
+            variants = []
+
+            for variant_type in ['n', 'p']:
+                for position in data_dict[variant_type]:
+                    for meta in data_dict[variant_type][position]:
+                        variants.append(meta)
+
+            variants.sort()
+            for v in variants:
+                print(v, file=f)
+
+        pyfastaq.utils.close(f)
+
+
+    @staticmethod
+    def _write_dict_of_sequences(seq_dict, filename):
+        f = pyfastaq.utils.open_file_write(filename)
+        for seq in sorted(seq_dict):
+            print(seq_dict[seq], file=f)
+        pyfastaq.utils.close(f)
+
+
+    def _write_sequences(self, filename, sequences_to_write):
+        assert sequences_to_write in self.seq_dicts and sequences_to_write in self.seq_filenames
+        if self.seq_filenames[sequences_to_write] is not None:
+            self._write_dict_of_sequences(self.seq_dicts[sequences_to_write], filename)
+
+
+    def _filter_bad_variant_data(self, out_prefix, presence_absence_removed, variants_only_removed):
+        genes_to_remove = set()
+        variants_only_genes_not_found = set(self.seq_dicts['variants_only'].keys())
+        log_file = out_prefix + '.log'
+        tsv_file = out_prefix + '.tsv'
+        new_variants_fa_file = out_prefix + '.variants_only.fa'
+        log_fh = pyfastaq.utils.open_file_write(log_file)
+
+        for gene_name, metadata_dict in sorted(self.metadata.items()):
+            if gene_name in presence_absence_removed:
+                print(gene_name, 'was removed from presence/absence fasta, so removing its metadata', file=log_fh)
+                genes_to_remove.add(gene_name)
+                continue
+            elif gene_name in variants_only_removed:
+                print(gene_name, 'was removed from variants only fasta, so removing its metadata', file=log_fh)
+                genes_to_remove.add(gene_name)
+                continue
+
+            gene_in_seq_dict = self._find_gene_in_seqs(gene_name, self.seq_dicts)
+            if gene_in_seq_dict is None:
+                print(gene_name, 'is in input tsv file, but not found in any input sequence files. Removing', file=log_fh)
+                genes_to_remove.add(gene_name)
+                continue
+
+            # take out any metadata that is not a variant and has no extra info.
+            to_remove = []
+
+            for metadata in metadata_dict['.']:
+                if metadata.free_text is None:
+                    print(gene_name, 'metadata has no info. Just gene name given. Removing. Line of file was:', metadata, file=log_fh)
+                    to_remove.append(metadata)
+
+            for metadata in to_remove:
+                metadata_dict['.'].remove(metadata)
+
+
+            # if this is non_coding, we shouldn't have any amino acid variants
+            if gene_in_seq_dict == 'non_coding':
+                for position in metadata_dict['p']:
+                    for metadata in metadata_dict['p'][position]:
+                        print(gene_name, 'variant of type "p" for protein, but sequence is non-coding. Removing. Line of file was:', metadata, file=log_fh)
+
+                metadata_dict['p'] = {}
+
+
+            # take out variant metadata that doesn't make sense (eg bases not matching ref sequence)
+            for variant_type in ['n', 'p']:
+                positions_to_remove = []
+                for position in metadata_dict[variant_type]:
+                    meta_to_remove = []
+                    for metadata in metadata_dict[variant_type][position]:
+                        to_translate = variant_type == 'p'
+
+                        if not metadata.variant.sanity_check_against_seq(self.seq_dicts[gene_in_seq_dict][gene_name], translate_seq=to_translate):
+                            print(gene_name, 'variant does not match reference. Removing. Line of file was:', metadata, file=log_fh)
+                            meta_to_remove.append(metadata)
+                            continue
+
+                        if gene_in_seq_dict == 'variants_only':
+                            variants_only_genes_not_found.discard(gene_name)
+
+                    for metadata in meta_to_remove:
+                        metadata_dict[variant_type][position].remove(metadata)
+                    if len(metadata_dict[variant_type][position]) == 0:
+                        positions_to_remove.append(position)
+
+                for position in positions_to_remove:
+                    del metadata_dict[variant_type][position]
+
+
+            if gene_in_seq_dict == 'variants_only' and len(metadata_dict['n']) == len(metadata_dict['p']) == len(metadata_dict['.']) == 0:
+                print(gene_name, 'No remaining data after checks. Removing this sequence because it is in the variants only file', file=log_fh)
+                genes_to_remove.add(gene_name)
+
+        for gene_name in genes_to_remove:
+            self.metadata.pop(gene_name)
+
+        for gene_name in variants_only_genes_not_found:
+            print(gene_name, 'is in variants only gene file, but no variants found. Removing.', file=log_fh)
+            self.seq_dicts['variants_only'].pop(gene_name)
+
+        pyfastaq.utils.close(log_fh)
+        self._write_metadata_tsv(self.metadata, tsv_file)
+        self._write_sequences(out_prefix + '.presence_absence.fa', 'presence_absence')
+        self._write_sequences(out_prefix + '.non_coding.fa', 'non_coding')
+        self._write_sequences(out_prefix + '.variants_only.fa', 'variants_only')
+
+
+    @staticmethod
+    def _gene_seq_is_ok(seq, min_length, max_length):
+        seq.seq = seq.seq.upper()
+        if len(seq) < min_length:
+            return False, 'Remove: too short. Length: ' + str(len(seq))
+        elif len(seq) > max_length:
+            return False, 'Remove: too long. Length: ' + str(len(seq))
+        elif not seq.looks_like_gene():
+            length_over_three = round(len(seq) / 3, 2)
+            return False, 'Does not look like a gene (does not start with start codon, length (' + str(len(seq)) + ') is not a multiple of 3 (length/3=' + str(length_over_three) + '), or contains internal stop codons). Translation: ' + seq.translate().seq
+
+        return True, None
+
+
+    def _remove_bad_genes(self, seqs_dict, log_file):
+        to_remove = set()
+
+        if len(seqs_dict) == 0:
+            return to_remove
+
+        log_fh = pyfastaq.utils.open_file_write(log_file)
+
+        for name, sequence in sorted(seqs_dict.items()):
+            ok, message = self._gene_seq_is_ok(sequence, self.min_gene_length, self.max_gene_length)
+            if message is not None:
+                print(name, message, file=log_fh)
+            if not ok:
+                to_remove.add(name)
+
+        pyfastaq.utils.close(log_fh)
+
+        for name in to_remove:
+            seqs_dict.pop(name)
+
+        return to_remove
+
+
+    def sanity_check(self, outprefix):
+        variants_only_removed = self._remove_bad_genes(self.seq_dicts['variants_only'], outprefix + '.00.check_fasta_variants_only.log')
+        presence_absence_removed = self._remove_bad_genes(self.seq_dicts['presence_absence'], outprefix + '.00.check_fasta_presence_absence.log')
+        self._filter_bad_variant_data(outprefix + '.01.check_variants', variants_only_removed, presence_absence_removed)
+
+
+    def make_catted_fasta(self, outfile):
+        f = pyfastaq.utils.open_file_write(outfile)
+
+        for key in ['presence_absence', 'variants_only', 'non_coding']:
+            filename = self.seq_filenames[key]
+            if filename is not None:
+                file_reader = pyfastaq.sequences.file_reader(filename)
+                for seq in file_reader:
+                    print(seq, file=f)
+
+        pyfastaq.utils.close(f)
+
+
+    def sequence_type(self, sequence_name):
+        return self._find_gene_in_seqs(sequence_name, self.seq_dicts)
+
+
+    def sequence(self, sequence_name):
+        d = self._find_gene_in_seqs(sequence_name, self.seq_dicts)
+        if d is None:
+            return None
+        else:
+            return self.seq_dicts[d][sequence_name]
+
+
+    def sequence_length(self, sequence_name):
+        seq = self.sequence(sequence_name)
+        assert seq is not None
+        return len(seq)
+
+
+    def all_non_wild_type_variants(self, ref_name):
+        ref_seq = self.sequence(ref_name)
+        variants = {'n': {}, 'p': {}}
+
+        if ref_seq is None or ref_name not in self.metadata:
+            return variants
+
+        for variant_type in ['n', 'p']:
+            for position, metadata_set in self.metadata[ref_name][variant_type].items():
+                for metadata in metadata_set:
+                    if metadata.always_report or metadata.has_variant(ref_seq):
+                        if position not in variants[variant_type]:
+                            variants[variant_type][position] = set()
+
+                        variants[variant_type][position].add(metadata)
+
+        return variants
+
+
+    @staticmethod
+    def write_cluster_allocation_file(clusters, outfile):
+        f_out = pyfastaq.utils.open_file_write(outfile)
+
+        for seq_type in ['presence_absence', 'variants_only', 'non_coding']:
+            if clusters[seq_type] is None:
+                continue
+
+            for seq_name in sorted(clusters[seq_type]):
+                other_seqs = clusters[seq_type][seq_name].difference({seq_name})
+                if len(other_seqs) > 0:
+                    other_seq_string = '\t'.join(sorted(list(other_seqs)))
+                    print(seq_name, other_seq_string, sep='\t', file=f_out)
+                else:
+                    print(seq_name, file=f_out)
+
+        pyfastaq.utils.close(f_out)
+
+
+    def cluster_with_cdhit(self, inprefix, outprefix, seq_identity_threshold=0.9, threads=1, length_diff_cutoff=0.9, nocluster=False, verbose=False):
+        files_to_cat = []
+        clusters = {}
+
+        for seqs_type in ['presence_absence', 'variants_only', 'non_coding']:
+            if len(self.seq_dicts[seqs_type]) > 0:
+                outfile = outprefix + '.' + seqs_type + '.cdhit'
+                files_to_cat.append(outfile)
+                cdhit_runner = cdhit.Runner(
+                  inprefix + '.' + seqs_type + '.fa',
+                  outfile,
+                  seq_identity_threshold=seq_identity_threshold,
+                  threads=threads,
+                  length_diff_cutoff=length_diff_cutoff,
+                  verbose=verbose
+                )
+
+                if nocluster:
+                    new_clusters = cdhit_runner.fake_run()
+                else:
+                    new_clusters = cdhit_runner.run()
+
+                clusters[seqs_type] = new_clusters
+            else:
+                clusters[seqs_type] = None
+
+        assert len(files_to_cat) > 0
+        f_out = pyfastaq.utils.open_file_write(outprefix + '.cluster_representatives.fa')
+
+        for filename in files_to_cat:
+            for seq in pyfastaq.sequences.file_reader(filename):
+                print(seq, file=f_out)
+
+        pyfastaq.utils.close(f_out)
+        self.write_cluster_allocation_file(clusters, outprefix + '.clusters.tsv')
+        return clusters
+
+
+    def write_seqs_to_fasta(self, outfile, names):
+        f_out = pyfastaq.utils.open_file_write(outfile)
+
+        for name in sorted(names):
+            print(self.sequence(name), file=f_out)
+
+        pyfastaq.utils.close(f_out)
diff --git a/ariba/report.py b/ariba/report.py
new file mode 100644
index 00000000..d0aff98e
--- /dev/null
+++ b/ariba/report.py
@@ -0,0 +1,209 @@
+import pymummer
+
+columns = [
+    'ref_name',              # 0  name of reference sequence
+    'ref_type',              # 1  type of reference sequence (presence/absence, variants only, noncoding)
+    'flag',                  # 2  cluster flag
+    'reads',                 # 3  number of reads in this cluster
+    'cluster_rep',           # 4  name of cluster representitive from cd hit
+    'ref_len',               # 5  length of reference sequence
+    'ref_base_assembled',    # 6  number of reference nucleotides assembled by this contig
+    'pc_ident',              # 7  %identity between ref sequence and contig
+    'ctg',                   # 8  name of contig matching reference
+    'ctg_len',               # 9  length of contig matching reference
+    'known_var',             # 10 is this a known SNP from reference metadata? 1|0
+    'var_type',              # 11 The type of variant. Currently only SNP supported
+    'var_seq_type',          # 12 if known_var=1, n|p for nucleotide or protein
+    'known_var_change',      # 13 if known_var=1, the wild/variant change, eg I42L
+    'has_known_var',         # 13 if known_var=1, 1|0 for whether or not the assembly has the variant
+    'ref_ctg_change',        # 14 amino acid or nucleotide change between reference and contig, eg I42L
+    'ref_ctg_effect',        # 15 effect of change between reference and contig, eg SYS, NONSYN (amino acid changes only)
+    'ref_start',             # 16 start position of variant in contig
+    'ref_end',               # 17 end position of variant in contig
+    'ref_nt',                # 18 nucleotide(s) in contig at variant position
+    'ctg_start',             # 19 start position of variant in contig
+    'ctg_end',               # 20 end position of variant in contig
+    'ctg_nt',                # 21 nucleotide(s) in contig at variant position
+    'smtls_total_depth',     # 22 total read depth at variant start position in contig, reported by mpileup
+    'smtls_alt_nt',          # 23 alt nucleotides on contig, reported by mpileup
+    'smtls_alt_depth',       # 24 alt depth on contig, reported by mpileup
+    'var_description',       # 25 description of variant from reference metdata
+    'free_text',             # 26 other free text about reference sequence, from reference metadata
+]
+
+
+def header_line():
+    return '\t'.join(columns)
+
+
+def _samtools_depths_at_known_snps_all_wild(sequence_meta, contig_name, cluster, variant_list):
+    '''Input is a known variants, as sequence_metadata object. The
+       assumption is that both the reference and the assembly have the
+       variant type, not wild type. The list variant_list should be a list
+       of pymummer.variant.Variant objects, only contaning variants to the
+       relevant query contig'''
+    ref_nuc_range = sequence_meta.variant.nucleotide_range()
+
+    if ref_nuc_range is None:
+        return None
+
+    depths = []
+    ctg_nts = []
+    ref_nts = []
+    smtls_total_depths = []
+    smtls_alt_nts = []
+    smtls_alt_depths = []
+    contig_positions = []
+
+    for ref_position in range(ref_nuc_range[0], ref_nuc_range[1]+1, 1):
+        nucmer_match = cluster.assembly_compare.nucmer_hit_containing_reference_position(cluster.assembly_compare.nucmer_hits, cluster.ref_sequence.id, ref_position)
+
+        if nucmer_match is not None:
+            # work out contig position. Needs indels variants to correct the position
+            ref_nts.append(cluster.ref_sequence[ref_position])
+            contig_position, in_indel = nucmer_match.qry_coords_from_ref_coord(ref_position, variant_list)
+            contig_positions.append(contig_position)
+            ref, alt, total_depth, alt_depths = cluster.samtools_vars.get_depths_at_position(contig_name, contig_position)
+            ctg_nts.append(ref)
+            smtls_alt_nts.append(alt)
+            smtls_total_depths.append(total_depth)
+            smtls_alt_depths.append(alt_depths)
+
+    ctg_nts = ';'.join(ctg_nts) if len(ctg_nts) else '.'
+    ref_nts = ';'.join(ref_nts) if len(ref_nts) else '.'
+    smtls_alt_nts = ';'.join(smtls_alt_nts) if len(smtls_alt_nts) else '.'
+    smtls_total_depths = ';'.join([str(x)for x in smtls_total_depths]) if len(smtls_total_depths) else '.'
+    smtls_alt_depths = ';'.join([str(x)for x in smtls_alt_depths]) if len(smtls_alt_depths) else '.'
+    ctg_start = str(min(contig_positions) + 1) if contig_positions is not None else '.'
+    ctg_end = str(max(contig_positions) + 1) if contig_positions is not None else '.'
+
+    return [str(x) for x in [
+        ref_nuc_range[0] + 1,
+        ref_nuc_range[1] + 1,
+        ref_nts,
+        ctg_start,
+        ctg_end,
+        ctg_nts,
+        smtls_total_depths,
+        smtls_alt_nts,
+        smtls_alt_depths
+    ]]
+
+
+def _report_lines_for_one_contig(cluster, contig_name, ref_cov_per_contig, pymummer_variants):
+    lines = []
+
+    common_first_columns = [
+        cluster.ref_sequence.id,
+        cluster.ref_sequence_type,
+        str(cluster.status_flag),
+        str(cluster.total_reads),
+        cluster.name,
+        str(len(cluster.ref_sequence)),
+        str(ref_cov_per_contig[contig_name]) if contig_name in ref_cov_per_contig else '0', # 6 ref bases assembled
+        str(cluster.assembly_compare.percent_identities[contig_name]) if contig_name in cluster.assembly_compare.percent_identities else '0',
+        contig_name,
+        str(len(cluster.assembly.sequences[contig_name])),  # 9 length of scaffold matching reference
+    ]
+
+    if cluster.ref_sequence.id in cluster.refdata.metadata and  len(cluster.refdata.metadata[cluster.ref_sequence.id]['.']) > 0:
+        free_text_columns = [x.free_text for x in cluster.refdata.metadata[cluster.ref_sequence.id]['.']]
+    else:
+        free_text_columns = ['.']
+
+    if cluster.assembled_ok and contig_name in cluster.assembly_variants and len(cluster.assembly_variants[contig_name]) > 0:
+        for (position, var_seq_type, ref_ctg_change, var_effect, contributing_vars, matching_vars_set, metainfo_set) in cluster.assembly_variants[contig_name]:
+            if len(matching_vars_set) > 0:
+                is_known_var = '1'
+                known_var_change = 'unknown'
+                var_type = 'SNP'
+                has_known_var = '1'
+                matching_vars_column = ';;;'.join([x.to_string(separator='_') for x in matching_vars_set])
+            else:
+                is_known_var = '0'
+                known_var_change = '.'
+                has_known_var = '0'
+                var_type = '.'
+                matching_vars_column = '.'
+
+            var_columns = ['.' if x is None else str(x) for x in [is_known_var, var_type, var_seq_type, known_var_change, has_known_var, ref_ctg_change, var_effect]]
+
+            if contributing_vars is None:
+                samtools_columns = [['.'] * 9]
+            else:
+                contributing_vars.sort(key = lambda x: x.qry_start)
+
+                smtls_total_depth = []
+                smtls_alt_nt = []
+                smtls_alt_depth = []
+
+                for var in contributing_vars:
+                    depths_tuple = cluster.samtools_vars.get_depths_at_position(contig_name, var.qry_start)
+                    if depths_tuple is not None:
+                        smtls_alt_nt.append(depths_tuple[1])
+                        smtls_total_depth.append(str(depths_tuple[2]))
+                        smtls_alt_depth.append(str(depths_tuple[3]))
+
+                smtls_total_depth = ';'.join(smtls_total_depth) if len(smtls_total_depth) else '.'
+                smtls_alt_nt = ';'.join(smtls_alt_nt) if len(smtls_alt_nt) else '.'
+                smtls_alt_depth = ';'.join(smtls_alt_depth) if len(smtls_alt_depth) else '.'
+                samtools_columns = [
+                        str(contributing_vars[0].ref_start), #ref_start
+                        str(contributing_vars[0].ref_end), # ref_end
+                        ';'.join([x.ref_base for x in contributing_vars]), # ref_nt
+                        str(contributing_vars[0].qry_start),  # ctg_start
+                        str(contributing_vars[0].qry_end),  #ctg_end
+                        ';'.join([x.qry_base for x in contributing_vars]), #ctg_nt
+                        smtls_total_depth,
+                        smtls_alt_nt,
+                        smtls_alt_depth,
+                ]
+
+
+            if len(matching_vars_set) > 0:
+                for matching_var in matching_vars_set:
+                    if contributing_vars is None:
+                        samtools_columns = _samtools_depths_at_known_snps_all_wild(matching_var, contig_name, cluster, pymummer_variants)
+                    var_columns[3] = str(matching_var.variant)
+
+                    if matching_var.has_variant(cluster.ref_sequence) == (ref_ctg_change is not None):
+                        var_columns[4] = '0'
+                    else:
+                        var_columns[4] = '1'
+
+                    if samtools_columns is None:
+                        samtools_columns = [['.'] * 9]
+
+                    lines.append('\t'.join(common_first_columns + var_columns + samtools_columns + [matching_vars_column] + free_text_columns))
+            else:
+                lines.append('\t'.join(
+                    common_first_columns + var_columns + \
+                    samtools_columns + \
+                    [matching_vars_column] + free_text_columns
+                ))
+    else:
+        lines.append('\t'.join(common_first_columns + ['.'] * (len(columns) - len(common_first_columns) - 1) + free_text_columns))
+
+    return lines
+
+
+def report_lines(cluster):
+    if cluster.status_flag.has('ref_seq_choose_fail'):
+        return ['\t'.join(['.', '.', str(cluster.status_flag), str(cluster.total_reads), cluster.name] + ['.'] * (len(columns) - 5))]
+    elif cluster.status_flag.has('assembly_fail'):
+        return ['\t'.join([cluster.ref_sequence.id, cluster.ref_sequence_type, str(cluster.status_flag), str(cluster.total_reads), cluster.name] + ['.'] * (len(columns) - 5))]
+
+
+    ref_cov_per_contig = cluster.assembly_compare.ref_cov_per_contig(cluster.assembly_compare.nucmer_hits)
+    lines = []
+    pymummer_variants = pymummer.snp_file.get_all_variants(cluster.assembly_compare.nucmer_snps_file)
+
+    for contig_name in sorted(cluster.assembly.sequences):
+        contig_pymummer_variants = [x for x in pymummer_variants if x.qry_name == contig_name]
+        lines.extend(_report_lines_for_one_contig(cluster, contig_name, ref_cov_per_contig, contig_pymummer_variants))
+
+    for line in lines:
+        assert len(line.split('\t')) == len(columns)
+
+    return lines if len(lines) > 0 else None
+
diff --git a/ariba/samtools_variants.py b/ariba/samtools_variants.py
new file mode 100644
index 00000000..80d224b8
--- /dev/null
+++ b/ariba/samtools_variants.py
@@ -0,0 +1,176 @@
+import os
+import sys
+import pysam
+import pyfastaq
+from ariba import common
+
+class Error (Exception): pass
+
+
+class SamtoolsVariants:
+    def __init__(self,
+      ref_fa,
+      bam,
+      outprefix,
+      log_fh=sys.stdout,
+      samtools_exe='samtools',
+      bcftools_exe='bcftools',
+      bcf_min_dp=10,
+      bcf_min_dv=5,
+      bcf_min_dv_over_dp=0.3,
+      bcf_min_qual=20,
+    ):
+        self.ref_fa = os.path.abspath(ref_fa)
+        self.bam = os.path.abspath(bam)
+        self.outprefix = os.path.abspath(outprefix)
+        self.log_fh = log_fh
+        self.samtools_exe = samtools_exe
+        self.bcftools_exe = bcftools_exe
+        self.bcf_min_dp = bcf_min_dp
+        self.bcf_min_dv = bcf_min_dv
+        self.bcf_min_dv_over_dp = bcf_min_dv_over_dp
+        self.bcf_min_qual = bcf_min_qual
+
+        self.vcf_file = self.outprefix + '.vcf'
+        self.read_depths_file = self.outprefix + '.read_depths.gz'
+
+
+    def _make_vcf_and_read_depths_files(self):
+        tmp_vcf = self.vcf_file + '.tmp'
+        cmd = ' '.join([
+            self.samtools_exe, 'mpileup',
+            '-t INFO/DPR,DV',
+            '-A',
+            '-f', self.ref_fa,
+            '-u',
+            '-v',
+            self.bam,
+            '>',
+            tmp_vcf
+        ])
+
+        common.syscall(cmd, verbose=True, verbose_filehandle=self.log_fh)
+
+        cmd = ' '.join([
+            self.bcftools_exe, 'call -m',
+            tmp_vcf,
+            '|',
+            self.bcftools_exe, 'query',
+            r'''-f '%CHROM\t%POS\t%REF\t%ALT\t%DP\t%DPR]\n' ''',
+            '>',
+            self.read_depths_file + '.tmp'
+        ])
+
+        common.syscall(cmd, verbose=True, verbose_filehandle=self.log_fh)
+        pysam.tabix_compress(self.read_depths_file + '.tmp', self.read_depths_file)
+        pysam.tabix_index(self.read_depths_file, seq_col=0, start_col=1, end_col=1)
+        os.unlink(self.read_depths_file + '.tmp')
+
+        cmd = ' '.join([
+            self.bcftools_exe, 'call -m -v',
+            tmp_vcf,
+            '|',
+            self.bcftools_exe, 'filter',
+            '-i', '"MIN(DP)>=' + str(self.bcf_min_dp),
+                  ' & MIN(DV)>=' + str(self.bcf_min_dv),
+                  ' & MIN(DV/DP)>=' + str(self.bcf_min_dv_over_dp),
+                  ' & QUAL >=', str(self.bcf_min_qual), '"',
+            '-o', self.vcf_file
+        ])
+
+        common.syscall(cmd, verbose=True, verbose_filehandle=self.log_fh)
+        os.unlink(tmp_vcf)
+
+
+    @classmethod
+    def _get_read_depths(cls, read_depths_file, sequence_name, position):
+        '''Returns total read depth and depth of reads supporting alternative (if present)'''
+        assert os.path.exists(read_depths_file)
+        assert os.path.exists(read_depths_file + '.tbi')
+        tbx = pysam.TabixFile(read_depths_file)
+        try:
+            rows = [x for x in tbx.fetch(sequence_name, position, position + 1)]
+        except:
+            return None
+
+        if len(rows) > 1: # which happens with indels, mutiple lines for same base of reference
+            test_rows = [x for x in rows if x.rstrip().split()[3] != '.']
+            if len(test_rows) != 1:
+                rows = [rows[-1]]
+            else:
+                rows = test_rows
+
+        if len(rows) == 1:
+            r, p, ref_base, alt_base, ref_counts, alt_counts = rows[0].rstrip().split()
+            return ref_base, alt_base, int(ref_counts), alt_counts
+        else:
+            return None
+
+
+    @classmethod
+    def _get_variant_positions_from_vcf(cls, vcf_file):
+        if not os.path.exists(vcf_file):
+            return []
+        f = pyfastaq.utils.open_file_read(vcf_file)
+        positions = [l.rstrip().split('\t')[0:2] for l in f if not l.startswith('#')]
+        positions = [(t[0], int(t[1]) - 1) for t in positions]
+        pyfastaq.utils.close(f)
+        return positions
+
+
+    @staticmethod
+    def _get_variants(vcf_file, read_depths_file, positions=None):
+        if positions is None:
+            positions = SamtoolsVariants._get_variant_positions_from_vcf(vcf_file)
+        variants = {}
+        if len(positions) == 0:
+            return variants
+        if not (os.path.exists(vcf_file) and os.path.exists(read_depths_file)):
+            return variants
+        for t in positions:
+            name, pos = t[0], t[1]
+            depths = SamtoolsVariants._get_read_depths(read_depths_file, name, pos)
+            if depths is None:
+                continue
+            if name not in variants:
+                variants[name] = {}
+            variants[name][t[1]] = depths
+        return variants
+
+
+    @staticmethod
+    def variants_in_coords(nucmer_matches, vcf_file):
+        '''nucmer_matches = made by assembly_compare.assembly_match_coords().
+           Returns number of variants that lie in nucmer_matches'''
+        vcf_variant_counts = {}
+        f = pyfastaq.utils.open_file_read(vcf_file)
+        for line in f:
+            if line.startswith('#'):
+                continue
+
+            data = line.rstrip().split('\t')
+            scaff = data[0]
+
+            if scaff in nucmer_matches:
+                position = int(data[1]) - 1
+                i = pyfastaq.intervals.Interval(position, position)
+                intersects = len([x for x in nucmer_matches[scaff] if x.intersects(i)]) > 0
+                if intersects:
+                    vcf_variant_counts[scaff] = vcf_variant_counts.get(scaff, 0) + 1
+
+        pyfastaq.utils.close(f)
+        return sum(list(vcf_variant_counts.values()))
+
+
+    def get_depths_at_position(self, seq_name, position):
+        d = self._get_variants(self.vcf_file, self.read_depths_file, [(seq_name, position)])
+        if seq_name in d and position in d[seq_name]:
+            return d[seq_name][position]
+        else:
+            return None
+
+
+    def run(self):
+        self._make_vcf_and_read_depths_files()
+        # This is to make this object picklable, to keep multithreading happy
+        self.log_fh = None
diff --git a/ariba/sequence_metadata.py b/ariba/sequence_metadata.py
new file mode 100644
index 00000000..f7c1037e
--- /dev/null
+++ b/ariba/sequence_metadata.py
@@ -0,0 +1,68 @@
+from ariba import sequence_variant
+
+class Error (Exception): pass
+
+
+class SequenceMetadata:
+    def __init__(self, line):
+        try:
+            self.name, variant_type, variant_string, always_report, *extra_columns = line.rstrip().split('\t')
+        except:
+            raise Error('Error parsing line of file:\n' + line)
+
+        if len(extra_columns) == 0:
+            self.free_text = None
+        elif len(extra_columns) == 1:
+            self.free_text = extra_columns[0]
+        else:
+            raise Error('Too many columns in this line:\n' + line)
+
+        self.variant_type = variant_type
+
+        if self.variant_type == '.':
+            self.variant = None
+        else:
+            self.variant = sequence_variant.Variant(self.variant_type, variant_string)
+
+        if always_report == 'Y':
+            self.always_report = True
+        elif always_report == 'N':
+            self.always_report = False
+        else:
+            raise Error('Error getting always_report column. Expected "Y" or "N" but got ' + always_report)
+
+        self.hashed = hash((self.name, self.variant_type, self.always_report, variant_string))
+
+    def __eq__(self, other):
+       return type(other) is type(self) and self.__dict__ == other.__dict__
+
+
+    def __lt__(self, other):
+        return self.name < other.name or (self.name == other.name and self.variant < other.variant)
+
+
+    def __hash__(self):
+        return self.hashed
+
+
+    def __str__(self):
+        return self.to_string()
+
+
+    def to_string(self, separator='\t'):
+        fields = [self.name, self.variant_type]
+        if self.variant is None:
+            fields.append('.')
+        else:
+            fields.append(str(self.variant))
+
+        fields.append('Y' if self.always_report else 'N')
+
+        if self.free_text:
+            return separator.join(fields + [self.free_text])
+        else:
+            return separator.join(fields)
+
+
+    def has_variant(self, seq):
+        return self.variant is not None and self.variant.has_variant(seq)
diff --git a/ariba/sequence_variant.py b/ariba/sequence_variant.py
new file mode 100644
index 00000000..fa475c92
--- /dev/null
+++ b/ariba/sequence_variant.py
@@ -0,0 +1,69 @@
+import pyfastaq
+import re
+
+class Error (Exception): pass
+
+
+allowed_variant_types = {'n', 'p'}
+
+class Variant:
+    def __init__(self, variant_type, variant_string):
+        if variant_type not in allowed_variant_types:
+            raise Error('Error! Variant type "' + variant_type + '" not recognised.\n' + \
+                        'Must be one of:' + ', '.join(allowed_variant_types))
+
+        self.variant_type = variant_type
+
+
+        m = re.match('^([A-Z])([0-9]+)([A-Z])$', variant_string.upper())
+        if m is None:
+            raise Error('Unexpected format of variant string: ', variant_string)
+
+        try:
+            self.wild_value, self.position, self.variant_value = m.group(1, 2, 3)
+        except:
+            raise Error('Error getting amino acids and position of variant from', variant_string)
+
+        self.position = int(self.position) - 1
+
+
+    def __eq__(self, other):
+       return type(other) is type(self) and self.__dict__ == other.__dict__
+
+
+    def __lt__(self, other):
+        return self.position < other.position or \
+            (self.position == other.position and self.variant_type < other.variant_type) or \
+            (self.position == other.position and self.variant_type == other.variant_type and self.wild_value < other.wild_value) or \
+            (self.position == other.position and self.variant_type == other.variant_type and self.wild_value == other.wild_value and self.variant_value < other.variant_value)
+
+
+    def __str__(self):
+        return ''.join([self.wild_value, str(self.position + 1), self.variant_value])
+
+
+    def sanity_check_against_seq(self, seq, translate_seq=False):
+        if translate_seq:
+            seq = pyfastaq.sequences.Fasta('x', seq).translate().seq
+
+        return len(seq) >= self.position + 1 and seq[self.position].upper() in [self.wild_value, self.variant_value]
+
+
+    def has_variant(self, seq):
+        if self.variant_type == 'p':
+            test_seq = seq.translate()
+        else:
+            test_seq = seq
+
+        assert self.position < len(test_seq)
+        return test_seq[self.position] == self.variant_value
+
+
+    def nucleotide_range(self):
+        '''Returns the nucleotide (start, end) positions inclusive of this variant.
+           start==end if it's an amino acid variant, otherwise start+2==end'''
+        if self.variant_type == 'p':
+            return 3 * self.position, 3 * self.position + 2
+        else:
+            return self.position, self.position
+
diff --git a/ariba/summary.py b/ariba/summary.py
index d6625956..732a825d 100644
--- a/ariba/summary.py
+++ b/ariba/summary.py
@@ -1,47 +1,27 @@
 import os
+import re
 import openpyxl
 import pyfastaq
-from ariba import flag, common
+from ariba import flag, common, reference_data, report
 
 class Error (Exception): pass
 
-columns = [
-    'gene',
-    'flag',
-    'reads',
-    'cluster',
-    'gene_len',
-    'assembled',
-    'pc_ident',
-    'var_type',
-    'var_effect',
-    'new_aa',
-    'gene_start',
-    'gene_end',
-    'gene_nt',
-    'scaffold',
-    'scaff_len',
-    'scaff_start',
-    'scaff_end',
-    'scaff_nt',
-    'read_depth',
-    'alt_bases',
-    'ref_alt_depth'
-]
 
 int_columns = [
     'reads',
-    'gene_len',
-    'assembled',
-    'gene_start',
-    'gene_end',
-    'scaff_len',
-    'scaff_start',
-    'scaff_end',
-    'read_depth',
+    'ref_len',
+    'ref_base_assembled',
+    'ctg_len',
+    'ref_start',
+    'ref_end',
+    'ctg_start',
+    'ctg_end',
 ]
 
 
+float_columns = ['pc_ident']
+
+
 class Summary:
     def __init__(
       self,
@@ -49,7 +29,7 @@ def __init__(
       filenames=None,
       fofn=None,
       filter_output=True,
-      js_candy_prefix=None,
+      phandango_prefix=None,
       min_id=90.0
     ):
         if filenames is None and fofn is None:
@@ -66,7 +46,7 @@ def __init__(
         self.filter_output = filter_output
         self.min_id = min_id
         self.outfile = outfile
-        self.js_candy_prefix = js_candy_prefix
+        self.phandango_prefix = phandango_prefix
 
 
     def _load_fofn(self, fofn):
@@ -82,138 +62,199 @@ def _check_files_exist(self):
                 raise Error('File not found: "' + fname + '". Cannot continue')
 
 
-    def _line2dict(self, line):
+    @classmethod
+    def _line2dict(cls, line):
         data = line.rstrip().split('\t')
-        d = {columns[i]: data[i] for i in range(len(data))}
+        if len(data) != len(report.columns):
+            raise Error('Wrong number of columns in the following line. Expected ' + str(len(report.columns)) + ' but got ' + str(len(data)) + '\n' + line)
+        d = {report.columns[i]: data[i] for i in range(len(data))}
         d['flag'] = flag.Flag(int(d['flag']) )
         for key in int_columns:
             try:
                 d[key] = int(d[key])
             except:
                 assert d[key] == '.'
-        try:
-            d['pc_ident'] = float(d['pc_ident'])
-        except:
-            assert d['pc_ident'] == '.'
+
+        for key in float_columns:
+            try:
+                d[key] = float(d[key])
+            except:
+                assert d[key] == '.'
+
         return d
 
 
-    def _load_file(self, filename):
+    @classmethod
+    def _dict2key(cls, d):
+        if d['var_type'] == '.':
+            return d['ref_name'], '', ''
+        elif d['known_var_change'] == d['ref_ctg_change'] == '.':
+            raise Error('Unexpected data in ariba summary... \n' + str(d) + '\n... known_var_change and ref_ctg_change both equal to ".", but var_type was not a ".". Cannot continue')
+        else:
+            if '.' not in [d['known_var_change'], d['ref_ctg_change']] and d['known_var_change'] != d['ref_ctg_change']:
+                raise Error('Unexpected data in ariba summary... \n' + str(d) + '\n... known_var_change != ref_ctg_change. Cannot continue')
+            if d['known_var_change'] != '.':
+                change = d['known_var_change']
+            else:
+                change = d['ref_ctg_change']
+
+            return d['ref_name'], d['var_seq_type'], change
+
+
+    @classmethod
+    def _load_file(cls, filename):
         f = pyfastaq.utils.open_file_read(filename)
         d = {}
 
         for line in f:
             if line.startswith('#'):
-                if line.rstrip()[1:].split('\t') != columns:
+                if line.rstrip()[1:].split('\t') != report.columns:
+                    pyfastaq.utils.close(f)
                     raise Error('Error parsing the following line.\n' + line)
                 continue
-            data = self._line2dict(line)
-
-            if data['gene'] not in d:
-                d[data['gene']] = []
-
-            d[data['gene']].append(data)
+            data = Summary._line2dict(line)
+            key = Summary._dict2key(data)
+            if key[0] not in d:
+                d[key[0]] = {}
+            d[key[0]][key] = data
 
         pyfastaq.utils.close(f)
         return d
 
 
-    def _to_summary_number(self, l):
-        f = l[0]['flag']
-        if f.has('assembly_fail') or not f.has('gene_assembled') or self._pc_id_of_longest(l) <= self.min_id:
-            return 0
+    @classmethod
+    def _pc_id_of_longest(cls, data_dict, seq_name):
+        longest = 0
+        identity = 0
+        assert seq_name in data_dict
 
-        if f.has('hit_both_strands') or (not f.has('complete_orf')):
-            return 1
+        for d in data_dict[seq_name].values():
+            if d['ref_base_assembled'] > longest:
+                longest = d['ref_base_assembled']
+                identity = d['pc_ident']
+
+        return identity
+
+
+    @classmethod
+    def _to_summary_number_for_seq(cls, data_dict, seq_name, min_id):
+        f = list(data_dict[seq_name].values())[0]['flag']
 
-        if f.has('unique_contig') and f.has('gene_assembled_into_one_contig') and f.has('complete_orf'):
+        if f.has('assembly_fail') or (not f.has('assembled')) or f.has('ref_seq_choose_fail') or Summary._pc_id_of_longest(data_dict, seq_name) <= min_id:
+            return 0
+        elif f.has('assembled_into_one_contig') and f.has('complete_orf') and f.has('unique_contig') and (not f.has('scaffold_graph_bad')) and (not f.has('variants_suggest_collapsed_repeat')) and (not f.has('hit_both_strands')) and (not f.has('region_assembled_twice')):
             if f.has('has_nonsynonymous_variants'):
-                return 3
+                return 2
             else:
-                return 4
+                return 3
         else:
-            return 2
+            return 1
 
 
-    def _pc_id_of_longest(self, l):
-        longest = 0
-        identity = None
-        for data in l:
-            if data['assembled'] > longest:
-                longest = data['assembled']
-                identity = data['pc_ident']
+    @classmethod
+    def _to_summary_number_for_variant(cls, data_dict):
+        if data_dict['has_known_var'] == '1' or (data_dict['known_var'] != '1' and data_dict['ref_ctg_change'] != '.'):
+            return 1
+        else:
+            return 0
 
-        assert identity is not None
-        return identity
 
+    @classmethod
+    def _gather_output_rows(cls, filenames, min_id):
+        data = {filename: Summary._load_file(filename) for filename in filenames}
 
+        all_column_tuples = set()
 
-    def _gather_output_rows(self):
-        self.data = {filename: self._load_file(filename) for filename in self.filenames}
+        for filename, data_dict in data.items():
+            for seq_name, seq_data_dict in data_dict.items():
+                all_column_tuples.update(set(seq_data_dict.keys()))
+                all_column_tuples.add((seq_name, '', ''))
 
-        all_genes = set()
-        for l in self.data.values():
-            all_genes.update(set(l.keys()))
-        all_genes = list(all_genes)
-        all_genes.sort()
 
-        self.rows_out = []
-        self.rows_out.append(['filename'] + all_genes)
 
-        for filename in self.filenames:
+        all_column_tuples = list(all_column_tuples)
+        all_column_tuples.sort()
+        rows = [['filename']]
+        for t in all_column_tuples:
+            if t[1] == t[2] == '':
+                rows[0].append(t[0])
+            else:
+                rows[0].append(t[0] + ';' + 'var.' + t[1] + '.' + t[2])
+
+        for filename in filenames:
             new_row = [filename]
-            for gene in all_genes:
-                if gene not in self.data[filename]:
+            for column_tuple in all_column_tuples:
+                if column_tuple[0] not in data[filename]:
                     new_row.append(0)
+                elif column_tuple[1] == '':
+                    new_row.append(Summary._to_summary_number_for_seq(data[filename], column_tuple[0], min_id))
+                elif column_tuple in data[filename][column_tuple[0]]:
+                    new_row.append(Summary._to_summary_number_for_variant(data[filename][column_tuple[0]][column_tuple]))
                 else:
-                    new_row.append(self._to_summary_number(self.data[filename][gene]))
+                    new_row.append(0)
 
-            self.rows_out.append(new_row)
+            rows.append(new_row)
 
+        return rows
 
-    def _filter_output_rows(self):
-        if not self.filter_output:
-            return
 
+    @classmethod
+    def _filter_output_rows(cls, rows):
         # remove rows that are all zeros
-        self.rows_out = [x for x in self.rows_out if x[1:] != [0]*(len(x)-1)]
+        rows = [x for x in rows if x[1:] != [0]*(len(x)-1)]
 
         # remove columns that are all zeros
         to_remove = []
-        for i in range(1, len(self.rows_out[0])):
-            if sum([x[i] for x in self.rows_out[1:]]) == 0:
+        for i in range(1, len(rows[0])):
+            if sum([x[i] for x in rows[1:]]) == 0:
                 to_remove.append(i)
 
-        for i in range(len(self.rows_out)):
-            self.rows_out[i] = [self.rows_out[i][j] for j in range(len(self.rows_out[i])) if j not in to_remove]
+        for i in range(len(rows)):
+            rows[i] = [rows[i][j] for j in range(len(rows[i])) if j not in to_remove]
 
+        return rows
 
 
-    def _write_tsv(self):
-        f = pyfastaq.utils.open_file_write(self.outfile)
+    @classmethod
+    def _write_tsv(cls, rows, outfile):
+        f = pyfastaq.utils.open_file_write(outfile)
         print('#', end='', file=f)
-        for row in self.rows_out:
+        for row in rows:
             print('\t'.join([str(x) for x in row]), file=f)
         pyfastaq.utils.close(f)
 
 
-    def _write_js_candy_csv(self, outfile):
+    @classmethod
+    def _write_phandango_csv(cls, rows, outfile):
+        # phandango needs the "name" column.
+        # Names must match those used in the tree file.
+        # we also need to add suffixes like :z1 to make phandango colour
+        # the columns consistently. We want to colour just sequence
+        # columns the same, and then all variant columns the same
+        header_line = ['name']
+        var_regex = re.compile('^.*;var\.[np.]\.\S+$')
+
+        for heading in rows[0][1:]:
+            if var_regex.search(heading) is None:
+                header_line.append(heading + ':z1')
+            else:
+                header_line.append(heading + ':z2')
+
         f = pyfastaq.utils.open_file_write(outfile)
-        # js candy needs the "name" column.
-        # Names must match those used in the tree file
-        print('name', *self.rows_out[0][1:], sep=',', file=f)
-        for row in self.rows_out[1:]:
+        print(*header_line, sep=',', file=f)
+        for row in rows[1:]:
             print(*row, sep=',', file=f)
         pyfastaq.utils.close(f)
 
 
-    def _write_xls(self):
+    @classmethod
+    def _write_xls(cls, rows, outfile):
         workbook = openpyxl.Workbook()
         worksheet = workbook.worksheets[0]
         worksheet.title = 'ARIBA_summary'
-        for row in self.rows_out:
+        for row in rows:
             worksheet.append(row)
-        workbook.save(self.outfile)
+        workbook.save(outfile)
 
 
     @staticmethod
@@ -230,28 +271,29 @@ def _distance_score_between_lists(cls, scores1, scores2):
         return sum([cls._distance_score_between_values(scores1[i], scores2[i]) for i in range(1, len(scores1))])
 
 
-    def _write_distance_matrix(self, outfile):
-        if len(self.rows_out) < 3:
-            raise Error('Cannot calculate distance matrix to make tree for js_candy.\n' +
+    @classmethod
+    def _write_distance_matrix(cls, rows, outfile):
+        if len(rows) < 3:
+            raise Error('Cannot calculate distance matrix to make tree for phandango.\n' +
                         'Only one sample present.')
 
-        if len(self.rows_out[0]) < 2:
-            raise Error('Cannot calculate distance matrix to make tree for js_candy.\n' +
+        if len(rows[0]) < 2:
+            raise Error('Cannot calculate distance matrix to make tree for phandango.\n' +
                         'No genes present in output')
 
         with open(outfile, 'w') as f:
-            sample_names = [x[0] for x in self.rows_out]
+            sample_names = [x[0] for x in rows]
             print(*sample_names[1:], sep='\t', file=f)
 
-            for i in range(1,len(self.rows_out)):
+            for i in range(1,len(rows)):
                 scores = []
-                for j in range(2, len(self.rows_out)):
-                    scores.append(self._distance_score_between_lists(self.rows_out[i], self.rows_out[j]))
-                print(self.rows_out[i][0], *scores, sep='\t', file=f)
+                for j in range(2, len(rows)):
+                    scores.append(Summary._distance_score_between_lists(rows[i], rows[j]))
+                print(rows[i][0], *scores, sep='\t', file=f)
 
 
-    @staticmethod
-    def _newick_from_dist_matrix(distance_file, outfile):
+    @classmethod
+    def _newick_from_dist_matrix(cls, distance_file, outfile):
         r_script = outfile + '.tmp.R'
 
         with open(r_script, 'w') as f:
@@ -265,24 +307,28 @@ def _newick_from_dist_matrix(distance_file, outfile):
         os.unlink(r_script)
 
 
-    def _write_js_candy_files(self, outprefix):
+    @classmethod
+    def _write_phandango_files(cls, rows, outprefix):
         distance_file = outprefix + '.distance_matrix'
         tree_file = outprefix + '.tre'
         csv_file = outprefix + '.csv'
-        self._write_distance_matrix(distance_file)
-        self._newick_from_dist_matrix(distance_file, tree_file)
+        Summary._write_distance_matrix(rows, distance_file)
+        Summary._newick_from_dist_matrix(distance_file, tree_file)
         os.unlink(distance_file)
-        self._write_js_candy_csv(csv_file)
+        Summary._write_phandango_csv(rows, csv_file)
 
 
     def run(self):
         self._check_files_exist()
-        self._gather_output_rows()
-        self._filter_output_rows()
+        rows = Summary._gather_output_rows(self.filenames, self.min_id)
+
+        if self.filter_rows:
+            rows = Summary._filter_output_rows(rows)
+
         if self.outfile.endswith('.xls'):
-            self._write_xls()
+            Summary._write_xls(rows, self.outfile)
         else:
-            self._write_tsv()
+            Summary._write_tsv(rows, self.outfile)
 
-        if self.js_candy_prefix is not None:
-            self._write_js_candy_files(self.js_candy_prefix)
+        if self.phandango_prefix is not None:
+            Summary._write_phandango_files(rows, self.phandango_prefix)
diff --git a/ariba/tasks/getref.py b/ariba/tasks/getref.py
new file mode 100644
index 00000000..87766de3
--- /dev/null
+++ b/ariba/tasks/getref.py
@@ -0,0 +1,20 @@
+import argparse
+import sys
+from ariba import ref_genes_getter
+
+
+def run():
+    allowed_dbs = ['argannot', 'card', 'resfinder']
+    parser = argparse.ArgumentParser(
+        description = 'Downloads reference data',
+        usage = 'ariba getref [options] <' + '|'.join(allowed_dbs) + '> <outprefix>'
+    )
+
+    parser.add_argument('--genetic_code', type=int, help='Number of genetic code to use. Currently supported 1,4,11 [%(default)s]', choices=[1,4,11], default=11, metavar='INT')
+    parser.add_argument('db', help='Database to download. Must be one of: ' + ' '.join(allowed_dbs), choices=allowed_dbs)
+    parser.add_argument('outprefix', help='Prefix of output filenames')
+    options = parser.parse_args()
+
+    getter = ref_genes_getter.RefGenesGetter(options.db, genetic_code=options.genetic_code)
+    getter.run(options.outprefix)
+
diff --git a/ariba/tasks/refcheck.py b/ariba/tasks/refcheck.py
index 4184494d..b3ed87af 100644
--- a/ariba/tasks/refcheck.py
+++ b/ariba/tasks/refcheck.py
@@ -6,26 +6,33 @@
 def run():
     parser = argparse.ArgumentParser(
         description = 'Check or fix resistance genes FASTA file',
-        usage = 'ariba refcheck [options] <infile>')
-    parser.add_argument('--genetic_code', type=int, help='Number of genetic code to use. Currently supported 1,4,11 [%(default)s]', choices=[1,4,11], default=11, metavar='INT')
-    parser.add_argument('-m', '--min_length', type=int, help='Minimum length in nucleotides of gene [%(default)s]', metavar='INT', default=6)
-    parser.add_argument('-n', '--max_length', type=int, help='Maximum length in nucleotides of gene [%(default)s]', metavar='INT', default=10000)
-    parser.add_argument('-o', '--outprefix', help='Prefix of output files. If this option is used, a fixed file will be output, together with information on what was changed in the input file. If this option is not used, the script dies if any input sequence is not OK')
-    parser.add_argument('infile', help='Input file containing genes to be checked', metavar='Filename')
-    options = parser.parse_args()
+        usage = 'ariba refcheck [options] <outprefix>',
+        epilog = 'Important: at least one of --presence_fa, --variants_fa, or --noncoding_fa must be specified')
 
-    pyfastaq.sequences.genetic_code = options.genetic_code
-    checker = ariba.refcheck.Checker(
-        options.infile,
-        min_length=options.min_length,
-        max_length=options.max_length,
-        outprefix=options.outprefix
-    )
+    input_group = parser.add_argument_group('Input files')
+    input_group.add_argument('--presence_fa', help='FASTA file of genes whose presence you want to check for', metavar='Filename')
+
+    input_group.add_argument('--variants_fa', help='FASTA file of genes that should only be reported if they have a given variant (variants specified in the tsv file given by --metadata_tsv', metavar='Filename')
+    input_group.add_argument('--noncoding_fa', help='FASTA file of generic sequences to look for', metavar='Filename')
+    input_group.add_argument('--metadata_tsv', help='tsv file of metadata about the sequences/variants of interest', metavar='Filename')
+
+    other_group = parser.add_argument_group('Other options')
+    other_group.add_argument('--genetic_code', type=int, help='Number of genetic code to use. Currently supported 1,4,11 [%(default)s]', choices=[1,4,11], default=11, metavar='INT')
+    other_group.add_argument('-m', '--min_gene_length', type=int, help='Minimum length in nucleotides of gene [%(default)s]', metavar='INT', default=6)
+    other_group.add_argument('-n', '--max_gene_length', type=int, help='Maximum length in nucleotides of gene [%(default)s]', metavar='INT', default=10000)
 
-    ok, reason, seq = checker.run()
+    parser.add_argument('outprefix', help='Prefix of names of output files')
 
-    if options.outprefix is None and not ok:
-        print('The following sequence not OK, for the reason:', reason)
-        print(seq)
-        sys.exit(1)
+    options = parser.parse_args()
+
+    ref_data = ariba.reference_data.ReferenceData(
+        presence_absence_fa=options.presence_fa,
+        variants_only_fa=options.variants_fa,
+        non_coding_fa=options.noncoding_fa,
+        metadata_tsv=options.metadata_tsv,
+        min_gene_length=options.min_gene_length,
+        max_gene_length=options.max_gene_length,
+        genetic_code=options.genetic_code,
+    )
 
+    ref_data.sanity_check(options.outprefix)
diff --git a/ariba/tasks/run.py b/ariba/tasks/run.py
index 800407e0..e558385b 100644
--- a/ariba/tasks/run.py
+++ b/ariba/tasks/run.py
@@ -7,12 +7,19 @@
 def run():
     parser = argparse.ArgumentParser(
         description = 'ARIBA: Antibiotic Resistance Identification By Assembly',
-        usage = 'ariba run [options] <db.fa> <reads1.fq> <reads2.fq> <outdir>')
-    parser.add_argument('db_fasta', help='FASTA file of reference genes')
+        usage = 'ariba run [options] <reads1.fq> <reads2.fq> <outdir>')
     parser.add_argument('reads_1', help='Name of fwd reads fastq file')
     parser.add_argument('reads_2', help='Name of rev reads fastq file')
     parser.add_argument('outdir', help='Output directory (must not already exist)')
 
+    refdata_group = parser.add_argument_group('Reference data options')
+    refdata_group.add_argument('--presabs', help='FASTA file of presence absence genes', metavar='FILENAME')
+    refdata_group.add_argument('--varonly', help='FASTA file of variants only genes', metavar='FILENAME')
+    refdata_group.add_argument('--noncoding', help='FASTA file of noncoding sequences', metavar='FILENAME')
+    refdata_group.add_argument('--metadata', help='tsv file of metadata about the reference sequences', metavar='FILENAME')
+    refdata_group.add_argument('--min_gene_length', type=int, help='Minimum allowed length in nucleotides of reference genes [%(default)s]', metavar='INT', default=6)
+    refdata_group.add_argument('--max_gene_length', type=int, help='Maximum allowed length in nucleotides of reference genes [%(default)s]', metavar='INT', default=10000)
+
     cdhit_group = parser.add_argument_group('cd-hit options')
     cdhit_group.add_argument('--no_cdhit', action='store_true', help='Do not run cd-hit')
     cdhit_group.add_argument('--cdhit_seq_identity_threshold', type=float, help='Sequence identity threshold (cd-hit option -c) [%(default)s]', default=0.9, metavar='FLOAT')
@@ -24,19 +31,13 @@ def run():
     nucmer_group.add_argument('--nucmer_breaklen', type=int, help='Value to use for -breaklen when running nucmer [%(default)s]', default=50, metavar='INT')
 
     assembly_group = parser.add_argument_group('Assembly options')
-    allowed_assemblers = ['velvet', 'spades']
-    assembly_group.add_argument('--assembler', help='Assembler to use. Available options: ' + ','.join(allowed_assemblers) + ' [%(default)s]', choices=allowed_assemblers, default='spades', metavar='Assembler')
-    assembly_group.add_argument('--min_scaff_depth', type=int, help='Minimum number of read pairs needed as evidence for scaffold link between two contigs. This is also the value used for sspace -k when scaffolding [%(default)s]', default=10, metavar='INT')
     assembly_group.add_argument('--assembler_k', type=int, help='kmer size to use with assembler. You can use 0 to set kmer to 2/3 of the read length. Warning - lower kmers are usually better. [%(default)s]', metavar='INT', default=21)
     assembly_group.add_argument('--spades_other', help='Put options string to be used with spades in quotes. This will NOT be sanity checked. Do not use -k or -t: for these options you should use the ariba run options --assembler_k and --threads [%(default)s]', default="--only-assembler", metavar="OPTIONS")
-
-    refcheck_group = parser.add_argument_group('refcheck options')
-    refcheck_group.add_argument('--refcheck_min_length', type=int, help='Minimum allowed length in nucleotides of reference gene [%(default)s]', metavar='INT', default=6)
-    refcheck_group.add_argument('--refcheck_max_length', type=int, help='Maximum allowed length in nucleotides of reference gene [%(default)s]', metavar='INT', default=10000)
+    assembly_group.add_argument('--min_scaff_depth', type=int, help='Minimum number of read pairs needed as evidence for scaffold link between two contigs. This is also the value used for sspace -k when scaffolding [%(default)s]', default=10, metavar='INT')
 
     other_group = parser.add_argument_group('Other options')
     other_group.add_argument('--genetic_code', type=int, help='Number of genetic code to use. Currently supported 1,4,11 [%(default)s]', choices=[1,4,11], default=11, metavar='INT')
-    other_group.add_argument('--threads', type=int, help='Number of threads for bowtie2 and spades [%(default)s]', default=1, metavar='INT')
+    other_group.add_argument('--threads', type=int, help='Number of threads [%(default)s]', default=1, metavar='INT')
     bowtie2_presets = ['very-fast-local', 'fast-local', 'sensitive-local', 'very-sensitive-local']
     other_group.add_argument('--bowtie2_preset', choices=bowtie2_presets, help='Preset option for bowtie2 mapping [%(default)s]', default='very-sensitive-local', metavar='|'.join(bowtie2_presets))
     other_group.add_argument('--assembled_threshold', type=float, help='If proportion of gene assembled (regardless of into how many contigs) is at least this value then the flag gene_assembled is set [%(default)s]', default=0.95, metavar='FLOAT (between 0 and 1)')
@@ -44,50 +45,34 @@ def run():
     other_group.add_argument('--clean', type=int, choices=[0,1,2], help='Specify how much cleaning to do. 0=none, 1=some, 2=only keep the report [%(default)s]', default=1, metavar='INT')
     other_group.add_argument('--verbose', action='store_true', help='Be verbose')
 
-    executables_group = parser.add_argument_group('executables locations')
-    executables_group.add_argument('--bcftools', help='bcftools executable [bcftools]', metavar='PATH')
-    executables_group.add_argument('--bowtie2', help='bowtie2 executable [bowtie2]', metavar='PATH')
-    executables_group.add_argument('--cdhit', help=argparse.SUPPRESS)
-    executables_group.add_argument('--gapfiller', help='GapFiller executable [GapFiller.pl]', metavar='PATH')
-    executables_group.add_argument('--nucmer', help=argparse.SUPPRESS, default='nucmer')
-    executables_group.add_argument('--samtools', help='samtools executable [samtools]', metavar='PATH')
-    executables_group.add_argument('--spades', help='SPAdes executable [spades.py]',  metavar='PATH')
-    executables_group.add_argument('--sspace', help='SSPACE executable [SSPACE_Basic_v2.0.pl]', metavar='PATH')
-    executables_group.add_argument('--velvet', help='prefix of velvet{g,h} executables [velvet]', metavar='PATH')
-    executables_group.add_argument('--velvetg', help=argparse.SUPPRESS)
-    executables_group.add_argument('--velveth', help=argparse.SUPPRESS)
-
     options = parser.parse_args()
-    if options.assembler == 'velvet':
-        options.velvet = 'velvet'
-    ariba.external_progs.check_versions(options, verbose=options.verbose, not_required=set(['sspace', 'gapfiller']))
-    pyfastaq.sequences.genetic_code = options.genetic_code
-
 
-    checker = ariba.refcheck.Checker(
-        options.db_fasta,
-        min_length=options.refcheck_min_length,
-        max_length=options.refcheck_max_length,
-    )
-    ok, reason, seq = checker.run()
+    if {None} == {options.presabs, options.varonly, options.noncoding}:
+        print('Error! Must use at least one of the options: --presabs --varonly --noncoding. Cannot continue', file=sys.stderr)
 
-    if not ok:
-        print('\nInput reference file of genes failed refcheck! Cannot continue.', file=sys.stderr)
-        print('The first failed sequence is:\n', file=sys.stderr)
-        print(seq, file=sys.stderr)
-        print('\nIt failed for the reason:', reason, file=sys.stderr)
-        print('\nTo make a new fasta file called new_genes.fa, with the bad genes fixed/removed run:\n', file=sys.stderr)
-        print('    ariba refcheck -o new_genes', options.db_fasta, file=sys.stderr)
-        sys.exit(1)
+    extern_progs = ariba.external_progs.ExternalProgs(verbose=options.verbose)
+    pyfastaq.sequences.genetic_code = options.genetic_code
 
+    if options.verbose:
+        print('{:_^79}'.format(' Loading reference data '), flush=True)
+    refdata = ariba.reference_data.ReferenceData(
+        presence_absence_fa=options.presabs,
+        variants_only_fa=options.varonly,
+        non_coding_fa=options.noncoding,
+        metadata_tsv=options.metadata,
+        min_gene_length=options.min_gene_length,
+        max_gene_length=options.max_gene_length,
+        genetic_code=options.genetic_code,
+    )
 
     c = ariba.clusters.Clusters(
-          options.db_fasta,
+          refdata,
           options.reads_1,
           options.reads_2,
           options.outdir,
+          extern_progs,
           assembly_kmer=options.assembler_k,
-          assembler=options.assembler,
+          assembler='spades',
           threads=options.threads,
           verbose=options.verbose,
           min_scaff_depth=options.min_scaff_depth,
@@ -97,14 +82,7 @@ def run():
           spades_other=options.spades_other,
           assembled_threshold=options.assembled_threshold,
           unique_threshold=options.unique_threshold,
-          bcftools_exe=options.bcftools,
-          gapfiller_exe=options.gapfiller,
-          samtools_exe=options.samtools,
-          bowtie2_exe=options.bowtie2,
           bowtie2_preset=options.bowtie2_preset,
-          spades_exe=options.spades,
-          sspace_exe=options.sspace,
-          velvet_exe=options.velvet,
           cdhit_seq_identity_threshold=options.cdhit_seq_identity_threshold,
           cdhit_length_diff_cutoff=options.cdhit_length_diff_cutoff,
           clean=options.clean,
diff --git a/ariba/tasks/summary.py b/ariba/tasks/summary.py
index 825bf244..85e9f1f4 100644
--- a/ariba/tasks/summary.py
+++ b/ariba/tasks/summary.py
@@ -7,7 +7,7 @@ def run():
         usage = 'ariba summary [options] <outfile> [report1.tsv report2.tsv ...]',
         epilog = 'Files must be listed after the output file and/or the option --fofn must be used. If both used, all files in the filename specified by --fofn AND the files listed after the output file will be used as input. The input report files must be in tsv format, not xls.')
     parser.add_argument('-f', '--fofn', help='File of filenames of ariba reports in tsv format (not xls) to be summarised. Must be used if no input files listed after the outfile.', metavar='FILENAME')
-    parser.add_argument('--js_candy', help='Write files that can be used as input to JS Candy, named with the given prefix. Writes a tree and csv file, which can be drag and dropped straight into JS Candy', metavar='files_prefix')
+    parser.add_argument('--phandango', help='Write files that can be used as input to phandango, named with the given prefix. Writes a tree and csv file, which can be drag and dropped straight into JS Candy', metavar='files_prefix')
     parser.add_argument('--min_id', type=float, help='Minimum percent identity cutoff to count as assembled [%(default)s]', default=90, metavar='FLOAT')
     parser.add_argument('--no_filter', action='store_true', help='Do not filter rows or columns of output that are all 0 (by default, they are removed from the output)')
     parser.add_argument('outfile', help='Name of output file. If file ends with ".xls", then an excel spreadsheet is written. Otherwise a tsv file is written')
@@ -22,6 +22,6 @@ def run():
         filenames=options.infiles,
         filter_output=(not options.no_filter),
         min_id=options.min_id,
-        js_candy_prefix=options.js_candy
+        phandango_prefix=options.phandango
     )
     s.run()
diff --git a/ariba/tasks/test.py b/ariba/tasks/test.py
new file mode 100644
index 00000000..a3ce714d
--- /dev/null
+++ b/ariba/tasks/test.py
@@ -0,0 +1,58 @@
+import argparse
+import subprocess
+import shutil
+import os
+import sys
+import ariba
+
+
+def run():
+    parser = argparse.ArgumentParser(
+        description = 'Run ARIBA on a small test dataset',
+        usage = 'ariba test [options] <outdir>')
+    parser.add_argument('--threads', type=int, help='Number of threads [%(default)s]', default=1, metavar='INT')
+    parser.add_argument('outdir', help='Name of output directory')
+    options = parser.parse_args()
+
+    print('Running ARIBA on test data...')
+
+    try:
+        os.mkdir(options.outdir)
+        os.chdir(options.outdir)
+    except:
+        print('Error making output directory "', options.outdir, '". Cannot continue.', sep='', file=sys.stderr)
+        sys.exit(1)
+
+    print('Made output directory. Copying test data files into it:')
+
+    modules_dir = os.path.dirname(os.path.abspath(ariba.__file__))
+    test_data_dir = os.path.join(modules_dir, 'test_run_data')
+
+    for filename in ['presence_absence.fa', 'non_coding.fa', 'variants_only.fa', 'metadata.tsv', 'reads_1.fq', 'reads_2.fq']:
+        shutil.copy(os.path.join(test_data_dir, filename), filename)
+        print('    copied', filename)
+
+    ariba_command = ' '.join([
+        sys.argv[0],
+        'run',
+        '--verbose',
+        '--presabs presence_absence.fa',
+        '--varonly variants_only.fa',
+        '--noncoding non_coding.fa',
+        '--metadata metadata.tsv',
+        '--threads', str(options.threads),
+        'reads_1.fq',
+        'reads_2.fq',
+        'OUT'
+    ])
+
+    print('\nRunning ARIBA with:', ariba_command, '', sep='\n')
+
+    return_code = subprocess.call(ariba_command, shell=True)
+
+    if return_code != 0:
+        print('\nSomething went wrong. See above for error message(s). Return code was', return_code)
+        sys.exit(1)
+
+    print('-' * 79)
+    print('Finished run on test data OK')
diff --git a/ariba/test_run_data/metadata.tsv b/ariba/test_run_data/metadata.tsv
new file mode 100644
index 00000000..ec4e55c9
--- /dev/null
+++ b/ariba/test_run_data/metadata.tsv
@@ -0,0 +1,14 @@
+presence_absence1	.	.	N	Generic description of presence_absence1
+presence_absence1	p	R3S	N	Ref and assembly have wild type, so do not report
+presence_absence1	p	A10V	N	Ref has wild, reads have variant so report
+presence_absence1	p	I5A	N	Ref and reads have variant so report
+variants_only1	.	.	N	Generic description of variants_only1
+variants_only1	p	I3L	N	Ref and assembly have wild type, so do not report
+variants_only1	p	S5T	N	Ref and reads have variant so report
+variants_only2	p	R3I	N	Ref and reads have wild so do not report
+variants_only2	.	.	N	Generic description of variants_only2
+noncoding1	.	.	N	generic description of noncoding1
+noncoding1	n	A6G	N	variant in ref and reads so should report
+noncoding1	n	G9T	N	wild type in ref and reads so should not report
+noncoding1	n	A14T	N	ref has wild type, reads have variant so should report
+noncoding1	n	A40C	N	ref has variant, reads have wild type so should not report
diff --git a/ariba/test_run_data/non_coding.fa b/ariba/test_run_data/non_coding.fa
new file mode 100644
index 00000000..3278f9e5
--- /dev/null
+++ b/ariba/test_run_data/non_coding.fa
@@ -0,0 +1,6 @@
+>noncoding1
+CGTACGCGGGTGGAGACATGTACTCCACTCCCATACATCCCTAAGTTTGTCCCTAAGGCA
+GTGCCCGCCGCCCACGAACGAACTGCGGTGAGATGCTTAGGGAACGCCATCCGAGATCAC
+>noncoding2
+TCTTTAACTGTTCACGACTGTATCGCGGCTTGCAAATCTTAAGTTCTTCCCAAGCGCGCT
+GCGATACAAATCCCAAGTTTAGCGGACAGTTCACGCCGGGTTCTAAGAATGTATGCGTCC
diff --git a/ariba/test_run_data/presence_absence.fa b/ariba/test_run_data/presence_absence.fa
new file mode 100644
index 00000000..64d08662
--- /dev/null
+++ b/ariba/test_run_data/presence_absence.fa
@@ -0,0 +1,5 @@
+>presence_absence1
+ATGGATCGCGAAGCGATGACCCATGAAGCAACCGAACGCGCGAGCACCAACATTAGCCAT
+ATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAA
+>presence_absence2
+ATGGCGTGCGATGAATTTGGCCATATTAAACTGATGAACCCGCAGCGCAGCACCTAA
diff --git a/ariba/test_run_data/reads_1.fq b/ariba/test_run_data/reads_1.fq
new file mode 100644
index 00000000..4e7dd75a
--- /dev/null
+++ b/ariba/test_run_data/reads_1.fq
@@ -0,0 +1,904 @@
+@presence_absence1:1:154:213/1
+TAGCCATATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAAGGACATA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:2:64:123/1
+CTTTCTTGTAGCTGTATTTGGCATAGTGTTGCGGCATATGGATCGCGAAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:3:21:79/1
+ACGTTCAGCACTCTAAACCGCGCCTAAACAGGTACACTTCTTCCTTTCTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:4:116:174/1
+ATGACCCATGAAGTAACCGAACGCGCGAGCACCAACATTAGCCATATTAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:5:16:76/1
+CTATAACGTTCAGCACTCTAAACCGCGCCTAAACAGGTACACTTCTTCCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:6:76:136/1
+TGTATTTGGCATAGTGTTGCGGCATATGGATCGCGAAGCGATGACCCATG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:7:120:180/1
+CCCATGAAGTAACCGAACGCGCGAGCACCAACATTAGCCATATTAACGGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:8:69:129/1
+TTGTAGCTGTATTTGGCATAGTGTTGCGGCATATGGATCGCGAAGCGATG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:9:31:91/1
+CTCTAAACCGCGCCTAAACAGGTACACTTCTTCCTTTCTTGTAGCTGTAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:10:76:135/1
+TGTATTTGGCATAGTGTTGCGGCATATGGATCGCGAAGCGATGACCCATG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:11:173:233/1
+AGCGCGTGGGAAAGCATGGAATAAGGACATACCTAGGTGCGAAGTGCAGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:12:37:99/1
+ACCGCGCCTAAACAGGTACACTTCTTCCTTTCTTGTAGCTGTATTTGGCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:13:131:190/1
+ACCGAACGCGCGAGCACCAACATTAGCCATATTAACGGCATTAGCGCGTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:14:94:153/1
+GCGGCATATGGATCGCGAAGCGATGACCCATGAAGTAACCGAACGCGCGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:15:46:104/1
+AAACAGGTACACTTCTTCCTTTCTTGTAGCTGTATTTGGCATAGTGTTGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:16:115:175/1
+GATGACCCATGAAGTAACCGAACGCGCGAGCACCAACATTAGCCATATTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:17:44:103/1
+CTAAACAGGTACACTTCTTCCTTTCTTGTAGCTGTATTTGGCATAGTGTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:18:141:201/1
+CGAGCACCAACATTAGCCATATTAACGGCATTAGCGCGTGGGAAAGCATG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:19:149:210/1
+AACATTAGCCATATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAAGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:20:63:124/1
+CCTTTCTTGTAGCTGTATTTGGCATAGTGTTGCGGCATATGGATCGCGAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:21:60:122/1
+CTTCCTTTCTTGTAGCTGTATTTGGCATAGTGTTGCGGCATATGGATCGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:22:111:169/1
+AAGCGATGACCCATGAAGTAACCGAACGCGCGAGCACCAACATTAGCCAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:23:90:150/1
+TGTTGCGGCATATGGATCGCGAAGCGATGACCCATGAAGTAACCGAACGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:24:56:114/1
+ACTTCTTCCTTTCTTGTAGCTGTATTTGGCATAGTGTTGCGGCATATGGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:25:21:80/1
+ACGTTCAGCACTCTAAACCGCGCCTAAACAGGTACACTTCTTCCTTTCTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:26:59:118/1
+TCTTCCTTTCTTGTAGCTGTATTTGGCATAGTGTTGCGGCATATGGATCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:27:19:77/1
+TAACGTTCAGCACTCTAAACCGCGCCTAAACAGGTACACTTCTTCCTTTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:28:125:186/1
+GAAGTAACCGAACGCGCGAGCACCAACATTAGCCATATTAACGGCATTAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:29:31:89/1
+CTCTAAACCGCGCCTAAACAGGTACACTTCTTCCTTTCTTGTAGCTGTAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:30:64:125/1
+CTTTCTTGTAGCTGTATTTGGCATAGTGTTGCGGCATATGGATCGCGAAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:31:150:211/1
+ACATTAGCCATATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAAGGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:32:144:205/1
+GCACCAACATTAGCCATATTAACGGCATTAGCGCGTGGGAAAGCATGGAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:33:59:119/1
+TCTTCCTTTCTTGTAGCTGTATTTGGCATAGTGTTGCGGCATATGGATCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:34:40:99/1
+GCGCCTAAACAGGTACACTTCTTCCTTTCTTGTAGCTGTATTTGGCATAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:35:105:165/1
+ATCGCGAAGCGATGACCCATGAAGTAACCGAACGCGCGAGCACCAACATT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:36:114:174/1
+CGATGACCCATGAAGTAACCGAACGCGCGAGCACCAACATTAGCCATATT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:37:82:142/1
+TGGCATAGTGTTGCGGCATATGGATCGCGAAGCGATGACCCATGAAGTAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:38:75:137/1
+CTGTATTTGGCATAGTGTTGCGGCATATGGATCGCGAAGCGATGACCCAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:39:67:129/1
+TCTTGTAGCTGTATTTGGCATAGTGTTGCGGCATATGGATCGCGAAGCGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:40:34:93/1
+TAAACCGCGCCTAAACAGGTACACTTCTTCCTTTCTTGTAGCTGTATTTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:41:163:222/1
+TAACGGCATTAGCGCGTGGGAAAGCATGGAATAAGGACATACCTAGGTGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:42:42:102/1
+GCCTAAACAGGTACACTTCTTCCTTTCTTGTAGCTGTATTTGGCATAGTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:43:148:207/1
+CAACATTAGCCATATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:44:117:177/1
+TGACCCATGAAGTAACCGAACGCGCGAGCACCAACATTAGCCATATTAAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:45:54:114/1
+AGCGGTAGGGAATCTCAAGTGCTAGCAGGATTACTTCGTGCTGATCTATG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:46:8:66/1
+ATCAGAGTAGCTTCGCGCCTCAGGGTTGTTCTCAGTTCTACTGATAAGCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:47:74:134/1
+GCTAGCAGGATTACTTCGTGCTGATCTATGCTGATTAGCACCGAAAACAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:48:6:65/1
+AAATCAGAGTAGCTTCGCGCCTCAGGGTTGTTCTCAGTTCTACTGATAAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:49:131:189/1
+AGCAGCGAACGCGCGTGCACCGCGCTGACCGAACGCGAAGATAGCACCGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:50:179:238/1
+GCGACCGAATAAAGCCCGCGTTACCAGATGTTCCGCACCGTCATTTCTCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:51:60:121/1
+AGGGAATCTCAAGTGCTAGCAGGATTACTTCGTGCTGATCTATGCTGATT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:52:117:176/1
+AAAACACCACCGAAAGCAGCGAACGCGCGTGCACCGCGCTGACCGAACGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:53:161:222/1
+GAACGCGAAGATAGCACCGCGACCGAATAAAGCCCGCGTTACCAGATGTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:54:99:157/1
+CTATGCTGATTAGCACCGAAAACACCACCGAAAGCAGCGAACGCGCGTGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:55:168:227/1
+AAGATAGCACCGCGACCGAATAAAGCCCGCGTTACCAGATGTTCCGCACC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:56:7:67/1
+AATCAGAGTAGCTTCGCGCCTCAGGGTTGTTCTCAGTTCTACTGATAAGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:57:9:67/1
+TCAGAGTAGCTTCGCGCCTCAGGGTTGTTCTCAGTTCTACTGATAAGCGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:58:163:221/1
+ACGCGAAGATAGCACCGCGACCGAATAAAGCCCGCGTTACCAGATGTTCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:59:31:93/1
+GGTTGTTCTCAGTTCTACTGATAAGCGGTAGGGAATCTCAAGTGCTAGCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:60:57:117/1
+GGTAGGGAATCTCAAGTGCTAGCAGGATTACTTCGTGCTGATCTATGCTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:61:13:72/1
+AGTAGCTTCGCGCCTCAGGGTTGTTCTCAGTTCTACTGATAAGCGGTAGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:62:19:79/1
+TTCGCGCCTCAGGGTTGTTCTCAGTTCTACTGATAAGCGGTAGGGAATCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:63:153:215/1
+CGCTGACCGAACGCGAAGATAGCACCGCGACCGAATAAAGCCCGCGTTAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:64:116:173/1
+GAAAACACCACCGAAAGCAGCGAACGCGCGTGCACCGCGCTGACCGAACG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:65:18:76/1
+CTTCGCGCCTCAGGGTTGTTCTCAGTTCTACTGATAAGCGGTAGGGAATC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:66:155:215/1
+CTGACCGAACGCGAAGATAGCACCGCGACCGAATAAAGCCCGCGTTACCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:67:84:143/1
+TTACTTCGTGCTGATCTATGCTGATTAGCACCGAAAACACCACCGAAAGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:68:62:120/1
+GGAATCTCAAGTGCTAGCAGGATTACTTCGTGCTGATCTATGCTGATTAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:69:165:223/1
+GCGAAGATAGCACCGCGACCGAATAAAGCCCGCGTTACCAGATGTTCCGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:70:1:62/1
+TAAGTAAATCAGAGTAGCTTCGCGCCTCAGGGTTGTTCTCAGTTCTACTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:71:25:86/1
+CCTCAGGGTTGTTCTCAGTTCTACTGATAAGCGGTAGGGAATCTCAAGTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:72:120:180/1
+ACACCACCGAAAGCAGCGAACGCGCGTGCACCGCGCTGACCGAACGCGAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:73:164:222/1
+CGCGAAGATAGCACCGCGACCGAATAAAGCCCGCGTTACCAGATGTTCCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:74:149:210/1
+ACCGCGCTGACCGAACGCGAAGATAGCACCGCGACCGAATAAAGCCCGCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:75:173:233/1
+AGCACCGCGACCGAATAAAGCCCGCGTTACCAGATGTTCCGCACCGTCAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:76:179:238.dup.2/1
+GCGACCGAATAAAGCCCGCGTTACCAGATGTTCCGCACCGTCATTTCTCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:77:62:122/1
+GGAATCTCAAGTGCTAGCAGGATTACTTCGTGCTGATCTATGCTGATTAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:78:76:136/1
+TAGCAGGATTACTTCGTGCTGATCTATGCTGATTAGCACCGAAAACACCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:79:12:72/1
+GAGTAGCTTCGCGCCTCAGGGTTGTTCTCAGTTCTACTGATAAGCGGTAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:80:107:167/1
+ATTAGCACCGAAAACACCACCGAAAGCAGCGAACGCGCGTGCACCGCGCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:81:55:113/1
+GCGGTAGGGAATCTCAAGTGCTAGCAGGATTACTTCGTGCTGATCTATGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:82:35:95/1
+GTTCTCAGTTCTACTGATAAGCGGTAGGGAATCTCAAGTGCTAGCAGGAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:83:168:228/1
+AAGATAGCACCGCGACCGAATAAAGCCCGCGTTACCAGATGTTCCGCACC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:84:180:240/1
+CGACCGAATAAAGCCCGCGTTACCAGATGTTCCGCACCGTCATTTCTCAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:85:97:157/1
+ATCTATGCTGATTAGCACCGAAAACACCACCGAAAGCAGCGAACGCGCGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:86:119:177/1
+AACACCACCGAAAGCAGCGAACGCGCGTGCACCGCGCTGACCGAACGCGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:87:157:216/1
+GACCGAACGCGAAGATAGCACCGCGACCGAATAAAGCCCGCGTTACCAGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:88:40:101/1
+CATGCTATTAGAGAACGGCCCTTTCGCCGTGGTCACCTCCCGCCAAGTGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:89:42:101/1
+TGCTATTAGAGAACGGCCCTTTCGCCGTGGTCACCTCCCGCCAAGTGTTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:90:14:75/1
+GTTAAGAGGTGCTACGGGAACTCCTCCATGCTATTAGAGAACGGCCCTTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:91:47:106/1
+TTAGAGAACGGCCCTTTCGCCGTGGTCACCTCCCGCCAAGTGTTGACGTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:92:9:67/1
+GCCCGGTTAAGAGGTGCTACGGGAACTCCTCCATGCTATTAGAGAACGGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:93:56:116/1
+GGCCCTTTCGCCGTGGTCACCTCCCGCCAAGTGTTGACGTTCTACATGTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:94:18:78/1
+AGAGGTGCTACGGGAACTCCTCCATGCTATTAGAGAACGGCCCTTTCGCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:95:47:106.dup.2/1
+TTAGAGAACGGCCCTTTCGCCGTGGTCACCTCCCGCCAAGTGTTGACGTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:96:38:97/1
+TCCATGCTATTAGAGAACGGCCCTTTCGCCGTGGTCACCTCCCGCCAAGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:97:144:203/1
+CGACCACCATTGCGAGCGAAAAACTGAACGATCATTATGAAGCGCATTAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:98:31:90/1
+GAACTCCTCCATGCTATTAGAGAACGGCCCTTTCGCCGTGGTCACCTCCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:99:147:206/1
+CCACCATTGCGAGCGAAAAACTGAACGATCATTATGAAGCGCATTAAAAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:100:11:71/1
+CCGGTTAAGAGGTGCTACGGGAACTCCTCCATGCTATTAGAGAACGGCCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:101:8:69/1
+GGCCCGGTTAAGAGGTGCTACGGGAACTCCTCCATGCTATTAGAGAACGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:102:45:106/1
+TATTAGAGAACGGCCCTTTCGCCGTGGTCACCTCCCGCCAAGTGTTGACG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:103:83:142/1
+CAAGTGTTGACGTTCTACATGTTTCGCGAAGCGAAAAAAATTACCTGCCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:104:124:183/1
+TACCTGCCATGAAAACATGGCGACCACCATTGCGAGCGAAAAACTGAACG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:105:32:91/1
+AACTCCTCCATGCTATTAGAGAACGGCCCTTTCGCCGTGGTCACCTCCCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:106:12:72/1
+CGGTTAAGAGGTGCTACGGGAACTCCTCCATGCTATTAGAGAACGGCCCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:107:156:214/1
+CGAGCGAAAAACTGAACGATCATTATGAAGCGCATTAAAATCAAGCATAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:108:85:145/1
+AGTGTTGACGTTCTACATGTTTCGCGAAGCGAAAAAAATTACCTGCCATG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:109:84:145/1
+AAGTGTTGACGTTCTACATGTTTCGCGAAGCGAAAAAAATTACCTGCCAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:110:60:117/1
+CTTTCGCCGTGGTCACCTCCCGCCAAGTGTTGACGTTCTACATGTTTCGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:111:131:191/1
+CATGAAAACATGGCGACCACCATTGCGAGCGAAAAACTGAACGATCATTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:112:23:84/1
+TGCTACGGGAACTCCTCCATGCTATTAGAGAACGGCCCTTTCGCCGTGGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:113:125:186/1
+ACCTGCCATGAAAACATGGCGACCACCATTGCGAGCGAAAAACTGAACGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:114:160:220/1
+CGAAAAACTGAACGATCATTATGAAGCGCATTAAAATCAAGCATATTAGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:115:148:207/1
+CACCATTGCGAGCGAAAAACTGAACGATCATTATGAAGCGCATTAAAATC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:116:126:186/1
+CCTGCCATGAAAACATGGCGACCACCATTGCGAGCGAAAAACTGAACGAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:117:155:214/1
+GCGAGCGAAAAACTGAACGATCATTATGAAGCGCATTAAAATCAAGCATA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:118:79:138/1
+CCGCCAAGTGTTGACGTTCTACATGTTTCGCGAAGCGAAAAAAATTACCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:119:158:216/1
+AGCGAAAAACTGAACGATCATTATGAAGCGCATTAAAATCAAGCATATTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:120:139:199/1
+CATGGCGACCACCATTGCGAGCGAAAAACTGAACGATCATTATGAAGCGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:121:55:114/1
+CGGCCCTTTCGCCGTGGTCACCTCCCGCCAAGTGTTGACGTTCTACATGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:122:153:213/1
+TTGCGAGCGAAAAACTGAACGATCATTATGAAGCGCATTAAAATCAAGCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:123:20:79/1
+AGGTGCTACGGGAACTCCTCCATGCTATTAGAGAACGGCCCTTTCGCCGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:124:154:215/1
+TGCGAGCGAAAAACTGAACGATCATTATGAAGCGCATTAAAATCAAGCAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:125:178:235/1
+TTATGAAGCGCATTAAAATCAAGCATATTAGTTTATTAAATGGTACCAAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:126:150:210/1
+CCATTGCGAGCGAAAAACTGAACGATCATTATGAAGCGCATTAAAATCAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:127:16:76/1
+TAAGAGGTGCTACGGGAACTCCTCCATGCTATTAGAGAACGGCCCTTTCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:128:6:65/1
+CAGGCCCGGTTAAGAGGTGCTACGGGAACTCCTCCATGCTATTAGAGAAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:129:15:75/1
+TTAAGAGGTGCTACGGGAACTCCTCCATGCTATTAGAGAACGGCCCTTTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:130:116:175/1
+AAAAAAATTACCTGCCATGAAAACATGGCGACCACCATTGCGAGCGAAAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:131:204:263/1
+ACGCCATCCGAGATCACCCACATAGCAAAGAACAGACCGACTTCGGATTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:132:64:123/1
+ACTTTTCGAATCAGGGTTATCTTCAGTCGCCCAACCTCGTACGCGGGTGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:133:104:164/1
+ACGCGGGTGGTGACATGTACTCCACTCCCATACATCACTAAGTTTGTCCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:134:131:188/1
+CCATACATCACTAAGTTTGTCCCTAAGGCAGTGCCCGCCGCCCACGAACG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:135:65:126/1
+CTTTTCGAATCAGGGTTATCTTCAGTCGCCCAACCTCGTACGCGGGTGGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:136:152:213/1
+CCTAAGGCAGTGCCCGCCGCCCACGAACGAACTGCGGTGAGATGCTTAGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:137:73:133/1
+ATCAGGGTTATCTTCAGTCGCCCAACCTCGTACGCGGGTGGTGACATGTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:138:35:94/1
+TAGGGAGAATCCTCTCAGTGTTACCTTACACTTTTCGAATCAGGGTTATC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:139:142:202/1
+TAAGTTTGTCCCTAAGGCAGTGCCCGCCGCCCACGAACGAACTGCGGTGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:140:196:256/1
+CTTAGGGAACGCCATCCGAGATCACCCACATAGCAAAGAACAGACCGACT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:141:198:255/1
+TAGGGAACGCCATCCGAGATCACCCACATAGCAAAGAACAGACCGACTTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:142:27:86/1
+GGGTACTTTAGGGAGAATCCTCTCAGTGTTACCTTACACTTTTCGAATCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:143:153:212/1
+CTAAGGCAGTGCCCGCCGCCCACGAACGAACTGCGGTGAGATGCTTAGGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:144:1:59/1
+ATGTGGTGTGTCCGTCTGACTTAGCTGGGTACTTTAGGGAGAATCCTCTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:145:89:149/1
+GTCGCCCAACCTCGTACGCGGGTGGTGACATGTACTCCACTCCCATACAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:146:178:237/1
+ACGAACTGCGGTGAGATGCTTAGGGAACGCCATCCGAGATCACCCACATA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:147:67:126/1
+TTTCGAATCAGGGTTATCTTCAGTCGCCCAACCTCGTACGCGGGTGGTGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:148:17:77/1
+TGACTTAGCTGGGTACTTTAGGGAGAATCCTCTCAGTGTTACCTTACACT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:149:64:124/1
+ACTTTTCGAATCAGGGTTATCTTCAGTCGCCCAACCTCGTACGCGGGTGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:150:116:174/1
+ACATGTACTCCACTCCCATACATCACTAAGTTTGTCCCTAAGGCAGTGCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:151:147:205/1
+TTGTCCCTAAGGCAGTGCCCGCCGCCCACGAACGAACTGCGGTGAGATGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:152:95:155/1
+CAACCTCGTACGCGGGTGGTGACATGTACTCCACTCCCATACATCACTAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:153:176:235/1
+GAACGAACTGCGGTGAGATGCTTAGGGAACGCCATCCGAGATCACCCACA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:154:202:262/1
+GAACGCCATCCGAGATCACCCACATAGCAAAGAACAGACCGACTTCGGAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:155:176:236/1
+GAACGAACTGCGGTGAGATGCTTAGGGAACGCCATCCGAGATCACCCACA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:156:94:154/1
+CCAACCTCGTACGCGGGTGGTGACATGTACTCCACTCCCATACATCACTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:157:71:128/1
+GAATCAGGGTTATCTTCAGTCGCCCAACCTCGTACGCGGGTGGTGACATG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:158:200:259/1
+GGGAACGCCATCCGAGATCACCCACATAGCAAAGAACAGACCGACTTCGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:159:102:162/1
+GTACGCGGGTGGTGACATGTACTCCACTCCCATACATCACTAAGTTTGTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:160:142:199/1
+TAAGTTTGTCCCTAAGGCAGTGCCCGCCGCCCACGAACGAACTGCGGTGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:161:195:254/1
+GCTTAGGGAACGCCATCCGAGATCACCCACATAGCAAAGAACAGACCGAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:162:73:133.dup.2/1
+ATCAGGGTTATCTTCAGTCGCCCAACCTCGTACGCGGGTGGTGACATGTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:163:117:175/1
+CATGTACTCCACTCCCATACATCACTAAGTTTGTCCCTAAGGCAGTGCCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:164:49:109/1
+TCAGTGTTACCTTACACTTTTCGAATCAGGGTTATCTTCAGTCGCCCAAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:165:80:138/1
+TTATCTTCAGTCGCCCAACCTCGTACGCGGGTGGTGACATGTACTCCACT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:166:177:235/1
+AACGAACTGCGGTGAGATGCTTAGGGAACGCCATCCGAGATCACCCACAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:167:71:131/1
+GAATCAGGGTTATCTTCAGTCGCCCAACCTCGTACGCGGGTGGTGACATG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:168:56:116/1
+TACCTTACACTTTTCGAATCAGGGTTATCTTCAGTCGCCCAACCTCGTAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:169:171:230/1
+CCCACGAACGAACTGCGGTGAGATGCTTAGGGAACGCCATCCGAGATCAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:170:134:194/1
+TACATCACTAAGTTTGTCCCTAAGGCAGTGCCCGCCGCCCACGAACGAAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:171:26:86/1
+TGGGTACTTTAGGGAGAATCCTCTCAGTGTTACCTTACACTTTTCGAATC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:172:182:241/1
+ACTGCGGTGAGATGCTTAGGGAACGCCATCCGAGATCACCCACATAGCAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:173:80:141/1
+TTATCTTCAGTCGCCCAACCTCGTACGCGGGTGGTGACATGTACTCCACT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:174:208:267/1
+CATCCGAGATCACCCACATAGCAAAGAACAGACCGACTTCGGATTCGAAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:175:30:91/1
+TACTTTAGGGAGAATCCTCTCAGTGTTACCTTACACTTTTCGAATCAGGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:176:206:266/1
+GCCATCCGAGATCACCCACATAGCAAAGAACAGACCGACTTCGGATTCGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:177:81:139/1
+TATCTTCAGTCGCCCAACCTCGTACGCGGGTGGTGACATGTACTCCACTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:178:10:67/1
+GTCCGTCTGACTTAGCTGGGTACTTTAGGGAGAATCCTCTCAGTGTTACC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:179:199:258/1
+GGTTCTAAGAATGTATGCGTCCGTTCGTTTTGTGCAATTGTCTCTGTATG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:180:35:94/1
+GGACATCGAATACACTAGCGGTATTTATGCTTATCTGCCCTGTTCCGGAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:181:186:244/1
+ACAGTTCACGCCGGGTTCTAAGAATGTATGCGTCCGTTCGTTTTGTGCAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:182:43:100/1
+AATACACTAGCGGTATTTATGCTTATCTGCCCTGTTCCGGAGCGTTGACT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:183:75:135/1
+TGTTCCGGAGCGTTGACTCTCATAGATCTTTAACTGTTCACGACTGTATC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:184:57:117/1
+ATTTATGCTTATCTGCCCTGTTCCGGAGCGTTGACTCTCATAGATCTTTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:185:122:181/1
+ATCGCGGCTTGCAAATCTTAAGTTCTTCCCAAGCGCGCTGCGATACAAAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:186:186:245/1
+ACAGTTCACGCCGGGTTCTAAGAATGTATGCGTCCGTTCGTTTTGTGCAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:187:99:158/1
+GATCTTTAACTGTTCACGACTGTATCGCGGCTTGCAAATCTTAAGTTCTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:188:57:115/1
+ATTTATGCTTATCTGCCCTGTTCCGGAGCGTTGACTCTCATAGATCTTTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:189:136:196/1
+ATCTTAAGTTCTTCCCAAGCGCGCTGCGATACAAATCCCAAGTTTAGCGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:190:23:81/1
+CACGGCACATTTGGACATCGAATACACTAGCGGTATTTATGCTTATCTGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:191:97:156/1
+TAGATCTTTAACTGTTCACGACTGTATCGCGGCTTGCAAATCTTAAGTTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:192:58:118/1
+TTTATGCTTATCTGCCCTGTTCCGGAGCGTTGACTCTCATAGATCTTTAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:193:156:214/1
+GCGCTGCGATACAAATCCCAAGTTTAGCGGACAGTTCACGCCGGGTTCTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:194:157:215/1
+CGCTGCGATACAAATCCCAAGTTTAGCGGACAGTTCACGCCGGGTTCTAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:195:11:71/1
+AGGTACCGGGCCCACGGCACATTTGGACATCGAATACACTAGCGGTATTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:196:164:226/1
+ATACAAATCCCAAGTTTAGCGGACAGTTCACGCCGGGTTCTAAGAATGTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:197:60:119/1
+TATGCTTATCTGCCCTGTTCCGGAGCGTTGACTCTCATAGATCTTTAACT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:198:152:212/1
+AAGCGCGCTGCGATACAAATCCCAAGTTTAGCGGACAGTTCACGCCGGGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:199:200:260/1
+GTTCTAAGAATGTATGCGTCCGTTCGTTTTGTGCAATTGTCTCTGTATGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:200:167:225/1
+CAAATCCCAAGTTTAGCGGACAGTTCACGCCGGGTTCTAAGAATGTATGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:201:24:86/1
+ACGGCACATTTGGACATCGAATACACTAGCGGTATTTATGCTTATCTGCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:202:144:203/1
+TTCTTCCCAAGCGCGCTGCGATACAAATCCCAAGTTTAGCGGACAGTTCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:203:41:101/1
+CGAATACACTAGCGGTATTTATGCTTATCTGCCCTGTTCCGGAGCGTTGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:204:189:249/1
+GTTCACGCCGGGTTCTAAGAATGTATGCGTCCGTTCGTTTTGTGCAATTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:205:29:89/1
+ACATTTGGACATCGAATACACTAGCGGTATTTATGCTTATCTGCCCTGTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:206:194:253/1
+CGCCGGGTTCTAAGAATGTATGCGTCCGTTCGTTTTGTGCAATTGTCTCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:207:140:199/1
+TAAGTTCTTCCCAAGCGCGCTGCGATACAAATCCCAAGTTTAGCGGACAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:208:168:227/1
+AAATCCCAAGTTTAGCGGACAGTTCACGCCGGGTTCTAAGAATGTATGCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:209:89:148/1
+GACTCTCATAGATCTTTAACTGTTCACGACTGTATCGCGGCTTGCAAATC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:210:193:252/1
+ACGCCGGGTTCTAAGAATGTATGCGTCCGTTCGTTTTGTGCAATTGTCTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:211:166:224/1
+ACAAATCCCAAGTTTAGCGGACAGTTCACGCCGGGTTCTAAGAATGTATG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:212:150:209/1
+CCAAGCGCGCTGCGATACAAATCCCAAGTTTAGCGGACAGTTCACGCCGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:213:197:256/1
+CGGGTTCTAAGAATGTATGCGTCCGTTCGTTTTGTGCAATTGTCTCTGTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:214:36:97/1
+GACATCGAATACACTAGCGGTATTTATGCTTATCTGCCCTGTTCCGGAGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:215:47:107/1
+CACTAGCGGTATTTATGCTTATCTGCCCTGTTCCGGAGCGTTGACTCTCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:216:165:224/1
+TACAAATCCCAAGTTTAGCGGACAGTTCACGCCGGGTTCTAAGAATGTAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:217:150:208/1
+CCAAGCGCGCTGCGATACAAATCCCAAGTTTAGCGGACAGTTCACGCCGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:218:26:87/1
+GGCACATTTGGACATCGAATACACTAGCGGTATTTATGCTTATCTGCCCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:219:43:102/1
+AATACACTAGCGGTATTTATGCTTATCTGCCCTGTTCCGGAGCGTTGACT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:220:161:221/1
+GCGATACAAATCCCAAGTTTAGCGGACAGTTCACGCCGGGTTCTAAGAAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:221:129:189/1
+CTTGCAAATCTTAAGTTCTTCCCAAGCGCGCTGCGATACAAATCCCAAGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:222:185:244/1
+GACAGTTCACGCCGGGTTCTAAGAATGTATGCGTCCGTTCGTTTTGTGCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:223:8:68/1
+ACAAGGTACCGGGCCCACGGCACATTTGGACATCGAATACACTAGCGGTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:224:183:242/1
+CGGACAGTTCACGCCGGGTTCTAAGAATGTATGCGTCCGTTCGTTTTGTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:225:100:159/1
+ATCTTTAACTGTTCACGACTGTATCGCGGCTTGCAAATCTTAAGTTCTTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:226:100:159.dup.2/1
+ATCTTTAACTGTTCACGACTGTATCGCGGCTTGCAAATCTTAAGTTCTTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
diff --git a/ariba/test_run_data/reads_2.fq b/ariba/test_run_data/reads_2.fq
new file mode 100644
index 00000000..6299acb2
--- /dev/null
+++ b/ariba/test_run_data/reads_2.fq
@@ -0,0 +1,904 @@
+@presence_absence1:1:154:213/2
+TAATCTTAAGTCTGGGTTTTCTATATCCACACCGAAACCATCTGCACTTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:2:64:123/2
+AATGCCGTTAATATGGCTAATGTTGGTGCTCGCGCGTTCGGTTACTTCAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:3:21:79/2
+CTTCATGGGTCATCGCTTCGCGATCCATATGCCGCAACACTATGCCAAAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:4:116:174/2
+ATCTGCACTTCGCACCTAGGTATGTCCTTATTCCATGCTTTCCCACGCGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:5:16:76/2
+CATGGGTCATCGCTTCGCGATCCATATGCCGCAACACTATGCCAAATACA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:6:76:136/2
+TTTCCCACGCGCTAATGCCGTTAATATGGCTAATGTTGGTGCTCGCGCGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:7:120:180/2
+GAAACCATCTGCACTTCGCACCTAGGTATGTCCTTATTCCATGCTTTCCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:8:69:129/2
+CGCGCTAATGCCGTTAATATGGCTAATGTTGGTGCTCGCGCGTTCGGTTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:9:31:91/2
+CGCGTTCGGTTACTTCATGGGTCATCGCTTCGCGATCCATATGCCGCAAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:10:76:135/2
+TTCCCACGCGCTAATGCCGTTAATATGGCTAATGTTGGTGCTCGCGCGTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:11:173:233/2
+AGTGAAGCTTTCAGAGGATTTAATCTTAAGTCTGGGTTTTCTATATCCAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:12:37:99/2
+GGTGCTCGCGCGTTCGGTTACTTCATGGGTCATCGCTTCGCGATCCATAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:13:131:190/2
+TATCCACACCGAAACCATCTGCACTTCGCACCTAGGTATGTCCTTATTCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:14:94:153/2
+ATGTCCTTATTCCATGCTTTCCCACGCGCTAATGCCGTTAATATGGCTAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:15:46:104/2
+ATGTTGGTGCTCGCGCGTTCGGTTACTTCATGGGTCATCGCTTCGCGATC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:16:115:175/2
+CATCTGCACTTCGCACCTAGGTATGTCCTTATTCCATGCTTTCCCACGCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:17:44:103/2
+TGTTGGTGCTCGCGCGTTCGGTTACTTCATGGGTCATCGCTTCGCGATCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:18:141:201/2
+TGGGTTTTCTATATCCACACCGAAACCATCTGCACTTCGCACCTAGGTAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:19:149:210/2
+TCTTAAGTCTGGGTTTTCTATATCCACACCGAAACCATCTGCACTTCGCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:20:63:124/2
+TAATGCCGTTAATATGGCTAATGTTGGTGCTCGCGCGTTCGGTTACTTCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:21:60:122/2
+ATGCCGTTAATATGGCTAATGTTGGTGCTCGCGCGTTCGGTTACTTCATG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:22:111:169/2
+CACTTCGCACCTAGGTATGTCCTTATTCCATGCTTTCCCACGCGCTAATG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:23:90:150/2
+TCCTTATTCCATGCTTTCCCACGCGCTAATGCCGTTAATATGGCTAATGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:24:56:114/2
+AATATGGCTAATGTTGGTGCTCGCGCGTTCGGTTACTTCATGGGTCATCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:25:21:80/2
+ACTTCATGGGTCATCGCTTCGCGATCCATATGCCGCAACACTATGCCAAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:26:59:118/2
+CGTTAATATGGCTAATGTTGGTGCTCGCGCGTTCGGTTACTTCATGGGTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:27:19:77/2
+TCATGGGTCATCGCTTCGCGATCCATATGCCGCAACACTATGCCAAATAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:28:125:186/2
+CACACCGAAACCATCTGCACTTCGCACCTAGGTATGTCCTTATTCCATGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:29:31:89/2
+CGTTCGGTTACTTCATGGGTCATCGCTTCGCGATCCATATGCCGCAACAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:30:64:125/2
+CTAATGCCGTTAATATGGCTAATGTTGGTGCTCGCGCGTTCGGTTACTTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:31:150:211/2
+ATCTTAAGTCTGGGTTTTCTATATCCACACCGAAACCATCTGCACTTCGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:32:144:205/2
+AGTCTGGGTTTTCTATATCCACACCGAAACCATCTGCACTTCGCACCTAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:33:59:119/2
+CCGTTAATATGGCTAATGTTGGTGCTCGCGCGTTCGGTTACTTCATGGGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:34:40:99/2
+GGTGCTCGCGCGTTCGGTTACTTCATGGGTCATCGCTTCGCGATCCATAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:35:105:165/2
+TCGCACCTAGGTATGTCCTTATTCCATGCTTTCCCACGCGCTAATGCCGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:36:114:174/2
+ATCTGCACTTCGCACCTAGGTATGTCCTTATTCCATGCTTTCCCACGCGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:37:82:142/2
+CCATGCTTTCCCACGCGCTAATGCCGTTAATATGGCTAATGTTGGTGCTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:38:75:137/2
+CTTTCCCACGCGCTAATGCCGTTAATATGGCTAATGTTGGTGCTCGCGCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:39:67:129/2
+CGCGCTAATGCCGTTAATATGGCTAATGTTGGTGCTCGCGCGTTCGGTTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:40:34:93/2
+CGCGCGTTCGGTTACTTCATGGGTCATCGCTTCGCGATCCATATGCCGCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:41:163:222/2
+CAGAGGATTTAATCTTAAGTCTGGGTTTTCTATATCCACACCGAAACCAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:42:42:102/2
+GTTGGTGCTCGCGCGTTCGGTTACTTCATGGGTCATCGCTTCGCGATCCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:43:148:207/2
+TAAGTCTGGGTTTTCTATATCCACACCGAAACCATCTGCACTTCGCACCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:44:117:177/2
+ACCATCTGCACTTCGCACCTAGGTATGTCCTTATTCCATGCTTTCCCACG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:45:54:114/2
+TTCGGTCAGCGCGGTGCACGCGCGTTCGCTGCTTTCGGTGGTGTTTTCGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:46:8:66/2
+GGTGCTAATCAGCATAGATCAGCACGAAGTAATCCTGCTAGCACTTGAGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:47:74:134/2
+GTCGCGGTGCTATCTTCGCGTTCGGTCAGCGCGGTGCACGCGCGTTCGCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:48:6:65/2
+GTGCTAATCAGCATAGATCAGCACGAAGTAATCCTGCTAGCACTTGAGAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:49:131:189/2
+AAGTAGCAGCTGAGAAATGACGGTGCGGAACATCTGGTAACGCGGGCTTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:50:179:238/2
+TGGTTCCAAGCCATCGAATGGCTGATACTGGACGCTTCAGCTAAAGTGAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:51:60:121/2
+CTTCGCGTTCGGTCAGCGCGGTGCACGCGCGTTCGCTGCTTTCGGTGGTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:52:117:176/2
+GAAATGACGGTGCGGAACATCTGGTAACGCGGGCTTTATTCGGTCGCGGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:53:161:222/2
+AATGGCTGATACTGGACGCTTCAGCTAAAGTGAAAGTAGCAGCTGAGAAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:54:99:157/2
+TCTGGTAACGCGGGCTTTATTCGGTCGCGGTGCTATCTTCGCGTTCGGTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:55:168:227/2
+CATCGAATGGCTGATACTGGACGCTTCAGCTAAAGTGAAAGTAGCAGCTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:56:7:67/2
+CGGTGCTAATCAGCATAGATCAGCACGAAGTAATCCTGCTAGCACTTGAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:57:9:67/2
+CGGTGCTAATCAGCATAGATCAGCACGAAGTAATCCTGCTAGCACTTGAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:58:163:221/2
+ATGGCTGATACTGGACGCTTCAGCTAAAGTGAAAGTAGCAGCTGAGAAAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:59:31:93/2
+GCGTTCGCTGCTTTCGGTGGTGTTTTCGGTGCTAATCAGCATAGATCAGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:60:57:117/2
+GCGTTCGGTCAGCGCGGTGCACGCGCGTTCGCTGCTTTCGGTGGTGTTTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:61:13:72/2
+GTTTTCGGTGCTAATCAGCATAGATCAGCACGAAGTAATCCTGCTAGCAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:62:19:79/2
+CGGTGGTGTTTTCGGTGCTAATCAGCATAGATCAGCACGAAGTAATCCTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:63:153:215/2
+GATACTGGACGCTTCAGCTAAAGTGAAAGTAGCAGCTGAGAAATGACGGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:64:116:173/2
+ATGACGGTGCGGAACATCTGGTAACGCGGGCTTTATTCGGTCGCGGTGCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:65:18:76/2
+TGGTGTTTTCGGTGCTAATCAGCATAGATCAGCACGAAGTAATCCTGCTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:66:155:215/2
+GATACTGGACGCTTCAGCTAAAGTGAAAGTAGCAGCTGAGAAATGACGGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:67:84:143/2
+CTTTATTCGGTCGCGGTGCTATCTTCGCGTTCGGTCAGCGCGGTGCACGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:68:62:120/2
+TTCGCGTTCGGTCAGCGCGGTGCACGCGCGTTCGCTGCTTTCGGTGGTGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:69:165:223/2
+GAATGGCTGATACTGGACGCTTCAGCTAAAGTGAAAGTAGCAGCTGAGAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:70:1:62/2
+CTAATCAGCATAGATCAGCACGAAGTAATCCTGCTAGCACTTGAGATTCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:71:25:86/2
+CTGCTTTCGGTGGTGTTTTCGGTGCTAATCAGCATAGATCAGCACGAAGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:72:120:180/2
+CTGAGAAATGACGGTGCGGAACATCTGGTAACGCGGGCTTTATTCGGTCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:73:164:222/2
+AATGGCTGATACTGGACGCTTCAGCTAAAGTGAAAGTAGCAGCTGAGAAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:74:149:210/2
+TGGACGCTTCAGCTAAAGTGAAAGTAGCAGCTGAGAAATGACGGTGCGGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:75:173:233/2
+CCAAGCCATCGAATGGCTGATACTGGACGCTTCAGCTAAAGTGAAAGTAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:76:179:238.dup.2/2
+TGGTTCCAAGCCATCGAATGGCTGATACTGGACGCTTCAGCTAAAGTGAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:77:62:122/2
+TCTTCGCGTTCGGTCAGCGCGGTGCACGCGCGTTCGCTGCTTTCGGTGGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:78:76:136/2
+CGGTCGCGGTGCTATCTTCGCGTTCGGTCAGCGCGGTGCACGCGCGTTCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:79:12:72/2
+GTTTTCGGTGCTAATCAGCATAGATCAGCACGAAGTAATCCTGCTAGCAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:80:107:167/2
+GTGCGGAACATCTGGTAACGCGGGCTTTATTCGGTCGCGGTGCTATCTTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:81:55:113/2
+TCGGTCAGCGCGGTGCACGCGCGTTCGCTGCTTTCGGTGGTGTTTTCGGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:82:35:95/2
+GCGCGTTCGCTGCTTTCGGTGGTGTTTTCGGTGCTAATCAGCATAGATCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:83:168:228/2
+CCATCGAATGGCTGATACTGGACGCTTCAGCTAAAGTGAAAGTAGCAGCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:84:180:240/2
+CTTGGTTCCAAGCCATCGAATGGCTGATACTGGACGCTTCAGCTAAAGTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:85:97:157/2
+TCTGGTAACGCGGGCTTTATTCGGTCGCGGTGCTATCTTCGCGTTCGGTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:86:119:177/2
+AGAAATGACGGTGCGGAACATCTGGTAACGCGGGCTTTATTCGGTCGCGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:87:157:216/2
+TGATACTGGACGCTTCAGCTAAAGTGAAAGTAGCAGCTGAGAAATGACGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:88:40:101/2
+GTGGTCGCCATGTTTTCATGGCAGGTAATTTTTTTCGCTTCGCGAAACAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:89:42:101/2
+GTGGTCGCCATGTTTTCATGGCAGGTAATTTTTTTCGCTTCGCGAAACAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:90:14:75/2
+AATTTTTTTCGCTTCGCGAAACATGTAGAACGTCAACACTTGGCGGGAGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:91:47:106/2
+CAATGGTGGTCGCCATGTTTTCATGGCAGGTAATTTTTTTCGCTTCGCGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:92:9:67/2
+TCGCTTCGCGAAACATGTAGAACGTCAACACTTGGCGGGAGGTGACCACG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:93:56:116/2
+TTTTCGCTCGCAATGGTGGTCGCCATGTTTTCATGGCAGGTAATTTTTTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:94:18:78/2
+GGTAATTTTTTTCGCTTCGCGAAACATGTAGAACGTCAACACTTGGCGGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:95:47:106.dup.2/2
+CAATGGTGGTCGCCATGTTTTCATGGCAGGTAATTTTTTTCGCTTCGCGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:96:38:97/2
+TCGCCATGTTTTCATGGCAGGTAATTTTTTTCGCTTCGCGAAACATGTAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:97:144:203/2
+GTGGGTACATTTTAATGGGGGGGGAGTTGGTACCATTTAATAAACTAATA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:98:31:90/2
+GTTTTCATGGCAGGTAATTTTTTTCGCTTCGCGAAACATGTAGAACGTCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:99:147:206/2
+GCAGTGGGTACATTTTAATGGGGGGGGAGTTGGTACCATTTAATAAACTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:100:11:71/2
+TTTTTCGCTTCGCGAAACATGTAGAACGTCAACACTTGGCGGGAGGTGAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:101:8:69/2
+TTTCGCTTCGCGAAACATGTAGAACGTCAACACTTGGCGGGAGGTGACCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:102:45:106/2
+CAATGGTGGTCGCCATGTTTTCATGGCAGGTAATTTTTTTCGCTTCGCGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:103:83:142/2
+AATGCGCTTCATAATGATCGTTCAGTTTTTCGCTCGCAATGGTGGTCGCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:104:124:183/2
+GGGGAGTTGGTACCATTTAATAAACTAATATGCTTGATTTTAATGCGCTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:105:32:91/2
+TGTTTTCATGGCAGGTAATTTTTTTCGCTTCGCGAAACATGTAGAACGTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:106:12:72/2
+TTTTTTCGCTTCGCGAAACATGTAGAACGTCAACACTTGGCGGGAGGTGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:107:156:214/2
+TATCAAGGGCAGTGGGTACATTTTAATGGGGGGGGAGTTGGTACCATTTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:108:85:145/2
+TTTAATGCGCTTCATAATGATCGTTCAGTTTTTCGCTCGCAATGGTGGTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:109:84:145/2
+TTTAATGCGCTTCATAATGATCGTTCAGTTTTTCGCTCGCAATGGTGGTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:110:60:117/2
+TTTTTCGCTCGCAATGGTGGTCGCCATGTTTTCATGGCAGGTAATTTTTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:111:131:191/2
+TAATGGGGGGGGAGTTGGTACCATTTAATAAACTAATATGCTTGATTTTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:112:23:84/2
+ATGGCAGGTAATTTTTTTCGCTTCGCGAAACATGTAGAACGTCAACACTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:113:125:186/2
+GGGGGGGAGTTGGTACCATTTAATAAACTAATATGCTTGATTTTAATGCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:114:160:220/2
+CTTTTGTATCAAGGGCAGTGGGTACATTTTAATGGGGGGGGAGTTGGTAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:115:148:207/2
+GGCAGTGGGTACATTTTAATGGGGGGGGAGTTGGTACCATTTAATAAACT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:116:126:186/2
+GGGGGGGAGTTGGTACCATTTAATAAACTAATATGCTTGATTTTAATGCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:117:155:214/2
+TATCAAGGGCAGTGGGTACATTTTAATGGGGGGGGAGTTGGTACCATTTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:118:79:138/2
+CGCTTCATAATGATCGTTCAGTTTTTCGCTCGCAATGGTGGTCGCCATGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:119:158:216/2
+TGTATCAAGGGCAGTGGGTACATTTTAATGGGGGGGGAGTTGGTACCATT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:120:139:199/2
+GTACATTTTAATGGGGGGGGAGTTGGTACCATTTAATAAACTAATATGCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:121:55:114/2
+TTCGCTCGCAATGGTGGTCGCCATGTTTTCATGGCAGGTAATTTTTTTCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:122:153:213/2
+ATCAAGGGCAGTGGGTACATTTTAATGGGGGGGGAGTTGGTACCATTTAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:123:20:79/2
+AGGTAATTTTTTTCGCTTCGCGAAACATGTAGAACGTCAACACTTGGCGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:124:154:215/2
+GTATCAAGGGCAGTGGGTACATTTTAATGGGGGGGGAGTTGGTACCATTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:125:178:235/2
+AACTGGTAAAAAGCCCTTTTGTATCAAGGGCAGTGGGTACATTTTAATGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:126:150:210/2
+AAGGGCAGTGGGTACATTTTAATGGGGGGGGAGTTGGTACCATTTAATAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:127:16:76/2
+TAATTTTTTTCGCTTCGCGAAACATGTAGAACGTCAACACTTGGCGGGAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:128:6:65/2
+GCTTCGCGAAACATGTAGAACGTCAACACTTGGCGGGAGGTGACCACGGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:129:15:75/2
+AATTTTTTTCGCTTCGCGAAACATGTAGAACGTCAACACTTGGCGGGAGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only2:130:116:175/2
+GGTACCATTTAATAAACTAATATGCTTGATTTTAATGCGCTTCATAATGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:131:204:263/2
+CAAACATTTCCCGTCGAATACAGATCCAGACTCTCGCCTGCCTCATGCGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:132:64:123/2
+GGCGGCGGGCACTGCCTTAGGGACAAACTTAGTGATGTATGGGAGTGGAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:133:104:164/2
+CGGATGGCGTTCCCTAAGCATCTCACCGCAGTTCGTTCGTGGGCGGCGGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:134:131:188/2
+TGTTCTTTGCTATGTGGGTGATCTCGGATGGCGTTCCCTAAGCATCTCAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:135:65:126/2
+GTGGGCGGCGGGCACTGCCTTAGGGACAAACTTAGTGATGTATGGGAGTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:136:152:213/2
+GATTAATTCGAATCCGAAGTCGGTCTGTTCTTTGCTATGTGGGTGATCTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:137:73:133/2
+TTCGTTCGTGGGCGGCGGGCACTGCCTTAGGGACAAACTTAGTGATGTAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:138:35:94/2
+TAGTGATGTATGGGAGTGGAGTACATGTCACCACCCGCGTACGAGGTTGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:139:142:202/2
+ATCCGAAGTCGGTCTGTTCTTTGCTATGTGGGTGATCTCGGATGGCGTTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:140:196:256/2
+TTCCCGTCGAATACAGATCCAGACTCTCGCCTGCCTCATGCGAGATTAAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:141:198:255/2
+TCCCGTCGAATACAGATCCAGACTCTCGCCTGCCTCATGCGAGATTAATT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:142:27:86/2
+TATGGGAGTGGAGTACATGTCACCACCCGCGTACGAGGTTGGGCGACTGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:143:153:212/2
+ATTAATTCGAATCCGAAGTCGGTCTGTTCTTTGCTATGTGGGTGATCTCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:144:1:59/2
+CGCGTACGAGGTTGGGCGACTGAAGATAACCCTGATTCGAAAAGTGTAAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:145:89:149/2
+AAGCATCTCACCGCAGTTCGTTCGTGGGCGGCGGGCACTGCCTTAGGGAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:146:178:237/2
+CAGACTCTCGCCTGCCTCATGCGAGATTAATTCGAATCCGAAGTCGGTCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:147:67:126/2
+GTGGGCGGCGGGCACTGCCTTAGGGACAAACTTAGTGATGTATGGGAGTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:148:17:77/2
+GGAGTACATGTCACCACCCGCGTACGAGGTTGGGCGACTGAAGATAACCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:149:64:124/2
+GGGCGGCGGGCACTGCCTTAGGGACAAACTTAGTGATGTATGGGAGTGGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:150:116:174/2
+TGGGTGATCTCGGATGGCGTTCCCTAAGCATCTCACCGCAGTTCGTTCGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:151:147:205/2
+CGAATCCGAAGTCGGTCTGTTCTTTGCTATGTGGGTGATCTCGGATGGCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:152:95:155/2
+TTCCCTAAGCATCTCACCGCAGTTCGTTCGTGGGCGGCGGGCACTGCCTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:153:176:235/2
+GACTCTCGCCTGCCTCATGCGAGATTAATTCGAATCCGAAGTCGGTCTGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:154:202:262/2
+AAACATTTCCCGTCGAATACAGATCCAGACTCTCGCCTGCCTCATGCGAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:155:176:236/2
+AGACTCTCGCCTGCCTCATGCGAGATTAATTCGAATCCGAAGTCGGTCTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:156:94:154/2
+TCCCTAAGCATCTCACCGCAGTTCGTTCGTGGGCGGCGGGCACTGCCTTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:157:71:128/2
+TCGTGGGCGGCGGGCACTGCCTTAGGGACAAACTTAGTGATGTATGGGAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:158:200:259/2
+CATTTCCCGTCGAATACAGATCCAGACTCTCGCCTGCCTCATGCGAGATT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:159:102:162/2
+GATGGCGTTCCCTAAGCATCTCACCGCAGTTCGTTCGTGGGCGGCGGGCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:160:142:199/2
+CGAAGTCGGTCTGTTCTTTGCTATGTGGGTGATCTCGGATGGCGTTCCCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:161:195:254/2
+CCCGTCGAATACAGATCCAGACTCTCGCCTGCCTCATGCGAGATTAATTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:162:73:133.dup.2/2
+TTCGTTCGTGGGCGGCGGGCACTGCCTTAGGGACAAACTTAGTGATGTAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:163:117:175/2
+GTGGGTGATCTCGGATGGCGTTCCCTAAGCATCTCACCGCAGTTCGTTCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:164:49:109/2
+CCTTAGGGACAAACTTAGTGATGTATGGGAGTGGAGTACATGTCACCACC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:165:80:138/2
+CGCAGTTCGTTCGTGGGCGGCGGGCACTGCCTTAGGGACAAACTTAGTGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:166:177:235/2
+GACTCTCGCCTGCCTCATGCGAGATTAATTCGAATCCGAAGTCGGTCTGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:167:71:131/2
+CGTTCGTGGGCGGCGGGCACTGCCTTAGGGACAAACTTAGTGATGTATGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:168:56:116/2
+GGCACTGCCTTAGGGACAAACTTAGTGATGTATGGGAGTGGAGTACATGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:169:171:230/2
+TCGCCTGCCTCATGCGAGATTAATTCGAATCCGAAGTCGGTCTGTTCTTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:170:134:194/2
+TCGGTCTGTTCTTTGCTATGTGGGTGATCTCGGATGGCGTTCCCTAAGCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:171:26:86/2
+TATGGGAGTGGAGTACATGTCACCACCCGCGTACGAGGTTGGGCGACTGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:172:182:241/2
+GATCCAGACTCTCGCCTGCCTCATGCGAGATTAATTCGAATCCGAAGTCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:173:80:141/2
+CACCGCAGTTCGTTCGTGGGCGGCGGGCACTGCCTTAGGGACAAACTTAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:174:208:267/2
+CTAGCAAACATTTCCCGTCGAATACAGATCCAGACTCTCGCCTGCCTCAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:175:30:91/2
+TGATGTATGGGAGTGGAGTACATGTCACCACCCGCGTACGAGGTTGGGCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:176:206:266/2
+TAGCAAACATTTCCCGTCGAATACAGATCCAGACTCTCGCCTGCCTCATG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:177:81:139/2
+CCGCAGTTCGTTCGTGGGCGGCGGGCACTGCCTTAGGGACAAACTTAGTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:178:10:67/2
+TCACCACCCGCGTACGAGGTTGGGCGACTGAAGATAACCCTGATTCGAAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:179:199:258/2
+ATAGCGCCATGAAGAGTAGAGAGCGTTTATGTTTCTACCGGTCCAGTAGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:180:35:94/2
+CTTAAGATTTGCAAGCCGCGATACAGTCGTGAACAGTTAAAGATCTATGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:181:186:244/2
+AGTAGAGAGCGTTTATGTTTCTACCGGTCCAGTAGACAATTGGTCCATAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:182:43:100/2
+GAAGAACTTAAGATTTGCAAGCCGCGATACAGTCGTGAACAGTTAAAGAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:183:75:135/2
+CGCTAAACTTGGGATTTGTATCGCAGCGCGCTTGGGAAGAACTTAAGATT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:184:57:117/2
+TATCGCAGCGCGCTTGGGAAGAACTTAAGATTTGCAAGCCGCGATACAGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:185:122:181/2
+CAAAACGAACGGACGCATACATTCTTAGAACCCGGCGTGAACTGTCCGCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:186:186:245/2
+GAGTAGAGAGCGTTTATGTTTCTACCGGTCCAGTAGACAATTGGTCCATA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:187:99:158/2
+CTTAGAACCCGGCGTGAACTGTCCGCTAAACTTGGGATTTGTATCGCAGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:188:57:115/2
+TCGCAGCGCGCTTGGGAAGAACTTAAGATTTGCAAGCCGCGATACAGTCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:189:136:196/2
+ACAGAGACAATTGCACAAAACGAACGGACGCATACATTCTTAGAACCCGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:190:23:81/2
+AGCCGCGATACAGTCGTGAACAGTTAAAGATCTATGAGAGTCAACGCTCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:191:97:156/2
+TAGAACCCGGCGTGAACTGTCCGCTAAACTTGGGATTTGTATCGCAGCGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:192:58:118/2
+GTATCGCAGCGCGCTTGGGAAGAACTTAAGATTTGCAAGCCGCGATACAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:193:156:214/2
+AGTAGACAATTGGTCCATACAGAGACAATTGCACAAAACGAACGGACGCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:194:157:215/2
+CAGTAGACAATTGGTCCATACAGAGACAATTGCACAAAACGAACGGACGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:195:11:71/2
+CAGTCGTGAACAGTTAAAGATCTATGAGAGTCAACGCTCCGGAACAGGGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:196:164:226/2
+TTCTACCGGTCCAGTAGACAATTGGTCCATACAGAGACAATTGCACAAAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:197:60:119/2
+TGTATCGCAGCGCGCTTGGGAAGAACTTAAGATTTGCAAGCCGCGATACA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:198:152:212/2
+TAGACAATTGGTCCATACAGAGACAATTGCACAAAACGAACGGACGCATA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:199:200:260/2
+CGATAGCGCCATGAAGAGTAGAGAGCGTTTATGTTTCTACCGGTCCAGTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:200:167:225/2
+TCTACCGGTCCAGTAGACAATTGGTCCATACAGAGACAATTGCACAAAAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:201:24:86/2
+TTGCAAGCCGCGATACAGTCGTGAACAGTTAAAGATCTATGAGAGTCAAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:202:144:203/2
+GGTCCATACAGAGACAATTGCACAAAACGAACGGACGCATACATTCTTAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:203:41:101/2
+GGAAGAACTTAAGATTTGCAAGCCGCGATACAGTCGTGAACAGTTAAAGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:204:189:249/2
+TGAAGAGTAGAGAGCGTTTATGTTTCTACCGGTCCAGTAGACAATTGGTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:205:29:89/2
+GATTTGCAAGCCGCGATACAGTCGTGAACAGTTAAAGATCTATGAGAGTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:206:194:253/2
+GCCATGAAGAGTAGAGAGCGTTTATGTTTCTACCGGTCCAGTAGACAATT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:207:140:199/2
+CATACAGAGACAATTGCACAAAACGAACGGACGCATACATTCTTAGAACC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:208:168:227/2
+TTTCTACCGGTCCAGTAGACAATTGGTCCATACAGAGACAATTGCACAAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:209:89:148/2
+GGCGTGAACTGTCCGCTAAACTTGGGATTTGTATCGCAGCGCGCTTGGGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:210:193:252/2
+CCATGAAGAGTAGAGAGCGTTTATGTTTCTACCGGTCCAGTAGACAATTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:211:166:224/2
+CTACCGGTCCAGTAGACAATTGGTCCATACAGAGACAATTGCACAAAACG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:212:150:209/2
+ACAATTGGTCCATACAGAGACAATTGCACAAAACGAACGGACGCATACAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:213:197:256/2
+AGCGCCATGAAGAGTAGAGAGCGTTTATGTTTCTACCGGTCCAGTAGACA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:214:36:97/2
+GAACTTAAGATTTGCAAGCCGCGATACAGTCGTGAACAGTTAAAGATCTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:215:47:107/2
+CGCTTGGGAAGAACTTAAGATTTGCAAGCCGCGATACAGTCGTGAACAGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:216:165:224/2
+CTACCGGTCCAGTAGACAATTGGTCCATACAGAGACAATTGCACAAAACG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:217:150:208/2
+CAATTGGTCCATACAGAGACAATTGCACAAAACGAACGGACGCATACATT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:218:26:87/2
+TTTGCAAGCCGCGATACAGTCGTGAACAGTTAAAGATCTATGAGAGTCAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:219:43:102/2
+GGGAAGAACTTAAGATTTGCAAGCCGCGATACAGTCGTGAACAGTTAAAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:220:161:221/2
+CCGGTCCAGTAGACAATTGGTCCATACAGAGACAATTGCACAAAACGAAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:221:129:189/2
+CAATTGCACAAAACGAACGGACGCATACATTCTTAGAACCCGGCGTGAAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:222:185:244/2
+AGTAGAGAGCGTTTATGTTTCTACCGGTCCAGTAGACAATTGGTCCATAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:223:8:68/2
+TCGTGAACAGTTAAAGATCTATGAGAGTCAACGCTCCGGAACAGGGCAGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:224:183:242/2
+TAGAGAGCGTTTATGTTTCTACCGGTCCAGTAGACAATTGGTCCATACAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:225:100:159/2
+TCTTAGAACCCGGCGTGAACTGTCCGCTAAACTTGGGATTTGTATCGCAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding2:226:100:159.dup.2/2
+TCTTAGAACCCGGCGTGAACTGTCCGCTAAACTTGGGATTTGTATCGCAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
diff --git a/ariba/test_run_data/ref_fasta_to_make_reads_from.fa b/ariba/test_run_data/ref_fasta_to_make_reads_from.fa
new file mode 100644
index 00000000..aee16d5f
--- /dev/null
+++ b/ariba/test_run_data/ref_fasta_to_make_reads_from.fa
@@ -0,0 +1,32 @@
+>presence_absence1
+TTAAGCTGCCTAACCCTATAACGTTCAGCACTCTAAACCGCGCCTAAACAGGTACACTTC
+TTCCTTTCTTGTAGCTGTATTTGGCATAGTGTTGCGGCATATGGATCGCGAAGCGATGAC
+CCATGAAGTAACCGAACGCGCGAGCACCAACATTAGCCATATTAACGGCATTAGCGCGTG
+GGAAAGCATGGAATAAGGACATACCTAGGTGCGAAGTGCAGATGGTTTCGGTGTGGATAT
+AGAAAACCCAGACTTAAGATTAAATCCTCTGAAAGCTTCACTGACGTCATGACTCA
+>variants_only1
+TAAGTAAATCAGAGTAGCTTCGCGCCTCAGGGTTGTTCTCAGTTCTACTGATAAGCGGTA
+GGGAATCTCAAGTGCTAGCAGGATTACTTCGTGCTGATCTATGCTGATTAGCACCGAAAA
+CACCACCGAAAGCAGCGAACGCGCGTGCACCGCGCTGACCGAACGCGAAGATAGCACCGC
+GACCGAATAAAGCCCGCGTTACCAGATGTTCCGCACCGTCATTTCTCAGCTGCTACTTTC
+ACTTTAGCTGAAGCGTCCAGTATCAGCCATTCGATGGCTTGGAACCAAGT
+>variants_only2
+ATGCGCAGGCCCGGTTAAGAGGTGCTACGGGAACTCCTCCATGCTATTAGAGAACGGCCC
+TTTCGCCGTGGTCACCTCCCGCCAAGTGTTGACGTTCTACATGTTTCGCGAAGCGAAAAA
+AATTACCTGCCATGAAAACATGGCGACCACCATTGCGAGCGAAAAACTGAACGATCATTA
+TGAAGCGCATTAAAATCAAGCATATTAGTTTATTAAATGGTACCAACTCCCCCCCCATTA
+AAATGTACCCACTGCCCTTGATACAAAAGGGCTTTTTACCAGTTCTACGACCG
+>noncoding1
+ATGTGGTGTGTCCGTCTGACTTAGCTGGGTACTTTAGGGAGAATCCTCTCAGTGTTACCT
+TACACTTTTCGAATCAGGGTTATCTTCAGTCGCCCAACCTCGTACGCGGGTGGTGACATG
+TACTCCACTCCCATACATCACTAAGTTTGTCCCTAAGGCAGTGCCCGCCGCCCACGAACG
+AACTGCGGTGAGATGCTTAGGGAACGCCATCCGAGATCACCCACATAGCAAAGAACAGAC
+CGACTTCGGATTCGAATTAATCTCGCATGAGGCAGGCGAGAGTCTGGATCTGTATTCGAC
+GGGAAATGTTTGCTAGGTCT
+>noncoding2
+CGACTACACAAGGTACCGGGCCCACGGCACATTTGGACATCGAATACACTAGCGGTATTT
+ATGCTTATCTGCCCTGTTCCGGAGCGTTGACTCTCATAGATCTTTAACTGTTCACGACTG
+TATCGCGGCTTGCAAATCTTAAGTTCTTCCCAAGCGCGCTGCGATACAAATCCCAAGTTT
+AGCGGACAGTTCACGCCGGGTTCTAAGAATGTATGCGTCCGTTCGTTTTGTGCAATTGTC
+TCTGTATGGACCAATTGTCTACTGGACCGGTAGAAACATAAACGCTCTCTACTCTTCATG
+GCGCTATCGGGGAGGGGGCG
diff --git a/ariba/test_run_data/variants_only.fa b/ariba/test_run_data/variants_only.fa
new file mode 100644
index 00000000..2a3b01c5
--- /dev/null
+++ b/ariba/test_run_data/variants_only.fa
@@ -0,0 +1,6 @@
+>variants_only1
+ATGCTGATTAGCACCGAAAACACCACCGAAAGCAGCGAACGCGCGTGCACCGCGCTGACC
+GAACGCGAAGATAGCACCGCGACCGAATAA
+>variants_only2
+ATGTTTCGCGAAGCGAAAAAAATTACCTGCCATGAAAACATGGCGACCACCATTGCGAGC
+GAAAAACTGAACGATCATTATGAAGCGCATTAA
diff --git a/ariba/tests/assembly_compare_test.py b/ariba/tests/assembly_compare_test.py
new file mode 100644
index 00000000..7f88111b
--- /dev/null
+++ b/ariba/tests/assembly_compare_test.py
@@ -0,0 +1,260 @@
+import unittest
+import os
+import copy
+import shutil
+import filecmp
+import pyfastaq
+import pysam
+import pymummer
+from ariba import assembly_compare
+
+modules_dir = os.path.dirname(os.path.abspath(assembly_compare.__file__))
+data_dir = os.path.join(modules_dir, 'tests', 'data')
+
+
+class TestAssemblyCompare(unittest.TestCase):
+    def test_parse_nucmer_coords_file(self):
+        '''test _parse_nucmer_coords_file'''
+        coords_file = os.path.join(data_dir, 'assembly_compare_parse_nucmer_coords_file.coords')
+        ref_name = 'ref'
+        got = assembly_compare.AssemblyCompare._parse_nucmer_coords_file(coords_file, ref_name)
+        line1 = ['1', '1000', '1', '1000', '1000', '1000', '100.00', '1000', '1000', '1', '1', 'ref', 'contig1']
+        line2 = ['1', '240', '1', '240', '240', '240', '100.00', '1000', '580', '1', '1', 'ref', 'contig2']
+        line3 = ['661', '1000', '241', '580', '340', '340', '100.00', '1000', '580', '1', '1', 'ref', 'contig2']
+        expected = {
+            'contig1': [pymummer.alignment.Alignment('\t'.join(line1))],
+            'contig2': [pymummer.alignment.Alignment('\t'.join(line2)), pymummer.alignment.Alignment('\t'.join(line3))],
+        }
+        self.assertEqual(expected, got)
+
+
+    def test_nucmer_hits_to_percent_identity(self):
+        '''test _nucmer_hits_to_percent_identity'''
+        hits = [
+            ['1', '10', '1', '10', '10', '10', '90.00', '1000', '1000', '1', '1', 'ref', 'scaff1'],
+            ['9', '42', '9', '42', '34', '34', '100.00', '1000', '1000', '1', '1', 'ref', 'scaff1'],
+            ['1', '42', '1', '42', '42', '42', '42.42', '1000', '1000', '1', '1', 'ref', 'scaff2'],
+        ]
+        nucmer_hits = {
+            'scaff1': [
+                pymummer.alignment.Alignment('\t'.join(hits[0])),
+                pymummer.alignment.Alignment('\t'.join(hits[1])),
+            ],
+            'scaff2': [
+                pymummer.alignment.Alignment('\t'.join(hits[2])),
+            ]
+        }
+        expected = {'scaff1': round((90*10 + 100*34) / (10+34), 2), 'scaff2': 42.42}
+        got = assembly_compare.AssemblyCompare._nucmer_hits_to_percent_identity(nucmer_hits)
+        self.assertEqual(expected, got)
+
+
+    def test_nucmer_hits_to_assembly_coords(self):
+        '''test _nucmer_hits_to_assembly_coords'''
+        hits = [
+            ['1', '10', '1', '10', '10', '10', '100.00', '1000', '1000', '1', '1', 'ref', 'scaff1'],
+            ['9', '42', '9', '42', '34', '34', '100.00', '1000', '1000', '1', '1', 'ref', 'scaff1'],
+            ['50', '52', '50', '52', '3', '3', '100.00', '1000', '1000', '1', '1', 'ref', 'scaff1'],
+            ['1', '42', '1', '42', '42', '42', '100.00', '1000', '1000', '1', '1', 'ref', 'scaff2'],
+        ]
+        nucmer_hits = {
+            'scaff1': [
+                pymummer.alignment.Alignment('\t'.join(hits[0])),
+                pymummer.alignment.Alignment('\t'.join(hits[1])),
+                pymummer.alignment.Alignment('\t'.join(hits[2])),
+            ],
+            'scaff2': [
+                pymummer.alignment.Alignment('\t'.join(hits[3])),
+            ]
+        }
+        expected = {
+            'scaff1': [
+                pyfastaq.intervals.Interval(0, 41),
+                pyfastaq.intervals.Interval(49, 51)
+            ],
+            'scaff2': [
+                pyfastaq.intervals.Interval(0, 41),
+            ]
+        }
+        got = assembly_compare.AssemblyCompare._nucmer_hits_to_assembly_coords(nucmer_hits)
+        self.assertEqual(expected, got)
+
+
+    def test_nucmer_hits_to_ref_coords(self):
+        '''test nucmer_hits_to_ref_coords'''
+        hits = [
+            ['1', '42', '1', '42', '42', '42', '100.00', '1000', '1000', '1', '1', 'ref', 'contig1'],
+            ['31', '52', '1', '22', '22', '22', '100.00', '1000', '1000', '1', '1', 'ref', 'contig1'],
+            ['100', '142', '200', '242', '42', '42', '99.42', '1000', '1000', '1', '1', 'ref', 'contig1'],
+            ['100', '110', '200', '210', '11', '11', '99.42', '1000', '1000', '1', '1', 'ref', 'contig2'],
+        ]
+        nucmer_hits = {
+            'contig1': [
+                pymummer.alignment.Alignment('\t'.join(hits[0])),
+                pymummer.alignment.Alignment('\t'.join(hits[1])),
+                pymummer.alignment.Alignment('\t'.join(hits[2])),
+            ],
+            'contig2': [
+                pymummer.alignment.Alignment('\t'.join(hits[3])),
+            ]
+        }
+        got = assembly_compare.AssemblyCompare.nucmer_hits_to_ref_coords(nucmer_hits)
+        expected = {
+            'contig1': [pyfastaq.intervals.Interval(0,51), pyfastaq.intervals.Interval(99, 141)],
+            'contig2': [pyfastaq.intervals.Interval(99, 109)]
+        }
+
+        self.assertEqual(expected, got)
+
+        got = assembly_compare.AssemblyCompare.nucmer_hits_to_ref_coords(nucmer_hits, contig='contig2')
+        del expected['contig1']
+        self.assertEqual(expected, got)
+
+
+    def test_ref_cov_per_contig(self):
+        '''test ref_cov_per_contig'''
+        hits = [
+            ['1', '42', '1', '42', '42', '42', '100.00', '1000', '1000', '1', '1', 'ref', 'contig1'],
+            ['100', '142', '200', '242', '42', '42', '99.42', '1000', '1000', '1', '1', 'ref', 'contig1'],
+            ['100', '110', '200', '210', '11', '11', '99.42', '1000', '1000', '1', '1', 'ref', 'contig2'],
+        ]
+        nucmer_hits = {
+            'contig1': [
+                pymummer.alignment.Alignment('\t'.join(hits[0])),
+                pymummer.alignment.Alignment('\t'.join(hits[1])),
+            ],
+            'contig2': [
+                pymummer.alignment.Alignment('\t'.join(hits[2])),
+            ]
+        }
+
+        expected = {'contig1': 85, 'contig2': 11}
+        got = assembly_compare.AssemblyCompare.ref_cov_per_contig(nucmer_hits)
+        self.assertEqual(expected, got)
+
+
+    def test_write_assembled_reference_sequences(self):
+        '''test _write_assembled_reference_sequences'''
+        ref_sequence = pyfastaq.sequences.Fasta('ref_seq', 'ATGGTACAAGACGGCCCTTTGCAGTCCTGTGTACTTGCGGGTCGCTCCTTTGCATTGAATTATCGAACATCGTCGCGTTCAAGATCCCGCGAAAAAAATTATAGATCGCAGGATATCACTGCCAGTGGCATCTGTGTAAGCGCTTAG')
+        assembly = {
+            'contig1': pyfastaq.sequences.Fasta('contig1', 'CATCTATGCTGCATCGATCACTGACGTATCATCATCAGCGTACTGACGTATTAGTTTGTAATGGTACAAGACGGCCCTTTGCAGTCCTGTGTACTTGCGGGTCGCTCCTTTGCATTGAATTATCGAACATCGTCGCGTTCAAGATCCCGCGAAAAAAATTATAGATCGCAGGATATCACTGCCAGTGGCATCTGTGTAAGCGCTTAGACGTCGTACTACTGTATATGCATCGATCTGAA'),
+            'contig2': pyfastaq.sequences.Fasta('contig2', 'AGTGATATCCTGCGATCTATAATTTTTTTCGCGGGATCTTGAACGCGACGATGTTCGATAATTCAATGCAAAGGAGCGACCCGCAAGTACACAGGACTGCAAA')
+        }
+
+        hits = [
+            ['1', '147', '61', '207', '147', '147', '100.00', '147', '239', '1', '1', 'ref_seq', 'contig1'],
+            ['18', '120', '103', '1', '103', '103', '100.00', '147', '103', '1', '-1', 'ref_seq', 'contig2']
+        ]
+        nucmer_hits = {
+            'contig1': [
+                pymummer.alignment.Alignment('\t'.join(hits[0])),
+            ],
+            'contig2': [
+                pymummer.alignment.Alignment('\t'.join(hits[1])),
+            ]
+        }
+
+        tmp_outfile = 'tmp.test_nucmer_hits_to_assembled_gene_sequences.out.fa'
+        assembly_compare.AssemblyCompare._write_assembled_reference_sequences(nucmer_hits, ref_sequence, assembly, tmp_outfile)
+        expected_outfile = os.path.join(data_dir, 'assembly_compare_write_assembled_reference_sequences.expected.fa')
+        self.assertTrue(filecmp.cmp(tmp_outfile, expected_outfile, shallow=False))
+        os.unlink(tmp_outfile)
+
+
+    def test_whole_gene_covered_by_nucmer_hits(self):
+        '''test _whole_gene_covered_by_nucmer_hits'''
+        ref_seq = pyfastaq.sequences.Fasta('ref', 'ACGTGTGCAT')
+        hit1 = ['1', '10', '1', '10', '10', '10', '100.00', '10', '10', '1', '1', 'ref', 'contig1']
+        hit2 = ['1', '5', '1', '5', '5', '5', '100.00', '10', '10', '1', '1', 'ref', 'contig2']
+        hit3 = ['6', '10', '6', '10', '5', '5', '100.00', '10', '10', '1', '1', 'ref', 'contig2']
+        nucmer_hits = [
+            {'contig1': [pymummer.alignment.Alignment('\t'.join(hit1))]},
+            {'contig2': [pymummer.alignment.Alignment('\t'.join(hit2))]},
+            {'contig2': [pymummer.alignment.Alignment('\t'.join(hit2)), pymummer.alignment.Alignment('\t'.join(hit3))]}
+        ]
+        expected = [True, False, True]
+        for i in range(len(nucmer_hits)):
+            got = assembly_compare.AssemblyCompare._whole_gene_covered_by_nucmer_hits(nucmer_hits[i], ref_seq, 0.95)
+            self.assertEqual(expected[i], got)
+
+
+    def test_ref_has_region_assembled_twice(self):
+        '''test _ref_has_region_assembled_twice'''
+        ref_seq = pyfastaq.sequences.Fasta('gene', 'ACGTGTGCAT')
+        hit1 = ['1', '10', '1', '10', '10', '10', '100.00', '10', '10', '1', '1', 'gene', 'contig1']
+        hit2 = ['1', '5', '1', '5', '5', '5', '100.00', '10', '10', '1', '1', 'gene', 'contig2']
+        nucmer_hits = { 'contig1': [pymummer.alignment.Alignment('\t'.join(hit1))] }
+        self.assertFalse(assembly_compare.AssemblyCompare._ref_has_region_assembled_twice(nucmer_hits, ref_seq, 0.03))
+        nucmer_hits['contig2'] = [pymummer.alignment.Alignment('\t'.join(hit2))]
+        self.assertTrue(assembly_compare.AssemblyCompare._ref_has_region_assembled_twice(nucmer_hits, ref_seq, 0.03))
+
+
+    def test_ref_covered_by_complete_contig_with_orf(self):
+        '''test _ref_covered_by_complete_contig_with_orf'''
+        gene = pyfastaq.sequences.Fasta('gene', 'GATCGCGAAGCGATGACCCATGAAGCGACCGAACGCTGA')
+        gene_no_orf = pyfastaq.sequences.Fasta('gene', 'GATTGAGAAGCGATGACCCATGAAGCGACCGAACGCTGA')
+        hit1 = ['1', '39', '1', '39', '39', '39', '100.00', '39', '39', '1', '1', 'gene', 'contig1']
+        hit2 = ['1', '20', '1', '20', '20', '20', '100.00', '39', '39', '1', '1', 'gene', 'contig1']
+        hit3 = ['21', '39', '21', '39', '19', '19', '100.00', '39', '39', '1', '1', 'gene', 'contig2']
+        nucmer_hits = [
+            {'contig1': [pymummer.alignment.Alignment('\t'.join(hit1))]},
+            {'contig1': [pymummer.alignment.Alignment('\t'.join(hit1))]},
+            {'contig2': [pymummer.alignment.Alignment('\t'.join(hit2))]},
+            {'contig2': [pymummer.alignment.Alignment('\t'.join(hit2)), pymummer.alignment.Alignment('\t'.join(hit3))]},
+        ]
+        expected = [True, False, False, False]
+        assemblies = [
+            {'contig1': gene},
+            {'contig1': gene_no_orf},
+            {'contig1': gene},
+            {'contig1': gene, 'contig2': pyfastaq.sequences.Fasta('contig2', 'ACGT')}
+        ]
+        assert len(expected) == len(nucmer_hits) == len(assemblies)
+        for i in range(len(expected)):
+            self.assertEqual(expected[i], assembly_compare.AssemblyCompare._ref_covered_by_complete_contig_with_orf(nucmer_hits[i], assemblies[i]))
+
+
+    def test_ref_covered_by_at_least_one_full_length_contig(self):
+        '''test _ref_covered_by_at_least_one_full_length_contig'''
+        ref = pyfastaq.sequences.Fasta('gene', 'GATCGCGAAGCGATGACCCATGAAGCGACCGAACGCTGA')
+        hit1 = ['1', '39', '1', '39', '39', '39', '100.00', '39', '39', '1', '1', 'ref', 'contig1']
+        hit2 = ['1', '20', '1', '20', '20', '20', '100.00', '39', '39', '1', '1', 'ref', 'contig1']
+        nucmer_hits = [
+            {'contig1': [pymummer.alignment.Alignment('\t'.join(hit1))]},
+            {'contig1': [pymummer.alignment.Alignment('\t'.join(hit2))]},
+        ]
+        expected = [True, False]
+        assert len(expected) == len(nucmer_hits)
+        for i in range(len(expected)):
+            self.assertEqual(expected[i], assembly_compare.AssemblyCompare._ref_covered_by_at_least_one_full_length_contig(nucmer_hits[i]))
+
+    def test_nucmer_hit_containing_reference_position(self):
+        '''test nucmer_hit_containing_reference_position'''
+        listhit1 = ['100', '200', '300', '400', '100', '100', '100.00', '600', '500', '1', '1', 'ref', 'contig1']
+        listhit2 = ['400', '500', '500', '600', '100', '100', '100.00', '600', '600', '1', '1', 'ref', 'contig2']
+        hit1 = pymummer.alignment.Alignment('\t'.join(listhit1))
+        hit2 = pymummer.alignment.Alignment('\t'.join(listhit2))
+        nucmer_hits = {
+            'contig1': [hit1],
+            'contig2': [hit2],
+        }
+
+        tests = [
+            ('ref2', 150, None),
+            ('ref', 42, None),
+            ('ref', 98, None),
+            ('ref', 200, None),
+            ('ref', 99, hit1),
+            ('ref', 142, hit1),
+            ('ref', 199, hit1),
+            ('ref', 200, None),
+            ('ref', 398, None),
+            ('ref', 399, hit2),
+            ('ref', 442, hit2),
+            ('ref', 499, hit2),
+            ('ref', 500, None),
+        ]
+
+        for ref_name, ref_pos, expected in tests:
+            got = assembly_compare.AssemblyCompare.nucmer_hit_containing_reference_position(nucmer_hits, ref_name, ref_pos)
+            self.assertEqual(expected, got)
diff --git a/ariba/tests/assembly_test.py b/ariba/tests/assembly_test.py
new file mode 100644
index 00000000..3b4f828c
--- /dev/null
+++ b/ariba/tests/assembly_test.py
@@ -0,0 +1,126 @@
+import unittest
+import sys
+import os
+import shutil
+import filecmp
+import pyfastaq
+from ariba import assembly
+
+modules_dir = os.path.dirname(os.path.abspath(assembly.__file__))
+data_dir = os.path.join(modules_dir, 'tests', 'data')
+
+
+class TestAssembly(unittest.TestCase):
+    def test_get_assembly_kmer(self):
+        '''test _get_assembly_kmer'''
+        reads1 = os.path.join(data_dir, 'assembly_test_set_assembly_kmer_reads_1.fq')
+        reads2 = os.path.join(data_dir, 'assembly_test_set_assembly_kmer_reads_2.fq')
+        got = assembly.Assembly._get_assembly_kmer(0, reads1, reads2)
+        self.assertEqual(got, 5)
+        got = assembly.Assembly._get_assembly_kmer(42, reads1, reads2)
+        self.assertEqual(got, 42)
+
+
+    def test_assemble_with_spades(self):
+        '''test _assemble_with_spades'''
+        reads1 = os.path.join(data_dir, 'assembly_test_assemble_with_spades_reads_1.fq')
+        reads2 = os.path.join(data_dir, 'assembly_test_assemble_with_spades_reads_2.fq')
+        ref_fasta = os.path.join(data_dir, 'assembly_test_assemble_with_spades_ref.fa')
+        tmp_dir = 'tmp.test_assemble_with_spades'
+        a = assembly.Assembly(reads1, reads2, ref_fasta, tmp_dir, 'not_needed_for_this_test.fa', 'not_needed_for_this_test.bam', sys.stdout)
+        a._assemble_with_spades(unittest=True)
+        self.assertTrue(a.assembled_ok)
+        shutil.rmtree(tmp_dir)
+
+
+    def test_assemble_with_spades_fail(self):
+        '''test _assemble_with_spades handles spades fail'''
+        reads1 = os.path.join(data_dir, 'assembly_test_assemble_with_spades_reads_1.fq')
+        reads2 = os.path.join(data_dir, 'assembly_test_assemble_with_spades_reads_2.fq')
+        ref_fasta = os.path.join(data_dir, 'assembly_test_assemble_with_spades_ref.fa')
+        tmp_dir = 'tmp.test_assemble_with_spades'
+        a = assembly.Assembly(reads1, reads2, ref_fasta, tmp_dir, 'not_needed_for_this_test.fa', 'not_needed_for_this_test.bam', sys.stdout)
+        a._assemble_with_spades(unittest=False)
+        self.assertFalse(a.assembled_ok)
+        shutil.rmtree(tmp_dir)
+
+
+    def test_scaffold_with_sspace(self):
+        '''test _scaffold_with_sspace'''
+        reads1 = os.path.join(data_dir, 'assembly_test_assemble_with_spades_reads_1.fq')
+        reads2 = os.path.join(data_dir, 'assembly_test_assemble_with_spades_reads_2.fq')
+        ref_fasta = os.path.join(data_dir, 'assembly_test_assemble_with_spades_ref.fa')
+        tmp_dir = 'tmp.test_scaffold_with_sspace'
+        a = assembly.Assembly(reads1, reads2, ref_fasta, tmp_dir, 'not_needed_for_this_test.fa', 'not_needed_for_this_test.bam', sys.stdout)
+        a.assembly_contigs = os.path.join(data_dir, 'assembly_test_scaffold_with_sspace_contigs.fa')
+        a._scaffold_with_sspace()
+        self.assertTrue(os.path.exists(a.scaffolder_scaffolds))
+        shutil.rmtree(tmp_dir)
+
+
+    def test_has_gaps_to_fill(self):
+        '''test _has_gaps_to_fill'''
+        no_gaps = os.path.join(data_dir, 'assembly_test_has_gaps_to_fill.no_gaps.fa')
+        has_gaps = os.path.join(data_dir, 'assembly_test_has_gaps_to_fill.has_gaps.fa')
+        self.assertTrue(assembly.Assembly._has_gaps_to_fill(has_gaps))
+        self.assertFalse(assembly.Assembly._has_gaps_to_fill(no_gaps))
+
+
+    def test_rename_scaffolds(self):
+        '''test _rename_scaffolds'''
+        infile = os.path.join(data_dir, 'assembly_test_rename_scaffolds.in.fa')
+        outfile = os.path.join(data_dir, 'assembly_test_rename_scaffolds.out.fa')
+        tmpfile = 'tmp.fa'
+        assembly.Assembly._rename_scaffolds(infile, tmpfile, 'prefix')
+        self.assertTrue(filecmp.cmp(outfile, tmpfile, shallow=False))
+        os.unlink(tmpfile)
+
+
+    def test_gap_fill_with_gapfiller_no_gaps(self):
+        '''test _gap_fill_with_gapfiller no gaps'''
+        reads1 = os.path.join(data_dir, 'assembly_test_gapfill_with_gapfiller_reads_1.fq')
+        reads2 = os.path.join(data_dir, 'assembly_test_gapfill_with_gapfiller_reads_2.fq')
+        tmp_dir = 'tmp.gap_fill_with_gapfiller_no_gaps'
+        a = assembly.Assembly(reads1, reads2, 'ref.fa', tmp_dir, 'not_needed_for_this_test.fa', 'not_needed_for_this_test.bam', sys.stdout)
+        a.scaffolder_scaffolds = os.path.join(data_dir, 'assembly_test_gapfill_with_gapfiller.scaffolds_no_gaps.fa')
+        a._gap_fill_with_gapfiller()
+        self.assertTrue(os.path.exists(a.gapfilled_scaffolds))
+        shutil.rmtree(tmp_dir)
+
+
+    def test_gap_fill_with_gapfiller_with_gaps(self):
+        '''test _gap_fill_with_gapfiller with gaps'''
+        reads1 = os.path.join(data_dir, 'assembly_test_gapfill_with_gapfiller_reads_1.fq')
+        reads2 = os.path.join(data_dir, 'assembly_test_gapfill_with_gapfiller_reads_2.fq')
+        tmp_dir = 'tmp.gap_fill_with_gapfiller_with_gaps'
+        a = assembly.Assembly(reads1, reads2, 'ref.fa', tmp_dir, 'not_needed_for_this_test.fa', 'not_needed_for_this_test.bam', sys.stdout)
+        a.scaffolder_scaffolds = os.path.join(data_dir, 'assembly_test_gapfill_with_gapfiller.scaffolds_with_gaps.fa')
+        a._gap_fill_with_gapfiller()
+        self.assertTrue(os.path.exists(a.gapfilled_scaffolds))
+        shutil.rmtree(tmp_dir)
+
+
+    def test_fix_contig_orientation(self):
+        '''test _fix_contig_orientation'''
+        scaffs_in = os.path.join(data_dir, 'assembly_test_fix_contig_orientation.in.fa')
+        expected_out = os.path.join(data_dir, 'assembly_test_fix_contig_orientation.out.fa')
+        ref_fa = os.path.join(data_dir, 'assembly_test_fix_contig_orientation.ref.fa')
+        tmp_out = 'tmp.assembly_test_fix_contig_orientation.out.fa'
+        got = assembly.Assembly._fix_contig_orientation(scaffs_in, ref_fa, tmp_out)
+        expected = {'match_both_strands'}
+        self.assertTrue(filecmp.cmp(expected_out, tmp_out, shallow=False))
+        self.assertEqual(expected, got)
+        os.unlink(tmp_out)
+
+
+    def test_parse_bam(self):
+        '''test _parse_bam'''
+        bam = os.path.join(data_dir, 'assembly_test_parse_assembly_bam.bam')
+        assembly_fa = os.path.join(data_dir, 'assembly_test_parse_assembly_bam.assembly.fa')
+        assembly_seqs = {}
+        pyfastaq.tasks.file_to_dict(assembly_fa, assembly_seqs)
+        self.assertTrue(assembly.Assembly._parse_bam(assembly_seqs, bam, 10, 1000))
+        os.unlink(bam + '.soft_clipped')
+        os.unlink(bam + '.unmapped_mates')
+        os.unlink(bam + '.scaff')
+
diff --git a/ariba/tests/assembly_variants_test.py b/ariba/tests/assembly_variants_test.py
new file mode 100644
index 00000000..97253e54
--- /dev/null
+++ b/ariba/tests/assembly_variants_test.py
@@ -0,0 +1,380 @@
+import unittest
+import os
+import pymummer
+import pyfastaq
+from ariba import assembly_variants, reference_data, sequence_variant, sequence_metadata
+
+modules_dir = os.path.dirname(os.path.abspath(assembly_variants.__file__))
+data_dir = os.path.join(modules_dir, 'tests', 'data')
+
+
+class TestAssemblyVariants(unittest.TestCase):
+    def test_get_codon_start(self):
+        '''test _get_codon_start'''
+        tests = [
+            (0, 5, 3),
+            (0, 0, 0),
+            (0, 1, 0),
+            (0, 2, 0),
+            (1, 3, 1),
+            (2, 3, 2),
+            (3, 3, 3),
+            (3, 6, 6),
+            (3, 7, 6),
+            (3, 8, 6),
+        ]
+        for start, position, expected in tests:
+            self.assertEqual(expected, assembly_variants.AssemblyVariants._get_codon_start(start, position))
+
+
+    def test_get_mummer_variants_no_variants(self):
+        '''test _get_mummer_variants when no variants'''
+        snp_file = os.path.join(data_dir, 'assembly_variants_test_get_mummer_variants.none.snps')
+        got = assembly_variants.AssemblyVariants._get_mummer_variants(snp_file)
+        self.assertEqual({}, got)
+
+
+    def test_get_mummer_variants_has_variants(self):
+        '''test _get_mummer_variants when there are variants'''
+        snp_file = os.path.join(data_dir, 'assembly_variants_test_get_mummer_variants.snp.snps')
+        v1 = pymummer.variant.Variant(pymummer.snp.Snp('42\tA\tG\t42\t42\t42\t500\t500\t1\t1\tgene\tcontig1'))
+        v2 = pymummer.variant.Variant(pymummer.snp.Snp('42\tA\tG\t42\t42\t42\t500\t500\t1\t1\tgene\tcontig2'))
+        v3 = pymummer.variant.Variant(pymummer.snp.Snp('40\tT\tC\t40\t42\t42\t500\t500\t1\t1\tgene\tcontig1'))
+        v4 = pymummer.variant.Variant(pymummer.snp.Snp('2\tC\tG\t2\t42\t42\t500\t500\t1\t1\tgene\tcontig1'))
+        expected = {
+            'contig1': [[v4], [v3, v1]],
+            'contig2': [[v2]]
+        }
+        got = assembly_variants.AssemblyVariants._get_mummer_variants(snp_file)
+        self.assertEqual(expected, got)
+
+
+    def test_get_variant_effect(self):
+        '''test _get_variant_effect'''
+        ref_seq = pyfastaq.sequences.Fasta('gene', 'GATCGCGAAGCGATGACCCATGAAGCGACCGAACGCTGA')
+        v1 = pymummer.variant.Variant(pymummer.snp.Snp('6\tC\tT\t6\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
+        v1 = pymummer.variant.Variant(pymummer.snp.Snp('6\tC\tT\t6\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
+        v2 = pymummer.variant.Variant(pymummer.snp.Snp('4\tC\tA\t6\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
+        v3 = pymummer.variant.Variant(pymummer.snp.Snp('4\tC\tT\t4\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
+        v4 = pymummer.variant.Variant(pymummer.snp.Snp('6\tC\tA\t6\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
+        v5 = pymummer.variant.Variant(pymummer.snp.Snp('4\tC\t.\t4\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
+        v6 = pymummer.variant.Variant(pymummer.snp.Snp('4\t.\tA\t4\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
+        v7 = pymummer.variant.Variant(pymummer.snp.Snp('4\t.\tG\t4\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
+        v7.qry_base = 'GAT'
+        v8 = pymummer.variant.Variant(pymummer.snp.Snp('4\t.\tG\t4\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
+        v8.qry_base = 'TGA'
+        v9 = pymummer.variant.Variant(pymummer.snp.Snp('4\t.\tG\t4\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
+        v9.qry_base = 'ATTCCT'
+        v10 = pymummer.variant.Variant(pymummer.snp.Snp('4\tC\t.\t4\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
+        v10.ref_base = 'CGC'
+        v10.ref_end = 5
+        v11 = pymummer.variant.Variant(pymummer.snp.Snp('4\tC\t.\t4\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
+        v11.ref_base = 'CGCGAA'
+        v11.ref_end = 8
+
+        variants = [
+            ([v1], ('SYN', '.', 1)),
+            ([v2], ('NONSYN', 'R2S', 1)),
+            ([v2, v1], ('NONSYN', 'R2S', 1)),
+            ([v3, v4], ('TRUNC', 'R2trunc', 1)),
+            ([v5], ('FSHIFT', 'R2fs', 1)),
+            ([v6], ('FSHIFT', 'R2fs', 1)),
+            ([v7], ('INS', 'R2_E3insD', 1)),
+            ([v8], ('TRUNC', 'R2trunc', 1)),
+            ([v9], ('INS', 'R2_E3insIP', 1)),
+            ([v10], ('DEL', 'R2del', 1)),
+            ([v11], ('DEL', 'R2_E3del', 1)),
+        ]
+
+        for variant_list, expected in variants:
+            self.assertEqual(expected, assembly_variants.AssemblyVariants._get_variant_effect(variant_list, ref_seq))
+
+
+    def test_filter_mummer_variants(self):
+        '''test filter_mummer_variants'''
+        ref_seq = pyfastaq.sequences.Fasta('gene', 'GATCGCGAAGCGATGACCCATGAAGCGACCGAACGCTGA')
+        v1 = pymummer.variant.Variant(pymummer.snp.Snp('4\tC\tT\t4\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
+        v2 = pymummer.variant.Variant(pymummer.snp.Snp('6\tC\tA\t6\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
+        v3 = pymummer.variant.Variant(pymummer.snp.Snp('12\tG\tT\t12\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
+        mummer_variants = {'contig': [[v1, v2], v3]}
+        assembly_variants.AssemblyVariants._filter_mummer_variants(mummer_variants, ref_seq)
+        expected = {'contig': [[v1, v2]]}
+        self.assertEqual(expected, mummer_variants)
+
+
+    def test_get_one_variant_for_one_contig_non_coding(self):
+        '''test _get_one_variant_for_one_contig_non_coding'''
+        refdata = reference_data.ReferenceData(
+            non_coding_fa=os.path.join(data_dir, 'assembly_variants_test_get_variants_non_coding.fa'),
+            metadata_tsv=os.path.join(data_dir, 'assembly_variants_test_get_one_variant_for_one_contig_non_coding.metadata.tsv')
+        )
+
+        ref_sequence_name = 'non_coding'
+        refdata_var_dict = refdata.metadata[ref_sequence_name]
+
+        v0 = pymummer.variant.Variant(pymummer.snp.Snp('2\tT\tA\t2\tx\tx\t42\t42\tx\tx\tnon_coding\tcontig'))
+
+        # ref has A at position 3, which is variant type. This gives contig the wild type C. Shouldn't report
+        v1 = pymummer.variant.Variant(pymummer.snp.Snp('3\tA\tC\t3\tx\tx\t42\t42\tx\tx\tnon_coding\tcontig'))
+
+        # ref has T at position 5, which is wild type. This gives contig variant type A. Should report
+        v2 = pymummer.variant.Variant(pymummer.snp.Snp('5\tT\tA\t5\tx\tx\t42\t42\tx\tx\tnon_coding\tcontig'))
+
+        meta0 = sequence_metadata.SequenceMetadata('non_coding\tn\tC3A\tN\tref has variant type A')
+        meta2 = sequence_metadata.SequenceMetadata('non_coding\tn\tT5A\tN\tref has wild type T')
+
+        mummer_variants = [v0, v1, v2]
+
+        expected_tuples = [
+            (1, 'n', 'T2A', 'SNP', [v0], set(), set()),   #0
+            None,                                     #1
+            (4, 'n', 'T5A', 'SNP', [v2], {meta2}, set()), #2
+        ]
+
+        expected_used_variants = [
+            set(),     #0
+            {meta0},   #1
+            {meta2},   #2
+        ]
+
+        assert len(mummer_variants) == len(expected_tuples) == len(expected_used_variants)
+
+
+        for i in range(len(mummer_variants)):
+            used_known_variants = set()
+            got_tuple, got_used_variants = assembly_variants.AssemblyVariants._get_one_variant_for_one_contig_non_coding(refdata_var_dict, mummer_variants[i])
+            self.assertEqual(expected_tuples[i], got_tuple)
+            self.assertEqual(expected_used_variants[i], got_used_variants)
+
+
+    def test_get_one_variant_for_one_contig_coding(self):
+        '''test _get_one_variant_for_one_contig_coding'''
+        refdata = reference_data.ReferenceData(
+            presence_absence_fa=os.path.join(data_dir, 'assembly_variants_test_get_one_variant_for_one_contig_coding_presence_absence.fa'),
+            metadata_tsv=os.path.join(data_dir, 'assembly_variants_test_get_one_variant_for_one_contig_coding_metadata.tsv')
+        )
+
+        ref_sequence_name = 'presence_absence'
+        ref_sequence = refdata.sequence(ref_sequence_name)
+        refdata_var_dict = refdata.metadata[ref_sequence_name]
+
+        v0 = pymummer.variant.Variant(pymummer.snp.Snp('6\tT\tA\t6\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig'))
+        v1 = pymummer.variant.Variant(pymummer.snp.Snp('9\tA\tT\t9\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig'))
+        v2 = pymummer.variant.Variant(pymummer.snp.Snp('18\tG\tT\t18\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig'))
+        v3 = pymummer.variant.Variant(pymummer.snp.Snp('21\tC\tT\t21\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig'))
+        v4 = pymummer.variant.Variant(pymummer.snp.Snp('7\tA\tT\t7\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig'))
+        v5 = pymummer.variant.Variant(pymummer.snp.Snp('12\tA\tC\t11\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig'))
+
+        v6 = pymummer.variant.Variant(pymummer.snp.Snp('4\tG\t.\t4\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig'))
+        self.assertTrue(v6.update_indel(pymummer.snp.Snp('5\tA\t.\t4\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig')))
+
+        v7 = pymummer.variant.Variant(pymummer.snp.Snp('4\t.\tA\t4\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig'))
+        self.assertTrue(v7.update_indel(pymummer.snp.Snp('4\t.\tA\t5\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig')))
+
+        v8 = pymummer.variant.Variant(pymummer.snp.Snp('4\tG\t.\t4\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig'))
+        self.assertTrue(v8.update_indel(pymummer.snp.Snp('5\tA\t.\t4\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig')))
+        self.assertTrue(v8.update_indel(pymummer.snp.Snp('6\tT\t.\t4\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig')))
+
+        v9 = pymummer.variant.Variant(pymummer.snp.Snp('4\tG\t.\t4\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig'))
+        self.assertTrue(v9.update_indel(pymummer.snp.Snp('5\tA\t.\t4\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig')))
+        self.assertTrue(v9.update_indel(pymummer.snp.Snp('6\tT\t.\t4\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig')))
+        self.assertTrue(v9.update_indel(pymummer.snp.Snp('7\tA\t.\t4\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig')))
+        self.assertTrue(v9.update_indel(pymummer.snp.Snp('8\tG\t.\t4\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig')))
+        self.assertTrue(v9.update_indel(pymummer.snp.Snp('9\tA\t.\t4\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig')))
+
+        v10 = pymummer.variant.Variant(pymummer.snp.Snp('4\t.\tA\t4\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig'))
+        self.assertTrue(v10.update_indel(pymummer.snp.Snp('4\t.\tT\t5\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig')))
+        self.assertTrue(v10.update_indel(pymummer.snp.Snp('4\t.\tT\t6\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig')))
+
+        mummer_variants = [[v0], [v1], [v2], [v3], [v4], [v5], [v6], [v7], [v8], [v9], [v10]]
+
+        meta0 = sequence_metadata.SequenceMetadata('presence_absence\tp\tD2E\tN\tref has wild type D (GAT=D, GAA=E)')
+        meta4 = sequence_metadata.SequenceMetadata('presence_absence\tp\tS3R\tN\tref has variant type R (AGA=R, AGT=S)')
+
+        expected_tuples = [
+            (1, 'p', 'D2E', 'NONSYN', [v0], {meta0}, set()),    #0
+            None,                                               #1
+            (5, 'p', 'M6I', 'NONSYN', [v2], set(), set()),      #2
+            (6, 'p', '.', 'SYN', [v3], set(), set()),           #3
+            (2, 'p', 'R3trunc', 'TRUNC', [v4], set(), {meta4}), #4
+            None,                                               #5
+            (1, 'p', 'D2fs', 'FSHIFT', [v6], set(), {meta0}),   #6
+            (1, 'p', 'D2fs', 'FSHIFT', [v7], set(), {meta0}),   #7
+            (1, 'p', 'D2del', 'DEL', [v8], set(), {meta0}),     #8
+            (1, 'p', 'D2_R3del', 'DEL', [v9], set(), {meta0}),  #9
+            (1, 'p', 'D2_R3insI', 'INS', [v10], set(), {meta0}) #10
+        ]
+
+        expected_used_variants = [
+            refdata_var_dict['p'][1], #0
+            refdata_var_dict['p'][2], #1
+            set(),                    #2
+            set(),                    #3
+            refdata_var_dict['p'][2], #4
+            refdata_var_dict['p'][3], #5
+            refdata_var_dict['p'][1], #6
+            refdata_var_dict['p'][1], #7
+            refdata_var_dict['p'][1], #8
+            refdata_var_dict['p'][1], #9
+            refdata_var_dict['p'][1], #10
+        ]
+
+        assert len(mummer_variants) == len(expected_tuples) == len(expected_used_variants)
+
+        for i in range(len(mummer_variants)):
+            used_known_variants = set()
+            got_tuple, got_used_variants = assembly_variants.AssemblyVariants._get_one_variant_for_one_contig_coding(ref_sequence, refdata_var_dict, mummer_variants[i])
+            self.assertEqual(expected_tuples[i], got_tuple)
+            self.assertEqual(expected_used_variants[i], got_used_variants)
+
+
+    def test_get_remaining_known_ref_variants_amino_acids(self):
+        '''test _get_remaining_known_ref_variants with amino acids'''
+        ref_var1 = sequence_metadata.SequenceMetadata('gene1\tp\tD2E\tN\tfoo bar')
+        ref_var2 = sequence_metadata.SequenceMetadata('gene1\tp\tD3E\tN\tfoo bar baz')
+        ref_var3 = sequence_metadata.SequenceMetadata('gene1\tp\tD3I\tN\tfoo bar baz spam')
+        ref_var4 = sequence_metadata.SequenceMetadata('gene1\tp\tD10E\tN\tfoo bar baz spam egg')
+        ref_var5 = sequence_metadata.SequenceMetadata('gene1\tp\tD14E\tN\tfoo bar baz spam egg chips')
+        ref_var6 = sequence_metadata.SequenceMetadata('gene1\tp\tD15E\tN\tfoo bar baz spam egg chips')
+        ref_var7 = sequence_metadata.SequenceMetadata('gene1\tp\tD40E\tN\tfoo bar baz spam egg chips')
+
+        known_ref_variants = {
+            1: {ref_var1},
+            2: {ref_var2, ref_var3},
+            9: {ref_var4},
+            13: {ref_var5},
+            14: {ref_var6},
+            39: {ref_var7}
+        }
+
+        used_ref_variants = {ref_var3, ref_var5}
+
+        nucmer_coords = [
+            pyfastaq.intervals.Interval(6, 25),
+            pyfastaq.intervals.Interval(30, 100)
+        ]
+
+        expected = [(None, 'p', None, None, None, {x}, set()) for x in [ref_var2, ref_var6]]
+        got = assembly_variants.AssemblyVariants._get_remaining_known_ref_variants(known_ref_variants, used_ref_variants, nucmer_coords)
+        self.assertEqual(expected, got)
+
+
+    def test_get_remaining_known_ref_variants_nucleotides(self):
+        '''test _get_remaining_known_ref_variants with nucleotides'''
+        ref_var1 = sequence_metadata.SequenceMetadata('gene1\tn\tA2C\tN\tfoo bar')
+        ref_var2 = sequence_metadata.SequenceMetadata('gene1\tn\tA3C\tN\tfoo bar baz')
+        ref_var3 = sequence_metadata.SequenceMetadata('gene1\tn\tA3T\tN\tfoo bar baz spam')
+        ref_var4 = sequence_metadata.SequenceMetadata('gene1\tn\tA10C\tN\tfoo bar baz spam egg')
+        ref_var5 = sequence_metadata.SequenceMetadata('gene1\tn\tA14C\tN\tfoo bar baz spam egg chips')
+        ref_var6 = sequence_metadata.SequenceMetadata('gene1\tn\tA15C\tN\tfoo bar baz spam egg chips')
+        ref_var7 = sequence_metadata.SequenceMetadata('gene1\tn\tA40C\tN\tfoo bar baz spam egg chips')
+
+        known_ref_variants = {
+            1: {ref_var1},
+            2: {ref_var2, ref_var3},
+            9: {ref_var4},
+            13: {ref_var5},
+            14: {ref_var6},
+            39: {ref_var7}
+        }
+
+        used_ref_variants = {ref_var3, ref_var5}
+
+        nucmer_coords = [
+            pyfastaq.intervals.Interval(2, 13),
+            pyfastaq.intervals.Interval(30, 100)
+        ]
+
+        expected = [(None, 'n', None, None, None, {x}, set()) for x in [ref_var2, ref_var4, ref_var7]]
+        got = assembly_variants.AssemblyVariants._get_remaining_known_ref_variants(known_ref_variants, used_ref_variants, nucmer_coords)
+        self.assertEqual(expected, got)
+
+
+    def test_get_variants_presence_absence(self):
+        '''test get_variants presence absence genes'''
+        meta1 = sequence_metadata.SequenceMetadata('presence_absence\tp\tD2E\tN\tref has wild type D (GAT=D, GAA=E)')
+        meta2 = sequence_metadata.SequenceMetadata('presence_absence\tp\tS3R\tN\tref has variant type R (AGA=R, AGT=S)')
+        meta3 = sequence_metadata.SequenceMetadata('presence_absence\tp\tD4E\tN\tref has variant type E (GAA=E, GAC=D)')
+        meta4 = sequence_metadata.SequenceMetadata('presence_absence\tp\tA5D\tN\tref has wild type A (GCG=A, GAC=D)')
+        meta5 = sequence_metadata.SequenceMetadata('presence_absence\tp\tR13S\tY\tref and qry have wild type, but Y in column 4')
+
+        metadata_tsv = 'tmp.test_get_variants_presence_absence.metadata.tsv'
+        with open(metadata_tsv, 'w') as f:
+            print(meta1, file=f)
+            print(meta2, file=f)
+            print(meta3, file=f)
+            print(meta4, file=f)
+            print(meta5, file=f)
+
+        refdata = reference_data.ReferenceData(
+            presence_absence_fa=os.path.join(data_dir, 'assembly_variants_test_get_variants_presence_absence.fa'),
+            metadata_tsv=metadata_tsv
+        )
+
+        os.unlink(metadata_tsv)
+
+        nucmer_snp_file = os.path.join(data_dir, 'assembly_variants_test_get_variants_presence_absence.snps')
+        v1 = pymummer.variant.Variant(pymummer.snp.Snp('9\tA\tT\t9\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig1'))
+        v2 = pymummer.variant.Variant(pymummer.snp.Snp('14\tC\tA\t14\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig1'))
+        v3 = pymummer.variant.Variant(pymummer.snp.Snp('15\tG\tC\t15\tx\tx\t42\t42\tx\tx\tpresence_absence\tcontig1'))
+
+        nucmer_coords = {
+            'contig1': [pyfastaq.intervals.Interval(0, 30)],
+            'contig2': [pyfastaq.intervals.Interval(10, 41)],
+        }
+
+        expected = {
+            'contig1': [
+               (4, 'p', 'A5D', 'NONSYN', [v2, v3], {meta4}, set()),
+               (None, 'p', None, None, None, {meta3}, set()),
+            ],
+            'contig2': [
+               (None, 'p', None, None, None, {meta3}, set()),
+               (None, 'p', None, None, None, {meta5}, set()),
+            ],
+        }
+
+        a_variants = assembly_variants.AssemblyVariants(refdata, nucmer_snp_file)
+        got = a_variants.get_variants('presence_absence', nucmer_coords)
+        self.assertEqual(expected, got)
+
+
+    def test_get_variants_variants_only(self):
+        '''test get_variants variants only'''
+        meta1 = sequence_metadata.SequenceMetadata('variants_only\tp\tD2E\tN\tref has wild type D (GAT=D, GAA=E)')
+        meta2 = sequence_metadata.SequenceMetadata('variants_only\tp\tS3R\tN\tref has variant type R (AGA=R, AGT=S)')
+        meta3 = sequence_metadata.SequenceMetadata('variants_only\tp\tD4E\tN\tref has variant type E (GAA=E, GAC=D)')
+
+        metadata_tsv = 'tmp.test_get_variants_variants_only.metadata.tsv'
+        with open(metadata_tsv, 'w') as f:
+            print(meta1, file=f)
+            print(meta2, file=f)
+            print(meta3, file=f)
+
+        refdata = reference_data.ReferenceData(
+            variants_only_fa=os.path.join(data_dir, 'assembly_variants_test_get_variants_variants_only.fa'),
+            metadata_tsv=metadata_tsv
+        )
+
+        os.unlink(metadata_tsv)
+
+        nucmer_snp_file = os.path.join(data_dir, 'assembly_variants_test_get_variants_variants_only.snps')
+        v1 = pymummer.variant.Variant(pymummer.snp.Snp('9\tA\tT\t9\tx\tx\t42\t42\tx\tx\tvariants_only\tcontig1'))
+        v2 = pymummer.variant.Variant(pymummer.snp.Snp('14\tC\tA\t14\tx\tx\t42\t42\tx\tx\tvariants_only\tcontig1'))
+        v3 = pymummer.variant.Variant(pymummer.snp.Snp('15\tG\tC\t15\tx\tx\t42\t42\tx\tx\tvariants_only\tcontig1'))
+
+        nucmer_coords = {
+            'contig1': [pyfastaq.intervals.Interval(0, 41)],
+            'contig2': [pyfastaq.intervals.Interval(10, 41)],
+        }
+
+        expected = {
+            'contig1': [(None, 'p', None, None, None, {meta3}, set())],
+            'contig2': [(None, 'p', None, None, None, {meta3}, set())],
+        }
+
+        self.maxDiff = None
+        a_variants = assembly_variants.AssemblyVariants(refdata, nucmer_snp_file)
+        got = a_variants.get_variants('variants_only', nucmer_coords)
+        self.assertEqual(expected, got)
+
diff --git a/ariba/tests/best_seq_chooser_test.py b/ariba/tests/best_seq_chooser_test.py
new file mode 100644
index 00000000..80d427ae
--- /dev/null
+++ b/ariba/tests/best_seq_chooser_test.py
@@ -0,0 +1,69 @@
+import unittest
+import sys
+import os
+import pyfastaq
+from ariba import best_seq_chooser, external_progs
+
+modules_dir = os.path.dirname(os.path.abspath(best_seq_chooser.__file__))
+data_dir = os.path.join(modules_dir, 'tests', 'data')
+extern_progs = external_progs.ExternalProgs()
+
+
+class TestBestSeqChooser(unittest.TestCase):
+    def test_total_alignment_score(self):
+        '''test _total_alignment_score'''
+        reads1 = os.path.join(data_dir, 'best_seq_chooser_total_alignment_score_reads_1.fq')
+        reads2 = os.path.join(data_dir, 'best_seq_chooser_total_alignment_score_reads_2.fq')
+        ref = os.path.join(data_dir, 'best_seq_chooser_total_alignment_score_ref_seqs.fa')
+        chooser = best_seq_chooser.BestSeqChooser(
+            reads1,
+            reads2,
+            ref,
+            sys.stdout,
+            samtools_exe=extern_progs.exe('samtools'),
+            bowtie2_exe=extern_progs.exe('bowtie2'),
+        )
+        self.assertEqual(3000, chooser._total_alignment_score('1'))
+
+
+    def test_get_best_seq_by_alignment_score(self):
+        '''test _get_best_seq_by_alignment_score'''
+        reads1 = os.path.join(data_dir, 'best_seq_chooser_get_best_seq_by_alignment_score_reads_1.fq')
+        reads2 = os.path.join(data_dir, 'best_seq_chooser_get_best_seq_by_alignment_score_reads_2.fq')
+        ref = os.path.join(data_dir, 'best_seq_chooser_get_best_seq_by_alignment_score_ref.fa')
+        chooser = best_seq_chooser.BestSeqChooser(
+            reads1,
+            reads2,
+            ref,
+            sys.stdout,
+            samtools_exe=extern_progs.exe('samtools'),
+            bowtie2_exe=extern_progs.exe('bowtie2'),
+        )
+        self.assertEqual('1', chooser._get_best_seq_by_alignment_score())
+
+
+    def test_best_seq(self):
+        '''test best_seq'''
+        reads1 = os.path.join(data_dir, 'best_seq_chooser_best_seq_reads_1.fq')
+        reads2 = os.path.join(data_dir, 'best_seq_chooser_best_seq_reads_2.fq')
+        ref = os.path.join(data_dir, 'best_seq_chooser_best_seq_ref.fa')
+        expected_seq = pyfastaq.sequences.Fasta('1', ''.join([
+            'AGCGCCTAGCTTTGGCACTTCAGGAGCGCCCGGAAATAATGGCGGGCGATGAAGGTTCTG',
+            'TAGGTACGCAAGATCCCTCTTAATCACAGTGGTGTAATCTGCGGGTCAGACCCTGTTAAC',
+            'CCGTGGCTTTCACACTCCCTCCTATGGGTAATCAATCCAGAAAGGGGCCGAAATGCAAAA',
+            'GTCTTAAGGACTCTGCGAGGCAAAGTACGGGCGAACTAAACCCCCGTGACAGGTCAGACG',
+            'TTGTTTCGGCAATCTGTCGCGCTCCCACACCTATAAGCGTACACCGTCTCTTCTGCCAGC',
+        ]))
+
+        tmp_file = 'tmp.best_seq.fa'
+        chooser = best_seq_chooser.BestSeqChooser(
+            reads1,
+            reads2,
+            ref,
+            sys.stdout,
+            samtools_exe=extern_progs.exe('samtools'),
+            bowtie2_exe=extern_progs.exe('bowtie2'),
+        )
+        got_seq = chooser.best_seq(tmp_file)
+        self.assertEqual(expected_seq, got_seq)
+        os.unlink(tmp_file)
diff --git a/ariba/tests/cdhit_test.py b/ariba/tests/cdhit_test.py
index dcb1aec7..c0421d7c 100644
--- a/ariba/tests/cdhit_test.py
+++ b/ariba/tests/cdhit_test.py
@@ -1,10 +1,11 @@
 import unittest
 import os
 import filecmp
-from ariba import cdhit
+from ariba import cdhit, external_progs
 
 modules_dir = os.path.dirname(os.path.abspath(cdhit.__file__))
 data_dir = os.path.join(modules_dir, 'tests', 'data')
+extern_progs = external_progs.ExternalProgs()
 
 class TestCdhit(unittest.TestCase):
     def test_init_fail_infile_missing(self):
@@ -13,55 +14,25 @@ def test_init_fail_infile_missing(self):
             r = cdhit.Runner('oopsnotafile', 'out')
 
 
-    def test_enumerate_fasta(self):
-        '''test _enumerate_fasta'''
-        infile = os.path.join(data_dir, 'cdhit_test_enumerate_fasta.in.fa')
-        expected_outfile = os.path.join(data_dir, 'cdhit_test_enumerate_fasta.out.fa')
-        tmpfile = 'tmp.test_enumerate_fasta.out.fa'
-        expected_dict = {'1': 'a', '2': 'b', '3': 'c'}
-        r = cdhit.Runner(infile, 'out')
-        got_dict = r._enumerate_fasta(infile, tmpfile)
-        self.assertTrue(filecmp.cmp(expected_outfile, tmpfile, shallow=False))
-        self.assertEqual(expected_dict, got_dict)
-        os.unlink(tmpfile)
-
-
     def test_get_ids(self):
         '''test _get_ids'''
         infile = os.path.join(data_dir, 'cdhit_test_get_ids.fa')
         expected = {'id1', 'id2', 'id3'}
-        r = cdhit.Runner(infile, 'out')
+        r = cdhit.Runner(infile, 'out', cd_hit_est=extern_progs.exe('cdhit'))
         got = r._get_ids(infile)
         self.assertEqual(expected, got)
 
 
     def test_parse_cluster_info_file(self):
         '''test _parse_cluster_info_file'''
-        infile = os.path.join(data_dir, 'cdhit_test_parse_cluster_info_file.in.fa')
-        r = cdhit.Runner(infile, 'out')
-        names_dict = {str(i): 'seq' + str(i) for i in range(1,5)}
-        cluster_representatives = {'1', '4'}
-        cluster_file = os.path.join(data_dir, 'cdhit_test_parse_cluster_info_file.out.fa.bak.clstr')
-        got_clusters, got_reps = r._parse_cluster_info_file(cluster_file, names_dict, cluster_representatives)
+        cluster_representatives = {'seq1', 'seq4'}
+        infile = os.path.join(data_dir, 'cdhit_test_parse_cluster_info_file.infile')
+        got_clusters = cdhit.Runner._parse_cluster_info_file(infile, cluster_representatives)
         expected_clusters = {
-            '0': {'seq1', 'seq2', 'seq3'},
-            '1': {'seq4'}
+            'seq1': {'seq1', 'seq2', 'seq3'},
+            'seq4': {'seq4'}
         }
-        expected_reps = {'1': '0', '4': '1'}
         self.assertEqual(expected_clusters, got_clusters)
-        self.assertEqual(expected_reps, got_reps)
-
-
-    def test_rename_fasta(self):
-        '''test _rename_fasta'''
-        infile = os.path.join(data_dir, 'cdhit_test_rename_fasta.in.fa')
-        tmpfile = 'tmp.rename_fasta.out.fa'
-        expected = os.path.join(data_dir, 'cdhit_test_rename_fasta.out.fa')
-        names_dict = {'a': 'seq1', 'b': 'seq2', 'c': 'seq3'}
-        r = cdhit.Runner(infile, 'out')
-        r._rename_fasta(infile, tmpfile, names_dict)
-        self.assertTrue(filecmp.cmp(expected, tmpfile, shallow=False))
-        os.unlink(tmpfile)
 
 
     def test_run(self):
@@ -69,11 +40,11 @@ def test_run(self):
         infile = os.path.join(data_dir, 'cdhit_test_run.in.fa')
         expected_outfile = os.path.join(data_dir, 'cdhit_test_run.out.fa')
         tmpfile = 'tmp.cdhit_test_run.out.fa'
-        r = cdhit.Runner(infile, tmpfile)
+        r = cdhit.Runner(infile, tmpfile, cd_hit_est=extern_progs.exe('cdhit'))
         clusters = r.run()
         expected_clusters = {
-            '0': {'seq1', 'seq2', 'seq3'},
-            '1': {'seq4'},
+            'seq1': {'seq1', 'seq2', 'seq3'},
+            'seq4': {'seq4'},
         }
         self.assertEqual(clusters, expected_clusters)
         self.assertTrue(filecmp.cmp(tmpfile, expected_outfile, shallow=False))
@@ -85,13 +56,13 @@ def test_fake_run(self):
         infile = os.path.join(data_dir, 'cdhit_test_fake_run.in.fa')
         expected_outfile = os.path.join(data_dir, 'cdhit_test_fake_run.out.fa')
         tmpfile = 'tmp.cdhit_test_fake_run.out.fa'
-        r = cdhit.Runner(infile, tmpfile)
+        r = cdhit.Runner(infile, tmpfile, cd_hit_est=extern_progs.exe('cdhit'))
         clusters = r.fake_run()
         expected_clusters = {
-            '0': {'seq1'},
-            '1': {'seq2'},
-            '2': {'seq3'},
-            '3': {'seq4'},
+            'seq1': {'seq1'},
+            'seq2': {'seq2'},
+            'seq3': {'seq3'},
+            'seq4': {'seq4'},
         }
         self.assertEqual(clusters, expected_clusters)
         self.assertTrue(filecmp.cmp(tmpfile, expected_outfile, shallow=False))
@@ -102,7 +73,7 @@ def test_fake_run_fail(self):
         '''test fake_run with non-unique names'''
         infile = os.path.join(data_dir, 'cdhit_test_fake_run.non-unique.in.fa')
         tmpfile = 'tmp.cdhit_test_fake_run.out.non-unique.fa'
-        r = cdhit.Runner(infile, tmpfile)
+        r = cdhit.Runner(infile, tmpfile, cd_hit_est=extern_progs.exe('cdhit'))
         with self.assertRaises(cdhit.Error):
             clusters = r.fake_run()
         os.unlink(tmpfile)
diff --git a/ariba/tests/cluster_test.py b/ariba/tests/cluster_test.py
index b6c47468..24f5a4ee 100644
--- a/ariba/tests/cluster_test.py
+++ b/ariba/tests/cluster_test.py
@@ -6,7 +6,7 @@
 import pyfastaq
 import pysam
 import pymummer
-from ariba import cluster, flag
+from ariba import cluster, flag, reference_data
 
 modules_dir = os.path.dirname(os.path.abspath(cluster.__file__))
 data_dir = os.path.join(modules_dir, 'tests', 'data')
@@ -32,809 +32,174 @@ def clean_cluster_dir(d, exclude=None):
                 os.unlink(full_path)
 
 
-def file2lines(filename):
-    f = pyfastaq.utils.open_file_read(filename)
-    lines = f.readlines()
-    pyfastaq.utils.close(f)
-    return lines
-
-
-def load_gene(filename):
-    file_reader = pyfastaq.sequences.file_reader(filename)
-    seq = None
-    for seq in file_reader:
-        pass
-    return seq
-
-
 class TestCluster(unittest.TestCase):
     def test_init_fail_files_missing(self):
         '''test init_fail_files_missing'''
+        refdata_fa = os.path.join(data_dir, 'cluster_test_init_refdata.fa')
+        refdata = reference_data.ReferenceData(presence_absence_fa=refdata_fa)
+
         dirs = [
-            'cluster_test_directorynotexist'
-            'cluster_test_init_no_genes_fa',
+            'cluster_test_init_no_refs_fa',
             'cluster_test_init_no_reads_1',
             'cluster_test_init_no_reads_2',
         ]
         dirs = [os.path.join(data_dir, d) for d in dirs]
         for d in dirs:
-            clean_cluster_dir(d)
+            tmpdir = 'tmp.cluster_test_init_fail_files_missing'
+            shutil.copytree(d, tmpdir)
             with self.assertRaises(cluster.Error):
-                c = cluster.Cluster(d, 'name')
-            clean_cluster_dir(d)
+                c = cluster.Cluster(tmpdir, 'name', refdata=refdata)
+            shutil.rmtree(tmpdir)
+
+        with self.assertRaises(cluster.Error):
+            c = cluster.Cluster('directorydoesnotexistshouldthrowerror', 'name', refdata=refdata)
 
 
-    def test_get_read_counts(self):
-        '''test _get_read_counts pass'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_get_read_counts')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        self.assertEqual(2, c._get_read_counts())
-        clean_cluster_dir(cluster_dir)
+    def test_count_reads(self):
+        '''test _count_reads pass'''
+        reads1 = os.path.join(data_dir, 'cluster_test_count_reads_1.fq')
+        reads2 = os.path.join(data_dir, 'cluster_test_count_reads_2.fq')
+        self.assertEqual(4, cluster.Cluster._count_reads(reads1, reads2))
 
 
-    def test_get_read_counts_fail(self):
-        '''test _get_read_counts fail'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_get_read_counts_fail')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        with self.assertRaises(cluster.Error):
-            c._get_read_counts()
-        clean_cluster_dir(cluster_dir)
-
-
-    def test_get_total_alignment_score(self):
-        '''test _get_total_alignment_score'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_get_total_alignment_score')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        got_score = c._get_total_alignment_score('1')
-        expected_score = 3000
-        self.assertEqual(got_score, expected_score)
-        clean_cluster_dir(cluster_dir)
-
-
-    def test_get_best_gene_by_alignment_score(self):
-        '''test _get_best_gene_by_alignment_score'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_get_best_gene_by_alignment_score')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        got_name = c._get_best_gene_by_alignment_score()
-        self.assertEqual(got_name, '1')
-        clean_cluster_dir(cluster_dir)
-
-
-    def test_choose_best_gene(self):
-        '''test _choose_best_gene'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_choose_best_gene')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        expected_gene = pyfastaq.sequences.Fasta('1', ''.join([
-            'AGCGCCTAGCTTTGGCACTTCAGGAGCGCCCGGAAATAATGGCGGGCGATGAAGGTTCTG',
-            'TAGGTACGCAAGATCCCTCTTAATCACAGTGGTGTAATCTGCGGGTCAGACCCTGTTAAC',
-            'CCGTGGCTTTCACACTCCCTCCTATGGGTAATCAATCCAGAAAGGGGCCGAAATGCAAAA',
-            'GTCTTAAGGACTCTGCGAGGCAAAGTACGGGCGAACTAAACCCCCGTGACAGGTCAGACG',
-            'TTGTTTCGGCAATCTGTCGCGCTCCCACACCTATAAGCGTACACCGTCTCTTCTGCCAGC',
-        ]))
-        expected_gene_fa = os.path.join(data_dir, 'cluster_test_choose_best_gene.gene.fa')
-        got = c._choose_best_gene()
-        self.assertEqual(got, expected_gene)
-        self.assertTrue(filecmp.cmp(expected_gene_fa, c.gene_fa, shallow=False))
-        clean_cluster_dir(cluster_dir)
-
-
-    def test_set_assembly_kmer(self):
-        '''test _set_assembly_kmer'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_set_assembly_kmer')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name', assembly_kmer=42)
-        self.assertEqual(c.assembly_kmer, 42)
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(os.path.join(data_dir, 'cluster_test_set_assembly_kmer'), 'name')
-        self.assertEqual(c.assembly_kmer, 5)
-        clean_cluster_dir(cluster_dir)
-
-
-    def test_assemble_with_spades(self):
-        '''test _assemble_with_spades'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_assemble_with_spades')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        shutil.copyfile(os.path.join(data_dir, 'cluster_test_assemble_with_spades.gene.fa'), c.gene_fa)
-        c._assemble_with_spades(unittest=True)
-        self.assertEqual(c.status_flag.to_number(), 0)
-        clean_cluster_dir(cluster_dir)
-
-
-    def test_assemble_with_spades_fail(self):
-        '''test _assemble_with_spades handles spades fail'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_assemble_with_spades')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        shutil.copyfile(os.path.join(data_dir, 'cluster_test_assemble_with_spades.gene.fa'), c.gene_fa)
-        c._assemble_with_spades()
-        self.assertEqual(c.status_flag.to_number(), 64)
-        clean_cluster_dir(cluster_dir)
-
-
-    def test_scaffold_with_sspace(self):
-        '''test _scaffold_with_sspace'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_scaffold_with_sspace')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        shutil.copyfile(
-            os.path.join(data_dir, 'cluster_test_scaffold_with_sspace.contigs.fa'),
-            c.assembly_contigs
+    def test_full_run_choose_ref_fail(self):
+        '''test complete run of cluster when choosing ref seq fails'''
+        refdata = reference_data.ReferenceData(
+            presence_absence_fa=os.path.join(data_dir, 'cluster_test_full_run_choose_ref_fail.presence_absence.fa')
         )
-        #shutil.copyfile(os.path.join(data_dir, 'cluster_test_scaffold_with_sspace.gene.fa'), c.gene_fa)
-        c._scaffold_with_sspace()
-        self.assertTrue(os.path.exists(c.scaffolder_scaffolds))
-        clean_cluster_dir(cluster_dir)
-
-
-    def test_gap_fill_with_gapfiller_no_gaps(self):
-        '''test _gap_fill_with_gapfiller no gaps'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_gapfill_with_gapfiller')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        shutil.copyfile(
-            os.path.join(data_dir, 'cluster_test_gapfill_with_gapfiller.scaffolds_no_gaps.fa'),
-            c.scaffolder_scaffolds
+        tmpdir = 'tmp.test_full_run_choose_ref_fail'
+        shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_choose_ref_fail'), tmpdir)
+
+        c = cluster.Cluster(tmpdir, 'cluster_name', refdata)
+        c.run()
+
+        expected = '\t'.join(['.', '.', '1088', '2', 'cluster_name'] + ['.'] * 23)
+        self.assertEqual([expected], c.report_lines)
+        self.assertTrue(c.status_flag.has('ref_seq_choose_fail'))
+        self.assertTrue(c.status_flag.has('assembly_fail'))
+        shutil.rmtree(tmpdir)
+
+
+    def test_full_run_assembly_fail(self):
+        '''test complete run of cluster when assembly fails'''
+        refdata = reference_data.ReferenceData(
+            non_coding_fa=os.path.join(data_dir, 'cluster_test_full_run_assembly_fail.noncoding.fa')
         )
-        c.gene = pyfastaq.sequences.Fasta('name_of_gene', 'AAACCCGGGTTT')
-        c._gap_fill_with_gapfiller()
-        self.assertTrue(os.path.exists(c.gapfilled_scaffolds))
-        clean_cluster_dir(cluster_dir)
-
-
-    def test_gap_fill_with_gapfiller_with_gaps(self):
-        '''test _gap_fill_with_gapfiller with gaps'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_gapfill_with_gapfiller')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        shutil.copyfile(
-            os.path.join(data_dir, 'cluster_test_gapfill_with_gapfiller.scaffolds_with_gaps.fa'),
-            c.scaffolder_scaffolds
+        tmpdir = 'tmp.test_full_run_assembly_fail'
+        shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_assembly_fail'), tmpdir)
+
+        c = cluster.Cluster(tmpdir, 'cluster_name', refdata)
+        c.run()
+
+        expected = '\t'.join(['noncoding_ref_seq', 'non_coding', '64', '4', 'cluster_name'] + ['.'] * 23)
+        self.assertEqual([expected], c.report_lines)
+        self.assertFalse(c.status_flag.has('ref_seq_choose_fail'))
+        self.assertTrue(c.status_flag.has('assembly_fail'))
+        shutil.rmtree(tmpdir)
+
+
+    def test_full_run_ok_non_coding(self):
+        '''test complete run of cluster on a noncoding sequence'''
+        refdata = reference_data.ReferenceData(
+            non_coding_fa=os.path.join(data_dir, 'cluster_test_full_run_ok_non_coding.fa'),
+            metadata_tsv=os.path.join(data_dir, 'cluster_test_full_run_ok_non_coding.metadata.tsv')
         )
-        c.gene = pyfastaq.sequences.Fasta('name_of_gene', 'AAACCCGGGTTT')
-        c._gap_fill_with_gapfiller()
-        self.assertTrue(os.path.exists(c.gapfilled_scaffolds))
-        clean_cluster_dir(cluster_dir)
-
-
-    def test_rename_scaffolds(self):
-        '''test _rename_scaffolds'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_rename_scaffolds')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        c.gene = pyfastaq.sequences.Fasta('name_of_gene', 'AAACCCGGGTTT')
-        infile = os.path.join(data_dir, 'cluster_test_rename_scaffolds.in.fa')
-        outfile = os.path.join(data_dir, 'cluster_test_rename_scaffolds.out.fa')
-        tmpfile = 'tmp.fa'
-        c._rename_scaffolds(infile, tmpfile)
-        self.assertTrue(filecmp.cmp(outfile, tmpfile, shallow=False))
-        os.unlink(tmpfile)
-        clean_cluster_dir(cluster_dir)
-
-
-    def test_fix_contig_orientation(self):
-        '''test _fix_contig_orientation'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_fix_contig_orientation')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        scaffs_in = os.path.join(data_dir, 'cluster_test_fix_contig_orientation.in.fa')
-        scaffs_out = os.path.join(data_dir, 'cluster_test_fix_contig_orientation.out.fa')
-        shutil.copyfile(scaffs_in, c.gapfilled_scaffolds)
-        shutil.copyfile(os.path.join(data_dir, 'cluster_test_fix_contig_orientation.gene.fa'), c.gene_fa)
-        c._fix_contig_orientation()
-        self.assertTrue(filecmp.cmp(scaffs_out, c.final_assembly_fa, shallow=False))
-        clean_cluster_dir(cluster_dir)
-
-
-    def test_load_final_contigs(self):
-        '''test _load_final_contigs'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_load_final_contigs')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        contigs_file = os.path.join(data_dir, 'cluster_test_load_final_contigs.contigs.fa')
-        shutil.copyfile(contigs_file, c.final_assembly_fa)
-        c._load_final_contigs()
-        expected = {
-            'spam': pyfastaq.sequences.Fasta('spam', 'ACGT'),
-            'egg1': pyfastaq.sequences.Fasta('egg1', 'TGCA'),
-            'egg2': pyfastaq.sequences.Fasta('egg2', 'AAAA'),
-        }
-        self.assertEqual(expected, c.final_assembly)
-        clean_cluster_dir(cluster_dir)
-
-
-    def test_parse_assembly_vs_gene_coords(self):
-        '''test _parse_assembly_vs_gene_coords'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_parse_assembly_vs_gene_coords')
-        coords_file = os.path.join(data_dir, 'cluster_test_parse_assembly_vs_gene_coords.coords')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        shutil.copyfile(coords_file, c.assembly_vs_gene_coords)
-        c.gene = pyfastaq.sequences.Fasta('gene', 'AAACCCGGGTTT')
-        c._parse_assembly_vs_gene_coords()
-        line1 = ['1', '1000', '1', '1000', '1000', '1000', '100.00', '1000', '1000', '1', '1', 'gene', 'contig1']
-        line2 = ['1', '240', '1', '240', '240', '240', '100.00', '1000', '580', '1', '1', 'gene', 'contig2']
-        line3 = ['661', '1000', '241', '580', '340', '340', '100.00', '1000', '580', '1', '1', 'gene', 'contig2']
-        expected = {
-            'contig1': [pymummer.alignment.Alignment('\t'.join(line1))],
-            'contig2': [pymummer.alignment.Alignment('\t'.join(line2)), pymummer.alignment.Alignment('\t'.join(line3))],
-        }
-        self.assertEqual(expected, c.nucmer_hits)
-        clean_cluster_dir(cluster_dir)
-
-
-    def test_parse_assembly_bam(self):
-        '''test _parse_assembly_bam'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_parse_assembly_bam')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        bam = os.path.join(data_dir, 'cluster_test_parse_assembly_bam.bam')
-        assembly_fa = os.path.join(data_dir, 'cluster_test_parse_assembly_bam.assembly.fa')
-        shutil.copyfile(bam, c.final_assembly_bam)
-        shutil.copy(assembly_fa, c.final_assembly_fa)
-        c._load_final_contigs()
-        c._parse_assembly_bam()
-        for e in ['scaff', 'soft_clipped', 'unmapped_mates']:
-            self.assertTrue(os.path.exists(c.final_assembly_bam + '.' + e))
-        clean_cluster_dir(cluster_dir)
-
-
-    def test_nucmer_hits_to_percent_identity(self):
-        '''test _nucmer_hits_to_percent_identity'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_generic')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        hits = [
-            ['1', '10', '1', '10', '10', '10', '90.00', '1000', '1000', '1', '1', 'gene', 'scaff1'],
-            ['9', '42', '9', '42', '34', '34', '100.00', '1000', '1000', '1', '1', 'gene', 'scaff1'],
-            ['1', '42', '1', '42', '42', '42', '42.42', '1000', '1000', '1', '1', 'gene', 'scaff2'],
-        ]
-        c.nucmer_hits = {
-            'scaff1': [
-                pymummer.alignment.Alignment('\t'.join(hits[0])),
-                pymummer.alignment.Alignment('\t'.join(hits[1])),
-            ],
-            'scaff2': [
-                pymummer.alignment.Alignment('\t'.join(hits[2])),
-            ]
-        }
-        expected = {'scaff1': round((90*10 + 100*34) / (10+34), 2), 'scaff2': 42.42}
-        c._nucmer_hits_to_percent_identity()
-        self.assertEqual(expected, c.percent_identities)
-
-
-    def test_nucmer_hits_to_scaff_coords(self):
-        '''test _nucmer_hits_to_scaff_coords'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_generic')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        hits = [
-            ['1', '10', '1', '10', '10', '10', '100.00', '1000', '1000', '1', '1', 'gene', 'scaff1'],
-            ['9', '42', '9', '42', '34', '34', '100.00', '1000', '1000', '1', '1', 'gene', 'scaff1'],
-            ['50', '52', '50', '52', '3', '3', '100.00', '1000', '1000', '1', '1', 'gene', 'scaff1'],
-            ['1', '42', '1', '42', '42', '42', '100.00', '1000', '1000', '1', '1', 'gene', 'scaff2'],
-        ]
-        c.nucmer_hits = {
-            'scaff1': [
-                pymummer.alignment.Alignment('\t'.join(hits[0])),
-                pymummer.alignment.Alignment('\t'.join(hits[1])),
-                pymummer.alignment.Alignment('\t'.join(hits[2])),
-            ],
-            'scaff2': [
-                pymummer.alignment.Alignment('\t'.join(hits[3])),
-            ]
-        }
-        got = c._nucmer_hits_to_scaff_coords()
-        expected = {
-            'scaff1': [
-                pyfastaq.intervals.Interval(0, 41),
-                pyfastaq.intervals.Interval(49, 51)
-            ],
-            'scaff2': [
-                pyfastaq.intervals.Interval(0, 41),
-            ]
-        }
-        self.assertEqual(got, expected)
-        clean_cluster_dir(cluster_dir)
-
-
-    def test_nucmer_hits_to_ref_coords(self):
-        '''test _nucmer_hits_to_ref_coords'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_generic')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        hits = [
-            ['1', '42', '1', '42', '42', '42', '100.00', '1000', '1000', '1', '1', 'gene', 'contig1'],
-            ['100', '142', '200', '242', '42', '42', '99.42', '1000', '1000', '1', '1', 'gene', 'contig1'],
-            ['100', '110', '200', '210', '11', '11', '99.42', '1000', '1000', '1', '1', 'gene', 'contig2'],
-        ]
-        c.nucmer_hits = {
-            'contig1': [
-                pymummer.alignment.Alignment('\t'.join(hits[0])),
-                pymummer.alignment.Alignment('\t'.join(hits[1])),
-            ],
-            'contig2': [
-                pymummer.alignment.Alignment('\t'.join(hits[2])),
-            ]
-        }
-        got_coords = c._nucmer_hits_to_ref_coords()
+
+        tmpdir = 'tmp.test_full_run_ok_non_coding'
+        shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_ok_non_coding'), tmpdir)
+
+        c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler')
+        c.run()
+
         expected = [
-            pyfastaq.intervals.Interval(0,41),
-            pyfastaq.intervals.Interval(99, 109),
-            pyfastaq.intervals.Interval(99, 141),
+            'noncoding1\tnon_coding\t531\t72\tcluster_name\t120\t120\t95.87\tnoncoding1.scaffold.1\t234\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t73\t73\tT\t19\t.\t19\tnoncoding1_n_A14T_N_ref has wild type, reads has variant so should report\tgeneric description of noncoding1',
+            'noncoding1\tnon_coding\t531\t72\tcluster_name\t120\t120\t95.87\tnoncoding1.scaffold.1\t234\t0\t.\tn\t.\t0\tG61T\tSNP\t60\t60\tG\t120\t120\tT\t24\t.\t24\t.\tgeneric description of noncoding1',
+            'noncoding1\tnon_coding\t531\t72\tcluster_name\t120\t120\t95.87\tnoncoding1.scaffold.1\t234\t0\t.\tn\t.\t0\t.82C\tINS\t81\t81\t.\t142\t142\tC\t23\t.\t23\t.\tgeneric description of noncoding1',
+            'noncoding1\tnon_coding\t531\t72\tcluster_name\t120\t120\t95.87\tnoncoding1.scaffold.1\t234\t0\t.\tn\t.\t0\tT108.\tDEL\t107\t107\tT\t167\t167\t.\t17\t.\t17\t.\tgeneric description of noncoding1',
+            'noncoding1\tnon_coding\t531\t72\tcluster_name\t120\t120\t95.87\tnoncoding1.scaffold.1\t234\t1\tSNP\tn\tA6G\t1\t.\t.\t6\t6\tG\t66\t66\tG\t19\t.\t19\tnoncoding1_n_A6G_N_variant in ref and reads so should report\tgeneric description of noncoding1'
         ]
-        self.assertEqual(got_coords, expected)
 
-        got_coords = c._nucmer_hits_to_ref_coords(contig='contig2')
+        self.assertEqual(expected, c.report_lines)
+        shutil.rmtree(tmpdir)
+
+
+    def test_full_run_ok_presence_absence(self):
+        '''test complete run of cluster on a presence absence gene'''
+        refdata = reference_data.ReferenceData(
+            presence_absence_fa=os.path.join(data_dir, 'cluster_test_full_run_ok_presence_absence.fa'),
+            metadata_tsv=os.path.join(data_dir, 'cluster_test_full_run_ok_presence_absence.metadata.tsv'),
+        )
+
+        tmpdir = 'tmp.cluster_test_full_run_ok_presence_absence'
+        shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_ok_presence_absence'), tmpdir)
+
+        c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler')
+        c.run()
+
         expected = [
-            pyfastaq.intervals.Interval(99, 109),
-        ]
-        self.assertEqual(got_coords, expected)
-        clean_cluster_dir(cluster_dir)
-
-
-    def test_nucmer_hits_to_gene_cov_per_contig(self):
-        '''test _nucmer_hits_to_gene_cov_per_contig'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_generic')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        hits = [
-            ['1', '42', '1', '42', '42', '42', '100.00', '1000', '1000', '1', '1', 'gene', 'contig1'],
-            ['100', '142', '200', '242', '42', '42', '99.42', '1000', '1000', '1', '1', 'gene', 'contig1'],
-            ['100', '110', '200', '210', '11', '11', '99.42', '1000', '1000', '1', '1', 'gene', 'contig2'],
-        ]
-        c.nucmer_hits = {
-            'contig1': [
-                pymummer.alignment.Alignment('\t'.join(hits[0])),
-                pymummer.alignment.Alignment('\t'.join(hits[1])),
-            ],
-            'contig2': [
-                pymummer.alignment.Alignment('\t'.join(hits[2])),
-            ]
-        }
-
-        expected = {'contig1': 85, 'contig2': 11}
-        self.assertEqual(expected, c._nucmer_hits_to_gene_cov_per_contig())
-        clean_cluster_dir(cluster_dir)
-
-
-    def test_nucmer_hits_to_assembled_gene_sequences(self):
-        '''test _nucmer_hits_to_assembled_gene_sequences'''
-        ref_gene = pyfastaq.sequences.Fasta('ref_gene', 'ATGGTACAAGACGGCCCTTTGCAGTCCTGTGTACTTGCGGGTCGCTCCTTTGCATTGAATTATCGAACATCGTCGCGTTCAAGATCCCGCGAAAAAAATTATAGATCGCAGGATATCACTGCCAGTGGCATCTGTGTAAGCGCTTAG')
-        assembly = {
-            'contig1': pyfastaq.sequences.Fasta('contig1', 'CATCTATGCTGCATCGATCACTGACGTATCATCATCAGCGTACTGACGTATTAGTTTGTAATGGTACAAGACGGCCCTTTGCAGTCCTGTGTACTTGCGGGTCGCTCCTTTGCATTGAATTATCGAACATCGTCGCGTTCAAGATCCCGCGAAAAAAATTATAGATCGCAGGATATCACTGCCAGTGGCATCTGTGTAAGCGCTTAGACGTCGTACTACTGTATATGCATCGATCTGAA'),
-            'contig2': pyfastaq.sequences.Fasta('contig2', 'AGTGATATCCTGCGATCTATAATTTTTTTCGCGGGATCTTGAACGCGACGATGTTCGATAATTCAATGCAAAGGAGCGACCCGCAAGTACACAGGACTGCAAA')
-        }
-
-        hits = [
-            ['1', '147', '61', '207', '147', '147', '100.00', '147', '239', '1', '1', 'ref_gene', 'contig1'],
-            ['18', '120', '103', '1', '103', '103', '100.00', '147', '103', '1', '-1', 'ref_gene', 'contig2']
-        ]
-        nucmer_hits = {
-            'contig1': [
-                pymummer.alignment.Alignment('\t'.join(hits[0])),
-            ],
-            'contig2': [
-                pymummer.alignment.Alignment('\t'.join(hits[1])),
-            ]
-        }
-
-        assembly_fasta = os.path.join(data_dir, 'cluster_test_nucmer_hits_to_assembled_gene_sequences.assembly.fa')
-        tmp_outfile = 'tmp.test_nucmer_hits_to_assembled_gene_sequences.out.fa'
-        expected_outfile = os.path.join(data_dir, 'cluster_test_nucmer_hits_to_assembled_gene_sequences.expected.out.fa')
-        cluster.Cluster._nucmer_hits_to_assembled_gene_sequences(nucmer_hits, ref_gene, assembly, tmp_outfile)
-        self.assertTrue(filecmp.cmp(tmp_outfile, expected_outfile, shallow=False))
-        os.unlink(tmp_outfile)
-
-
-    def test_whole_gene_covered_by_nucmer_hits(self):
-        '''test _whole_gene_covered_by_nucmer_hits'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_generic')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        c.gene = pyfastaq.sequences.Fasta('gene', 'ACGTGTGCAT')
-        hit1 = ['1', '10', '1', '10', '10', '10', '100.00', '10', '10', '1', '1', 'gene', 'contig1']
-        hit2 = ['1', '5', '1', '5', '5', '5', '100.00', '10', '10', '1', '1', 'gene', 'contig2']
-        hit3 = ['6', '10', '6', '10', '5', '5', '100.00', '10', '10', '1', '1', 'gene', 'contig2']
-        nucmer_hits = [
-            {'contig1': [pymummer.alignment.Alignment('\t'.join(hit1))]},
-            {'contig2': [pymummer.alignment.Alignment('\t'.join(hit2))]},
-            {'contig2': [pymummer.alignment.Alignment('\t'.join(hit2)), pymummer.alignment.Alignment('\t'.join(hit3))]}
-        ]
-        expected = [True, False, True]
-        for i in range(len(nucmer_hits)):
-            c.nucmer_hits = nucmer_hits[i]
-            self.assertEqual(expected[i], c._whole_gene_covered_by_nucmer_hits())
-
-        clean_cluster_dir(cluster_dir)
-
-
-    def test_gene_coverage_unique(self):
-        '''test _gene_coverage_unique'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_generic')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        c.gene = pyfastaq.sequences.Fasta('gene', 'ACGTGTGCAT')
-        hit1 = ['1', '10', '1', '10', '10', '10', '100.00', '10', '10', '1', '1', 'gene', 'contig1']
-        hit2 = ['1', '5', '1', '5', '5', '5', '100.00', '10', '10', '1', '1', 'gene', 'contig2']
-        c.nucmer_hits = { 'contig1': [pymummer.alignment.Alignment('\t'.join(hit1))] }
-        self.assertTrue(c._gene_coverage_unique())
-        c.nucmer_hits['contig2'] = [pymummer.alignment.Alignment('\t'.join(hit2))]
-        self.assertFalse(c._gene_coverage_unique())
-
-
-    def test_gene_covered_by_complete_contig_with_orf(self):
-        '''test _gene_covered_by_complete_contig_with_orf'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_generic')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        gene = pyfastaq.sequences.Fasta('gene', 'GATCGCGAAGCGATGACCCATGAAGCGACCGAACGCTGA')
-        gene_no_orf = pyfastaq.sequences.Fasta('gene', 'GATTGAGAAGCGATGACCCATGAAGCGACCGAACGCTGA')
-        c.gene = gene
-        hit1 = ['1', '39', '1', '39', '39', '39', '100.00', '39', '39', '1', '1', 'gene', 'contig1']
-        hit2 = ['1', '20', '1', '20', '20', '20', '100.00', '39', '39', '1', '1', 'gene', 'contig1']
-        hit3 = ['21', '39', '21', '39', '19', '19', '100.00', '39', '39', '1', '1', 'gene', 'contig2']
-        nucmer_hits = [
-            {'contig1': [pymummer.alignment.Alignment('\t'.join(hit1))]},
-            {'contig1': [pymummer.alignment.Alignment('\t'.join(hit1))]},
-            {'contig2': [pymummer.alignment.Alignment('\t'.join(hit2))]},
-            {'contig2': [pymummer.alignment.Alignment('\t'.join(hit2)), pymummer.alignment.Alignment('\t'.join(hit3))]},
-        ]
-        expected = [True, False, False, False]
-        assemblies = [
-            {'contig1': gene},
-            {'contig1': gene_no_orf},
-            {'contig1': gene},
-            {'contig1': gene, 'contig2': pyfastaq.sequences.Fasta('contig2', 'ACGT')}
-        ]
-        assert len(expected) == len(nucmer_hits) == len(assemblies)
-        for i in range(len(expected)):
-            c.final_assembly = assemblies[i]
-            c.nucmer_hits = nucmer_hits[i]
-            self.assertEqual(c._gene_covered_by_complete_contig_with_orf(), expected[i])
-        clean_cluster_dir(cluster_dir)
-
-
-    def test_gene_covered_by_at_least_one_full_length_contig(self):
-        '''test _gene_covered_by_at_least_one_full_length_contig'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_generic')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        c.gene = pyfastaq.sequences.Fasta('gene', 'GATCGCGAAGCGATGACCCATGAAGCGACCGAACGCTGA')
-        hit1 = ['1', '39', '1', '39', '39', '39', '100.00', '39', '39', '1', '1', 'gene', 'contig1']
-        hit2 = ['1', '20', '1', '20', '20', '20', '100.00', '39', '39', '1', '1', 'gene', 'contig1']
-        nucmer_hits = [
-            {'contig1': [pymummer.alignment.Alignment('\t'.join(hit1))]},
-            {'contig1': [pymummer.alignment.Alignment('\t'.join(hit2))]},
-        ]
-        expected = [True, False]
-        assert len(expected) == len(nucmer_hits)
-        for i in range(len(expected)):
-            c.nucmer_hits = nucmer_hits[i]
-            self.assertEqual(c._gene_covered_by_at_least_one_full_length_contig(), expected[i])
-        clean_cluster_dir(cluster_dir)
-
-
-    def test_get_mummer_variants(self):
-        '''test _get_mummer_variants'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_generic')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        snp_file = os.path.join(data_dir, 'cluster_test_get_mummer_variants.none.snps')
-        shutil.copyfile(snp_file, c.assembly_vs_gene_coords + '.snps')
-        c._get_mummer_variants()
-        self.assertEqual(c.mummer_variants, {})
-
-        clean_cluster_dir(cluster_dir)
-        snp_file = os.path.join(data_dir, 'cluster_test_get_mummer_variants.snp.snps')
-        shutil.copyfile(snp_file, c.assembly_vs_gene_coords + '.snps')
-        v1 = pymummer.variant.Variant(pymummer.snp.Snp('42\tA\tG\t42\t42\t42\t500\t500\t1\t1\tgene\tcontig1'))
-        v2 = pymummer.variant.Variant(pymummer.snp.Snp('42\tA\tG\t42\t42\t42\t500\t500\t1\t1\tgene\tcontig2'))
-        v3 = pymummer.variant.Variant(pymummer.snp.Snp('40\tT\tC\t40\t42\t42\t500\t500\t1\t1\tgene\tcontig1'))
-        v4 = pymummer.variant.Variant(pymummer.snp.Snp('2\tC\tG\t2\t42\t42\t500\t500\t1\t1\tgene\tcontig1'))
-        expected = {
-            'contig1': [[v4], [v3, v1]],
-            'contig2': [[v2]]
-        }
-        shutil.copyfile(snp_file, c.assembly_vs_gene_coords + '.snps')
-        c._get_mummer_variants()
-        self.assertEqual(c.mummer_variants, expected)
-        clean_cluster_dir(cluster_dir)
-
-
-    def test_filter_mummer_variants(self):
-        '''test filter_mummer_variants'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_generic')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        c.gene = pyfastaq.sequences.Fasta('gene', 'GATCGCGAAGCGATGACCCATGAAGCGACCGAACGCTGA')
-        v1 = pymummer.variant.Variant(pymummer.snp.Snp('4\tC\tT\t4\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
-        v2 = pymummer.variant.Variant(pymummer.snp.Snp('6\tC\tA\t6\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
-        v3 = pymummer.variant.Variant(pymummer.snp.Snp('12\tG\tT\t12\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
-        c.mummer_variants = {'contig': [[v1, v2], v3]}
-        c._filter_mummer_variants()
-        expected = {'contig': [[v1, v2]]}
-        self.assertEqual(expected, c.mummer_variants)
-        clean_cluster_dir(cluster_dir)
-
-
-    def test_get_codon_start(self):
-        '''test _get_codon_start'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_generic')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        tests = [
-            (0, 5, 3),
-            (0, 0, 0),
-            (0, 1, 0),
-            (0, 2, 0),
-            (1, 3, 1),
-            (2, 3, 2),
-            (3, 3, 3),
-            (3, 6, 6),
-            (3, 7, 6),
-            (3, 8, 6),
-        ]
-        for t in tests:
-            self.assertEqual(c._get_codon_start(t[0], t[1]), t[2])
-        clean_cluster_dir(cluster_dir)
-
-
-    def test_get_variant_effect(self):
-        '''test _get_variant_effect'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_generic')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        c.gene = pyfastaq.sequences.Fasta('gene', 'GATCGCGAAGCGATGACCCATGAAGCGACCGAACGCTGA')
-        v1 = pymummer.variant.Variant(pymummer.snp.Snp('6\tC\tT\t6\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
-        v1 = pymummer.variant.Variant(pymummer.snp.Snp('6\tC\tT\t6\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
-        v2 = pymummer.variant.Variant(pymummer.snp.Snp('4\tC\tA\t6\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
-        v3 = pymummer.variant.Variant(pymummer.snp.Snp('4\tC\tT\t4\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
-        v4 = pymummer.variant.Variant(pymummer.snp.Snp('6\tC\tA\t6\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
-        v5 = pymummer.variant.Variant(pymummer.snp.Snp('4\tC\t.\t4\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
-        v6 = pymummer.variant.Variant(pymummer.snp.Snp('4\t.\tA\t4\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
-        v7 = pymummer.variant.Variant(pymummer.snp.Snp('4\t.\tG\t4\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
-        v7.qry_base = 'GAT'
-        v8 = pymummer.variant.Variant(pymummer.snp.Snp('4\t.\tG\t4\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
-        v8.qry_base = 'TGA'
-        v9 = pymummer.variant.Variant(pymummer.snp.Snp('4\t.\tG\t4\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
-        v9.qry_base = 'ATTCCT'
-        v10 = pymummer.variant.Variant(pymummer.snp.Snp('4\tC\t.\t4\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
-        v10.ref_base = 'CGC'
-        v10.ref_end = 5
-        v11 = pymummer.variant.Variant(pymummer.snp.Snp('4\tC\t.\t4\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
-        v11.ref_base = 'CGCGAA'
-        v11.ref_end = 8
-
-        variants = [
-            ([v1], ('SYN', '.')),
-            ([v2], ('NONSYN', 'R2S')),
-            ([v2, v1], ('NONSYN', 'R2S')),
-            ([v3, v4], ('TRUNC', 'R2trunc')),
-            ([v5], ('FSHIFT', 'R2fs')),
-            ([v6], ('FSHIFT', 'R2fs')),
-            ([v7], ('INS', 'R2_E3insD')),
-            ([v8], ('TRUNC', 'R2trunc')),
-            ([v9], ('INS', 'R2_E3insIP')),
-            ([v10], ('DEL', 'R2del')),
-            ([v11], ('DEL', 'R2_E3del')),
+            'presence_absence1\tpresence_absence\t539\t64\tcluster_name\t96\t96\t97.92\tpresence_absence1.scaffold.1\t213\t1\tSNP\tp\tA10V\t1\tA10V\tNONSYN\t28\t28\tC\t83\t83\tT\t22\t.\t22\tpresence_absence1_p_A10V_N_Ref has wild, reads have variant so report\tGeneric description of presence_absence1',
+            'presence_absence1\tpresence_absence\t539\t64\tcluster_name\t96\t96\t97.92\tpresence_absence1.scaffold.1\t213\t0\t.\tp\t.\t0\t.\tSYN\t53\t53\tT\t108\t108\tC\t32\t.\t32\t.\tGeneric description of presence_absence1',
+            'presence_absence1\tpresence_absence\t539\t64\tcluster_name\t96\t96\t97.92\tpresence_absence1.scaffold.1\t213\t1\tSNP\tp\tI5A\t1\t.\t.\t13\t15\tG;C;G\t68\t70\tG;C;G\t18;20;20\t.;.;.\t18;20;20\tpresence_absence1_p_I5A_N_Ref and reads have variant so report\tGeneric description of presence_absence1',
         ]
 
-        for t in variants:
-            self.assertEqual(t[1], c._get_variant_effect(t[0]))
-
-        clean_cluster_dir(cluster_dir)
-
-
-    def test_make_assembly_vcf(self):
-        '''test _make_assembly_vcf'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_generic')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        c.final_assembly_fa = os.path.join(data_dir, 'cluster_test_make_assembly_vcf.assembly.fa')
-        c.final_assembly_bam = os.path.join(data_dir, 'cluster_test_make_assembly_vcf.assembly.bam')
-        expected_vcf = os.path.join(data_dir, 'cluster_test_make_assembly_vcf.assembly.vcf')
-        expected_depths = os.path.join(data_dir, 'cluster_test_make_assembly_vcf.assembly.read_depths.gz')
-        c._make_assembly_vcf()
-
-        def get_vcf_call_lines(fname):
-            with open(fname) as f:
-                lines = [x for x in f.readlines() if not x.startswith('#')]
-            return lines
-
-        expected_lines = get_vcf_call_lines(expected_vcf)
-        got_lines = get_vcf_call_lines(c.final_assembly_vcf)
-        self.assertEqual(expected_lines, got_lines)
-        self.assertEqual(file2lines(expected_depths), file2lines(c.final_assembly_read_depths))
-        clean_cluster_dir(cluster_dir)
-
-    def test_get_assembly_read_depths(self):
-        '''test _get_assembly_read_depths'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_generic')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        c.final_assembly_read_depths = os.path.join(data_dir, 'cluster_test_get_assembly_read_depths.gz')
-        tests = [
-            ( ('ref1', 42), None ),
-            ( ('ref2', 1), None ),
-            ( ('ref1', 0), ('G', '.', 1, '1') ),
-            ( ('ref1', 2), ('T', 'A', 3, '2,1') ),
-            ( ('ref1', 3), ('C', 'A,G', 42, '21,11,10') ),
-            ( ('ref1', 4), ('C', 'AC', 41, '0,42') )
-        ]
+        self.assertEqual(expected, c.report_lines)
+        shutil.rmtree(tmpdir)
 
-        for t in tests:
-            self.assertEqual(c._get_assembly_read_depths(t[0][0], t[0][1]), t[1])
 
-    def test_get_samtools_variant_positions(self):
-        '''test _get_samtools_variant_positions'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_generic')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        c.final_assembly_vcf = os.path.join(data_dir, 'cluster_test_get_samtools_variant_positions.vcf')
+    def test_full_run_ok_variants_only_variant_not_present(self):
+        '''test complete run of cluster on a variants only gene when variant not present'''
+        refdata = reference_data.ReferenceData(
+            variants_only_fa=os.path.join(data_dir, 'cluster_test_full_run_ok_variants_only.fa'),
+            metadata_tsv=os.path.join(data_dir, 'cluster_test_full_run_ok_variants_only.not_present.metadata.tsv'),
+        )
+
+        tmpdir = 'tmp.cluster_test_full_run_ok_variants_only.not_present'
+        shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_ok_variants_only'), tmpdir)
+
+        c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler')
+        c.run()
         expected = [
-            ('16__cat_2_M35190.scaffold.1', 92),
-            ('16__cat_2_M35190.scaffold.1', 179),
-            ('16__cat_2_M35190.scaffold.1', 263),
-            ('16__cat_2_M35190.scaffold.6', 93)
-        ]
-        self.assertEqual(expected, c._get_samtools_variant_positions())
-
-
-    def test_get_samtools_variants(self):
-        '''test _get_samtools_variants'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_generic')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        c.final_assembly_vcf = os.path.join(data_dir, 'cluster_test_get_samtools_variants.vcf')
-        c.final_assembly_read_depths = os.path.join(data_dir, 'cluster_test_get_samtools_variants.read_depths.gz')
-        positions = [
-            ('16__cat_2_M35190.scaffold.1', 92),
-            ('16__cat_2_M35190.scaffold.1', 179),
-            ('16__cat_2_M35190.scaffold.1', 263),
-            ('16__cat_2_M35190.scaffold.6', 93)
+            'variants_only1\tvariants_only\t27\t66\tcluster_name\t96\t96\t100.0\tvariants_only1.scaffold.1\t215\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\t.\tGeneric description of variants_only1'
         ]
-        expected = {
-            '16__cat_2_M35190.scaffold.1': {
-                92: ('T', 'A', 123, '65,58'),
-                179: ('A', 'T', 86, '41,45'),
-                263: ('G', 'C', 97, '53,44'),
-            },
-            '16__cat_2_M35190.scaffold.6': {
-                93: ('T', 'G', 99, '56,43')
-            }
-        }
-
-        got = c._get_samtools_variants(positions)
-        self.assertEqual(expected, got)
-
-
-    def test_get_vcf_variant_counts(self):
-        '''test _get_vcf_variant_counts'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_generic')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'name')
-        hit = ['1', '42', '1', '42', '42', '42', '100.00', '1000', '1000', '1', '1', 'gene', 'scaff1']
-        c.nucmer_hits = {
-            'scaff1': [pymummer.alignment.Alignment('\t'.join(hit))]
-        }
-
-        c.final_assembly_vcf = os.path.join(data_dir, 'cluster_test_get_vcf_variant_counts.vcf')
-        c._get_vcf_variant_counts()
-        expected = {'scaff1': 1}
-        self.assertEqual(expected, c.vcf_variant_counts)
-        clean_cluster_dir(cluster_dir)
-
-
-    def test_make_report_lines_nonsynonymous(self):
-        '''test _make_report_lines'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_generic')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'cluster_name')
-        c.gene = pyfastaq.sequences.Fasta('gene', 'GATCGCGAAGCGATGACCCATGAAGCGACCGAACGCTGA')
-        v1 = pymummer.variant.Variant(pymummer.snp.Snp('8\tA\tG\t8\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
-
-        nucmer_hit = ['1', '10', '1', '10', '10', '10', '90.00', '1000', '1000', '1', '1', 'gene', 'contig']
-        c.nucmer_hits = {'contig': [pymummer.alignment.Alignment('\t'.join(nucmer_hit))]}
-        c.mummer_variants = {'contig': [[v1]]}
-        c.percent_identities = {'contig': 92.42}
-        c.status_flag.set_flag(42)
-        c.assembled_ok = True
-        c.final_assembly_read_depths = os.path.join(data_dir, 'cluster_test_make_report_lines.read_depths.gz')
-        c._make_report_lines()
-        expected = [[
-            'gene',
-            554,
-            2,
-            'cluster_name',
-            39,
-            10,
-            92.42,
-            'SNP',
-            'NONSYN',
-            'E3G',
-            8,
-            8,
-            'A',
-            'contig',
-            39,
-            8,
-            8,
-            'G',
-            '.',
-            '.',
-            '.'
-        ]]
-        self.assertEqual(expected, c.report_lines)
-        clean_cluster_dir(cluster_dir)
-
-
-    def test_make_report_lines_synonymous(self):
-        '''test _make_report_lines'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_generic')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'cluster_name')
-        c.gene = pyfastaq.sequences.Fasta('gene', 'GATCGCGAAGCGATGACCCATGAAGCGACCGAACGCTGA')
-        v1 = pymummer.variant.Variant(pymummer.snp.Snp('6\tC\tT\t6\tx\tx\t39\t39\tx\tx\tgene\tcontig'))
-
-        nucmer_hit = ['1', '10', '1', '10', '10', '10', '90.00', '1000', '1000', '1', '1', 'gene', 'contig']
-        c.nucmer_hits = {'contig': [pymummer.alignment.Alignment('\t'.join(nucmer_hit))]}
-        c.mummer_variants = {'contig': [[v1]]}
-        c.percent_identities = {'contig': 92.42}
-        c.status_flag.set_flag(42)
-        c.assembled_ok = True
-        c.final_assembly_read_depths = os.path.join(data_dir, 'cluster_test_make_report_lines.read_depths.gz')
-        c._make_report_lines()
-        expected = [[
-            'gene',
-            42,
-            2,
-            'cluster_name',
-            39,
-            10,
-            92.42,
-            'SNP',
-            'SYN',
-            '.',
-            6,
-            6,
-            'C',
-            'contig',
-            39,
-            6,
-            6,
-            'T',
-            42,
-            'G',
-            '22,20'
-        ]]
         self.assertEqual(expected, c.report_lines)
-        clean_cluster_dir(cluster_dir)
-
-
-    def test_make_report_lines_assembly_fail(self):
-        '''test _make_report_lines when assembly fails'''
-        cluster_dir = os.path.join(data_dir, 'cluster_test_generic')
-        clean_cluster_dir(cluster_dir)
-        c = cluster.Cluster(cluster_dir, 'cluster_name')
-        c.gene = pyfastaq.sequences.Fasta('gene', 'GATCGCGAAGCGATGACCCATGAAGCGACCGAACGCTGA')
-        c.status_flag.set_flag(64)
-        c.assembled_ok = False
-        c._make_report_lines()
+        shutil.rmtree(tmpdir)
+
+
+    def test_full_run_ok_variants_only_variant_not_present_always_report(self):
+        '''test complete run of cluster on a variants only gene when variant not present but always report variant'''
+        refdata = reference_data.ReferenceData(
+            variants_only_fa=os.path.join(data_dir, 'cluster_test_full_run_ok_variants_only.fa'),
+            metadata_tsv=os.path.join(data_dir, 'cluster_test_full_run_ok_variants_only.not_present.always_report.metadata.tsv'),
+        )
+
+        tmpdir = 'tmp.cluster_test_full_run_ok_variants_only.not_present.always_report'
+        shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_ok_variants_only'), tmpdir)
+
+        c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler')
+        c.run()
         expected = [
-            [
-                'gene',
-                64,
-                2,
-                'cluster_name',
-                39,
-            ] + ['.'] * 16
+            'variants_only1\tvariants_only\t27\t66\tcluster_name\t96\t96\t100.0\tvariants_only1.scaffold.1\t215\t1\tSNP\tp\tR3S\t0\t.\t.\t7\t9\tC;G;C\t65\t67\tC;G;C\t18;18;19\t.;.;.\t18;18;19\tvariants_only1_p_R3S_Y_Ref and assembly have wild type, but always report anyway\tGeneric description of variants_only1'
         ]
         self.assertEqual(expected, c.report_lines)
-        clean_cluster_dir(cluster_dir)
+        shutil.rmtree(tmpdir)
+
+
+    def test_full_run_ok_variants_only_variant_is_present(self):
+        '''test complete run of cluster on a variants only gene when variant is present'''
+        refdata = reference_data.ReferenceData(
+            variants_only_fa=os.path.join(data_dir, 'cluster_test_full_run_ok_variants_only.fa'),
+            metadata_tsv=os.path.join(data_dir, 'cluster_test_full_run_ok_variants_only.present.metadata.tsv'),
+        )
+
+        tmpdir = 'tmp.cluster_test_full_run_ok_variants_only.present'
+        shutil.copytree(os.path.join(data_dir, 'cluster_test_full_run_ok_variants_only'), tmpdir)
 
+        c = cluster.Cluster(tmpdir, 'cluster_name', refdata, spades_other_options='--only-assembler')
+        c.run()
+
+        expected = [
+            'variants_only1\tvariants_only\t27\t66\tcluster_name\t96\t96\t100.0\tvariants_only1.scaffold.1\t215\t1\tSNP\tp\tI5A\t1\t.\t.\t13\t15\tG;C;G\t71\t73\tG;C;G\t17;17;17\t.;.;.\t17;17;17\tvariants_only1_p_I5A_N_Ref and reads have variant so report\tGeneric description of variants_only1',
+        ]
+
+        self.assertEqual(expected, c.report_lines)
+        shutil.rmtree(tmpdir)
diff --git a/ariba/tests/clusters_test.py b/ariba/tests/clusters_test.py
index 3d9a6d5a..3d20c310 100644
--- a/ariba/tests/clusters_test.py
+++ b/ariba/tests/clusters_test.py
@@ -4,19 +4,20 @@
 import pysam
 import pyfastaq
 import filecmp
-from ariba import clusters
+from ariba import clusters, external_progs, reference_data
 
 modules_dir = os.path.dirname(os.path.abspath(clusters.__file__))
 data_dir = os.path.join(modules_dir, 'tests', 'data')
+extern_progs = external_progs.ExternalProgs()
 
 
 class TestClusters(unittest.TestCase):
     def setUp(self):
         self.cluster_dir = 'tmp.Cluster'
+        refdata = reference_data.ReferenceData(non_coding_fa = os.path.join(data_dir, 'clusters_test_dummy_db.fa'))
         reads1 = os.path.join(data_dir, 'clusters_test_dummy_reads_1.fq')
         reads2 = os.path.join(data_dir, 'clusters_test_dummy_reads_2.fq')
-        db = os.path.join(data_dir, 'clusters_test_dummy_db.fa')
-        self.clusters = clusters.Clusters(db, reads1, reads2, self.cluster_dir)
+        self.clusters = clusters.Clusters(refdata, reads1, reads2, self.cluster_dir, extern_progs)
 
 
     def tearDown(self):
@@ -68,7 +69,8 @@ def test_bam_to_clusters_reads(self):
         reads1 = os.path.join(data_dir, 'clusters_test_bam_to_clusters_reads.reads_1.fq')
         reads2 = os.path.join(data_dir, 'clusters_test_bam_to_clusters_reads.reads_2.fq')
         ref = os.path.join(data_dir, 'clusters_test_bam_to_clusters_reads.db.fa')
-        c = clusters.Clusters(ref, reads1, reads2, clusters_dir)
+        refdata = reference_data.ReferenceData(presence_absence_fa = ref)
+        c = clusters.Clusters(refdata, reads1, reads2, clusters_dir, extern_progs)
         shutil.copyfile(os.path.join(data_dir, 'clusters_test_bam_to_clusters_reads.bam'), c.bam)
         c._bam_to_clusters_reads()
         expected = [
@@ -121,28 +123,40 @@ class FakeCluster:
             def __init__(self, lines):
                 self.report_lines = lines
 
-        self.clusters.clusters = {
-            'gene1': FakeCluster([['gene1 line1']]),
-            'gene2': FakeCluster([['gene2 line2']])
+        clusters_dict = {
+            'gene1': FakeCluster(['gene1\tline1']),
+            'gene2': FakeCluster(['gene2\tline2'])
         }
 
-        self.clusters._write_reports()
+        tmp_tsv = 'tmp.test_write_reports.tsv'
+        tmp_xls = 'tmp.test_write_reports.xls'
+        clusters.Clusters._write_reports(clusters_dict, tmp_tsv, tmp_xls)
+
         expected = os.path.join(data_dir, 'clusters_test_write_report.tsv')
-        self.assertTrue(filecmp.cmp(expected, self.clusters.report_file_tsv, shallow=False))
-        self.assertTrue(os.path.exists(self.clusters.report_file_xls))
+        self.assertTrue(filecmp.cmp(expected, tmp_tsv, shallow=False))
+        self.assertTrue(os.path.exists(tmp_xls))
+        os.unlink(tmp_tsv)
+        os.unlink(tmp_xls)
 
 
-    def test_write_catted_assembled_genes_fasta(self):
-        '''test _write_catted_assembled_genes_fasta'''
+    def test_write_catted_assembled_seqs_fasta(self):
+        '''test _write_catted_assembled_seqs_fasta'''
+        class FakeAssemblyCompare:
+            def __init__(self, filename):
+                self.assembled_ref_seqs_file = filename
+
         class FakeCluster:
             def __init__(self, filename):
-                self.final_assembled_genes_fa = filename
+                #self.final_assembled_genes_fa = filename
+                self.assembly_compare = FakeAssemblyCompare(filename)
 
         self.clusters.clusters = {
             'gene1': FakeCluster(os.path.join(data_dir, 'clusters_test_write_catted_assembled_genes_fasta.in.gene1.fa')),
             'gene2': FakeCluster(os.path.join(data_dir, 'clusters_test_write_catted_assembled_genes_fasta.in.gene2.fa')),
         }
 
-        self.clusters._write_catted_assembled_genes_fasta()
+        tmp_file = 'tmp.test_write_catted_assembled_seqs_fasta.fa'
+        self.clusters._write_catted_assembled_seqs_fasta(tmp_file)
         expected = os.path.join(data_dir, 'clusters_test_write_catted_assembled_genes_fasta.expected.out.fa')
-        self.assertTrue(filecmp.cmp(expected, self.clusters.catted_assembled_genes_fasta, shallow=False))
+        self.assertTrue(filecmp.cmp(expected, tmp_file, shallow=False))
+        os.unlink(tmp_file)
diff --git a/ariba/tests/data/assembly_compare_parse_nucmer_coords_file.coords b/ariba/tests/data/assembly_compare_parse_nucmer_coords_file.coords
new file mode 100644
index 00000000..47c18947
--- /dev/null
+++ b/ariba/tests/data/assembly_compare_parse_nucmer_coords_file.coords
@@ -0,0 +1,7 @@
+a.fa b.fa
+NUCMER
+
+[S1]	[E1]	[S2]	[E2]	[LEN 1]	[LEN 2]	[% IDY]	[LEN R]	[LEN Q]	[FRM]	[TAGS]
+1	1000	1	1000	1000	1000	100.00	1000	1000	1	1	ref	contig1	[IDENTITY]
+1	240	1	240	240	240	100.00	1000	580	1	1	ref	contig2
+661	1000	241	580	340	340	100.00	1000	580	1	1	ref	contig2
diff --git a/ariba/tests/data/cluster_test_nucmer_hits_to_assembled_gene_sequences.expected.out.fa b/ariba/tests/data/assembly_compare_write_assembled_reference_sequences.expected.fa
similarity index 77%
rename from ariba/tests/data/cluster_test_nucmer_hits_to_assembled_gene_sequences.expected.out.fa
rename to ariba/tests/data/assembly_compare_write_assembled_reference_sequences.expected.fa
index 87aa2147..9fecb1a5 100644
--- a/ariba/tests/data/cluster_test_nucmer_hits_to_assembled_gene_sequences.expected.out.fa
+++ b/ariba/tests/data/assembly_compare_write_assembled_reference_sequences.expected.fa
@@ -1,7 +1,7 @@
->ref_gene.1.147.contig1.61.207.+.complete
+>ref_seq.1.147.contig1.61.207.+.complete
 ATGGTACAAGACGGCCCTTTGCAGTCCTGTGTACTTGCGGGTCGCTCCTTTGCATTGAAT
 TATCGAACATCGTCGCGTTCAAGATCCCGCGAAAAAAATTATAGATCGCAGGATATCACT
 GCCAGTGGCATCTGTGTAAGCGCTTAG
->ref_gene.18.120.contig2.1.103.-
+>ref_seq.18.120.contig2.1.103.-
 TTTGCAGTCCTGTGTACTTGCGGGTCGCTCCTTTGCATTGAATTATCGAACATCGTCGCG
 TTCAAGATCCCGCGAAAAAAATTATAGATCGCAGGATATCACT
diff --git a/ariba/tests/data/cluster_test_assemble_with_spades/reads_1.fq b/ariba/tests/data/assembly_test_assemble_with_spades_reads_1.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_assemble_with_spades/reads_1.fq
rename to ariba/tests/data/assembly_test_assemble_with_spades_reads_1.fq
diff --git a/ariba/tests/data/cluster_test_assemble_with_spades/reads_2.fq b/ariba/tests/data/assembly_test_assemble_with_spades_reads_2.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_assemble_with_spades/reads_2.fq
rename to ariba/tests/data/assembly_test_assemble_with_spades_reads_2.fq
diff --git a/ariba/tests/data/cluster_test_assemble_with_spades.gene.fa b/ariba/tests/data/assembly_test_assemble_with_spades_ref.fa
similarity index 100%
rename from ariba/tests/data/cluster_test_assemble_with_spades.gene.fa
rename to ariba/tests/data/assembly_test_assemble_with_spades_ref.fa
diff --git a/ariba/tests/data/cluster_test_fix_contig_orientation.in.fa b/ariba/tests/data/assembly_test_fix_contig_orientation.in.fa
similarity index 77%
rename from ariba/tests/data/cluster_test_fix_contig_orientation.in.fa
rename to ariba/tests/data/assembly_test_fix_contig_orientation.in.fa
index 183e9114..0b9605b0 100644
--- a/ariba/tests/data/cluster_test_fix_contig_orientation.in.fa
+++ b/ariba/tests/data/assembly_test_fix_contig_orientation.in.fa
@@ -24,3 +24,10 @@ ACAAAAAAAAAAAAAAAGAGAGAGAAGGAGGAGAAGAAAAAAAAAAAAAAAAAAAAAATT
 ACAAAAAAAAAAAAAAAGAGAGAGAAGGAGGAGAAGAAAAAAAAAAAAAAAAAAAAAATT
 ACAAAAAAAAAAAAAAAGAGAGAGAAGGAGGAGAAGAAAAAAAAAAAAAAAAAAAAAATT
 ACAAAAAAAAAAAAAAAGAGAGAGAAGGAGGAGAAGAAAAAAAAAAAAAAAAAAAAAATT
+>match_both_strands
+CGCTTCGGTCCACCATGATGGAGCGCCATGTGATGGGATTTCCAACCCCGTTGTTTCAGG
+ACTCATGGCATTTACCACCGACAACCGTTTATAATCCATGAGCAAGGAATACAGTGGAGA
+CAGGATTGGTTGTATTGGACTGAATACATGCCCCACTGTTACCCCGAAAGTTAACACGTA
+TAGTTTAAATAAACTATGGGTACGTGTTAACTTTCGGGGTAACAGTGGGGCATGTATTCA
+GTCCAATACAACCAATCCTGTCTCCACTGTATTCCTTGCTCATGGATTATAAACGGTTGT
+CGGTGGTAAATGCCATGAGTCCTGAAACAACGGGGTTGGAAATCCCATCACATGGCGCTC
diff --git a/ariba/tests/data/cluster_test_fix_contig_orientation.out.fa b/ariba/tests/data/assembly_test_fix_contig_orientation.out.fa
similarity index 77%
rename from ariba/tests/data/cluster_test_fix_contig_orientation.out.fa
rename to ariba/tests/data/assembly_test_fix_contig_orientation.out.fa
index ce3a0b5d..a1d0f7b2 100644
--- a/ariba/tests/data/cluster_test_fix_contig_orientation.out.fa
+++ b/ariba/tests/data/assembly_test_fix_contig_orientation.out.fa
@@ -24,3 +24,10 @@ ACAAAAAAAAAAAAAAAGAGAGAGAAGGAGGAGAAGAAAAAAAAAAAAAAAAAAAAAATT
 ACAAAAAAAAAAAAAAAGAGAGAGAAGGAGGAGAAGAAAAAAAAAAAAAAAAAAAAAATT
 ACAAAAAAAAAAAAAAAGAGAGAGAAGGAGGAGAAGAAAAAAAAAAAAAAAAAAAAAATT
 ACAAAAAAAAAAAAAAAGAGAGAGAAGGAGGAGAAGAAAAAAAAAAAAAAAAAAAAAATT
+>match_both_strands
+CGCTTCGGTCCACCATGATGGAGCGCCATGTGATGGGATTTCCAACCCCGTTGTTTCAGG
+ACTCATGGCATTTACCACCGACAACCGTTTATAATCCATGAGCAAGGAATACAGTGGAGA
+CAGGATTGGTTGTATTGGACTGAATACATGCCCCACTGTTACCCCGAAAGTTAACACGTA
+TAGTTTAAATAAACTATGGGTACGTGTTAACTTTCGGGGTAACAGTGGGGCATGTATTCA
+GTCCAATACAACCAATCCTGTCTCCACTGTATTCCTTGCTCATGGATTATAAACGGTTGT
+CGGTGGTAAATGCCATGAGTCCTGAAACAACGGGGTTGGAAATCCCATCACATGGCGCTC
diff --git a/ariba/tests/data/cluster_test_fix_contig_orientation.gene.fa b/ariba/tests/data/assembly_test_fix_contig_orientation.ref.fa
similarity index 100%
rename from ariba/tests/data/cluster_test_fix_contig_orientation.gene.fa
rename to ariba/tests/data/assembly_test_fix_contig_orientation.ref.fa
diff --git a/ariba/tests/data/cluster_test_gapfill_with_gapfiller.scaffolds_no_gaps.fa b/ariba/tests/data/assembly_test_gapfill_with_gapfiller.scaffolds_no_gaps.fa
similarity index 100%
rename from ariba/tests/data/cluster_test_gapfill_with_gapfiller.scaffolds_no_gaps.fa
rename to ariba/tests/data/assembly_test_gapfill_with_gapfiller.scaffolds_no_gaps.fa
diff --git a/ariba/tests/data/cluster_test_gapfill_with_gapfiller.scaffolds_with_gaps.fa b/ariba/tests/data/assembly_test_gapfill_with_gapfiller.scaffolds_with_gaps.fa
similarity index 100%
rename from ariba/tests/data/cluster_test_gapfill_with_gapfiller.scaffolds_with_gaps.fa
rename to ariba/tests/data/assembly_test_gapfill_with_gapfiller.scaffolds_with_gaps.fa
diff --git a/ariba/tests/data/cluster_test_gapfill_with_gapfiller/reads_1.fq b/ariba/tests/data/assembly_test_gapfill_with_gapfiller_reads_1.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_gapfill_with_gapfiller/reads_1.fq
rename to ariba/tests/data/assembly_test_gapfill_with_gapfiller_reads_1.fq
diff --git a/ariba/tests/data/cluster_test_gapfill_with_gapfiller/reads_2.fq b/ariba/tests/data/assembly_test_gapfill_with_gapfiller_reads_2.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_gapfill_with_gapfiller/reads_2.fq
rename to ariba/tests/data/assembly_test_gapfill_with_gapfiller_reads_2.fq
diff --git a/ariba/tests/data/assembly_test_has_gaps_to_fill.has_gaps.fa b/ariba/tests/data/assembly_test_has_gaps_to_fill.has_gaps.fa
new file mode 100644
index 00000000..14bf0bcb
--- /dev/null
+++ b/ariba/tests/data/assembly_test_has_gaps_to_fill.has_gaps.fa
@@ -0,0 +1,2 @@
+>seq
+CATCATCATCATnCATAATATATATAT
diff --git a/ariba/tests/data/assembly_test_has_gaps_to_fill.no_gaps.fa b/ariba/tests/data/assembly_test_has_gaps_to_fill.no_gaps.fa
new file mode 100644
index 00000000..896d4d40
--- /dev/null
+++ b/ariba/tests/data/assembly_test_has_gaps_to_fill.no_gaps.fa
@@ -0,0 +1,4 @@
+>seq
+ACTATCCATGCATGCATACT
+>seq2
+CACACGTCAGTCAAG
diff --git a/ariba/tests/data/cluster_test_parse_assembly_bam.assembly.fa b/ariba/tests/data/assembly_test_parse_assembly_bam.assembly.fa
similarity index 100%
rename from ariba/tests/data/cluster_test_parse_assembly_bam.assembly.fa
rename to ariba/tests/data/assembly_test_parse_assembly_bam.assembly.fa
diff --git a/ariba/tests/data/cluster_test_parse_assembly_bam.bam b/ariba/tests/data/assembly_test_parse_assembly_bam.bam
similarity index 100%
rename from ariba/tests/data/cluster_test_parse_assembly_bam.bam
rename to ariba/tests/data/assembly_test_parse_assembly_bam.bam
diff --git a/ariba/tests/data/cluster_test_rename_scaffolds.in.fa b/ariba/tests/data/assembly_test_rename_scaffolds.in.fa
similarity index 100%
rename from ariba/tests/data/cluster_test_rename_scaffolds.in.fa
rename to ariba/tests/data/assembly_test_rename_scaffolds.in.fa
diff --git a/ariba/tests/data/assembly_test_rename_scaffolds.out.fa b/ariba/tests/data/assembly_test_rename_scaffolds.out.fa
new file mode 100644
index 00000000..5866be1d
--- /dev/null
+++ b/ariba/tests/data/assembly_test_rename_scaffolds.out.fa
@@ -0,0 +1,6 @@
+>prefix.scaffold.1
+TACG
+>prefix.scaffold.2
+ACGT
+>prefix.scaffold.3
+CGTA
diff --git a/ariba/tests/data/cluster_test_scaffold_with_sspace.contigs.fa b/ariba/tests/data/assembly_test_scaffold_with_sspace_contigs.fa
similarity index 100%
rename from ariba/tests/data/cluster_test_scaffold_with_sspace.contigs.fa
rename to ariba/tests/data/assembly_test_scaffold_with_sspace_contigs.fa
diff --git a/ariba/tests/data/cluster_test_scaffold_with_sspace/reads_1.fq b/ariba/tests/data/assembly_test_scaffold_with_sspace_reads_1.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_scaffold_with_sspace/reads_1.fq
rename to ariba/tests/data/assembly_test_scaffold_with_sspace_reads_1.fq
diff --git a/ariba/tests/data/cluster_test_scaffold_with_sspace/reads_2.fq b/ariba/tests/data/assembly_test_scaffold_with_sspace_reads_2.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_scaffold_with_sspace/reads_2.fq
rename to ariba/tests/data/assembly_test_scaffold_with_sspace_reads_2.fq
diff --git a/ariba/tests/data/cluster_test_fix_contig_orientation/reads_1.fq b/ariba/tests/data/assembly_test_set_assembly_kmer_reads_1.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_fix_contig_orientation/reads_1.fq
rename to ariba/tests/data/assembly_test_set_assembly_kmer_reads_1.fq
diff --git a/ariba/tests/data/cluster_test_fix_contig_orientation/reads_2.fq b/ariba/tests/data/assembly_test_set_assembly_kmer_reads_2.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_fix_contig_orientation/reads_2.fq
rename to ariba/tests/data/assembly_test_set_assembly_kmer_reads_2.fq
diff --git a/ariba/tests/data/cluster_test_get_mummer_variants.none.snps b/ariba/tests/data/assembly_variants_test_get_mummer_variants.none.snps
similarity index 100%
rename from ariba/tests/data/cluster_test_get_mummer_variants.none.snps
rename to ariba/tests/data/assembly_variants_test_get_mummer_variants.none.snps
diff --git a/ariba/tests/data/cluster_test_get_mummer_variants.snp.snps b/ariba/tests/data/assembly_variants_test_get_mummer_variants.snp.snps
similarity index 100%
rename from ariba/tests/data/cluster_test_get_mummer_variants.snp.snps
rename to ariba/tests/data/assembly_variants_test_get_mummer_variants.snp.snps
diff --git a/ariba/tests/data/assembly_variants_test_get_one_variant_for_one_contig_coding_metadata.tsv b/ariba/tests/data/assembly_variants_test_get_one_variant_for_one_contig_coding_metadata.tsv
new file mode 100644
index 00000000..b2e074e6
--- /dev/null
+++ b/ariba/tests/data/assembly_variants_test_get_one_variant_for_one_contig_coding_metadata.tsv
@@ -0,0 +1,11 @@
+presence_absence	p	D2E	N	ref has wild type D (GAT=D, GAA=E)
+presence_absence	p	S3R	N	ref has variant type R (AGA=R, AGT=S)
+presence_absence	p	D4E	N	ref has variant type E (GAA=E, GAC=D)
+presence_absence	p	A5D	N	ref has wild type A (GCG=A)
+variants_only	p	D2E	N	ref has wild type D (GAT=D, GAA=E)
+variants_only	p	S3R	N	ref has variant type R (AGA=R, AGT=S)
+variants_only	p	D4E	N	ref has variant type E (GAA=E, GAC=D)
+variants_only	p	A5D	N	ref has wild type A (GCG=A)
+non_coding	n	C3A	N	ref has variant type A
+non_coding	n	T5A	N	ref has wild type T
+non_coding	n	C6G	N	ref has variant type G
diff --git a/ariba/tests/data/assembly_variants_test_get_one_variant_for_one_contig_coding_presence_absence.fa b/ariba/tests/data/assembly_variants_test_get_one_variant_for_one_contig_coding_presence_absence.fa
new file mode 100644
index 00000000..d60406be
--- /dev/null
+++ b/ariba/tests/data/assembly_variants_test_get_one_variant_for_one_contig_coding_presence_absence.fa
@@ -0,0 +1,2 @@
+>presence_absence
+ATGGATAGAGAAGCGATGACCCATGAAGCGACCGAACGCTGA
diff --git a/ariba/tests/data/assembly_variants_test_get_one_variant_for_one_contig_non_coding.metadata.tsv b/ariba/tests/data/assembly_variants_test_get_one_variant_for_one_contig_non_coding.metadata.tsv
new file mode 100644
index 00000000..b2e074e6
--- /dev/null
+++ b/ariba/tests/data/assembly_variants_test_get_one_variant_for_one_contig_non_coding.metadata.tsv
@@ -0,0 +1,11 @@
+presence_absence	p	D2E	N	ref has wild type D (GAT=D, GAA=E)
+presence_absence	p	S3R	N	ref has variant type R (AGA=R, AGT=S)
+presence_absence	p	D4E	N	ref has variant type E (GAA=E, GAC=D)
+presence_absence	p	A5D	N	ref has wild type A (GCG=A)
+variants_only	p	D2E	N	ref has wild type D (GAT=D, GAA=E)
+variants_only	p	S3R	N	ref has variant type R (AGA=R, AGT=S)
+variants_only	p	D4E	N	ref has variant type E (GAA=E, GAC=D)
+variants_only	p	A5D	N	ref has wild type A (GCG=A)
+non_coding	n	C3A	N	ref has variant type A
+non_coding	n	T5A	N	ref has wild type T
+non_coding	n	C6G	N	ref has variant type G
diff --git a/ariba/tests/data/assembly_variants_test_get_variants_non_coding.fa b/ariba/tests/data/assembly_variants_test_get_variants_non_coding.fa
new file mode 100644
index 00000000..bc1221d6
--- /dev/null
+++ b/ariba/tests/data/assembly_variants_test_get_variants_non_coding.fa
@@ -0,0 +1,2 @@
+>non_coding
+CTACTGACGTACTGATCGATCGTATGAA
diff --git a/ariba/tests/data/assembly_variants_test_get_variants_presence_absence.fa b/ariba/tests/data/assembly_variants_test_get_variants_presence_absence.fa
new file mode 100644
index 00000000..d60406be
--- /dev/null
+++ b/ariba/tests/data/assembly_variants_test_get_variants_presence_absence.fa
@@ -0,0 +1,2 @@
+>presence_absence
+ATGGATAGAGAAGCGATGACCCATGAAGCGACCGAACGCTGA
diff --git a/ariba/tests/data/assembly_variants_test_get_variants_presence_absence.snps b/ariba/tests/data/assembly_variants_test_get_variants_presence_absence.snps
new file mode 100644
index 00000000..9171478b
--- /dev/null
+++ b/ariba/tests/data/assembly_variants_test_get_variants_presence_absence.snps
@@ -0,0 +1,3 @@
+9	A	T	9	x	x	42	42	x	x	presence_absence	contig1
+14	C	A	14	x	x	42	42	x	x	presence_absence	contig1
+15	G	C	15	x	x	42	42	x	x	presence_absence	contig1
diff --git a/ariba/tests/data/assembly_variants_test_get_variants_variants_only.fa b/ariba/tests/data/assembly_variants_test_get_variants_variants_only.fa
new file mode 100644
index 00000000..0d2677e6
--- /dev/null
+++ b/ariba/tests/data/assembly_variants_test_get_variants_variants_only.fa
@@ -0,0 +1,2 @@
+>variants_only
+ATGGATAGAGAAGCGATGACCCATGAAGCGACCGAACGCTGA
diff --git a/ariba/tests/data/assembly_variants_test_get_variants_variants_only.snps b/ariba/tests/data/assembly_variants_test_get_variants_variants_only.snps
new file mode 100644
index 00000000..c9e5c6fb
--- /dev/null
+++ b/ariba/tests/data/assembly_variants_test_get_variants_variants_only.snps
@@ -0,0 +1,3 @@
+9	A	T	9	x	x	42	42	x	x	variants_only	contig1
+14	C	A	14	x	x	42	42	x	x	variants_only	contig1
+15	G	C	15	x	x	42	42	x	x	variants_only	contig1
diff --git a/ariba/tests/data/cluster_test_choose_best_gene/reads_1.fq b/ariba/tests/data/best_seq_chooser_best_seq_reads_1.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_choose_best_gene/reads_1.fq
rename to ariba/tests/data/best_seq_chooser_best_seq_reads_1.fq
diff --git a/ariba/tests/data/cluster_test_choose_best_gene/reads_2.fq b/ariba/tests/data/best_seq_chooser_best_seq_reads_2.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_choose_best_gene/reads_2.fq
rename to ariba/tests/data/best_seq_chooser_best_seq_reads_2.fq
diff --git a/ariba/tests/data/cluster_test_choose_best_gene/genes.fa b/ariba/tests/data/best_seq_chooser_best_seq_ref.fa
similarity index 100%
rename from ariba/tests/data/cluster_test_choose_best_gene/genes.fa
rename to ariba/tests/data/best_seq_chooser_best_seq_ref.fa
diff --git a/ariba/tests/data/best_seq_chooser_best_seq_ref.fa.fai b/ariba/tests/data/best_seq_chooser_best_seq_ref.fa.fai
new file mode 100644
index 00000000..ae529616
--- /dev/null
+++ b/ariba/tests/data/best_seq_chooser_best_seq_ref.fa.fai
@@ -0,0 +1,3 @@
+1	300	3	60	61
+2	279	311	60	61
+3	300	598	60	61
diff --git a/ariba/tests/data/cluster_test_get_best_gene_by_alignment_score/reads_1.fq b/ariba/tests/data/best_seq_chooser_get_best_seq_by_alignment_score_reads_1.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_get_best_gene_by_alignment_score/reads_1.fq
rename to ariba/tests/data/best_seq_chooser_get_best_seq_by_alignment_score_reads_1.fq
diff --git a/ariba/tests/data/cluster_test_get_best_gene_by_alignment_score/reads_2.fq b/ariba/tests/data/best_seq_chooser_get_best_seq_by_alignment_score_reads_2.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_get_best_gene_by_alignment_score/reads_2.fq
rename to ariba/tests/data/best_seq_chooser_get_best_seq_by_alignment_score_reads_2.fq
diff --git a/ariba/tests/data/cluster_test_get_best_gene_by_alignment_score/genes.fa b/ariba/tests/data/best_seq_chooser_get_best_seq_by_alignment_score_ref.fa
similarity index 100%
rename from ariba/tests/data/cluster_test_get_best_gene_by_alignment_score/genes.fa
rename to ariba/tests/data/best_seq_chooser_get_best_seq_by_alignment_score_ref.fa
diff --git a/ariba/tests/data/best_seq_chooser_get_best_seq_by_alignment_score_ref.fa.fai b/ariba/tests/data/best_seq_chooser_get_best_seq_by_alignment_score_ref.fa.fai
new file mode 100644
index 00000000..ae529616
--- /dev/null
+++ b/ariba/tests/data/best_seq_chooser_get_best_seq_by_alignment_score_ref.fa.fai
@@ -0,0 +1,3 @@
+1	300	3	60	61
+2	279	311	60	61
+3	300	598	60	61
diff --git a/ariba/tests/data/cluster_test_get_total_alignment_score/reads_1.fq b/ariba/tests/data/best_seq_chooser_total_alignment_score_reads_1.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_get_total_alignment_score/reads_1.fq
rename to ariba/tests/data/best_seq_chooser_total_alignment_score_reads_1.fq
diff --git a/ariba/tests/data/cluster_test_get_total_alignment_score/reads_2.fq b/ariba/tests/data/best_seq_chooser_total_alignment_score_reads_2.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_get_total_alignment_score/reads_2.fq
rename to ariba/tests/data/best_seq_chooser_total_alignment_score_reads_2.fq
diff --git a/ariba/tests/data/cluster_test_get_total_alignment_score/genes.fa b/ariba/tests/data/best_seq_chooser_total_alignment_score_ref_seqs.fa
similarity index 100%
rename from ariba/tests/data/cluster_test_get_total_alignment_score/genes.fa
rename to ariba/tests/data/best_seq_chooser_total_alignment_score_ref_seqs.fa
diff --git a/ariba/tests/data/best_seq_chooser_total_alignment_score_ref_seqs.fa.fai b/ariba/tests/data/best_seq_chooser_total_alignment_score_ref_seqs.fa.fai
new file mode 100644
index 00000000..ae529616
--- /dev/null
+++ b/ariba/tests/data/best_seq_chooser_total_alignment_score_ref_seqs.fa.fai
@@ -0,0 +1,3 @@
+1	300	3	60	61
+2	279	311	60	61
+3	300	598	60	61
diff --git a/ariba/tests/data/cdhit_test_enumerate_fasta.in.fa b/ariba/tests/data/cdhit_test_enumerate_fasta.in.fa
deleted file mode 100644
index 85ca4cb1..00000000
--- a/ariba/tests/data/cdhit_test_enumerate_fasta.in.fa
+++ /dev/null
@@ -1,6 +0,0 @@
->a
-A
->b
-G
->c
-T
diff --git a/ariba/tests/data/cdhit_test_enumerate_fasta.out.fa b/ariba/tests/data/cdhit_test_enumerate_fasta.out.fa
deleted file mode 100644
index 4b36e898..00000000
--- a/ariba/tests/data/cdhit_test_enumerate_fasta.out.fa
+++ /dev/null
@@ -1,6 +0,0 @@
->1
-A
->2
-G
->3
-T
diff --git a/ariba/tests/data/cdhit_test_fake_run.out.fa b/ariba/tests/data/cdhit_test_fake_run.out.fa
index 2a8bbc9e..bf8b12c8 100644
--- a/ariba/tests/data/cdhit_test_fake_run.out.fa
+++ b/ariba/tests/data/cdhit_test_fake_run.out.fa
@@ -1,4 +1,4 @@
->0
+>seq1
 TGGGGAATATAGTGGGTACTGTGTGTTGAGCGATTCCCGAGCCCTATGCAGGCTTGTGAA
 GGAGGTCGTGGGATGCTCGTGTCTTCACGAACTTAAAGCCCCTCTTTGGCTTAGGGCCGG
 AGATCGCGTCATAAGTGTAATCTAGCGTTGCAGGTATGGGTAAGGCCATACACTTAGCTC
@@ -8,7 +8,7 @@ ACGGTGAATAGAATCTTGGCATACGGTTAATCAGTGCTCTGCTAGTCCTGCTTTCTCTAA
 GCTTATAGAATTCCTGATATATTAAGTAACTTTTCCATTCCATAGACGCGACGAACTGGA
 TACACTCACGTAGGCTCGATCATGAAAGTGAAAGGCGCTCCAAGTTGCGAATTGAAACAA
 AACTCTATCGTAGGGTCGCA
->1
+>seq2
 TGGGGAATATAGTGGGTACTGTGTGTTGAGCGATTCCCGAGCCCTATGCAGGCTTGTGAA
 GGAGTTCGTGGGATGCTCGTGTCTTCACGAACTTAAAGCCCCTCTTTGGCTTAGGGCCGG
 AGATCGCGTCATAAGTGTAATCTAGCGTTGCAGGTATGGGTAAGGCCATACACTTAGCTC
@@ -18,7 +18,7 @@ ACGGTGAATGGAATCTTGGCATACGGTTAATCAGTGCTCTGCTAGTCCTGCTTTCTCTAA
 GCTTATAGAATTCCTGATATATTAAGTAACTTTTCCATTCCATAGACGCGACGAACTGGA
 TACACTCACGTATGCTCGATCATGAAAGTGAAAGGCGCTCCAAGTTGCGAATTGAAACAA
 AACTCTATGTAGGGTCGCA
->2
+>seq3
 TGGGGAATATAGTGGGTACTGTGTGTTGAGCGATTCCCGAGTTCTATGCAGGCTTGTGAA
 GGAGTTCGTGGGATGCTCGTGTCTTCACGAACTTAAAGCCCCTCTTTGGCTTAGGGCCGG
 AGATCGCGTCATAAGTGTAATCTAGCGTTGCAGGTATGGGTAGGCCATACACTTAGCTCA
@@ -28,7 +28,7 @@ ACGGTGAATGGAATCTTGGCATACGGTTAATCAGTGCTCTGCTAGTCCTGCTTTCTCTAA
 GCTTATAGAATTCCTGATATATTAAGTAACTTTTCCATTCCATAGACGCGACGAACTGGA
 TACACTCACGTATGCTCGATCATGAAAGTGAAAGGCGCTCCAAGATGCGAATTGAAACAA
 AACTCTATGTAGGGTCGCA
->3
+>seq4
 CAAGGGCGGATTCGAACGGGTAACAGGGATCTGATTGGCTCCGGCCAGCTGGTGGATATC
 TGCATCCGTTGACCCACCAACTTTAGCAGTATAGACCCTAAACTGGCATGGTGCCCTTTT
 TATATCCCGATGCATCTGGAGAAACCGTCAGGACCTCTTAAGCCCCGTGGAGAGCCAAAC
diff --git a/ariba/tests/data/cdhit_test_parse_cluster_info_file.in.fa b/ariba/tests/data/cdhit_test_parse_cluster_info_file.in.fa
deleted file mode 100644
index bf8b12c8..00000000
--- a/ariba/tests/data/cdhit_test_parse_cluster_info_file.in.fa
+++ /dev/null
@@ -1,40 +0,0 @@
->seq1
-TGGGGAATATAGTGGGTACTGTGTGTTGAGCGATTCCCGAGCCCTATGCAGGCTTGTGAA
-GGAGGTCGTGGGATGCTCGTGTCTTCACGAACTTAAAGCCCCTCTTTGGCTTAGGGCCGG
-AGATCGCGTCATAAGTGTAATCTAGCGTTGCAGGTATGGGTAAGGCCATACACTTAGCTC
-TGATGTGATGTGTCAGGTCTGGAGTTTACATATGTCCTGCCACGGTCCTATTTGTTAGAG
-AGGCCTTCAGGCGGCCCCTGCCCGTCGATTCGGCAAACTGCCGAAAACGGAGAGACAGCC
-ACGGTGAATAGAATCTTGGCATACGGTTAATCAGTGCTCTGCTAGTCCTGCTTTCTCTAA
-GCTTATAGAATTCCTGATATATTAAGTAACTTTTCCATTCCATAGACGCGACGAACTGGA
-TACACTCACGTAGGCTCGATCATGAAAGTGAAAGGCGCTCCAAGTTGCGAATTGAAACAA
-AACTCTATCGTAGGGTCGCA
->seq2
-TGGGGAATATAGTGGGTACTGTGTGTTGAGCGATTCCCGAGCCCTATGCAGGCTTGTGAA
-GGAGTTCGTGGGATGCTCGTGTCTTCACGAACTTAAAGCCCCTCTTTGGCTTAGGGCCGG
-AGATCGCGTCATAAGTGTAATCTAGCGTTGCAGGTATGGGTAAGGCCATACACTTAGCTC
-TGATGTGATGTGTCAGGTCTGGAGTTTACATATGTCCTGCCACGGTCCTATTTGTTAGAG
-AGGCCTTCAGGCGGCCCCTGCCCGTCGATTCGGCAAACTGCCGAAAACGGAGAGACAGCC
-ACGGTGAATGGAATCTTGGCATACGGTTAATCAGTGCTCTGCTAGTCCTGCTTTCTCTAA
-GCTTATAGAATTCCTGATATATTAAGTAACTTTTCCATTCCATAGACGCGACGAACTGGA
-TACACTCACGTATGCTCGATCATGAAAGTGAAAGGCGCTCCAAGTTGCGAATTGAAACAA
-AACTCTATGTAGGGTCGCA
->seq3
-TGGGGAATATAGTGGGTACTGTGTGTTGAGCGATTCCCGAGTTCTATGCAGGCTTGTGAA
-GGAGTTCGTGGGATGCTCGTGTCTTCACGAACTTAAAGCCCCTCTTTGGCTTAGGGCCGG
-AGATCGCGTCATAAGTGTAATCTAGCGTTGCAGGTATGGGTAGGCCATACACTTAGCTCA
-TGATGTGATGTGTCAGGTCTGGAGTTTACATATGTCCTGCCACGGTCCTATTTGTTAGAG
-AGGCCTTCAGGCGGCCCCTGCCCGTCGATTCGGCAAACTGCCGAAAACGGAGAGATAGCC
-ACGGTGAATGGAATCTTGGCATACGGTTAATCAGTGCTCTGCTAGTCCTGCTTTCTCTAA
-GCTTATAGAATTCCTGATATATTAAGTAACTTTTCCATTCCATAGACGCGACGAACTGGA
-TACACTCACGTATGCTCGATCATGAAAGTGAAAGGCGCTCCAAGATGCGAATTGAAACAA
-AACTCTATGTAGGGTCGCA
->seq4
-CAAGGGCGGATTCGAACGGGTAACAGGGATCTGATTGGCTCCGGCCAGCTGGTGGATATC
-TGCATCCGTTGACCCACCAACTTTAGCAGTATAGACCCTAAACTGGCATGGTGCCCTTTT
-TATATCCCGATGCATCTGGAGAAACCGTCAGGACCTCTTAAGCCCCGTGGAGAGCCAAAC
-TTCCAACCACGTCAAGGCAACCTTGGTTTAGCACAGGGCTCCCAGTGGGTGTAAGGGATG
-AACACTACCCGGCCCACCGTCGATTTAGCCCTAAATGGTCTATTGCTCACGGGTAGCACA
-CAAGTAATAAAAACGTATTCAGCTCGAGTCAGCGTCCAGCCATTTTACTTTGCGTCATCG
-AGGGGTAGTGCCTCCGAGAATCAAGGTTTGATTATACTAAACGGAGGGGCCTACCACTCA
-GCCAGTCTTTGCATCGTCCATTCCCGCCGTTTATGGGTCACTATTCATTCGGAATTTGGA
-TGCGGTCAACAAGTCCAGGT
diff --git a/ariba/tests/data/cdhit_test_parse_cluster_info_file.in.renamed.fa b/ariba/tests/data/cdhit_test_parse_cluster_info_file.in.renamed.fa
deleted file mode 100644
index 9f7eca5d..00000000
--- a/ariba/tests/data/cdhit_test_parse_cluster_info_file.in.renamed.fa
+++ /dev/null
@@ -1,40 +0,0 @@
->1
-TGGGGAATATAGTGGGTACTGTGTGTTGAGCGATTCCCGAGCCCTATGCAGGCTTGTGAA
-GGAGGTCGTGGGATGCTCGTGTCTTCACGAACTTAAAGCCCCTCTTTGGCTTAGGGCCGG
-AGATCGCGTCATAAGTGTAATCTAGCGTTGCAGGTATGGGTAAGGCCATACACTTAGCTC
-TGATGTGATGTGTCAGGTCTGGAGTTTACATATGTCCTGCCACGGTCCTATTTGTTAGAG
-AGGCCTTCAGGCGGCCCCTGCCCGTCGATTCGGCAAACTGCCGAAAACGGAGAGACAGCC
-ACGGTGAATAGAATCTTGGCATACGGTTAATCAGTGCTCTGCTAGTCCTGCTTTCTCTAA
-GCTTATAGAATTCCTGATATATTAAGTAACTTTTCCATTCCATAGACGCGACGAACTGGA
-TACACTCACGTAGGCTCGATCATGAAAGTGAAAGGCGCTCCAAGTTGCGAATTGAAACAA
-AACTCTATCGTAGGGTCGCA
->2
-TGGGGAATATAGTGGGTACTGTGTGTTGAGCGATTCCCGAGCCCTATGCAGGCTTGTGAA
-GGAGTTCGTGGGATGCTCGTGTCTTCACGAACTTAAAGCCCCTCTTTGGCTTAGGGCCGG
-AGATCGCGTCATAAGTGTAATCTAGCGTTGCAGGTATGGGTAAGGCCATACACTTAGCTC
-TGATGTGATGTGTCAGGTCTGGAGTTTACATATGTCCTGCCACGGTCCTATTTGTTAGAG
-AGGCCTTCAGGCGGCCCCTGCCCGTCGATTCGGCAAACTGCCGAAAACGGAGAGACAGCC
-ACGGTGAATGGAATCTTGGCATACGGTTAATCAGTGCTCTGCTAGTCCTGCTTTCTCTAA
-GCTTATAGAATTCCTGATATATTAAGTAACTTTTCCATTCCATAGACGCGACGAACTGGA
-TACACTCACGTATGCTCGATCATGAAAGTGAAAGGCGCTCCAAGTTGCGAATTGAAACAA
-AACTCTATGTAGGGTCGCA
->3
-TGGGGAATATAGTGGGTACTGTGTGTTGAGCGATTCCCGAGTTCTATGCAGGCTTGTGAA
-GGAGTTCGTGGGATGCTCGTGTCTTCACGAACTTAAAGCCCCTCTTTGGCTTAGGGCCGG
-AGATCGCGTCATAAGTGTAATCTAGCGTTGCAGGTATGGGTAGGCCATACACTTAGCTCA
-TGATGTGATGTGTCAGGTCTGGAGTTTACATATGTCCTGCCACGGTCCTATTTGTTAGAG
-AGGCCTTCAGGCGGCCCCTGCCCGTCGATTCGGCAAACTGCCGAAAACGGAGAGATAGCC
-ACGGTGAATGGAATCTTGGCATACGGTTAATCAGTGCTCTGCTAGTCCTGCTTTCTCTAA
-GCTTATAGAATTCCTGATATATTAAGTAACTTTTCCATTCCATAGACGCGACGAACTGGA
-TACACTCACGTATGCTCGATCATGAAAGTGAAAGGCGCTCCAAGATGCGAATTGAAACAA
-AACTCTATGTAGGGTCGCA
->4
-CAAGGGCGGATTCGAACGGGTAACAGGGATCTGATTGGCTCCGGCCAGCTGGTGGATATC
-TGCATCCGTTGACCCACCAACTTTAGCAGTATAGACCCTAAACTGGCATGGTGCCCTTTT
-TATATCCCGATGCATCTGGAGAAACCGTCAGGACCTCTTAAGCCCCGTGGAGAGCCAAAC
-TTCCAACCACGTCAAGGCAACCTTGGTTTAGCACAGGGCTCCCAGTGGGTGTAAGGGATG
-AACACTACCCGGCCCACCGTCGATTTAGCCCTAAATGGTCTATTGCTCACGGGTAGCACA
-CAAGTAATAAAAACGTATTCAGCTCGAGTCAGCGTCCAGCCATTTTACTTTGCGTCATCG
-AGGGGTAGTGCCTCCGAGAATCAAGGTTTGATTATACTAAACGGAGGGGCCTACCACTCA
-GCCAGTCTTTGCATCGTCCATTCCCGCCGTTTATGGGTCACTATTCATTCGGAATTTGGA
-TGCGGTCAACAAGTCCAGGT
diff --git a/ariba/tests/data/cdhit_test_parse_cluster_info_file.infile b/ariba/tests/data/cdhit_test_parse_cluster_info_file.infile
new file mode 100644
index 00000000..69a6f14b
--- /dev/null
+++ b/ariba/tests/data/cdhit_test_parse_cluster_info_file.infile
@@ -0,0 +1,4 @@
+0	500aa, >seq1... *
+0	499aa, >seq2... at 99.40%
+0	499aa, >seq3... at 98.40%
+1	500aa, >seq4... *
diff --git a/ariba/tests/data/cdhit_test_parse_cluster_info_file.out.fa b/ariba/tests/data/cdhit_test_parse_cluster_info_file.out.fa
deleted file mode 100644
index dba7562b..00000000
--- a/ariba/tests/data/cdhit_test_parse_cluster_info_file.out.fa
+++ /dev/null
@@ -1,20 +0,0 @@
->1
-TGGGGAATATAGTGGGTACTGTGTGTTGAGCGATTCCCGAGCCCTATGCAGGCTTGTGAA
-GGAGGTCGTGGGATGCTCGTGTCTTCACGAACTTAAAGCCCCTCTTTGGCTTAGGGCCGG
-AGATCGCGTCATAAGTGTAATCTAGCGTTGCAGGTATGGGTAAGGCCATACACTTAGCTC
-TGATGTGATGTGTCAGGTCTGGAGTTTACATATGTCCTGCCACGGTCCTATTTGTTAGAG
-AGGCCTTCAGGCGGCCCCTGCCCGTCGATTCGGCAAACTGCCGAAAACGGAGAGACAGCC
-ACGGTGAATAGAATCTTGGCATACGGTTAATCAGTGCTCTGCTAGTCCTGCTTTCTCTAA
-GCTTATAGAATTCCTGATATATTAAGTAACTTTTCCATTCCATAGACGCGACGAACTGGA
-TACACTCACGTAGGCTCGATCATGAAAGTGAAAGGCGCTCCAAGTTGCGAATTGAAACAA
-AACTCTATCGTAGGGTCGCA
->4
-CAAGGGCGGATTCGAACGGGTAACAGGGATCTGATTGGCTCCGGCCAGCTGGTGGATATC
-TGCATCCGTTGACCCACCAACTTTAGCAGTATAGACCCTAAACTGGCATGGTGCCCTTTT
-TATATCCCGATGCATCTGGAGAAACCGTCAGGACCTCTTAAGCCCCGTGGAGAGCCAAAC
-TTCCAACCACGTCAAGGCAACCTTGGTTTAGCACAGGGCTCCCAGTGGGTGTAAGGGATG
-AACACTACCCGGCCCACCGTCGATTTAGCCCTAAATGGTCTATTGCTCACGGGTAGCACA
-CAAGTAATAAAAACGTATTCAGCTCGAGTCAGCGTCCAGCCATTTTACTTTGCGTCATCG
-AGGGGTAGTGCCTCCGAGAATCAAGGTTTGATTATACTAAACGGAGGGGCCTACCACTCA
-GCCAGTCTTTGCATCGTCCATTCCCGCCGTTTATGGGTCACTATTCATTCGGAATTTGGA
-TGCGGTCAACAAGTCCAGGT
diff --git a/ariba/tests/data/cdhit_test_parse_cluster_info_file.out.fa.bak.clstr b/ariba/tests/data/cdhit_test_parse_cluster_info_file.out.fa.bak.clstr
deleted file mode 100644
index 17451594..00000000
--- a/ariba/tests/data/cdhit_test_parse_cluster_info_file.out.fa.bak.clstr
+++ /dev/null
@@ -1,4 +0,0 @@
-0	500aa, >1... *
-0	499aa, >2... at 99.40%
-0	499aa, >3... at 98.40%
-1	500aa, >4... *
diff --git a/ariba/tests/data/cdhit_test_rename_fasta.in.fa b/ariba/tests/data/cdhit_test_rename_fasta.in.fa
deleted file mode 100644
index 11d5e25a..00000000
--- a/ariba/tests/data/cdhit_test_rename_fasta.in.fa
+++ /dev/null
@@ -1,6 +0,0 @@
->a
-A
->b
-C
->c
-G
diff --git a/ariba/tests/data/cdhit_test_run.out.fa b/ariba/tests/data/cdhit_test_run.out.fa
index 12a02b2b..1aae0abd 100644
--- a/ariba/tests/data/cdhit_test_run.out.fa
+++ b/ariba/tests/data/cdhit_test_run.out.fa
@@ -1,4 +1,4 @@
->0
+>seq1
 TGGGGAATATAGTGGGTACTGTGTGTTGAGCGATTCCCGAGCCCTATGCAGGCTTGTGAA
 GGAGGTCGTGGGATGCTCGTGTCTTCACGAACTTAAAGCCCCTCTTTGGCTTAGGGCCGG
 AGATCGCGTCATAAGTGTAATCTAGCGTTGCAGGTATGGGTAAGGCCATACACTTAGCTC
@@ -8,7 +8,7 @@ ACGGTGAATAGAATCTTGGCATACGGTTAATCAGTGCTCTGCTAGTCCTGCTTTCTCTAA
 GCTTATAGAATTCCTGATATATTAAGTAACTTTTCCATTCCATAGACGCGACGAACTGGA
 TACACTCACGTAGGCTCGATCATGAAAGTGAAAGGCGCTCCAAGTTGCGAATTGAAACAA
 AACTCTATCGTAGGGTCGCA
->1
+>seq4
 CAAGGGCGGATTCGAACGGGTAACAGGGATCTGATTGGCTCCGGCCAGCTGGTGGATATC
 TGCATCCGTTGACCCACCAACTTTAGCAGTATAGACCCTAAACTGGCATGGTGCCCTTTT
 TATATCCCGATGCATCTGGAGAAACCGTCAGGACCTCTTAAGCCCCGTGGAGAGCCAAAC
diff --git a/ariba/tests/data/cluster_test_assemble_with_spades/genes.fa b/ariba/tests/data/cluster_test_assemble_with_spades/genes.fa
deleted file mode 100644
index 042775d6..00000000
--- a/ariba/tests/data/cluster_test_assemble_with_spades/genes.fa
+++ /dev/null
@@ -1,2 +0,0 @@
->name_of_gene
-AAACCCGGGTTT
diff --git a/ariba/tests/data/cluster_test_choose_best_gene.gene.fa b/ariba/tests/data/cluster_test_choose_best_gene.gene.fa
deleted file mode 100644
index 6d31a2ed..00000000
--- a/ariba/tests/data/cluster_test_choose_best_gene.gene.fa
+++ /dev/null
@@ -1,6 +0,0 @@
->1
-AGCGCCTAGCTTTGGCACTTCAGGAGCGCCCGGAAATAATGGCGGGCGATGAAGGTTCTG
-TAGGTACGCAAGATCCCTCTTAATCACAGTGGTGTAATCTGCGGGTCAGACCCTGTTAAC
-CCGTGGCTTTCACACTCCCTCCTATGGGTAATCAATCCAGAAAGGGGCCGAAATGCAAAA
-GTCTTAAGGACTCTGCGAGGCAAAGTACGGGCGAACTAAACCCCCGTGACAGGTCAGACG
-TTGTTTCGGCAATCTGTCGCGCTCCCACACCTATAAGCGTACACCGTCTCTTCTGCCAGC
diff --git a/ariba/tests/data/cluster_test_count_reads_1.fq b/ariba/tests/data/cluster_test_count_reads_1.fq
new file mode 100644
index 00000000..9d691782
--- /dev/null
+++ b/ariba/tests/data/cluster_test_count_reads_1.fq
@@ -0,0 +1,8 @@
+@read1
+ACGT
++
+HHHH
+@read2
+ACG
++
+:-)
diff --git a/ariba/tests/data/cluster_test_count_reads_2.fq b/ariba/tests/data/cluster_test_count_reads_2.fq
new file mode 100644
index 00000000..e458df7f
--- /dev/null
+++ b/ariba/tests/data/cluster_test_count_reads_2.fq
@@ -0,0 +1,8 @@
+@read1
+ACGT
++
+HHHH
+@read2
+ACG
++
+:-D
diff --git a/ariba/tests/data/cluster_test_fix_contig_orientation/genes.fa b/ariba/tests/data/cluster_test_fix_contig_orientation/genes.fa
deleted file mode 100644
index 5d5102b8..00000000
--- a/ariba/tests/data/cluster_test_fix_contig_orientation/genes.fa
+++ /dev/null
@@ -1,10 +0,0 @@
->gene
-ACTTACCGGTTCGGGGTCTAAACCAACCATTAAACTGCGACAACCATTCATCCTGGAGTA
-CGCTTCGGTCCACCATGATGGAGCGCCATGTGATGGGATTTCCAACCCCGTTGTTTCAGG
-ACTCATGGCATTTACCACCGACAACCGTTTATAATCCATGAGCAAGGAATACAGTGGAGA
-CAGGATTGGTTGTATTGGACTGAATACATGCCCCACTGTTACCCCGAAAGTTAACACGTA
-CCCATAGTTTATTTAAACTAGGCACTCCCGATCAGCCAAGACTTAAAAAGGGGGATAGGA
-ATATCAACGTAGTACTTCTCGGTTGATCCGTGTTTTTTAATCTAAAATATAATGTGTAGG
-CAGCTATCGTGCTAATCGTTGAAATGAGCAGGCGAAATGCCGTTTACAACGACGCTAAAC
-CTCCAAGTCGAATTAAGCCAAATTGTGCCTTCCATATGACCTCCACAGATTTGGGCTGGC
-ACTGTCAGCGTAGTTGCGCT
diff --git a/ariba/tests/data/cluster_test_full_run_assembly_fail.noncoding.fa b/ariba/tests/data/cluster_test_full_run_assembly_fail.noncoding.fa
new file mode 100644
index 00000000..7a82c890
--- /dev/null
+++ b/ariba/tests/data/cluster_test_full_run_assembly_fail.noncoding.fa
@@ -0,0 +1,8 @@
+>noncoding_ref_seq
+TTGGTTCCGTTCTTATTCTTCAGCTATTGATAGCAATGGTCCAACACATCGACCGACCCC
+CACAAAGACAGCAGACATCGATTGCTAAGGGGCCCGAAATTTGCTGGTCCGCACGAACAC
+GACTCCGCCAAATACGTATTCGTCCGCGCAGGATGTGGCGAACTCTCAGATGTTCGGCTT
+TTCTTTCGATCGAAGCGACTGGCCGGAGGCAGTCAGACGGCGATAGGAGTAGAATCCACC
+GGAGGGTCTGCCCTGCGGTCATCAGCTACCTTCAACCCTCAAGTCCTCGTCGCCATCCGA
+AAAAACCTTCCCCACGATAGCGTACATCACGCTTTGGTTACAGGAAGAATGAGGGATTCA
+AATGAAGATCCCATGTACACTGTAAAGGTCTTTCCGAGTA
diff --git a/ariba/tests/data/cluster_test_full_run_assembly_fail/reads_1.fq b/ariba/tests/data/cluster_test_full_run_assembly_fail/reads_1.fq
new file mode 100644
index 00000000..932a2328
--- /dev/null
+++ b/ariba/tests/data/cluster_test_full_run_assembly_fail/reads_1.fq
@@ -0,0 +1,8 @@
+@non_coding:1:92:322/1
+GCCCGAAATTTGCTGGTCCGCACGAACACGACTCCGCCAAATACGTATTCGTCCGCGCAGGATGTGGCGAACTCTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@non_coding:2:11:253/1
+TCTTATTCTTCAGCTATTGATAGCAATGGTCCAACACATCGACCGACCCCCACAAAGACAGCAGACATCGATTGCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
diff --git a/ariba/tests/data/cluster_test_full_run_assembly_fail/reads_2.fq b/ariba/tests/data/cluster_test_full_run_assembly_fail/reads_2.fq
new file mode 100644
index 00000000..3fc31962
--- /dev/null
+++ b/ariba/tests/data/cluster_test_full_run_assembly_fail/reads_2.fq
@@ -0,0 +1,8 @@
+@non_coding:1:92:322/2
+TCGGAAAGACCTTTACAGTGTACATGGGATCTTCATTTGAATCCCTCATTCTTCCTGTAACCAAAGCGTGATGTAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@non_coding:2:11:253/2
+GATGTACGCTATCGTGGGGAAGGTTTTTTCGGATGGCGACGAGGACTTGAGGGTTGAAGGTAGCTGATGACCGCAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
diff --git a/ariba/tests/data/cluster_test_full_run_assembly_fail/references.fa b/ariba/tests/data/cluster_test_full_run_assembly_fail/references.fa
new file mode 100644
index 00000000..7a82c890
--- /dev/null
+++ b/ariba/tests/data/cluster_test_full_run_assembly_fail/references.fa
@@ -0,0 +1,8 @@
+>noncoding_ref_seq
+TTGGTTCCGTTCTTATTCTTCAGCTATTGATAGCAATGGTCCAACACATCGACCGACCCC
+CACAAAGACAGCAGACATCGATTGCTAAGGGGCCCGAAATTTGCTGGTCCGCACGAACAC
+GACTCCGCCAAATACGTATTCGTCCGCGCAGGATGTGGCGAACTCTCAGATGTTCGGCTT
+TTCTTTCGATCGAAGCGACTGGCCGGAGGCAGTCAGACGGCGATAGGAGTAGAATCCACC
+GGAGGGTCTGCCCTGCGGTCATCAGCTACCTTCAACCCTCAAGTCCTCGTCGCCATCCGA
+AAAAACCTTCCCCACGATAGCGTACATCACGCTTTGGTTACAGGAAGAATGAGGGATTCA
+AATGAAGATCCCATGTACACTGTAAAGGTCTTTCCGAGTA
diff --git a/ariba/tests/data/cluster_test_full_run_choose_ref_fail.presence_absence.fa b/ariba/tests/data/cluster_test_full_run_choose_ref_fail.presence_absence.fa
new file mode 100644
index 00000000..09fa61e8
--- /dev/null
+++ b/ariba/tests/data/cluster_test_full_run_choose_ref_fail.presence_absence.fa
@@ -0,0 +1,2 @@
+>presence_absence_gene
+ATGAACCCCGGGGTTTTTTAA
diff --git a/ariba/tests/data/cluster_test_full_run_choose_ref_fail/reads_1.fq b/ariba/tests/data/cluster_test_full_run_choose_ref_fail/reads_1.fq
new file mode 100644
index 00000000..022d4f47
--- /dev/null
+++ b/ariba/tests/data/cluster_test_full_run_choose_ref_fail/reads_1.fq
@@ -0,0 +1,4 @@
+@read1/1
+CACTGACTTCACTTACTATCTACTGAATATACTTATCATCTACTCGATGCATGCATGCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
diff --git a/ariba/tests/data/cluster_test_full_run_choose_ref_fail/reads_2.fq b/ariba/tests/data/cluster_test_full_run_choose_ref_fail/reads_2.fq
new file mode 100644
index 00000000..022d4f47
--- /dev/null
+++ b/ariba/tests/data/cluster_test_full_run_choose_ref_fail/reads_2.fq
@@ -0,0 +1,4 @@
+@read1/1
+CACTGACTTCACTTACTATCTACTGAATATACTTATCATCTACTCGATGCATGCATGCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
diff --git a/ariba/tests/data/cluster_test_full_run_choose_ref_fail/references.fa b/ariba/tests/data/cluster_test_full_run_choose_ref_fail/references.fa
new file mode 100644
index 00000000..bf3b2726
--- /dev/null
+++ b/ariba/tests/data/cluster_test_full_run_choose_ref_fail/references.fa
@@ -0,0 +1,4 @@
+>presence_absence_gene1
+ATGAACCCCGGGGTTTTTTAA
+>presence_absence_gene2
+ATGACCCCCGGGGTTTTTTAA
diff --git a/ariba/tests/data/cluster_test_full_run_ok_non_coding.fa b/ariba/tests/data/cluster_test_full_run_ok_non_coding.fa
new file mode 100644
index 00000000..3278f9e5
--- /dev/null
+++ b/ariba/tests/data/cluster_test_full_run_ok_non_coding.fa
@@ -0,0 +1,6 @@
+>noncoding1
+CGTACGCGGGTGGAGACATGTACTCCACTCCCATACATCCCTAAGTTTGTCCCTAAGGCA
+GTGCCCGCCGCCCACGAACGAACTGCGGTGAGATGCTTAGGGAACGCCATCCGAGATCAC
+>noncoding2
+TCTTTAACTGTTCACGACTGTATCGCGGCTTGCAAATCTTAAGTTCTTCCCAAGCGCGCT
+GCGATACAAATCCCAAGTTTAGCGGACAGTTCACGCCGGGTTCTAAGAATGTATGCGTCC
diff --git a/ariba/tests/data/cluster_test_full_run_ok_non_coding.metadata.tsv b/ariba/tests/data/cluster_test_full_run_ok_non_coding.metadata.tsv
new file mode 100644
index 00000000..ce964cd7
--- /dev/null
+++ b/ariba/tests/data/cluster_test_full_run_ok_non_coding.metadata.tsv
@@ -0,0 +1,5 @@
+noncoding1	.	.	N	generic description of noncoding1
+noncoding1	n	A6G	N	variant in ref and reads so should report
+noncoding1	n	G9T	N	wild type in ref and reads so should not report
+noncoding1	n	A14T	N	ref has wild type, reads has variant so should report
+noncoding1	n	A40C	N	ref has variant, reads has wild type so should not report
diff --git a/ariba/tests/data/cluster_test_full_run_ok_non_coding/reads_1.fq b/ariba/tests/data/cluster_test_full_run_ok_non_coding/reads_1.fq
new file mode 100644
index 00000000..4356e47f
--- /dev/null
+++ b/ariba/tests/data/cluster_test_full_run_ok_non_coding/reads_1.fq
@@ -0,0 +1,144 @@
+@noncoding1:1:77:136/1
+CATGTACTCCACTCCCATACATCACTAAGTTTGTCCCTAAGGCATTGCCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:2:48:107/1
+CTGAGTGAAGCGACGTACGCGGGTGGTGACATGTACTCCACTCCCATACA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:3:98:159/1
+TCACTAAGTTTGTCCCTAAGGCATTGCCCGCCGCCCACGAACGAACTGCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:4:126:185/1
+CGCCGCCCACGAACGAACTGCGGTGAGATGCTTAGGGAACGCCATCCGAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:5:26:85/1
+CGTAGCGTACTGAGTCTACTGACTGAGTGAAGCGACGTACGCGGGTGGTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:6:85:145/1
+CCACTCCCATACATCACTAAGTTTGTCCCTAAGGCATTGCCCGCCGCCCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:7:53:112/1
+TGAAGCGACGTACGCGGGTGGTGACATGTACTCCACTCCCATACATCACT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:8:110:170/1
+TCCCTAAGGCATTGCCCGCCGCCCACGAACGAACTGCGGTGAGATGCTTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:9:73:132/1
+GTGACATGTACTCCACTCCCATACATCACTAAGTTTGTCCCTAAGGCATT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:10:51:110/1
+AGTGAAGCGACGTACGCGGGTGGTGACATGTACTCCACTCCCATACATCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:11:123:183/1
+GCCCGCCGCCCACGAACGAACTGCGGTGAGATGCTTAGGGAACGCCATCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:12:63:122/1
+TACGCGGGTGGTGACATGTACTCCACTCCCATACATCACTAAGTTTGTCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:13:91:150/1
+CCATACATCACTAAGTTTGTCCCTAAGGCATTGCCCGCCGCCCACGAACG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:14:7:68/1
+GACTTGACGATCGTACGTACGTAGCGTACTGAGTCTACTGACTGAGTGAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:15:104:163/1
+AGTTTGTCCCTAAGGCATTGCCCGCCGCCCACGAACGAACTGCGGTGAGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:16:1:60/1
+CGTATCGACTTGACGATCGTACGTACGTAGCGTACTGAGTCTACTGACTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:17:64:123/1
+ACGCGGGTGGTGACATGTACTCCACTCCCATACATCACTAAGTTTGTCCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:18:128:185/1
+CCGCCCACGAACGAACTGCGGTGAGATGCTTAGGGAACGCCATCCGAGAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:19:28:88/1
+TAGCGTACTGAGTCTACTGACTGAGTGAAGCGACGTACGCGGGTGGTGAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:20:97:157/1
+ATCACTAAGTTTGTCCCTAAGGCATTGCCCGCCGCCCACGAACGAACTGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:21:22:81/1
+CGTACGTAGCGTACTGAGTCTACTGACTGAGTGAAGCGACGTACGCGGGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:22:95:155/1
+ACATCACTAAGTTTGTCCCTAAGGCATTGCCCGCCGCCCACGAACGAACT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:23:119:176/1
+CATTGCCCGCCGCCCACGAACGAACTGCGGTGAGATGCTTAGGGAACGCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:24:110:169/1
+TCCCTAAGGCATTGCCCGCCGCCCACGAACGAACTGCGGTGAGATGCTTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:25:110:170.dup.2/1
+TCCCTAAGGCATTGCCCGCCGCCCACGAACGAACTGCGGTGAGATGCTTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:26:57:117/1
+GCGACGTACGCGGGTGGTGACATGTACTCCACTCCCATACATCACTAAGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:27:41:100/1
+CTACTGACTGAGTGAAGCGACGTACGCGGGTGGTGACATGTACTCCACTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:28:18:78/1
+CGTACGTACGTAGCGTACTGAGTCTACTGACTGAGTGAAGCGACGTACGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:29:6:65/1
+CGACTTGACGATCGTACGTACGTAGCGTACTGAGTCTACTGACTGAGTGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:30:3:63/1
+TATCGACTTGACGATCGTACGTACGTAGCGTACTGAGTCTACTGACTGAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:31:66:124/1
+GCGGGTGGTGACATGTACTCCACTCCCATACATCACTAAGTTTGTCCCTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:32:62:122/1
+GTACGCGGGTGGTGACATGTACTCCACTCCCATACATCACTAAGTTTGTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:33:32:91/1
+GTACTGAGTCTACTGACTGAGTGAAGCGACGTACGCGGGTGGTGACATGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:34:28:86/1
+TAGCGTACTGAGTCTACTGACTGAGTGAAGCGACGTACGCGGGTGGTGAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:35:3:64/1
+TATCGACTTGACGATCGTACGTACGTAGCGTACTGAGTCTACTGACTGAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:36:120:181/1
+ATTGCCCGCCGCCCACGAACGAACTGCGGTGAGATGCTTAGGGAACGCCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
diff --git a/ariba/tests/data/cluster_test_full_run_ok_non_coding/reads_2.fq b/ariba/tests/data/cluster_test_full_run_ok_non_coding/reads_2.fq
new file mode 100644
index 00000000..b3ba7388
--- /dev/null
+++ b/ariba/tests/data/cluster_test_full_run_ok_non_coding/reads_2.fq
@@ -0,0 +1,144 @@
+@noncoding1:1:77:136/2
+TGAATGTGATCTCGGATGGCGTTCCCTAAGCATCTCACCGCAGTTCGTTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:2:48:107/2
+GCATCTCACCGCAGTTCGTTCGTGGGCGGCGGGCAATGCCTTAGGGACAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:3:98:159/2
+CGTCGCTGATAGCTGCTCGCTCGTGAATGTGATCTCGGATGGCGTTCCCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:4:126:185/2
+AGATCCGCGCGAGAGTATATATCGCTCGTCGCTGATAGCTGCTCGCTCGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:5:26:85/2
+TGGGCGGCGGGCAATGCCTTAGGGACAAACTTAGTGATGTATGGGAGTGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:6:85:145/2
+GCTCGCTCGTGAATGTGATCTCGGATGGCGTTCCCTAAGCATCTCACCGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:7:53:112/2
+CCTAAGCATCTCACCGCAGTTCGTTCGTGGGCGGCGGGCAATGCCTTAGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:8:110:170/2
+TATATATCGCTCGTCGCTGATAGCTGCTCGCTCGTGAATGTGATCTCGGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:9:73:132/2
+TGTGATCTCGGATGGCGTTCCCTAAGCATCTCACCGCAGTTCGTTCGTGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:10:51:110/2
+TAAGCATCTCACCGCAGTTCGTTCGTGGGCGGCGGGCAATGCCTTAGGGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:11:123:183/2
+ATCCGCGCGAGAGTATATATCGCTCGTCGCTGATAGCTGCTCGCTCGTGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:12:63:122/2
+GATGGCGTTCCCTAAGCATCTCACCGCAGTTCGTTCGTGGGCGGCGGGCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:13:91:150/2
+TAGCTGCTCGCTCGTGAATGTGATCTCGGATGGCGTTCCCTAAGCATCTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:14:7:68/2
+CTTAGGGACAAACTTAGTGATGTATGGGAGTGGAGTACATGTCACCACCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:15:104:163/2
+CGCTCGTCGCTGATAGCTGCTCGCTCGTGAATGTGATCTCGGATGGCGTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:16:1:60/2
+CAAACTTAGTGATGTATGGGAGTGGAGTACATGTCACCACCCGCGTACGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:17:64:123/2
+GGATGGCGTTCCCTAAGCATCTCACCGCAGTTCGTTCGTGGGCGGCGGGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:18:128:185/2
+AGATCCGCGCGAGAGTATATATCGCTCGTCGCTGATAGCTGCTCGCTCGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:19:28:88/2
+TCGTGGGCGGCGGGCAATGCCTTAGGGACAAACTTAGTGATGTATGGGAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:20:97:157/2
+TCGCTGATAGCTGCTCGCTCGTGAATGTGATCTCGGATGGCGTTCCCTAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:21:22:81/2
+CGGCGGGCAATGCCTTAGGGACAAACTTAGTGATGTATGGGAGTGGAGTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:22:95:155/2
+GCTGATAGCTGCTCGCTCGTGAATGTGATCTCGGATGGCGTTCCCTAAGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:23:119:176/2
+CGAGAGTATATATCGCTCGTCGCTGATAGCTGCTCGCTCGTGAATGTGAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:24:110:169/2
+ATATATCGCTCGTCGCTGATAGCTGCTCGCTCGTGAATGTGATCTCGGAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:25:110:170.dup.2/2
+TATATATCGCTCGTCGCTGATAGCTGCTCGCTCGTGAATGTGATCTCGGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:26:57:117/2
+CGTTCCCTAAGCATCTCACCGCAGTTCGTTCGTGGGCGGCGGGCAATGCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:27:41:100/2
+ACCGCAGTTCGTTCGTGGGCGGCGGGCAATGCCTTAGGGACAAACTTAGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:28:18:78/2
+CGGGCAATGCCTTAGGGACAAACTTAGTGATGTATGGGAGTGGAGTACAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:29:6:65/2
+AGGGACAAACTTAGTGATGTATGGGAGTGGAGTACATGTCACCACCCGCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:30:3:63/2
+GGACAAACTTAGTGATGTATGGGAGTGGAGTACATGTCACCACCCGCGTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:31:66:124/2
+CGGATGGCGTTCCCTAAGCATCTCACCGCAGTTCGTTCGTGGGCGGCGGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:32:62:122/2
+GATGGCGTTCCCTAAGCATCTCACCGCAGTTCGTTCGTGGGCGGCGGGCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:33:32:91/2
+CGTTCGTGGGCGGCGGGCAATGCCTTAGGGACAAACTTAGTGATGTATGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:34:28:86/2
+GTGGGCGGCGGGCAATGCCTTAGGGACAAACTTAGTGATGTATGGGAGTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:35:3:64/2
+GGGACAAACTTAGTGATGTATGGGAGTGGAGTACATGTCACCACCCGCGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@noncoding1:36:120:181/2
+CCGCGCGAGAGTATATATCGCTCGTCGCTGATAGCTGCTCGCTCGTGAAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
diff --git a/ariba/tests/data/cluster_test_full_run_ok_non_coding/references.fa b/ariba/tests/data/cluster_test_full_run_ok_non_coding/references.fa
new file mode 100644
index 00000000..2a9253db
--- /dev/null
+++ b/ariba/tests/data/cluster_test_full_run_ok_non_coding/references.fa
@@ -0,0 +1,6 @@
+>noncoding1
+CGTACGCGGGTGGAGACATGTACTCCACTCCCATACATCCCTAAGTTTGTCCCTAAGGCA
+GTGCCCGCCGCCCACGAACGAATGCGGTGAGATGCTTAGGGAACGCCTATCCGAGATCAC
+>noncoding2
+TCTTTAACTGTTCACGACTGTATCGCGGCTTGCAAATCTTAAGTTCTTCCCAAGCGCGCT
+GCGATACAAATCCCAAGTTTAGCGGACAGTTCACGCCGGGTTCTAAGAATGTATGCGTCC
diff --git a/ariba/tests/data/cluster_test_full_run_ok_presence_absence.fa b/ariba/tests/data/cluster_test_full_run_ok_presence_absence.fa
new file mode 100644
index 00000000..c7a01d84
--- /dev/null
+++ b/ariba/tests/data/cluster_test_full_run_ok_presence_absence.fa
@@ -0,0 +1,5 @@
+>presence_absence1
+ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCGCGAGCACCAACATTAGCCAT
+ATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAA
+>presence_absence2
+ATGGCGTGCGATGAATTTGGCCATATTAAACTGATGAACCCGCAGCGCAGCACCTAA
diff --git a/ariba/tests/data/cluster_test_full_run_ok_presence_absence.metadata.tsv b/ariba/tests/data/cluster_test_full_run_ok_presence_absence.metadata.tsv
new file mode 100644
index 00000000..fca05715
--- /dev/null
+++ b/ariba/tests/data/cluster_test_full_run_ok_presence_absence.metadata.tsv
@@ -0,0 +1,4 @@
+presence_absence1	.	.	N	Generic description of presence_absence1
+presence_absence1	p	R3S	N	Ref and assembly have wild type, so do not report
+presence_absence1	p	A10V	N	Ref has wild, reads have variant so report
+presence_absence1	p	I5A	N	Ref and reads have variant so report
diff --git a/ariba/tests/data/cluster_test_full_run_ok_presence_absence/reads_1.fq b/ariba/tests/data/cluster_test_full_run_ok_presence_absence/reads_1.fq
new file mode 100644
index 00000000..3d9db949
--- /dev/null
+++ b/ariba/tests/data/cluster_test_full_run_ok_presence_absence/reads_1.fq
@@ -0,0 +1,128 @@
+@presence_absence1:1:42:100/1
+CAGTCGTAGCCGTAGTGATATGGATCGCGAAGCGATGACCCATGAAGTGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:2:95:155/1
+AACGCGCGAGCACCAACATCAGCCATATTAACGGCATTAGCGCGTGGGAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:3:33:94/1
+TTCGACGCTCAGTCGTAGCCGTAGTGATATGGATCGCGAAGCGATGACCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:4:18:78/1
+ACGACGTCGGTTATCTTCGACGCTCAGTCGTAGCCGTAGTGATATGGATC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:5:86:147/1
+AAGTGACCGAACGCGCGAGCACCAACATCAGCCATATTAACGGCATTAGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:6:53:112/1
+GTAGTGATATGGATCGCGAAGCGATGACCCATGAAGTGACCGAACGCGCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:7:98:156/1
+GCGCGAGCACCAACATCAGCCATATTAACGGCATTAGCGCGTGGGAAAGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:8:29:87/1
+TATCTTCGACGCTCAGTCGTAGCCGTAGTGATATGGATCGCGAAGCGATG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:9:84:145/1
+TGAAGTGACCGAACGCGCGAGCACCAACATCAGCCATATTAACGGCATTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:10:35:94/1
+CGACGCTCAGTCGTAGCCGTAGTGATATGGATCGCGAAGCGATGACCCAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:11:9:66/1
+CTATCATCGACGACGTCGGTTATCTTCGACGCTCAGTCGTAGCCGTAGTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:12:9:68/1
+CTATCATCGACGACGTCGGTTATCTTCGACGCTCAGTCGTAGCCGTAGTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:13:82:142/1
+CATGAAGTGACCGAACGCGCGAGCACCAACATCAGCCATATTAACGGCAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:14:83:142/1
+ATGAAGTGACCGAACGCGCGAGCACCAACATCAGCCATATTAACGGCATT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:15:54:114/1
+TAGTGATATGGATCGCGAAGCGATGACCCATGAAGTGACCGAACGCGCGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:16:81:140/1
+CCATGAAGTGACCGAACGCGCGAGCACCAACATCAGCCATATTAACGGCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:17:18:76/1
+ACGACGTCGGTTATCTTCGACGCTCAGTCGTAGCCGTAGTGATATGGATC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:18:14:73/1
+ATCGACGACGTCGGTTATCTTCGACGCTCAGTCGTAGCCGTAGTGATATG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:19:6:65/1
+ACTCTATCATCGACGACGTCGGTTATCTTCGACGCTCAGTCGTAGCCGTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:20:30:89/1
+ATCTTCGACGCTCAGTCGTAGCCGTAGTGATATGGATCGCGAAGCGATGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:21:77:136/1
+TGACCCATGAAGTGACCGAACGCGCGAGCACCAACATCAGCCATATTAAC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:22:47:107/1
+GTAGCCGTAGTGATATGGATCGCGAAGCGATGACCCATGAAGTGACCGAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:23:90:149/1
+GACCGAACGCGCGAGCACCAACATCAGCCATATTAACGGCATTAGCGCGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:24:109:169/1
+AACATCAGCCATATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAACA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:25:35:94.dup.2/1
+CGACGCTCAGTCGTAGCCGTAGTGATATGGATCGCGAAGCGATGACCCAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:26:74:133/1
+CGATGACCCATGAAGTGACCGAACGCGCGAGCACCAACATCAGCCATATT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:27:36:97/1
+GACGCTCAGTCGTAGCCGTAGTGATATGGATCGCGAAGCGATGACCCATG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:28:39:99/1
+GCTCAGTCGTAGCCGTAGTGATATGGATCGCGAAGCGATGACCCATGAAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:29:74:135/1
+CGATGACCCATGAAGTGACCGAACGCGCGAGCACCAACATCAGCCATATT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:30:40:99/1
+CTCAGTCGTAGCCGTAGTGATATGGATCGCGAAGCGATGACCCATGAAGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:31:36:96/1
+GACGCTCAGTCGTAGCCGTAGTGATATGGATCGCGAAGCGATGACCCATG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:32:40:101/1
+CTCAGTCGTAGCCGTAGTGATATGGATCGCGAAGCGATGACCCATGAAGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
diff --git a/ariba/tests/data/cluster_test_full_run_ok_presence_absence/reads_2.fq b/ariba/tests/data/cluster_test_full_run_ok_presence_absence/reads_2.fq
new file mode 100644
index 00000000..94dc98bb
--- /dev/null
+++ b/ariba/tests/data/cluster_test_full_run_ok_presence_absence/reads_2.fq
@@ -0,0 +1,128 @@
+@presence_absence1:1:42:100/2
+ATGCTTTCCCACGCGCTAATGCCGTTAATATGGCTGATGTTGGTGCTCGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:2:95:155/2
+CGACTGATACGTCGACTCGTGTCGTTCAGCTACGTCAGCTACGTAGTGTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:3:33:94/2
+TCCCACGCGCTAATGCCGTTAATATGGCTGATGTTGGTGCTCGCGCGTTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:4:18:78/2
+CGTTAATATGGCTGATGTTGGTGCTCGCGCGTTCGGTCACTTCATGGGTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:5:86:147/2
+ACGTCGACTCGTGTCGTTCAGCTACGTCAGCTACGTAGTGTTATTCCATG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:6:53:112/2
+TAGTGTTATTCCATGCTTTCCCACGCGCTAATGCCGTTAATATGGCTGAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:7:98:156/2
+ACGACTGATACGTCGACTCGTGTCGTTCAGCTACGTCAGCTACGTAGTGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:8:29:87/2
+CGCTAATGCCGTTAATATGGCTGATGTTGGTGCTCGCGCGTTCGGTCACT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:9:84:145/2
+GTCGACTCGTGTCGTTCAGCTACGTCAGCTACGTAGTGTTATTCCATGCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:10:35:94/2
+TCCCACGCGCTAATGCCGTTAATATGGCTGATGTTGGTGCTCGCGCGTTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:11:9:66/2
+TGATGTTGGTGCTCGCGCGTTCGGTCACTTCATGGGTCATCGCTTCGCGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:12:9:68/2
+GCTGATGTTGGTGCTCGCGCGTTCGGTCACTTCATGGGTCATCGCTTCGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:13:82:142/2
+GACTCGTGTCGTTCAGCTACGTCAGCTACGTAGTGTTATTCCATGCTTTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:14:83:142/2
+GACTCGTGTCGTTCAGCTACGTCAGCTACGTAGTGTTATTCCATGCTTTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:15:54:114/2
+CGTAGTGTTATTCCATGCTTTCCCACGCGCTAATGCCGTTAATATGGCTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:16:81:140/2
+CTCGTGTCGTTCAGCTACGTCAGCTACGTAGTGTTATTCCATGCTTTCCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:17:18:76/2
+TTAATATGGCTGATGTTGGTGCTCGCGCGTTCGGTCACTTCATGGGTCAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:18:14:73/2
+ATATGGCTGATGTTGGTGCTCGCGCGTTCGGTCACTTCATGGGTCATCGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:19:6:65/2
+GATGTTGGTGCTCGCGCGTTCGGTCACTTCATGGGTCATCGCTTCGCGAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:20:30:89/2
+CGCGCTAATGCCGTTAATATGGCTGATGTTGGTGCTCGCGCGTTCGGTCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:21:77:136/2
+TGTCGTTCAGCTACGTCAGCTACGTAGTGTTATTCCATGCTTTCCCACGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:22:47:107/2
+TTATTCCATGCTTTCCCACGCGCTAATGCCGTTAATATGGCTGATGTTGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:23:90:149/2
+ATACGTCGACTCGTGTCGTTCAGCTACGTCAGCTACGTAGTGTTATTCCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:24:109:169/2
+TTTTGTGTGAGCTACGACTGATACGTCGACTCGTGTCGTTCAGCTACGTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:25:35:94.dup.2/2
+TCCCACGCGCTAATGCCGTTAATATGGCTGATGTTGGTGCTCGCGCGTTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:26:74:133/2
+CGTTCAGCTACGTCAGCTACGTAGTGTTATTCCATGCTTTCCCACGCGCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:27:36:97/2
+CTTTCCCACGCGCTAATGCCGTTAATATGGCTGATGTTGGTGCTCGCGCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:28:39:99/2
+TGCTTTCCCACGCGCTAATGCCGTTAATATGGCTGATGTTGGTGCTCGCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:29:74:135/2
+GTCGTTCAGCTACGTCAGCTACGTAGTGTTATTCCATGCTTTCCCACGCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:30:40:99/2
+TGCTTTCCCACGCGCTAATGCCGTTAATATGGCTGATGTTGGTGCTCGCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:31:36:96/2
+TTTCCCACGCGCTAATGCCGTTAATATGGCTGATGTTGGTGCTCGCGCGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@presence_absence1:32:40:101/2
+CATGCTTTCCCACGCGCTAATGCCGTTAATATGGCTGATGTTGGTGCTCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
diff --git a/ariba/tests/data/cluster_test_full_run_ok_presence_absence/references.fa b/ariba/tests/data/cluster_test_full_run_ok_presence_absence/references.fa
new file mode 100644
index 00000000..c7a01d84
--- /dev/null
+++ b/ariba/tests/data/cluster_test_full_run_ok_presence_absence/references.fa
@@ -0,0 +1,5 @@
+>presence_absence1
+ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCGCGAGCACCAACATTAGCCAT
+ATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAA
+>presence_absence2
+ATGGCGTGCGATGAATTTGGCCATATTAAACTGATGAACCCGCAGCGCAGCACCTAA
diff --git a/ariba/tests/data/cluster_test_full_run_ok_variants_only.fa b/ariba/tests/data/cluster_test_full_run_ok_variants_only.fa
new file mode 100644
index 00000000..b99d42a6
--- /dev/null
+++ b/ariba/tests/data/cluster_test_full_run_ok_variants_only.fa
@@ -0,0 +1,5 @@
+>variants_only1
+ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCGCGAGCACCAACATTAGCCAT
+ATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAA
+>variants_only2
+ATGGCGTGCGATGAATTTGGCCATATTAAACTGATGAACCCGCAGCGCAGCACCTAA
diff --git a/ariba/tests/data/cluster_test_full_run_ok_variants_only.not_present.always_report.metadata.tsv b/ariba/tests/data/cluster_test_full_run_ok_variants_only.not_present.always_report.metadata.tsv
new file mode 100644
index 00000000..7cc8e4e5
--- /dev/null
+++ b/ariba/tests/data/cluster_test_full_run_ok_variants_only.not_present.always_report.metadata.tsv
@@ -0,0 +1,2 @@
+variants_only1	.	.	N	Generic description of variants_only1
+variants_only1	p	R3S	Y	Ref and assembly have wild type, but always report anyway
diff --git a/ariba/tests/data/cluster_test_full_run_ok_variants_only.not_present.metadata.tsv b/ariba/tests/data/cluster_test_full_run_ok_variants_only.not_present.metadata.tsv
new file mode 100644
index 00000000..3a1a9c26
--- /dev/null
+++ b/ariba/tests/data/cluster_test_full_run_ok_variants_only.not_present.metadata.tsv
@@ -0,0 +1,2 @@
+variants_only1	.	.	N	Generic description of variants_only1
+variants_only1	p	R3S	N	Ref and assembly have wild type, so do not report
diff --git a/ariba/tests/data/cluster_test_full_run_ok_variants_only.present.metadata.tsv b/ariba/tests/data/cluster_test_full_run_ok_variants_only.present.metadata.tsv
new file mode 100644
index 00000000..22c8edd6
--- /dev/null
+++ b/ariba/tests/data/cluster_test_full_run_ok_variants_only.present.metadata.tsv
@@ -0,0 +1,3 @@
+variants_only1	.	.	N	Generic description of variants_only1
+variants_only1	p	R3S	N	Ref and assembly have wild type, so do not report
+variants_only1	p	I5A	N	Ref and reads have variant so report
diff --git a/ariba/tests/data/cluster_test_full_run_ok_variants_only/reads_1.fq b/ariba/tests/data/cluster_test_full_run_ok_variants_only/reads_1.fq
new file mode 100644
index 00000000..9f18c629
--- /dev/null
+++ b/ariba/tests/data/cluster_test_full_run_ok_variants_only/reads_1.fq
@@ -0,0 +1,132 @@
+@variants_only1:1:3:63/1
+CTACGAGCATGCGTCGCAGTCGACGACGTAGCAGCAGTCACGTAGCAGCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:2:24:83/1
+GACGACGTAGCAGCAGTCACGTAGCAGCACGTATCAAATGGATCGCGAAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:3:92:151/1
+CCGAACGCGCGAGCACCAACATTAGCCATATTAACGGCATTAGCGCGTGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:4:64:122/1
+GATCGCGAAGCGATGACCCATGAAGCGACCGAACGCGCGAGCACCAACAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:5:95:154/1
+AACGCGCGAGCACCAACATTAGCCATATTAACGGCATTAGCGCGTGGGAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:6:87:144/1
+AGCGACCGAACGCGCGAGCACCAACATTAGCCATATTAACGGCATTAGCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:7:109:168/1
+AACATTAGCCATATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAAAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:8:105:165/1
+CACCAACATTAGCCATATTAACGGCATTAGCGCGTGGGAAAGCATGGAAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:9:56:116/1
+ATCAAATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCGCGAGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:10:76:135/1
+ATGACCCATGAAGCGACCGAACGCGCGAGCACCAACATTAGCCATATTAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:11:19:79/1
+CAGTCGACGACGTAGCAGCAGTCACGTAGCAGCACGTATCAAATGGATCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:12:100:161/1
+GCGAGCACCAACATTAGCCATATTAACGGCATTAGCGCGTGGGAAAGCAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:13:40:101/1
+TCACGTAGCAGCACGTATCAAATGGATCGCGAAGCGATGACCCATGAAGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:14:51:112/1
+CACGTATCAAATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:15:74:135/1
+CGATGACCCATGAAGCGACCGAACGCGCGAGCACCAACATTAGCCATATT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:16:22:81/1
+TCGACGACGTAGCAGCAGTCACGTAGCAGCACGTATCAAATGGATCGCGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:17:102:158/1
+GAGCACCAACATTAGCCATATTAACGGCATTAGCGCGTGGGAAAGCATGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:18:92:151.dup.2/1
+CCGAACGCGCGAGCACCAACATTAGCCATATTAACGGCATTAGCGCGTGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:19:102:162/1
+GAGCACCAACATTAGCCATATTAACGGCATTAGCGCGTGGGAAAGCATGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:20:22:80/1
+TCGACGACGTAGCAGCAGTCACGTAGCAGCACGTATCAAATGGATCGCGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:21:63:121/1
+GGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCGCGAGCACCAACA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:22:96:155/1
+ACGCGCGAGCACCAACATTAGCCATATTAACGGCATTAGCGCGTGGGAAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:23:8:69/1
+AGCATGCGTCGCAGTCGACGACGTAGCAGCAGTCACGTAGCAGCACGTAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:24:8:67/1
+AGCATGCGTCGCAGTCGACGACGTAGCAGCAGTCACGTAGCAGCACGTAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:25:36:96/1
+GCAGTCACGTAGCAGCACGTATCAAATGGATCGCGAAGCGATGACCCATG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:26:73:133/1
+GCGATGACCCATGAAGCGACCGAACGCGCGAGCACCAACATTAGCCATAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:27:41:100/1
+CACGTAGCAGCACGTATCAAATGGATCGCGAAGCGATGACCCATGAAGCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:28:50:109/1
+GCACGTATCAAATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:29:47:106/1
+GCAGCACGTATCAAATGGATCGCGAAGCGATGACCCATGAAGCGACCGAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:30:69:130/1
+CGAAGCGATGACCCATGAAGCGACCGAACGCGCGAGCACCAACATTAGCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:31:54:113/1
+GTATCAAATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCGCGA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:32:38:97/1
+AGTCACGTAGCAGCACGTATCAAATGGATCGCGAAGCGATGACCCATGAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:33:23:83/1
+CGACGACGTAGCAGCAGTCACGTAGCAGCACGTATCAAATGGATCGCGAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
diff --git a/ariba/tests/data/cluster_test_full_run_ok_variants_only/reads_2.fq b/ariba/tests/data/cluster_test_full_run_ok_variants_only/reads_2.fq
new file mode 100644
index 00000000..89b713bb
--- /dev/null
+++ b/ariba/tests/data/cluster_test_full_run_ok_variants_only/reads_2.fq
@@ -0,0 +1,132 @@
+@variants_only1:1:3:63/2
+TGTTGGTGCTCGCGCGTTCGGTCGCTTCATGGGTCATCGCTTCGCGATCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:2:24:83/2
+AATGCCGTTAATATGGCTAATGTTGGTGCTCGCGCGTTCGGTCGCTTCAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:3:92:151/2
+GCTAGTACGTCGTAGATGCTTTGACTGCTGCTGTCGTCGCTGCTTTATTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:4:64:122/2
+GCTGTCGTCGCTGCTTTATTCCATGCTTTCCCACGCGCTAATGCCGTTAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:5:95:154/2
+GTCGCTAGTACGTCGTAGATGCTTTGACTGCTGCTGTCGTCGCTGCTTTA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:6:87:144/2
+CGTCGTAGATGCTTTGACTGCTGCTGTCGTCGCTGCTTTATTCCATGCTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:7:109:168/2
+CTGGTAACGTGCTCGTCGCTAGTACGTCGTAGATGCTTTGACTGCTGCTG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:8:105:165/2
+GTAACGTGCTCGTCGCTAGTACGTCGTAGATGCTTTGACTGCTGCTGTCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:9:56:116/2
+GTCGCTGCTTTATTCCATGCTTTCCCACGCGCTAATGCCGTTAATATGGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:10:76:135/2
+TGCTTTGACTGCTGCTGTCGTCGCTGCTTTATTCCATGCTTTCCCACGCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:11:19:79/2
+CCGTTAATATGGCTAATGTTGGTGCTCGCGCGTTCGGTCGCTTCATGGGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:12:100:161/2
+CGTGCTCGTCGCTAGTACGTCGTAGATGCTTTGACTGCTGCTGTCGTCGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:13:40:101/2
+CATGCTTTCCCACGCGCTAATGCCGTTAATATGGCTAATGTTGGTGCTCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:14:51:112/2
+CTGCTTTATTCCATGCTTTCCCACGCGCTAATGCCGTTAATATGGCTAAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:15:74:135/2
+TGCTTTGACTGCTGCTGTCGTCGCTGCTTTATTCCATGCTTTCCCACGCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:16:22:81/2
+TGCCGTTAATATGGCTAATGTTGGTGCTCGCGCGTTCGGTCGCTTCATGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:17:102:158/2
+GCTCGTCGCTAGTACGTCGTAGATGCTTTGACTGCTGCTGTCGTCGCTGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:18:92:151.dup.2/2
+GCTAGTACGTCGTAGATGCTTTGACTGCTGCTGTCGTCGCTGCTTTATTC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:19:102:162/2
+ACGTGCTCGTCGCTAGTACGTCGTAGATGCTTTGACTGCTGCTGTCGTCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:20:22:80/2
+GCCGTTAATATGGCTAATGTTGGTGCTCGCGCGTTCGGTCGCTTCATGGG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:21:63:121/2
+CTGTCGTCGCTGCTTTATTCCATGCTTTCCCACGCGCTAATGCCGTTAAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:22:96:155/2
+CGTCGCTAGTACGTCGTAGATGCTTTGACTGCTGCTGTCGTCGCTGCTTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:23:8:69/2
+GGCTAATGTTGGTGCTCGCGCGTTCGGTCGCTTCATGGGTCATCGCTTCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:24:8:67/2
+CTAATGTTGGTGCTCGCGCGTTCGGTCGCTTCATGGGTCATCGCTTCGCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:25:36:96/2
+TTTCCCACGCGCTAATGCCGTTAATATGGCTAATGTTGGTGCTCGCGCGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:26:73:133/2
+CTTTGACTGCTGCTGTCGTCGCTGCTTTATTCCATGCTTTCCCACGCGCT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:27:41:100/2
+ATGCTTTCCCACGCGCTAATGCCGTTAATATGGCTAATGTTGGTGCTCGC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:28:50:109/2
+CTTTATTCCATGCTTTCCCACGCGCTAATGCCGTTAATATGGCTAATGTT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:29:47:106/2
+TATTCCATGCTTTCCCACGCGCTAATGCCGTTAATATGGCTAATGTTGGT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:30:69:130/2
+TGACTGCTGCTGTCGTCGCTGCTTTATTCCATGCTTTCCCACGCGCTAAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:31:54:113/2
+GCTGCTTTATTCCATGCTTTCCCACGCGCTAATGCCGTTAATATGGCTAA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:32:38:97/2
+CTTTCCCACGCGCTAATGCCGTTAATATGGCTAATGTTGGTGCTCGCGCG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@variants_only1:33:23:83/2
+AATGCCGTTAATATGGCTAATGTTGGTGCTCGCGCGTTCGGTCGCTTCAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
diff --git a/ariba/tests/data/cluster_test_full_run_ok_variants_only/references.fa b/ariba/tests/data/cluster_test_full_run_ok_variants_only/references.fa
new file mode 100644
index 00000000..b99d42a6
--- /dev/null
+++ b/ariba/tests/data/cluster_test_full_run_ok_variants_only/references.fa
@@ -0,0 +1,5 @@
+>variants_only1
+ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCGCGAGCACCAACATTAGCCAT
+ATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAA
+>variants_only2
+ATGGCGTGCGATGAATTTGGCCATATTAAACTGATGAACCCGCAGCGCAGCACCTAA
diff --git a/ariba/tests/data/cluster_test_gapfill_with_gapfiller.gene.fa b/ariba/tests/data/cluster_test_gapfill_with_gapfiller.gene.fa
deleted file mode 100644
index 042775d6..00000000
--- a/ariba/tests/data/cluster_test_gapfill_with_gapfiller.gene.fa
+++ /dev/null
@@ -1,2 +0,0 @@
->name_of_gene
-AAACCCGGGTTT
diff --git a/ariba/tests/data/cluster_test_gapfill_with_gapfiller/genes.fa b/ariba/tests/data/cluster_test_gapfill_with_gapfiller/genes.fa
deleted file mode 100644
index 042775d6..00000000
--- a/ariba/tests/data/cluster_test_gapfill_with_gapfiller/genes.fa
+++ /dev/null
@@ -1,2 +0,0 @@
->name_of_gene
-AAACCCGGGTTT
diff --git a/ariba/tests/data/cluster_test_generic/genes.fa b/ariba/tests/data/cluster_test_generic/genes.fa
deleted file mode 100644
index 042775d6..00000000
--- a/ariba/tests/data/cluster_test_generic/genes.fa
+++ /dev/null
@@ -1,2 +0,0 @@
->name_of_gene
-AAACCCGGGTTT
diff --git a/ariba/tests/data/cluster_test_generic/reads_1.fq b/ariba/tests/data/cluster_test_generic/reads_1.fq
deleted file mode 100644
index 6ff1e12f..00000000
--- a/ariba/tests/data/cluster_test_generic/reads_1.fq
+++ /dev/null
@@ -1,4 +0,0 @@
-@read1/1
-ACGTACGT
-+
-IIIIIIII
diff --git a/ariba/tests/data/cluster_test_generic/reads_2.fq b/ariba/tests/data/cluster_test_generic/reads_2.fq
deleted file mode 100644
index 2eb387ff..00000000
--- a/ariba/tests/data/cluster_test_generic/reads_2.fq
+++ /dev/null
@@ -1,4 +0,0 @@
-@read1/2
-ACGTACGT
-+
-IIIIIIII
diff --git a/ariba/tests/data/cluster_test_get_read_counts/genes.fa b/ariba/tests/data/cluster_test_get_read_counts/genes.fa
deleted file mode 100644
index 042775d6..00000000
--- a/ariba/tests/data/cluster_test_get_read_counts/genes.fa
+++ /dev/null
@@ -1,2 +0,0 @@
->name_of_gene
-AAACCCGGGTTT
diff --git a/ariba/tests/data/cluster_test_get_read_counts/reads_1.fq b/ariba/tests/data/cluster_test_get_read_counts/reads_1.fq
deleted file mode 100644
index 6ff1e12f..00000000
--- a/ariba/tests/data/cluster_test_get_read_counts/reads_1.fq
+++ /dev/null
@@ -1,4 +0,0 @@
-@read1/1
-ACGTACGT
-+
-IIIIIIII
diff --git a/ariba/tests/data/cluster_test_get_read_counts/reads_2.fq b/ariba/tests/data/cluster_test_get_read_counts/reads_2.fq
deleted file mode 100644
index 2eb387ff..00000000
--- a/ariba/tests/data/cluster_test_get_read_counts/reads_2.fq
+++ /dev/null
@@ -1,4 +0,0 @@
-@read1/2
-ACGTACGT
-+
-IIIIIIII
diff --git a/ariba/tests/data/cluster_test_get_read_counts_fail/genes.fa b/ariba/tests/data/cluster_test_get_read_counts_fail/genes.fa
deleted file mode 100644
index 042775d6..00000000
--- a/ariba/tests/data/cluster_test_get_read_counts_fail/genes.fa
+++ /dev/null
@@ -1,2 +0,0 @@
->name_of_gene
-AAACCCGGGTTT
diff --git a/ariba/tests/data/cluster_test_get_read_counts_fail/reads_1.fq b/ariba/tests/data/cluster_test_get_read_counts_fail/reads_1.fq
deleted file mode 100644
index 46dd9a20..00000000
--- a/ariba/tests/data/cluster_test_get_read_counts_fail/reads_1.fq
+++ /dev/null
@@ -1,8 +0,0 @@
-@read1/1
-ACGTACGT
-+
-IIIIIIII
-@read2/1
-TCATCATA
-+
-:D:D:D:D
diff --git a/ariba/tests/data/cluster_test_get_read_counts_fail/reads_2.fq b/ariba/tests/data/cluster_test_get_read_counts_fail/reads_2.fq
deleted file mode 100644
index 2eb387ff..00000000
--- a/ariba/tests/data/cluster_test_get_read_counts_fail/reads_2.fq
+++ /dev/null
@@ -1,4 +0,0 @@
-@read1/2
-ACGTACGT
-+
-IIIIIIII
diff --git a/ariba/tests/data/cluster_test_init_no_gene_fa/reads_1.fq b/ariba/tests/data/cluster_test_init_no_refs_fa/reads_1.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_init_no_gene_fa/reads_1.fq
rename to ariba/tests/data/cluster_test_init_no_refs_fa/reads_1.fq
diff --git a/ariba/tests/data/cluster_test_init_no_gene_fa/reads_2.fq b/ariba/tests/data/cluster_test_init_no_refs_fa/reads_2.fq
similarity index 100%
rename from ariba/tests/data/cluster_test_init_no_gene_fa/reads_2.fq
rename to ariba/tests/data/cluster_test_init_no_refs_fa/reads_2.fq
diff --git a/ariba/tests/data/cluster_test_init_refdata.fa b/ariba/tests/data/cluster_test_init_refdata.fa
new file mode 100644
index 00000000..f98f0aca
--- /dev/null
+++ b/ariba/tests/data/cluster_test_init_refdata.fa
@@ -0,0 +1,2 @@
+>seq1
+ACGTACGT
diff --git a/ariba/tests/data/cluster_test_load_final_contigs.contigs.fa b/ariba/tests/data/cluster_test_load_final_contigs.contigs.fa
deleted file mode 100644
index a32ff99d..00000000
--- a/ariba/tests/data/cluster_test_load_final_contigs.contigs.fa
+++ /dev/null
@@ -1,6 +0,0 @@
->spam
-ACGT
->egg1
-TGCA
->egg2
-AAAA
diff --git a/ariba/tests/data/cluster_test_load_final_contigs/genes.fa b/ariba/tests/data/cluster_test_load_final_contigs/genes.fa
deleted file mode 100644
index 042775d6..00000000
--- a/ariba/tests/data/cluster_test_load_final_contigs/genes.fa
+++ /dev/null
@@ -1,2 +0,0 @@
->name_of_gene
-AAACCCGGGTTT
diff --git a/ariba/tests/data/cluster_test_load_final_contigs/reads_1.fq b/ariba/tests/data/cluster_test_load_final_contigs/reads_1.fq
deleted file mode 100644
index 6ff1e12f..00000000
--- a/ariba/tests/data/cluster_test_load_final_contigs/reads_1.fq
+++ /dev/null
@@ -1,4 +0,0 @@
-@read1/1
-ACGTACGT
-+
-IIIIIIII
diff --git a/ariba/tests/data/cluster_test_load_final_contigs/reads_2.fq b/ariba/tests/data/cluster_test_load_final_contigs/reads_2.fq
deleted file mode 100644
index 2eb387ff..00000000
--- a/ariba/tests/data/cluster_test_load_final_contigs/reads_2.fq
+++ /dev/null
@@ -1,4 +0,0 @@
-@read1/2
-ACGTACGT
-+
-IIIIIIII
diff --git a/ariba/tests/data/cluster_test_make_report_lines.read_depths.gz b/ariba/tests/data/cluster_test_make_report_lines.read_depths.gz
deleted file mode 100644
index 87c55ae7..00000000
Binary files a/ariba/tests/data/cluster_test_make_report_lines.read_depths.gz and /dev/null differ
diff --git a/ariba/tests/data/cluster_test_make_report_lines.read_depths.gz.tbi b/ariba/tests/data/cluster_test_make_report_lines.read_depths.gz.tbi
deleted file mode 100644
index 02adc592..00000000
Binary files a/ariba/tests/data/cluster_test_make_report_lines.read_depths.gz.tbi and /dev/null differ
diff --git a/ariba/tests/data/cluster_test_parse_assembly_bam/genes.fa b/ariba/tests/data/cluster_test_parse_assembly_bam/genes.fa
deleted file mode 100644
index 042775d6..00000000
--- a/ariba/tests/data/cluster_test_parse_assembly_bam/genes.fa
+++ /dev/null
@@ -1,2 +0,0 @@
->name_of_gene
-AAACCCGGGTTT
diff --git a/ariba/tests/data/cluster_test_parse_assembly_bam/reads_1.fq b/ariba/tests/data/cluster_test_parse_assembly_bam/reads_1.fq
deleted file mode 100644
index 6ff1e12f..00000000
--- a/ariba/tests/data/cluster_test_parse_assembly_bam/reads_1.fq
+++ /dev/null
@@ -1,4 +0,0 @@
-@read1/1
-ACGTACGT
-+
-IIIIIIII
diff --git a/ariba/tests/data/cluster_test_parse_assembly_bam/reads_2.fq b/ariba/tests/data/cluster_test_parse_assembly_bam/reads_2.fq
deleted file mode 100644
index 2eb387ff..00000000
--- a/ariba/tests/data/cluster_test_parse_assembly_bam/reads_2.fq
+++ /dev/null
@@ -1,4 +0,0 @@
-@read1/2
-ACGTACGT
-+
-IIIIIIII
diff --git a/ariba/tests/data/cluster_test_parse_assembly_vs_gene_coords.coords b/ariba/tests/data/cluster_test_parse_assembly_vs_gene_coords.coords
deleted file mode 100644
index 57be1095..00000000
--- a/ariba/tests/data/cluster_test_parse_assembly_vs_gene_coords.coords
+++ /dev/null
@@ -1,7 +0,0 @@
-a.fa b.fa
-NUCMER
-
-[S1]	[E1]	[S2]	[E2]	[LEN 1]	[LEN 2]	[% IDY]	[LEN R]	[LEN Q]	[FRM]	[TAGS]
-1	1000	1	1000	1000	1000	100.00	1000	1000	1	1	gene	contig1	[IDENTITY]
-1	240	1	240	240	240	100.00	1000	580	1	1	gene	contig2	
-661	1000	241	580	340	340	100.00	1000	580	1	1	gene	contig2	
diff --git a/ariba/tests/data/cluster_test_parse_assembly_vs_gene_coords/genes.fa b/ariba/tests/data/cluster_test_parse_assembly_vs_gene_coords/genes.fa
deleted file mode 100644
index 8d999fa4..00000000
--- a/ariba/tests/data/cluster_test_parse_assembly_vs_gene_coords/genes.fa
+++ /dev/null
@@ -1,2 +0,0 @@
->gene
-AAACCCGGGTTT
diff --git a/ariba/tests/data/cluster_test_parse_assembly_vs_gene_coords/reads_1.fq b/ariba/tests/data/cluster_test_parse_assembly_vs_gene_coords/reads_1.fq
deleted file mode 100644
index 6ff1e12f..00000000
--- a/ariba/tests/data/cluster_test_parse_assembly_vs_gene_coords/reads_1.fq
+++ /dev/null
@@ -1,4 +0,0 @@
-@read1/1
-ACGTACGT
-+
-IIIIIIII
diff --git a/ariba/tests/data/cluster_test_parse_assembly_vs_gene_coords/reads_2.fq b/ariba/tests/data/cluster_test_parse_assembly_vs_gene_coords/reads_2.fq
deleted file mode 100644
index 2eb387ff..00000000
--- a/ariba/tests/data/cluster_test_parse_assembly_vs_gene_coords/reads_2.fq
+++ /dev/null
@@ -1,4 +0,0 @@
-@read1/2
-ACGTACGT
-+
-IIIIIIII
diff --git a/ariba/tests/data/cluster_test_rename_scaffolds.out.fa b/ariba/tests/data/cluster_test_rename_scaffolds.out.fa
deleted file mode 100644
index d60043f8..00000000
--- a/ariba/tests/data/cluster_test_rename_scaffolds.out.fa
+++ /dev/null
@@ -1,6 +0,0 @@
->name_of_gene.scaffold.1
-TACG
->name_of_gene.scaffold.2
-ACGT
->name_of_gene.scaffold.3
-CGTA
diff --git a/ariba/tests/data/cluster_test_rename_scaffolds/genes.fa b/ariba/tests/data/cluster_test_rename_scaffolds/genes.fa
deleted file mode 100644
index 042775d6..00000000
--- a/ariba/tests/data/cluster_test_rename_scaffolds/genes.fa
+++ /dev/null
@@ -1,2 +0,0 @@
->name_of_gene
-AAACCCGGGTTT
diff --git a/ariba/tests/data/cluster_test_rename_scaffolds/reads_1.fq b/ariba/tests/data/cluster_test_rename_scaffolds/reads_1.fq
deleted file mode 100644
index 6ff1e12f..00000000
--- a/ariba/tests/data/cluster_test_rename_scaffolds/reads_1.fq
+++ /dev/null
@@ -1,4 +0,0 @@
-@read1/1
-ACGTACGT
-+
-IIIIIIII
diff --git a/ariba/tests/data/cluster_test_rename_scaffolds/reads_2.fq b/ariba/tests/data/cluster_test_rename_scaffolds/reads_2.fq
deleted file mode 100644
index 2eb387ff..00000000
--- a/ariba/tests/data/cluster_test_rename_scaffolds/reads_2.fq
+++ /dev/null
@@ -1,4 +0,0 @@
-@read1/2
-ACGTACGT
-+
-IIIIIIII
diff --git a/ariba/tests/data/cluster_test_scaffold_with_sspace.gene.fa b/ariba/tests/data/cluster_test_scaffold_with_sspace.gene.fa
deleted file mode 100644
index 042775d6..00000000
--- a/ariba/tests/data/cluster_test_scaffold_with_sspace.gene.fa
+++ /dev/null
@@ -1,2 +0,0 @@
->name_of_gene
-AAACCCGGGTTT
diff --git a/ariba/tests/data/cluster_test_scaffold_with_sspace/genes.fa b/ariba/tests/data/cluster_test_scaffold_with_sspace/genes.fa
deleted file mode 100644
index 042775d6..00000000
--- a/ariba/tests/data/cluster_test_scaffold_with_sspace/genes.fa
+++ /dev/null
@@ -1,2 +0,0 @@
->name_of_gene
-AAACCCGGGTTT
diff --git a/ariba/tests/data/cluster_test_set_assembly_kmer/genes.fa b/ariba/tests/data/cluster_test_set_assembly_kmer/genes.fa
deleted file mode 100644
index 042775d6..00000000
--- a/ariba/tests/data/cluster_test_set_assembly_kmer/genes.fa
+++ /dev/null
@@ -1,2 +0,0 @@
->name_of_gene
-AAACCCGGGTTT
diff --git a/ariba/tests/data/cluster_test_set_assembly_kmer/reads_1.fq b/ariba/tests/data/cluster_test_set_assembly_kmer/reads_1.fq
deleted file mode 100644
index 192e5e7c..00000000
--- a/ariba/tests/data/cluster_test_set_assembly_kmer/reads_1.fq
+++ /dev/null
@@ -1,12 +0,0 @@
-@1/1
-AGTGACGTA
-+
-III:DIIII
-@2/1
-ACGTGACGTA
-+
-II:-()IIII
-@3/1
-AACGTGACGTA
-+
-II;)IIIIIII
diff --git a/ariba/tests/data/cluster_test_set_assembly_kmer/reads_2.fq b/ariba/tests/data/cluster_test_set_assembly_kmer/reads_2.fq
deleted file mode 100644
index fc17f72b..00000000
--- a/ariba/tests/data/cluster_test_set_assembly_kmer/reads_2.fq
+++ /dev/null
@@ -1,12 +0,0 @@
-@1/2
-ACGTG
-+
-IIIII
-@2/2
-ACAGTG
-+
-IIIIIII
-@3/2
-ACGTAGA
-+
-IIIIIII
diff --git a/ariba/tests/data/clusters_test_dummy_db.fa b/ariba/tests/data/clusters_test_dummy_db.fa
index e69de29b..6c2305af 100644
--- a/ariba/tests/data/clusters_test_dummy_db.fa
+++ b/ariba/tests/data/clusters_test_dummy_db.fa
@@ -0,0 +1,2 @@
+>x
+CTACTGCATCGTAATCATCGTATACCATTGACTGCATCAATCTGCATCTATGCATCAAA
diff --git a/ariba/tests/data/clusters_test_write_report.tsv b/ariba/tests/data/clusters_test_write_report.tsv
index 382892c0..d9e3ab28 100644
--- a/ariba/tests/data/clusters_test_write_report.tsv
+++ b/ariba/tests/data/clusters_test_write_report.tsv
@@ -1,3 +1,3 @@
-#gene	flag	reads	cluster	gene_len	assembled	pc_ident	var_type	var_effect	new_aa	gene_start	gene_end	gene_nt	scaffold	scaff_len	scaff_start	scaff_end	scaff_nt	read_depth	alt_bases	ref_alt_depth
-gene1 line1
-gene2 line2
+#ref_name	ref_type	flag	reads	cluster_rep	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
+gene1	line1
+gene2	line2
diff --git a/ariba/tests/data/mapping_test_bowtie2_remove_both_unmapped_reads.bam b/ariba/tests/data/mapping_test_bowtie2_remove_both_unmapped_reads.bam
new file mode 100644
index 00000000..916eaff4
Binary files /dev/null and b/ariba/tests/data/mapping_test_bowtie2_remove_both_unmapped_reads.bam differ
diff --git a/ariba/tests/data/mapping_test_bowtie2_remove_both_unmapped_reads_1.fq b/ariba/tests/data/mapping_test_bowtie2_remove_both_unmapped_reads_1.fq
new file mode 100644
index 00000000..5d5b849c
--- /dev/null
+++ b/ariba/tests/data/mapping_test_bowtie2_remove_both_unmapped_reads_1.fq
@@ -0,0 +1,16 @@
+@read1/1
+TCCACAGGATGGTGGTATACCTGAGGCCAAAGGATACAGATCTTGTGGGAAAGGTCCGCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@read2/1
+CATCTGACTGAACTCATCTGACTGACTACTATACTCAGTGCATGCATCGTCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@read3/1
+CATCTGACTGAACTCATCTGACTGACTACTATACTCAGTGCATGCATCGTCA
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@read4/1
+TCCACAGGATGGTGGTATACCTGAGGCCAAAGGATACAGATCTTGTGGGAAAGGTCCGCC
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
diff --git a/ariba/tests/data/mapping_test_bowtie2_remove_both_unmapped_reads_2.fq b/ariba/tests/data/mapping_test_bowtie2_remove_both_unmapped_reads_2.fq
new file mode 100644
index 00000000..b0580a4f
--- /dev/null
+++ b/ariba/tests/data/mapping_test_bowtie2_remove_both_unmapped_reads_2.fq
@@ -0,0 +1,16 @@
+@read1/2
+CTACTGACTGACTGATCGATCGATCATCTACTATCCTACTCAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@read2/2
+ACCATAATGTTCTTAGGGCTTACCATAGAGGTACACTAAAAAGTGTTTCATCCACCTTAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@read3/2
+CTACTGACTGACTGATCGATCGATCATCTACTATCCTACTCAT
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
+@read4/2
+ACCATAATGTTCTTAGGGCTTACCATAGAGGTACACTAAAAAGTGTTTCATCCACCTTAG
++
+IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
diff --git a/ariba/tests/data/mapping_test_smalt_reads_1.fq b/ariba/tests/data/mapping_test_smalt_reads_1.fq
deleted file mode 100644
index a786c430..00000000
--- a/ariba/tests/data/mapping_test_smalt_reads_1.fq
+++ /dev/null
@@ -1,28 +0,0 @@
-@1/1
-AGCCCTCCACAGGATGGTGGTATAC
-+
-IIIIIIIIIIIIIIIIIIIIIIIII
-@2/1
-TAATGTTCTTAGGGCTTACCATAGA
-+
-IIIIIIIIIIIIIIIIIIIIIIIII
-@3/1
-TCGGGTCTGTACAAGGACGGATGGT
-+
-IIIIIIIIIIIIIIIIIIIIIIIII
-@4/1
-CCGCCGGGAAGTCCTTCTGTCGTGC
-+
-IIIIIIIIIIIIIIIIIIIIIIIII
-@5/1
-CCTCCACAGGATGGTGGTATACCTG
-+
-IIIIIIIIIIIIIIIIIIIIIIIII
-@6/1
-CAGTTGCATGACGTCATGCAGTCAT
-+
-IIIIIIIIIIIIIIIIIIIIIIIII
-@7/1
-ACGCCGGGAAGTCCTTCTGTCGTGT
-+
-IIIIIIIIIIIIIIIIIIIIIIIII
diff --git a/ariba/tests/data/mapping_test_smalt_reads_2.fq b/ariba/tests/data/mapping_test_smalt_reads_2.fq
deleted file mode 100644
index 33e89bd4..00000000
--- a/ariba/tests/data/mapping_test_smalt_reads_2.fq
+++ /dev/null
@@ -1,28 +0,0 @@
-@1/2
-ACCTTTCCCACAAGATCTGTATCCT
-+
-IIIIIIIIIIIIIIIIIIIIIIIII
-@2/2
-CGAGTCTGCGCTTAGCTAAGGTGGA
-+
-IIIIIIIIIIIIIIIIIIIIIIIII
-@3/2
-CGTACTGACTGACTGACGTACTGCA
-+
-IIIIIIIIIIIIIIIIIIIIIIIII
-@4/2
-TTTTAGTGTACCTCTATGGTAAGCC
-+
-IIIIIIIIIIIIIIIIIIIIIIIII
-@5/2
-TCTGCGCTTAGCTAAGGTGGATGAA
-+
-IIIIIIIIIIIIIIIIIIIIIIIII
-@6/2
-AATGAGTATGATGAGTAATGGTATG
-+
-IIIIIIIIIIIIIIIIIIIIIIIII
-@7/2
-ATTTAGTGTACCTCTATGGTAAGCC
-+
-IIIIIIIIIIIIIIIIIIIIIIIII
diff --git a/ariba/tests/data/mapping_test_smalt_ref.fa b/ariba/tests/data/mapping_test_smalt_ref.fa
deleted file mode 100644
index 18764e8e..00000000
--- a/ariba/tests/data/mapping_test_smalt_ref.fa
+++ /dev/null
@@ -1,5 +0,0 @@
->ref
-TCCACAGGATGGTGGTATACCTGAGGCCAAAGGATACAGATCTTGTGGGAAAGGTCCGCC
-GGGAAGTCCTTCTGTCGTGCTTTTTATCGGGTCTGTACAAGGACGGATGGTTTCCGGCAT
-ACCATAATGTTCTTAGGGCTTACCATAGAGGTACACTAAAAAGTGTTTCATCCACCTTAG
-CTAAGCGCAG
diff --git a/ariba/tests/data/mapping_test_smalt_ref.fa.fai b/ariba/tests/data/mapping_test_smalt_ref.fa.fai
deleted file mode 100644
index 045a7897..00000000
--- a/ariba/tests/data/mapping_test_smalt_ref.fa.fai
+++ /dev/null
@@ -1 +0,0 @@
-ref	190	5	60	61
diff --git a/ariba/tests/data/mapping_test_smalt_sorted.bam b/ariba/tests/data/mapping_test_smalt_sorted.bam
deleted file mode 100644
index e50c639a..00000000
Binary files a/ariba/tests/data/mapping_test_smalt_sorted.bam and /dev/null differ
diff --git a/ariba/tests/data/mapping_test_smalt_unsorted.bam b/ariba/tests/data/mapping_test_smalt_unsorted.bam
deleted file mode 100644
index 40e3d8e5..00000000
Binary files a/ariba/tests/data/mapping_test_smalt_unsorted.bam and /dev/null differ
diff --git a/ariba/tests/data/reference_data_filter_bad_data_metadata.expected.tsv b/ariba/tests/data/reference_data_filter_bad_data_metadata.expected.tsv
new file mode 100644
index 00000000..4976b746
--- /dev/null
+++ b/ariba/tests/data/reference_data_filter_bad_data_metadata.expected.tsv
@@ -0,0 +1,8 @@
+non_coding_1	.	.	N	should be in output because this field is here
+non_coding_1	n	C5A	N	dna variant ok
+presence_absence_1	.	.	N	should be in output because this field is here
+presence_absence_2	n	T4G	N	dna variant ok
+presence_absence_3	p	R3S	N	amino acid variant ok
+variants_only_1	.	.	N	should be kept as a generic description of variants_only_1
+variants_only_1	p	S2T	N	amino acid variant ok
+variants_only_1	n	T4A	N	dna variant ok
diff --git a/ariba/tests/data/reference_data_filter_bad_data_metadata.in.tsv b/ariba/tests/data/reference_data_filter_bad_data_metadata.in.tsv
new file mode 100644
index 00000000..5fe13cf0
--- /dev/null
+++ b/ariba/tests/data/reference_data_filter_bad_data_metadata.in.tsv
@@ -0,0 +1,20 @@
+non_coding_1	.	.
+non_coding_1	.	.	N	should be in output because this field is here
+non_coding_1	p	L2K	N	should be removed because this is non-coding, but variant is protein
+non_coding_1	n	C5A	N	dna variant ok
+non_coding_not_in_fasta	.	. N	should be removed from tsv because not in fasta
+presence_absence_1	.	.	N
+presence_absence_1	.	.	N	should be in output because this field is here
+presence_absence_2	n	T4G	N	dna variant ok
+presence_absence_2	n	A4G	N	dna variant not ok
+presence_absence_3	p	R3S	N	amino acid variant ok
+presence_absence_3	p	I3S	N	amino acid variant not ok
+presence_absence_not_in_fasta	.	. N	should be removed from tsv because not in fasta
+variants_only_1	n	T4A	N	dna variant ok
+variants_only_1	n	C4G	N	dna variant not ok
+variants_only_1	p	S2T	N	amino acid variant ok
+variants_only_1	p	I2L	N	amin acid variant not ok
+variants_only_1	.	.	N	should be kept as a generic description of variants_only_1
+variants_only_1	.	.	N
+variants_only_not_in_fasta	.	.	N	should be removed from tsv because not in fasta
+variants_only_no_good_variants	n	A4G	N	dna variant not ok
diff --git a/ariba/tests/data/reference_data_filter_bad_data_non_coding.expected.fa b/ariba/tests/data/reference_data_filter_bad_data_non_coding.expected.fa
new file mode 100644
index 00000000..000b42f3
--- /dev/null
+++ b/ariba/tests/data/reference_data_filter_bad_data_non_coding.expected.fa
@@ -0,0 +1,2 @@
+>non_coding_1
+AAAAAAAGAAGGAAAA
diff --git a/ariba/tests/data/reference_data_filter_bad_data_non_coding.in.fa b/ariba/tests/data/reference_data_filter_bad_data_non_coding.in.fa
new file mode 100644
index 00000000..000b42f3
--- /dev/null
+++ b/ariba/tests/data/reference_data_filter_bad_data_non_coding.in.fa
@@ -0,0 +1,2 @@
+>non_coding_1
+AAAAAAAGAAGGAAAA
diff --git a/ariba/tests/data/reference_data_filter_bad_data_presence_absence.expected.fa b/ariba/tests/data/reference_data_filter_bad_data_presence_absence.expected.fa
new file mode 100644
index 00000000..077de1b0
--- /dev/null
+++ b/ariba/tests/data/reference_data_filter_bad_data_presence_absence.expected.fa
@@ -0,0 +1,6 @@
+>presence_absence_1
+ATGTCTTAA
+>presence_absence_2
+ATGTTTTAA
+>presence_absence_3
+ATGTTTCGTTAA
diff --git a/ariba/tests/data/reference_data_filter_bad_data_presence_absence.in.fa b/ariba/tests/data/reference_data_filter_bad_data_presence_absence.in.fa
new file mode 100644
index 00000000..077de1b0
--- /dev/null
+++ b/ariba/tests/data/reference_data_filter_bad_data_presence_absence.in.fa
@@ -0,0 +1,6 @@
+>presence_absence_1
+ATGTCTTAA
+>presence_absence_2
+ATGTTTTAA
+>presence_absence_3
+ATGTTTCGTTAA
diff --git a/ariba/tests/data/reference_data_filter_bad_data_variants_only.expected.fa b/ariba/tests/data/reference_data_filter_bad_data_variants_only.expected.fa
new file mode 100644
index 00000000..974509c5
--- /dev/null
+++ b/ariba/tests/data/reference_data_filter_bad_data_variants_only.expected.fa
@@ -0,0 +1,2 @@
+>variants_only_1
+ATGTCCTGTTAG
diff --git a/ariba/tests/data/reference_data_filter_bad_data_variants_only.in.fa b/ariba/tests/data/reference_data_filter_bad_data_variants_only.in.fa
new file mode 100644
index 00000000..d68032c6
--- /dev/null
+++ b/ariba/tests/data/reference_data_filter_bad_data_variants_only.in.fa
@@ -0,0 +1,6 @@
+>variants_only_1
+ATGTCCTGTTAG
+>variants_only_should_be_removed
+ATGTCCTGTTAG
+>variants_only_no_good_variants
+ATGTTTTAG
diff --git a/ariba/tests/data/reference_data_get_filename b/ariba/tests/data/reference_data_get_filename
new file mode 100644
index 00000000..ce013625
--- /dev/null
+++ b/ariba/tests/data/reference_data_get_filename
@@ -0,0 +1 @@
+hello
diff --git a/ariba/tests/data/reference_data_init.tsv b/ariba/tests/data/reference_data_init.tsv
new file mode 100644
index 00000000..691625d7
--- /dev/null
+++ b/ariba/tests/data/reference_data_init.tsv
@@ -0,0 +1,4 @@
+gene1	n	A42G	N	free text
+gene1	n	A42T	N	free text2
+gene1	n	G13T	N	confers killer rabbit resistance
+gene2	p	I42L	Y	removes tardigrade's space-living capability
diff --git a/ariba/tests/data/reference_data_init_empty.fa b/ariba/tests/data/reference_data_init_empty.fa
new file mode 100644
index 00000000..e69de29b
diff --git a/ariba/tests/data/reference_data_init_presence_absence.fa b/ariba/tests/data/reference_data_init_presence_absence.fa
new file mode 100644
index 00000000..3f4671cd
--- /dev/null
+++ b/ariba/tests/data/reference_data_init_presence_absence.fa
@@ -0,0 +1,4 @@
+>gene1
+CATTCCTAGCGTCGTCTATCGTCG
+>gene2
+AAAAACCCCGGGGTTTT
diff --git a/ariba/tests/data/reference_data_keep_seqs_from_dict.fa b/ariba/tests/data/reference_data_keep_seqs_from_dict.fa
new file mode 100644
index 00000000..045a3bda
--- /dev/null
+++ b/ariba/tests/data/reference_data_keep_seqs_from_dict.fa
@@ -0,0 +1,2 @@
+>seq1
+acgt
diff --git a/ariba/tests/data/reference_data_keep_seqs_from_dict.log b/ariba/tests/data/reference_data_keep_seqs_from_dict.log
new file mode 100644
index 00000000..067ea0b8
--- /dev/null
+++ b/ariba/tests/data/reference_data_keep_seqs_from_dict.log
@@ -0,0 +1 @@
+seq2 has no variants. Removing.
diff --git a/ariba/tests/data/reference_data_load_fasta_file.fa b/ariba/tests/data/reference_data_load_fasta_file.fa
new file mode 100644
index 00000000..6b27daed
--- /dev/null
+++ b/ariba/tests/data/reference_data_load_fasta_file.fa
@@ -0,0 +1,2 @@
+>seq1
+ACGT
diff --git a/ariba/tests/data/reference_data_load_metadata_tsv.tsv b/ariba/tests/data/reference_data_load_metadata_tsv.tsv
new file mode 100644
index 00000000..2b54928a
--- /dev/null
+++ b/ariba/tests/data/reference_data_load_metadata_tsv.tsv
@@ -0,0 +1,3 @@
+gene1	n	A42G	N	free text
+gene1	n	G13T	N	confers killer rabbit resistance
+gene2	p	I42L	Y	removes tardigrade's space-living capability
diff --git a/ariba/tests/data/reference_data_make_catted_fasta.expected.fa b/ariba/tests/data/reference_data_make_catted_fasta.expected.fa
new file mode 100644
index 00000000..b2dd9a37
--- /dev/null
+++ b/ariba/tests/data/reference_data_make_catted_fasta.expected.fa
@@ -0,0 +1,6 @@
+>pa1
+ATGTTTTAA
+>vonly1
+ATGTTTTAA
+>nc1
+CACCATACTGCATCT
diff --git a/ariba/tests/data/reference_data_make_catted_fasta.noncoding.fa b/ariba/tests/data/reference_data_make_catted_fasta.noncoding.fa
new file mode 100644
index 00000000..2fee0334
--- /dev/null
+++ b/ariba/tests/data/reference_data_make_catted_fasta.noncoding.fa
@@ -0,0 +1,2 @@
+>nc1
+CACCATACTGCATCT
diff --git a/ariba/tests/data/reference_data_make_catted_fasta.presence_absence.fa b/ariba/tests/data/reference_data_make_catted_fasta.presence_absence.fa
new file mode 100644
index 00000000..9caa1f6e
--- /dev/null
+++ b/ariba/tests/data/reference_data_make_catted_fasta.presence_absence.fa
@@ -0,0 +1,2 @@
+>pa1
+ATGTTTTAA
diff --git a/ariba/tests/data/reference_data_make_catted_fasta.variants_only.fa b/ariba/tests/data/reference_data_make_catted_fasta.variants_only.fa
new file mode 100644
index 00000000..1b0ae3c3
--- /dev/null
+++ b/ariba/tests/data/reference_data_make_catted_fasta.variants_only.fa
@@ -0,0 +1,2 @@
+>vonly1
+ATGTTTTAA
diff --git a/ariba/tests/data/reference_data_remove_bad_genes.in.fa b/ariba/tests/data/reference_data_remove_bad_genes.in.fa
new file mode 100644
index 00000000..47d3da08
--- /dev/null
+++ b/ariba/tests/data/reference_data_remove_bad_genes.in.fa
@@ -0,0 +1,10 @@
+>g1
+ACGTG
+>g2
+ACGCGTACGTATCGACGTATCTGACGTACGTAGTACCGTACGTACGTAATCACGTAGTACTGACTGAGTCGTCAGTCAGCTGTAGTACGTAGCACATATA
+>g3
+GAGGAGCCG
+>g4
+ATGTAACCT
+>g5
+ATGCCTGAG
diff --git a/ariba/tests/data/reference_data_sequence.presence_absence.fa b/ariba/tests/data/reference_data_sequence.presence_absence.fa
new file mode 100644
index 00000000..3ea7b0a1
--- /dev/null
+++ b/ariba/tests/data/reference_data_sequence.presence_absence.fa
@@ -0,0 +1,2 @@
+>pa
+ATGTTTTAA
diff --git a/ariba/tests/data/reference_data_sequence_length.presence_absence.fa b/ariba/tests/data/reference_data_sequence_length.presence_absence.fa
new file mode 100644
index 00000000..3ea7b0a1
--- /dev/null
+++ b/ariba/tests/data/reference_data_sequence_length.presence_absence.fa
@@ -0,0 +1,2 @@
+>pa
+ATGTTTTAA
diff --git a/ariba/tests/data/reference_data_sequence_type.noncoding.fa b/ariba/tests/data/reference_data_sequence_type.noncoding.fa
new file mode 100644
index 00000000..92270e6a
--- /dev/null
+++ b/ariba/tests/data/reference_data_sequence_type.noncoding.fa
@@ -0,0 +1,2 @@
+>noncoding
+AAAAAA
diff --git a/ariba/tests/data/reference_data_sequence_type.presence_absence.fa b/ariba/tests/data/reference_data_sequence_type.presence_absence.fa
new file mode 100644
index 00000000..3ea7b0a1
--- /dev/null
+++ b/ariba/tests/data/reference_data_sequence_type.presence_absence.fa
@@ -0,0 +1,2 @@
+>pa
+ATGTTTTAA
diff --git a/ariba/tests/data/reference_data_sequence_type.variants_only.fa b/ariba/tests/data/reference_data_sequence_type.variants_only.fa
new file mode 100644
index 00000000..acf47308
--- /dev/null
+++ b/ariba/tests/data/reference_data_sequence_type.variants_only.fa
@@ -0,0 +1,2 @@
+>var_only
+ATGTTTTAA
diff --git a/ariba/tests/data/reference_data_test_all_non_wild_type_variants.ref.noncoding.fa b/ariba/tests/data/reference_data_test_all_non_wild_type_variants.ref.noncoding.fa
new file mode 100644
index 00000000..601a75d3
--- /dev/null
+++ b/ariba/tests/data/reference_data_test_all_non_wild_type_variants.ref.noncoding.fa
@@ -0,0 +1,2 @@
+>non_coding
+AAGTCATGCATCTA
diff --git a/ariba/tests/data/reference_data_test_all_non_wild_type_variants.ref.pres_abs.fa b/ariba/tests/data/reference_data_test_all_non_wild_type_variants.ref.pres_abs.fa
new file mode 100644
index 00000000..09fa61e8
--- /dev/null
+++ b/ariba/tests/data/reference_data_test_all_non_wild_type_variants.ref.pres_abs.fa
@@ -0,0 +1,2 @@
+>presence_absence_gene
+ATGAACCCCGGGGTTTTTTAA
diff --git a/ariba/tests/data/reference_data_test_all_non_wild_type_variants.ref.var_only.fa b/ariba/tests/data/reference_data_test_all_non_wild_type_variants.ref.var_only.fa
new file mode 100644
index 00000000..65d15d35
--- /dev/null
+++ b/ariba/tests/data/reference_data_test_all_non_wild_type_variants.ref.var_only.fa
@@ -0,0 +1,2 @@
+>var_only_gene
+ATGAACCCCGGGGTTTTTTAA
diff --git a/ariba/tests/data/reference_data_test_all_non_wild_type_variants.tsv b/ariba/tests/data/reference_data_test_all_non_wild_type_variants.tsv
new file mode 100644
index 00000000..292e18ed
--- /dev/null
+++ b/ariba/tests/data/reference_data_test_all_non_wild_type_variants.tsv
@@ -0,0 +1,12 @@
+var_only_gene	n	A8T	N	ref has wild type A should get ignored
+var_only_gene	n	G9C	N	ref has variant C instead of G
+var_only_gene	p	G4I	N	ref has wild type F
+var_only_gene	p	F6I	Y	ref has wild type F
+var_only_gene	p	P3Q	N	ref has wild type P should get ignored
+var_only_gene	p	I5V	N	ref has variant V instead of I
+presence_absence_gene	n	A4G	N	ref has wild type A should get ignored
+presence_absence_gene	n	A6C	N	ref has variant C instead of A
+presence_absence_gene	p	N2I	N	ref has wild tpye N should get ignored
+presence_absence_gene	p	A4G	N	ref has variant G instead of A
+non_coding	n	A2C	N	ref has wild type A should get ignored
+non_coding	n	C4T	N	ref has variant T instead of C
diff --git a/ariba/tests/data/reference_data_test_cluster_with_cdhit.clusters.tsv b/ariba/tests/data/reference_data_test_cluster_with_cdhit.clusters.tsv
new file mode 100644
index 00000000..ba13af65
--- /dev/null
+++ b/ariba/tests/data/reference_data_test_cluster_with_cdhit.clusters.tsv
@@ -0,0 +1,3 @@
+presence_absence1	presence_absence2
+presence_absence3	presence_absence4
+noncoding1
diff --git a/ariba/tests/data/reference_data_test_cluster_with_cdhit.expected_representatives.fa b/ariba/tests/data/reference_data_test_cluster_with_cdhit.expected_representatives.fa
new file mode 100644
index 00000000..b42cb224
--- /dev/null
+++ b/ariba/tests/data/reference_data_test_cluster_with_cdhit.expected_representatives.fa
@@ -0,0 +1,10 @@
+>presence_absence1
+ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCGCGAGCACCAACATTAGCCAT
+ATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAA
+>presence_absence3
+ATGGCGTGCGATGAATTTGGCCATATTAAACTGATGAACCCGCAGCGCAGCACCTAA
+>noncoding1
+GTACATTACTGGCGACCCAAGGAAGGGAAATCTGTTAAACATGATCTCGGTAGTCTATAG
+AACAGATTTAACATTACCTGGTGTTTGCTCCTTGCATCATCCTCTGACTATTCTAGACCA
+GGGAGGACTTTGGTCACGCGCGACCTTGCACTGTGGCGACGCCATAGAACCGTCTACCTG
+ATGCTGGGAAGGGTTTGCTG
diff --git a/ariba/tests/data/reference_data_test_cluster_with_cdhit.non_coding.fa b/ariba/tests/data/reference_data_test_cluster_with_cdhit.non_coding.fa
new file mode 100644
index 00000000..3008b6e4
--- /dev/null
+++ b/ariba/tests/data/reference_data_test_cluster_with_cdhit.non_coding.fa
@@ -0,0 +1,5 @@
+>noncoding1
+GTACATTACTGGCGACCCAAGGAAGGGAAATCTGTTAAACATGATCTCGGTAGTCTATAG
+AACAGATTTAACATTACCTGGTGTTTGCTCCTTGCATCATCCTCTGACTATTCTAGACCA
+GGGAGGACTTTGGTCACGCGCGACCTTGCACTGTGGCGACGCCATAGAACCGTCTACCTG
+ATGCTGGGAAGGGTTTGCTG
diff --git a/ariba/tests/data/reference_data_test_cluster_with_cdhit.presence_absence.fa b/ariba/tests/data/reference_data_test_cluster_with_cdhit.presence_absence.fa
new file mode 100644
index 00000000..4541dff5
--- /dev/null
+++ b/ariba/tests/data/reference_data_test_cluster_with_cdhit.presence_absence.fa
@@ -0,0 +1,10 @@
+>presence_absence1
+ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAACGCGCGAGCACCAACATTAGCCAT
+ATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAA
+>presence_absence2
+ATGGATCGCGAAGCGATGACCCATGAAGCGACCGAAATGGCGAGCACCAACATTAGCCAT
+ATTAACGGCATTAGCGCGTGGGAAAGCATGGAATAA
+>presence_absence3
+ATGGCGTGCGATGAATTTGGCCATATTAAACTGATGAACCCGCAGCGCAGCACCTAA
+>presence_absence4
+ATGGCGTGCGATGAATTTGGCATGATTAAACTGATGAACCCGCAGCGCAGCACCTAA
diff --git a/ariba/tests/data/reference_data_test_remove_bad_genes.log b/ariba/tests/data/reference_data_test_remove_bad_genes.log
new file mode 100644
index 00000000..5b148109
--- /dev/null
+++ b/ariba/tests/data/reference_data_test_remove_bad_genes.log
@@ -0,0 +1,4 @@
+g1 Remove: too short. Length: 5
+g2 Remove: too long. Length: 100
+g3 Does not look like a gene (does not start with start codon, length (9) is not a multiple of 3 (length/3=3.0), or contains internal stop codons). Translation: EEP
+g4 Does not look like a gene (does not start with start codon, length (9) is not a multiple of 3 (length/3=3.0), or contains internal stop codons). Translation: M*P
diff --git a/ariba/tests/data/reference_data_test_write_cluster_allocation_file.expected b/ariba/tests/data/reference_data_test_write_cluster_allocation_file.expected
new file mode 100644
index 00000000..ae974b0e
--- /dev/null
+++ b/ariba/tests/data/reference_data_test_write_cluster_allocation_file.expected
@@ -0,0 +1,4 @@
+seq1	seq2
+seq3	seq4	seq5
+seq6
+seq10	seq42
diff --git a/ariba/tests/data/reference_data_test_write_seqs_to_fasta.expected.fa b/ariba/tests/data/reference_data_test_write_seqs_to_fasta.expected.fa
new file mode 100644
index 00000000..adcd5e6f
--- /dev/null
+++ b/ariba/tests/data/reference_data_test_write_seqs_to_fasta.expected.fa
@@ -0,0 +1,6 @@
+>seq1
+ATGAACCCCGGGGTTTTTTAA
+>seq4
+ATGAACCCCGGGGTTTTTTAA
+>seq5
+ATGAACCCCGGGGTTTTTTAA
diff --git a/ariba/tests/data/reference_data_test_write_seqs_to_fasta.in.fa b/ariba/tests/data/reference_data_test_write_seqs_to_fasta.in.fa
new file mode 100644
index 00000000..bf5df893
--- /dev/null
+++ b/ariba/tests/data/reference_data_test_write_seqs_to_fasta.in.fa
@@ -0,0 +1,10 @@
+>seq1
+ATGAACCCCGGGGTTTTTTAA
+>seq2
+ATGAACCCCGGGGTTTTTTAA
+>seq3
+ATGAACCCCGGGGTTTTTTAA
+>seq4
+ATGAACCCCGGGGTTTTTTAA
+>seq5
+ATGAACCCCGGGGTTTTTTAA
diff --git a/ariba/tests/data/cdhit_test_rename_fasta.out.fa b/ariba/tests/data/reference_data_write_dict_of_sequences.fa
similarity index 50%
rename from ariba/tests/data/cdhit_test_rename_fasta.out.fa
rename to ariba/tests/data/reference_data_write_dict_of_sequences.fa
index 7ab37993..3becfa85 100644
--- a/ariba/tests/data/cdhit_test_rename_fasta.out.fa
+++ b/ariba/tests/data/reference_data_write_dict_of_sequences.fa
@@ -1,6 +1,4 @@
 >seq1
-A
+ACGT
 >seq2
-C
->seq3
-G
+GGGG
diff --git a/ariba/tests/data/reference_data_write_metadata_tsv.expected.tsv b/ariba/tests/data/reference_data_write_metadata_tsv.expected.tsv
new file mode 100644
index 00000000..8319f9ea
--- /dev/null
+++ b/ariba/tests/data/reference_data_write_metadata_tsv.expected.tsv
@@ -0,0 +1,2 @@
+gene1	.	.	N	has anybody got a bottle of orange juice?
+gene2	.	.	N	we didn't burn him
diff --git a/ariba/tests/data/reference_data_write_metadata_tsv.tsv b/ariba/tests/data/reference_data_write_metadata_tsv.tsv
new file mode 100644
index 00000000..79667e31
--- /dev/null
+++ b/ariba/tests/data/reference_data_write_metadata_tsv.tsv
@@ -0,0 +1,2 @@
+gene2	.	.	N	we didn't burn him
+gene1	.	.	N	has anybody got a bottle of orange juice?
diff --git a/ariba/tests/data/reference_data_write_metadata_tsv_presence_absence.fa b/ariba/tests/data/reference_data_write_metadata_tsv_presence_absence.fa
new file mode 100644
index 00000000..1f144424
--- /dev/null
+++ b/ariba/tests/data/reference_data_write_metadata_tsv_presence_absence.fa
@@ -0,0 +1,4 @@
+>gene1
+CCTACTATCGCGTCTGCTG
+>gene2
+CGCAGCAGCCGACAGAGAGA
diff --git a/ariba/tests/data/samtools_variants_test_get_depths_at_position.bam b/ariba/tests/data/samtools_variants_test_get_depths_at_position.bam
new file mode 100644
index 00000000..a2c29b2c
Binary files /dev/null and b/ariba/tests/data/samtools_variants_test_get_depths_at_position.bam differ
diff --git a/ariba/tests/data/samtools_variants_test_get_depths_at_position.ref.fa b/ariba/tests/data/samtools_variants_test_get_depths_at_position.ref.fa
new file mode 100644
index 00000000..dc03a6ea
--- /dev/null
+++ b/ariba/tests/data/samtools_variants_test_get_depths_at_position.ref.fa
@@ -0,0 +1,18 @@
+>ref
+AGTGCCTTTTAGACTAGACGCACTCTTCTTGCTGAGAAACTAGAGCTGTCGCTCCAAGAG
+GAGTTAAAAAGCAGAACCTGGACCACAGTTCCTGAAGAATACCGTGTATGTACTGCAGCC
+GGTGTACCTGTCTGGACCCTATGCTCGCGAAAACGGACTCATTACTGCAGGGTTGCACCG
+CCGTGCTCGGAGGGAGGTCAGTCCCCGGTGAGATCTACACCGGCTGACACCACCCTTTCC
+CACCCAGTGAAGTACCTTCAGCCAAAGGACGAGGCTAGTAACAAAATTCTGCGATGTGCG
+TGGAGCGCTACTAAATGGCCCGTAGTACGGCCCACTACAGCGTACCTTTTGGGCACTACA
+ATTACCTCCGGTATTTGCTTAGATCATTGCAGTAAGGACTCATAAGAAACCTTCCTGTCA
+TAGCACCCCGCAGTGCCACAGAAATGGAGTTTTGTGTGAATTTGATAAGGACGGCACTCC
+GCAGTACCCAACCCATGAGTATCTATGGCCATTGTTGATTGGAGCCCTTATCAGTGTCTT
+AATCCATAATCGGATTATGTCGACCCGTTCTAGTTATATTGCATTCCTAACCCTGGTCCT
+GGGTGCCTTGCATTCCACGAAGAGCTGCAGAAATTTCGCGCACATCAGACACGAACACGC
+CAAACCCGTATCTACCGCACCAACCGGCCTCGCTGACTAGGGCATAATGCGGTGGGATGG
+CAACTGTGTCCTTTTTCGTAGTATCGACTGATATATGGATGCACTCCGCGGTCGTTTGAG
+AGCGGACGGATCACTAGGACATTTGCGGTGGGTTTTAGGCATTGACCGAGCTAGTCCATG
+TTTTTCCATGACGGGTGTGTCGATCAATTACAGCGGTTCCACGATCGAGAAGCACTATCG
+TCTCGGATATTGACCTGTAAGCTGGGAGATCTCCACCAACAGTATTGGGATACGTGGTCC
+CACCGGTAGTAGGATCGCTCCTGCCCGAACGACTAGTTAA
diff --git a/ariba/tests/data/samtools_variants_test_get_depths_at_position.ref.fa.fai b/ariba/tests/data/samtools_variants_test_get_depths_at_position.ref.fa.fai
new file mode 100644
index 00000000..32074835
--- /dev/null
+++ b/ariba/tests/data/samtools_variants_test_get_depths_at_position.ref.fa.fai
@@ -0,0 +1 @@
+ref	1000	5	60	61
diff --git a/ariba/tests/data/cluster_test_get_assembly_read_depths.gz b/ariba/tests/data/samtools_variants_test_get_read_depths.gz
similarity index 100%
rename from ariba/tests/data/cluster_test_get_assembly_read_depths.gz
rename to ariba/tests/data/samtools_variants_test_get_read_depths.gz
diff --git a/ariba/tests/data/cluster_test_get_assembly_read_depths.gz.tbi b/ariba/tests/data/samtools_variants_test_get_read_depths.gz.tbi
similarity index 100%
rename from ariba/tests/data/cluster_test_get_assembly_read_depths.gz.tbi
rename to ariba/tests/data/samtools_variants_test_get_read_depths.gz.tbi
diff --git a/ariba/tests/data/cluster_test_get_samtools_variant_positions.vcf b/ariba/tests/data/samtools_variants_test_get_variant_positions_from_vcf.vcf
similarity index 100%
rename from ariba/tests/data/cluster_test_get_samtools_variant_positions.vcf
rename to ariba/tests/data/samtools_variants_test_get_variant_positions_from_vcf.vcf
diff --git a/ariba/tests/data/cluster_test_get_samtools_variants.read_depths.gz b/ariba/tests/data/samtools_variants_test_get_variants.read_depths.gz
similarity index 100%
rename from ariba/tests/data/cluster_test_get_samtools_variants.read_depths.gz
rename to ariba/tests/data/samtools_variants_test_get_variants.read_depths.gz
diff --git a/ariba/tests/data/cluster_test_get_samtools_variants.read_depths.gz.tbi b/ariba/tests/data/samtools_variants_test_get_variants.read_depths.gz.tbi
similarity index 100%
rename from ariba/tests/data/cluster_test_get_samtools_variants.read_depths.gz.tbi
rename to ariba/tests/data/samtools_variants_test_get_variants.read_depths.gz.tbi
diff --git a/ariba/tests/data/cluster_test_get_samtools_variants.vcf b/ariba/tests/data/samtools_variants_test_get_variants.vcf
similarity index 100%
rename from ariba/tests/data/cluster_test_get_samtools_variants.vcf
rename to ariba/tests/data/samtools_variants_test_get_variants.vcf
diff --git a/ariba/tests/data/cluster_test_make_assembly_vcf.assembly.fa b/ariba/tests/data/samtools_variants_test_make_vcf_and_read_depths_files.assembly.fa
similarity index 100%
rename from ariba/tests/data/cluster_test_make_assembly_vcf.assembly.fa
rename to ariba/tests/data/samtools_variants_test_make_vcf_and_read_depths_files.assembly.fa
diff --git a/ariba/tests/data/cluster_test_make_assembly_vcf.assembly.fa.fai b/ariba/tests/data/samtools_variants_test_make_vcf_and_read_depths_files.assembly.fa.fai
similarity index 100%
rename from ariba/tests/data/cluster_test_make_assembly_vcf.assembly.fa.fai
rename to ariba/tests/data/samtools_variants_test_make_vcf_and_read_depths_files.assembly.fa.fai
diff --git a/ariba/tests/data/cluster_test_make_assembly_vcf.assembly.bam b/ariba/tests/data/samtools_variants_test_make_vcf_and_read_depths_files.bam
similarity index 100%
rename from ariba/tests/data/cluster_test_make_assembly_vcf.assembly.bam
rename to ariba/tests/data/samtools_variants_test_make_vcf_and_read_depths_files.bam
diff --git a/ariba/tests/data/cluster_test_make_assembly_vcf.assembly.read_depths.gz b/ariba/tests/data/samtools_variants_test_make_vcf_and_read_depths_files.expected.read_depths.gz
similarity index 100%
rename from ariba/tests/data/cluster_test_make_assembly_vcf.assembly.read_depths.gz
rename to ariba/tests/data/samtools_variants_test_make_vcf_and_read_depths_files.expected.read_depths.gz
diff --git a/ariba/tests/data/cluster_test_make_assembly_vcf.assembly.read_depths.gz.tbi b/ariba/tests/data/samtools_variants_test_make_vcf_and_read_depths_files.expected.read_depths.gz.tbi
similarity index 100%
rename from ariba/tests/data/cluster_test_make_assembly_vcf.assembly.read_depths.gz.tbi
rename to ariba/tests/data/samtools_variants_test_make_vcf_and_read_depths_files.expected.read_depths.gz.tbi
diff --git a/ariba/tests/data/cluster_test_make_assembly_vcf.assembly.vcf b/ariba/tests/data/samtools_variants_test_make_vcf_and_read_depths_files.expected.vcf
similarity index 100%
rename from ariba/tests/data/cluster_test_make_assembly_vcf.assembly.vcf
rename to ariba/tests/data/samtools_variants_test_make_vcf_and_read_depths_files.expected.vcf
diff --git a/ariba/tests/data/cluster_test_get_vcf_variant_counts.vcf b/ariba/tests/data/samtools_variants_test_variants_in_coords.vcf
similarity index 100%
rename from ariba/tests/data/cluster_test_get_vcf_variant_counts.vcf
rename to ariba/tests/data/samtools_variants_test_variants_in_coords.vcf
diff --git a/ariba/tests/data/summary_test_gather_output_rows.in.1.tsv b/ariba/tests/data/summary_test_gather_output_rows.in.1.tsv
index ff5aaf52..24a7af2d 100644
--- a/ariba/tests/data/summary_test_gather_output_rows.in.1.tsv
+++ b/ariba/tests/data/summary_test_gather_output_rows.in.1.tsv
@@ -1,4 +1,3 @@
-#gene	flag	reads	cluster	gene_len	assembled	pc_ident	var_type	var_effect	new_aa	gene_start	gene_end	gene_nt	scaffold	scaff_len	scaff_start	scaff_end	scaff_nt	read_depth	alt_bases	ref_alt_depth
-gene1	27	42	1	822	822	100.0	.	.	.	.	.	.	gene1.scaffold.1	1490	.	.	.	.	.	.
-gene2	15	44	2	780	780	100.0	.	.	.	.	.	.	gene2.scaffold.2	1124	.	.	.	.	.	.
-gene2	15	46	2	780	770	99.0	.	.	.	.	.	.	gene2.scaffold.3	1097	.	.	.	.	.	.
+#ref_name	ref_type	flag	reads	cluster_rep	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
+noncoding1	non_coding	19	78	noncoding1	120	120	98.33	noncoding1.scaffold.1	279	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1_n_A14T_N_ref has wild type, reads have variant so should report	generic description of noncoding1
+presence_absence1	presence_absence	27	88	presence_absence1	96	96	98.96	presence_absence1.scaffold.1	267	1	SNP	p	A10V	1	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1_p_A10V_N_Ref has wild, reads have variant so report	Generic description of presence_absence1
diff --git a/ariba/tests/data/summary_test_gather_output_rows.in.2.tsv b/ariba/tests/data/summary_test_gather_output_rows.in.2.tsv
index 3d7bfb73..1113e417 100644
--- a/ariba/tests/data/summary_test_gather_output_rows.in.2.tsv
+++ b/ariba/tests/data/summary_test_gather_output_rows.in.2.tsv
@@ -1,3 +1,5 @@
-#gene	flag	reads	cluster	gene_len	assembled	pc_ident	var_type	var_effect	new_aa	gene_start	gene_end	gene_nt	scaffold	scaff_len	scaff_start	scaff_end	scaff_nt	read_depth	alt_bases	ref_alt_depth
-gene1	27	142	1	822	822	100.0	.	.	.	.	.	.	gene1.scaffold.1	1490	.	.	.	.	.	.
-gene3	27	144	3	750	750	98.93	.	.	.	.	.	.	gene3.scaffold.1	1047	.	.	.	.	.	.
+#ref_name	ref_type	flag	reads	cluster_rep	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
+noncoding1	non_coding	19	78	noncoding1	120	120	98.33	noncoding1.scaffold.1	279	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1_n_A14T_N_ref has wild type, reads have variant so should report	generic description of noncoding1
+noncoding1	non_coding	19	78	noncoding1	120	120	98.33	noncoding1.scaffold.1	279	1	SNP	n	A6G	1	.	.	6	6	G	77	77	G	18	.	18	noncoding1_n_A6G_N_variant in ref and reads so should report	generic description of noncoding1
+presence_absence1	presence_absence	27	88	presence_absence1	96	96	98.96	presence_absence1.scaffold.1	267	1	SNP	p	A10V	1	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1_p_A10V_N_Ref has wild, reads have variant so report	Generic description of presence_absence1
+variants_only1	variants_only	64	12	variants_only1	90	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.
diff --git a/ariba/tests/data/summary_test_load_file.in.tsv b/ariba/tests/data/summary_test_load_file.in.tsv
index e78caca3..933f3dcf 100644
--- a/ariba/tests/data/summary_test_load_file.in.tsv
+++ b/ariba/tests/data/summary_test_load_file.in.tsv
@@ -1,5 +1,5 @@
-#gene	flag	reads	cluster	gene_len	assembled	pc_ident	var_type	var_effect	new_aa	gene_start	gene_end	gene_nt	scaffold	scaff_len	scaff_start	scaff_end	scaff_nt	read_depth	alt_bases	ref_alt_depth
-gene1	27	42	1	822	822	100.0	.	.	.	.	.	.	gene1.scaffold.1	1490	.	.	.	.	.	.
-gene2	15	44	2	780	780	100.0	.	.	.	.	.	.	gene2.scaffold.2	1124	.	.	.	.	.	.
-gene2	15	46	2	780	770	99.0	.	.	.	.	.	.	gene2.scaffold.3	1097	.	.	.	.	.	.
-gene3	187	48	3	750	750	98.93	SNP	SYN	.	318	318	C	gene3.scaffold.1	1047	319	319	G	.	.	.
+#ref_name	ref_type	flag	reads	cluster_rep	ref_len	ref_base_assembled	pc_ident	ctg	ctg_len	known_var	var_type	var_seq_type	known_var_change	has_known_var	ref_ctg_change	ref_ctg_effect	ref_start	ref_end	ref_nt	ctg_start	ctg_end	ctg_nt	smtls_total_depth	smtls_alt_nt	smtls_alt_depth	var_description	free_text
+noncoding1	non_coding	19	78	noncoding1	120	120	98.33	noncoding1.scaffold.1	279	1	SNP	n	A14T	1	A14T	SNP	13	13	A	84	84	T	17	.	17	noncoding1_n_A14T_N_ref has wild type, reads have variant so should report	generic description of noncoding1
+noncoding1	non_coding	19	78	noncoding1	120	120	98.33	noncoding1.scaffold.1	279	1	SNP	n	A6G	1	.	.	6	6	G	77	77	G	18	.	18	noncoding1_n_A6G_N_variant in ref and reads so should report	generic description of noncoding1
+presence_absence1	presence_absence	27	88	presence_absence1	96	96	98.96	presence_absence1.scaffold.1	267	1	SNP	p	A10V	1	A10V	NONSYN	28	28	C	113	113	T	29	.	29	presence_absence1_p_A10V_N_Ref has wild, reads have variant so report	Generic description of presence_absence1
+variants_only1	variants_only	27	64	variants_only1	90	90	100.0	variants_only1.scaffold.1	260	1	SNP	p	S5T	1	.	.	13	15	A;C;C	96	98	A;C;C	12;13;13	.;.;.	12;13;13	variants_only1_p_S5T_N_Ref and reads have variant so report	Generic description of variants_only1
diff --git a/ariba/tests/data/summary_test_write_js_candy_csv.csv b/ariba/tests/data/summary_test_write_js_candy_csv.csv
deleted file mode 100644
index d50d116f..00000000
--- a/ariba/tests/data/summary_test_write_js_candy_csv.csv
+++ /dev/null
@@ -1,3 +0,0 @@
-name,gene1,gene3
-file1,1,3
-file2,2,4
diff --git a/ariba/tests/data/summary_test_write_js_candy_files.csv b/ariba/tests/data/summary_test_write_js_candy_files.csv
deleted file mode 100644
index 5bbee36c..00000000
--- a/ariba/tests/data/summary_test_write_js_candy_files.csv
+++ /dev/null
@@ -1,4 +0,0 @@
-name,gene1,gene2,gene3
-file1,0,1,0
-file2,1,0,3
-file3,0,0,4
diff --git a/ariba/tests/data/summary_test_write_js_candy_files.tre b/ariba/tests/data/summary_test_write_js_candy_files.tre
deleted file mode 100644
index a3f5ac3a..00000000
--- a/ariba/tests/data/summary_test_write_js_candy_files.tre
+++ /dev/null
@@ -1 +0,0 @@
-(file1:1.58113883,(file2:0.7071067812,file3:0.7071067812):0.8740320489);
diff --git a/ariba/tests/data/summary_test_write_phandango_csv.csv b/ariba/tests/data/summary_test_write_phandango_csv.csv
new file mode 100644
index 00000000..0d5a3d80
--- /dev/null
+++ b/ariba/tests/data/summary_test_write_phandango_csv.csv
@@ -0,0 +1,5 @@
+name,seq1:z1,seq1;var.p.I14L:z2,seq1;var.p.P42Q:z2,seq2:z1,seq2;var.n.A14T:z2
+file1,3,0,1,3,1
+file2,3,1,0,3,0
+file3,1,0,0,3,0
+file4,2,1,0,0,0
diff --git a/ariba/tests/data/summary_test_write_phandango_files.csv b/ariba/tests/data/summary_test_write_phandango_files.csv
new file mode 100644
index 00000000..0d5a3d80
--- /dev/null
+++ b/ariba/tests/data/summary_test_write_phandango_files.csv
@@ -0,0 +1,5 @@
+name,seq1:z1,seq1;var.p.I14L:z2,seq1;var.p.P42Q:z2,seq2:z1,seq2;var.n.A14T:z2
+file1,3,0,1,3,1
+file2,3,1,0,3,0
+file3,1,0,0,3,0
+file4,2,1,0,0,0
diff --git a/ariba/tests/data/summary_test_write_phandango_files.tre b/ariba/tests/data/summary_test_write_phandango_files.tre
new file mode 100644
index 00000000..1266cb32
--- /dev/null
+++ b/ariba/tests/data/summary_test_write_phandango_files.tre
@@ -0,0 +1 @@
+(file1:2.236067977,(file4:1.414213562,(file2:0.8660254038,file3:0.8660254038):0.5481881586):0.8218544151);
diff --git a/ariba/tests/external_progs_test.py b/ariba/tests/external_progs_test.py
new file mode 100644
index 00000000..590b086a
--- /dev/null
+++ b/ariba/tests/external_progs_test.py
@@ -0,0 +1,9 @@
+import unittest
+import os
+from ariba import external_progs
+
+class TestExternalProgs(unittest.TestCase):
+    def test_external_progs_ok(self):
+        '''Test that external programs are found'''
+        progs = external_progs.ExternalProgs(verbose=True)
+
diff --git a/ariba/tests/faidx_test.py b/ariba/tests/faidx_test.py
index ce83af69..fa883872 100644
--- a/ariba/tests/faidx_test.py
+++ b/ariba/tests/faidx_test.py
@@ -1,10 +1,11 @@
 import unittest
 import filecmp
 import os
-from ariba import faidx
+from ariba import faidx, external_progs
 
 modules_dir = os.path.dirname(os.path.abspath(faidx.__file__))
 data_dir = os.path.join(modules_dir, 'tests', 'data')
+extern_progs = external_progs.ExternalProgs()
 
 
 class TestFaidx(unittest.TestCase):
@@ -13,6 +14,6 @@ def test_write_fa_subset(self):
         infile = os.path.join(data_dir, 'faidx_test_write_fa_subset.in.fa')
         expected = os.path.join(data_dir, 'faidx_test_write_fa_subset.out.fa')
         tmpfile = 'tmp.test_write_fa_subset.out.fa'
-        faidx.write_fa_subset(['seq1', 'seq3', 'seq4'], infile, tmpfile)
+        faidx.write_fa_subset(['seq1', 'seq3', 'seq4'], infile, tmpfile, samtools_exe=extern_progs.exe('samtools'))
         self.assertTrue(filecmp.cmp(expected, tmpfile, shallow=False))
         os.unlink(tmpfile)
diff --git a/ariba/tests/flag_test.py b/ariba/tests/flag_test.py
index 3053c538..f0960729 100644
--- a/ariba/tests/flag_test.py
+++ b/ariba/tests/flag_test.py
@@ -24,7 +24,7 @@ def test_set_flag(self):
     def test_add(self):
         '''Test add'''
         f = flag.Flag()
-        expected = [1, 3, 7, 15, 31, 63, 127, 255, 511, 1023]
+        expected = [1, 3, 7, 15, 31, 63, 127, 255, 511, 1023, 2047]
         for i in range(len(flag.flags_in_order)):
             f.add(flag.flags_in_order[i])
             self.assertEqual(f.to_number(), expected[i])
@@ -41,9 +41,9 @@ def test_to_long_str(self):
         '''Test to_long_str'''
         f = flag.Flag(13)
         expected = '\n'.join([
-            '[X] gene_assembled',
-            '[ ] gene_assembled_into_one_contig',
-            '[X] gene_region_assembled_twice',
+            '[X] assembled',
+            '[ ] assembled_into_one_contig',
+            '[X] region_assembled_twice',
             '[X] complete_orf',
             '[ ] unique_contig',
             '[ ] scaffold_graph_bad',
@@ -51,6 +51,7 @@ def test_to_long_str(self):
             '[ ] variants_suggest_collapsed_repeat',
             '[ ] hit_both_strands',
             '[ ] has_nonsynonymous_variants',
+            '[ ] ref_seq_choose_fail',
         ])
 
         self.assertEqual(expected, f.to_long_string())
diff --git a/ariba/tests/mapping_test.py b/ariba/tests/mapping_test.py
index fee1021a..72b44b7e 100644
--- a/ariba/tests/mapping_test.py
+++ b/ariba/tests/mapping_test.py
@@ -2,10 +2,11 @@
 import os
 import shutil
 import pysam
-from ariba import mapping
+from ariba import mapping, external_progs
 
 modules_dir = os.path.dirname(os.path.abspath(mapping.__file__))
 data_dir = os.path.join(modules_dir, 'tests', 'data')
+extern_progs = external_progs.ExternalProgs()
 
 
 # different smalt version output slightly different BAMs. Some columns
@@ -23,16 +24,6 @@ def get_sam_columns(bamfile):
 
 
 class TestMapping(unittest.TestCase):
-    def test_bowtie2_in_path(self):
-        '''Test that bowtie2 is in the user's path'''
-        assert(shutil.which('bowtie2') is not None)
-
-
-    def test_samtools_in_path(self):
-        '''Test that samtools is in the user's path'''
-        assert(shutil.which('samtools') is not None)
-
-
     def test_run_bowtie2(self):
         '''Test run_bowtie2 unsorted'''
         self.maxDiff = None
@@ -40,54 +31,62 @@ def test_run_bowtie2(self):
         reads1 = os.path.join(data_dir, 'mapping_test_bowtie2_reads_1.fq')
         reads2 = os.path.join(data_dir, 'mapping_test_bowtie2_reads_2.fq')
         out_prefix = 'tmp.out.bowtie2'
-        mapping.run_bowtie2(reads1, reads2, ref, out_prefix)
+        mapping.run_bowtie2(
+            reads1,
+            reads2,
+            ref,
+            out_prefix,
+            samtools=extern_progs.exe('samtools'),
+            bowtie2=extern_progs.exe('bowtie2'),
+        )
         expected = get_sam_columns(os.path.join(data_dir, 'mapping_test_bowtie2_unsorted.bam'))
         got = get_sam_columns(out_prefix + '.bam')
         self.assertListEqual(expected, got)
         os.unlink(out_prefix + '.bam')
 
 
+    def test_run_bowtie2_remove_both_unmapped(self):
+        '''Test run_bowtie2 unsorted remove both unmapped'''
+        self.maxDiff = None
+        ref = os.path.join(data_dir, 'mapping_test_bowtie2_ref.fa')
+        reads1 = os.path.join(data_dir, 'mapping_test_bowtie2_remove_both_unmapped_reads_1.fq')
+        reads2 = os.path.join(data_dir, 'mapping_test_bowtie2_remove_both_unmapped_reads_2.fq')
+        out_prefix = 'tmp.out.bowtie2_remove_both_unmapped'
+        mapping.run_bowtie2(
+            reads1,
+            reads2,
+            ref,
+            out_prefix,
+            samtools=extern_progs.exe('samtools'),
+            bowtie2=extern_progs.exe('bowtie2'),
+            remove_both_unmapped=True,
+        )
+        expected = get_sam_columns(os.path.join(data_dir, 'mapping_test_bowtie2_remove_both_unmapped_reads.bam'))
+        got = get_sam_columns(out_prefix + '.bam')
+        self.assertListEqual(expected, got)
+        os.unlink(out_prefix + '.bam')
+
+
     def test_run_bowtie2_and_sort(self):
         '''Test run_bowtie2 sorted'''
         ref = os.path.join(data_dir, 'mapping_test_bowtie2_ref.fa')
         reads1 = os.path.join(data_dir, 'mapping_test_bowtie2_reads_1.fq')
         reads2 = os.path.join(data_dir, 'mapping_test_bowtie2_reads_2.fq')
         out_prefix = 'tmp.out.bowtie2'
-        mapping.run_bowtie2(reads1, reads2, ref, out_prefix, sort=True)
+        mapping.run_bowtie2(
+            reads1,
+            reads2,
+            ref,
+            out_prefix,
+            sort=True,
+            samtools=extern_progs.exe('samtools'),
+            bowtie2=extern_progs.exe('bowtie2'),
+        )
         expected = get_sam_columns(os.path.join(data_dir, 'mapping_test_bowtie2_sorted.bam'))
         got = get_sam_columns(out_prefix + '.bam')
         self.assertListEqual(expected, got)
         os.unlink(out_prefix + '.bam')
         os.unlink(out_prefix + '.bam.bai')
-        os.unlink(out_prefix + '.unsorted.bam')
-
-
-    #def test_run_smalt(self):
-    #    '''Test run_smalt unsorted'''
-    #    ref = os.path.join(data_dir, 'mapping_test_smalt_ref.fa')
-    #    reads1 = os.path.join(data_dir, 'mapping_test_smalt_reads_1.fq')
-    #    reads2 = os.path.join(data_dir, 'mapping_test_smalt_reads_2.fq')
-    #    out_prefix = 'tmp.out.smalt'
-    #    mapping.run_smalt(reads1, reads2, ref, out_prefix)
-    #    expected = get_sam_columns(os.path.join(data_dir, 'mapping_test_smalt_unsorted.bam'))
-    #    got = get_sam_columns(out_prefix + '.bam')
-    #    self.assertListEqual(expected, got)
-    #    os.unlink(out_prefix + '.bam')
-
-
-    #def test_run_smalt_and_sort(self):
-    #    '''Test run_smalt sorted'''
-    #    ref = os.path.join(data_dir, 'mapping_test_smalt_ref.fa')
-    #    reads1 = os.path.join(data_dir, 'mapping_test_smalt_reads_1.fq')
-    #    reads2 = os.path.join(data_dir, 'mapping_test_smalt_reads_2.fq')
-    #    out_prefix = 'tmp.out.smalt'
-    #    mapping.run_smalt(reads1, reads2, ref, out_prefix, sort=True)
-    #    expected = get_sam_columns(os.path.join(data_dir, 'mapping_test_smalt_sorted.bam'))
-    #    got = get_sam_columns(out_prefix + '.bam')
-    #    self.assertListEqual(expected, got)
-    #    os.unlink(out_prefix + '.bam')
-    #    os.unlink(out_prefix + '.bam.bai')
-    #    os.unlink(out_prefix + '.unsorted.bam')
 
 
     def test_get_total_alignment_score(self):
diff --git a/ariba/tests/refcheck_test.py b/ariba/tests/refcheck_test.py
deleted file mode 100644
index bc13e015..00000000
--- a/ariba/tests/refcheck_test.py
+++ /dev/null
@@ -1,69 +0,0 @@
-import unittest
-import os
-import filecmp
-import pyfastaq
-from ariba import refcheck
-
-modules_dir = os.path.dirname(os.path.abspath(refcheck.__file__))
-data_dir = os.path.join(modules_dir, 'tests', 'data')
-
-
-class TestRefcheck(unittest.TestCase):
-    def test_check_pass(self):
-        '''test check file OK'''
-        infile = os.path.join(data_dir, 'refcheck_test_check_ok.fa')
-        c = refcheck.Checker(infile)
-        self.assertEqual(c.run(), (True, None, None))
-
-
-    def test_check_file_fail_not_gene(self):
-        '''test check file fail not a gene'''
-        infile = os.path.join(data_dir, 'refcheck_test_check_not_gene.fa')
-        c = refcheck.Checker(infile)
-        seq = pyfastaq.sequences.Fasta('gene1', 'TTGTGATGA')
-        self.assertEqual(c.run(), (False, 'Not a gene', seq))
-
-
-    def test_check_file_fail_too_short(self):
-        '''test check file fail short gene'''
-        infile = os.path.join(data_dir, 'refcheck_test_check_too_short.fa')
-        c = refcheck.Checker(infile, min_length=10)
-        seq = pyfastaq.sequences.Fasta('gene1', 'TTGTGGTGA')
-        self.assertEqual(c.run(), (False, 'Too short', seq))
-
-
-    def test_check_file_fail_too_long(self):
-        '''test check file fail long gene'''
-        infile = os.path.join(data_dir, 'refcheck_test_check_too_long.fa')
-        c = refcheck.Checker(infile, max_length=6)
-        seq = pyfastaq.sequences.Fasta('gene1', 'TTGTGGTGA')
-        self.assertEqual(c.run(), (False, 'Too long', seq))
-
-
-    def test_check_file_fail_spades_in_name(self):
-        '''test check file with sequence that has spaces in its name'''
-        infile = os.path.join(data_dir, 'refcheck_test_check_spaces_in_name.fa')
-        c = refcheck.Checker(infile, min_length=3)
-        seq = pyfastaq.sequences.Fasta('gene foo', 'TTGTGGTGA')
-        self.assertEqual(c.run(), (False, 'Name has spaces', seq))
-
-
-    def test_check_file_fail_duplicate_name(self):
-        '''test check file with sequence that has two genes with the same name'''
-        infile = os.path.join(data_dir, 'refcheck_test_check_duplicate_name.fa')
-        c = refcheck.Checker(infile, min_length=3)
-        seq = pyfastaq.sequences.Fasta('gene1', 'TTGTGGTGA')
-        self.assertEqual(c.run(), (False, 'Duplicate name', seq))
-
-
-    def test_check_run_with_outfiles(self):
-        '''test run when making output files'''
-        infile = os.path.join(data_dir, 'refcheck_test_fix_in.fa')
-        tmp_prefix = 'tmp.refcheck_test_fix.out'
-        c = refcheck.Checker(infile, min_length=10, max_length=25, outprefix=tmp_prefix)
-        c.run()
-        for x in ['fa', 'log', 'rename', 'removed.fa']:
-            expected = os.path.join(data_dir, 'refcheck_test_fix_out.' + x)
-            got = tmp_prefix + '.' + x
-            self.assertTrue(filecmp.cmp(expected, got, shallow=False))
-            os.unlink(got)
diff --git a/ariba/tests/reference_data_test.py b/ariba/tests/reference_data_test.py
new file mode 100644
index 00000000..a1a6c4d0
--- /dev/null
+++ b/ariba/tests/reference_data_test.py
@@ -0,0 +1,368 @@
+import unittest
+import filecmp
+import os
+import pyfastaq
+from ariba import reference_data, sequence_metadata
+
+modules_dir = os.path.dirname(os.path.abspath(reference_data.__file__))
+data_dir = os.path.join(modules_dir, 'tests', 'data')
+
+
+class TestReferenceData(unittest.TestCase):
+    def test_init_fails(self):
+        '''Test __init__ fails when it should'''
+
+        with self.assertRaises(reference_data.Error):
+            ref_data = reference_data.ReferenceData()
+
+        presence_absence_bad  = os.path.join(data_dir, 'reference_data_init_presence_absence_bad.fa')
+
+        with self.assertRaises(reference_data.Error):
+            ref_data = reference_data.ReferenceData(presence_absence_fa=presence_absence_bad)
+
+        empty_fasta = os.path.join(data_dir, 'reference_data_init_empty.fa')
+
+        with self.assertRaises(reference_data.Error):
+            ref_data = reference_data.ReferenceData(presence_absence_fa=empty_fasta)
+
+
+    def test_init_ok(self):
+        '''Test init with good input'''
+        tsv_file = os.path.join(data_dir, 'reference_data_init.tsv')
+        presence_absence_fa = os.path.join(data_dir, 'reference_data_init_presence_absence.fa')
+        meta1 = sequence_metadata.SequenceMetadata('gene1\tn\tA42G\tN\tfree text')
+        meta2 = sequence_metadata.SequenceMetadata('gene1\tn\tA42T\tN\tfree text2')
+        meta3 = sequence_metadata.SequenceMetadata('gene1\tn\tG13T\tN\tconfers killer rabbit resistance')
+        meta4 = sequence_metadata.SequenceMetadata("gene2\tp\tI42L\tY\tremoves tardigrade's space-living capability")
+
+        expected_metadata = {
+            'gene1': {
+                'n': {12: {meta3}, 41: {meta1, meta2}},
+                'p': {},
+                '.': set(),
+            },
+            'gene2': {
+                'n': {},
+                'p': {41: {meta4}},
+                '.': set(),
+            }
+        }
+        ref_data = reference_data.ReferenceData(presence_absence_fa=presence_absence_fa, metadata_tsv=tsv_file)
+        self.assertEqual(expected_metadata, ref_data.metadata)
+
+        expected_seqs_dict = {
+            'presence_absence': {
+                'gene1': pyfastaq.sequences.Fasta('gene1', 'CATTCCTAGCGTCGTCTATCGTCG'),
+                'gene2': pyfastaq.sequences.Fasta('gene2', 'AAAAACCCCGGGGTTTT')
+            },
+            'variants_only': {},
+            'non_coding': {},
+        }
+
+        self.assertEqual(expected_seqs_dict, ref_data.seq_dicts)
+
+
+    def test_dict_keys_intersection(self):
+        '''Test dict_keys_intersection'''
+        d1 = {'a': 1, 'b':2, 'c': 42}
+        d2 = {'a': 42}
+        d3 = {'a': 11, 'b': 'xyz'}
+        self.assertEqual({'a'}, reference_data.ReferenceData._dict_keys_intersection([d1, d2, d3]))
+
+
+    def test_get_filename(self):
+        '''Test _get_filename'''
+        file_that_exists_abs = os.path.join(data_dir, 'reference_data_get_filename')
+        file_that_exists_rel = os.path.relpath(file_that_exists_abs)
+        self.assertEqual(file_that_exists_abs, reference_data.ReferenceData._get_filename(file_that_exists_rel))
+        self.assertIsNone(reference_data.ReferenceData._get_filename(None))
+
+        with self.assertRaises(reference_data.Error):
+            reference_data.ReferenceData._get_filename('thisisnotafilesoshouldthrowerror,unlessyoujustmadeitwhichseemslikeanoddthingtodoandyoudeservethefailingtest')
+
+
+    def test_load_metadata_tsv(self):
+        '''Test _load_metadata_tsv'''
+        meta1 = sequence_metadata.SequenceMetadata('gene1\tn\tA42G\tN\tfree text')
+        meta2 = sequence_metadata.SequenceMetadata('gene1\tn\tG13T\tN\tconfers killer rabbit resistance')
+        meta3 = sequence_metadata.SequenceMetadata("gene2\tp\tI42L\tY\tremoves tardigrade's space-living capability")
+        expected = {
+            'gene1': {
+                'n': {12: {meta2}, 41: {meta1}},
+                'p': {},
+                '.': set(),
+            },
+            'gene2': {
+                'n': {},
+                'p': {41: {meta3}},
+                '.': set(),
+            }
+        }
+
+        tsv_file = os.path.join(data_dir, 'reference_data_load_metadata_tsv.tsv')
+        self.assertEqual(expected, reference_data.ReferenceData._load_metadata_tsv(tsv_file))
+
+
+    def test_load_fasta_file(self):
+        '''Test _load_fasta_file'''
+        expected = {'seq1': pyfastaq.sequences.Fasta('seq1', 'ACGT')}
+        filename = os.path.join(data_dir, 'reference_data_load_fasta_file.fa')
+        got = reference_data.ReferenceData._load_fasta_file(filename)
+        self.assertEqual(expected, got)
+
+
+    def test_find_gene_in_seqs(self):
+        '''Test _find_gene_in_seqs'''
+        seqs_dict = {
+            'dict1': {'name1': 'seq1', 'name2': 'seq2'},
+            'dict2': {'name3': 'seq3'}
+        }
+        self.assertEqual(None, reference_data.ReferenceData._find_gene_in_seqs('name42', seqs_dict))
+        self.assertEqual('dict1', reference_data.ReferenceData._find_gene_in_seqs('name1', seqs_dict))
+        self.assertEqual('dict1', reference_data.ReferenceData._find_gene_in_seqs('name2', seqs_dict))
+        self.assertEqual('dict2', reference_data.ReferenceData._find_gene_in_seqs('name3', seqs_dict))
+
+
+    def test_write_metadata_tsv(self):
+        '''Test _write_metadata_tsv'''
+        presence_absence_fa = os.path.join(data_dir, 'reference_data_write_metadata_tsv_presence_absence.fa')
+        metadata_tsv_in = os.path.join(data_dir, 'reference_data_write_metadata_tsv.tsv')
+        metadata_tsv_expected = os.path.join(data_dir, 'reference_data_write_metadata_tsv.expected.tsv')
+        tmp_tsv = 'tmp.test_write_metadata_tsv.out.tsv'
+        ref_data = reference_data.ReferenceData(presence_absence_fa=presence_absence_fa, metadata_tsv=metadata_tsv_in)
+        ref_data._write_metadata_tsv(ref_data.metadata, tmp_tsv)
+        self.assertTrue(filecmp.cmp(metadata_tsv_expected, tmp_tsv, shallow=False))
+        os.unlink(tmp_tsv)
+
+
+    def test_write_dict_of_sequences(self):
+        '''Test _write_dict_of_sequences'''
+        d = {
+            'seq1': pyfastaq.sequences.Fasta('seq1', 'ACGT'),
+            'seq2': pyfastaq.sequences.Fasta('seq2', 'GGGG'),
+        }
+        tmp_file = 'tmp.test_write_dict_of_sequences.fa'
+        reference_data.ReferenceData._write_dict_of_sequences(d, tmp_file)
+        expected = os.path.join(data_dir, 'reference_data_write_dict_of_sequences.fa')
+        self.assertTrue(filecmp.cmp(expected, tmp_file, shallow=False))
+        os.unlink(tmp_file)
+
+
+    def test_filter_bad_variant_data(self):
+        '''Test _filter_bad_variant_data'''
+        presence_absence_fa = os.path.join(data_dir, 'reference_data_filter_bad_data_presence_absence.in.fa')
+        expected_presence_absence_fa = os.path.join(data_dir, 'reference_data_filter_bad_data_presence_absence.expected.fa')
+        variants_only_fa = os.path.join(data_dir, 'reference_data_filter_bad_data_variants_only.in.fa')
+        expected_variants_only_fa = os.path.join(data_dir, 'reference_data_filter_bad_data_variants_only.expected.fa')
+        non_coding_fa = os.path.join(data_dir, 'reference_data_filter_bad_data_non_coding.in.fa')
+        expected_non_coding_fa = os.path.join(data_dir, 'reference_data_filter_bad_data_non_coding.expected.fa')
+        metadata_tsv = os.path.join(data_dir, 'reference_data_filter_bad_data_metadata.in.tsv')
+        expected_tsv = os.path.join(data_dir, 'reference_data_filter_bad_data_metadata.expected.tsv')
+        refdata = reference_data.ReferenceData(
+            presence_absence_fa=presence_absence_fa,
+            variants_only_fa=variants_only_fa,
+            non_coding_fa=non_coding_fa,
+            metadata_tsv=metadata_tsv
+        )
+
+        outprefix = 'tmp.test_filter_bad_variant_data'
+        refdata._filter_bad_variant_data(outprefix, set(), set())
+
+        self.assertTrue(filecmp.cmp(expected_tsv, outprefix + '.tsv'))
+        self.assertTrue(filecmp.cmp(expected_variants_only_fa, outprefix + '.variants_only.fa'))
+        self.assertTrue(filecmp.cmp(expected_presence_absence_fa, outprefix + '.presence_absence.fa'))
+        self.assertTrue(filecmp.cmp(expected_non_coding_fa, outprefix + '.non_coding.fa'))
+        os.unlink(outprefix + '.tsv')
+        os.unlink(outprefix + '.variants_only.fa')
+        os.unlink(outprefix + '.presence_absence.fa')
+        os.unlink(outprefix + '.non_coding.fa')
+        os.unlink(outprefix + '.log')
+
+
+    def test_gene_seq_is_ok(self):
+        '''Test _gene_seq_is_ok'''
+        tests = [
+            (pyfastaq.sequences.Fasta('x', 'ACGTG'), False, 'Remove: too short. Length: 5'),
+            (pyfastaq.sequences.Fasta('x', 'A' * 100), False, 'Remove: too long. Length: 100'),
+            (pyfastaq.sequences.Fasta('x', 'GAGGAGCCG'), False, 'Does not look like a gene (does not start with start codon, length (9) is not a multiple of 3 (length/3=3.0), or contains internal stop codons). Translation: EEP'),
+            (pyfastaq.sequences.Fasta('x', 'ATGTAACCT'), False, 'Does not look like a gene (does not start with start codon, length (9) is not a multiple of 3 (length/3=3.0), or contains internal stop codons). Translation: M*P'),
+            (pyfastaq.sequences.Fasta('x', 'ATGCCTGAG'), True, None)
+        ]
+
+        for seq, ok, message in tests:
+            self.assertEqual((ok, message), reference_data.ReferenceData._gene_seq_is_ok(seq, 6, 99))
+
+
+    def test_remove_bad_genes(self):
+        '''Test _remove_bad_genes'''
+        presence_absence_fasta = os.path.join(data_dir, 'reference_data_remove_bad_genes.in.fa')
+        refdata = reference_data.ReferenceData(presence_absence_fa=presence_absence_fasta, max_gene_length=99)
+        tmp_log = 'tmp.test_remove_bad_genes.log'
+
+        expected_removed = {'g1', 'g2', 'g3', 'g4'}
+        got_removed = refdata._remove_bad_genes(refdata.seq_dicts['presence_absence'], tmp_log)
+        self.assertEqual(expected_removed, got_removed)
+
+        expected_dict = {
+            'g5': pyfastaq.sequences.Fasta('g5', 'ATGCCTGAG')
+        }
+        self.assertEqual(expected_dict, refdata.seq_dicts['presence_absence'])
+        expected_log = os.path.join(data_dir, 'reference_data_test_remove_bad_genes.log')
+        self.assertTrue(filecmp.cmp(expected_log, tmp_log, shallow=False))
+        os.unlink(tmp_log)
+
+
+    def test_make_catted_fasta(self):
+        '''Test make_catted_fasta'''
+        presence_absence_fa = os.path.join(data_dir, 'reference_data_make_catted_fasta.presence_absence.fa')
+        variants_only_fa = os.path.join(data_dir, 'reference_data_make_catted_fasta.variants_only.fa')
+        noncoding_fa = os.path.join(data_dir, 'reference_data_make_catted_fasta.noncoding.fa')
+        expected_fa = os.path.join(data_dir, 'reference_data_make_catted_fasta.expected.fa')
+        refdata = reference_data.ReferenceData(
+            presence_absence_fa=presence_absence_fa,
+            variants_only_fa=variants_only_fa,
+            non_coding_fa=noncoding_fa
+        )
+        tmp_out = 'tmp.test.make_catted_fasta.out.fa'
+        refdata.make_catted_fasta(tmp_out)
+        self.assertTrue(filecmp.cmp(expected_fa, tmp_out, shallow=False))
+        os.unlink(tmp_out)
+
+
+    def test_sequence_type(self):
+        '''Test sequence_type'''
+        presence_absence_fa = os.path.join(data_dir, 'reference_data_sequence_type.presence_absence.fa')
+        variants_only_fa = os.path.join(data_dir, 'reference_data_sequence_type.variants_only.fa')
+        noncoding_fa = os.path.join(data_dir, 'reference_data_sequence_type.noncoding.fa')
+        refdata = reference_data.ReferenceData(
+            presence_absence_fa=presence_absence_fa,
+            variants_only_fa=variants_only_fa,
+            non_coding_fa=noncoding_fa
+        )
+
+        tests = [
+            ('pa', 'presence_absence'),
+            ('var_only', 'variants_only'),
+            ('noncoding', 'non_coding'),
+            ('not_there', None)
+        ]
+
+        for name, expected in tests:
+            self.assertEqual(expected, refdata.sequence_type(name))
+
+
+    def test_sequence(self):
+        '''Test sequence'''
+        presence_absence_fa = os.path.join(data_dir, 'reference_data_sequence.presence_absence.fa')
+        expected = pyfastaq.sequences.Fasta('pa', 'ATGTTTTAA')
+        refdata = reference_data.ReferenceData(presence_absence_fa=presence_absence_fa)
+        self.assertEqual(expected, refdata.sequence('pa'))
+
+
+    def test_sequence_length(self):
+        '''Test sequence_length'''
+        presence_absence_fa = os.path.join(data_dir, 'reference_data_sequence_length.presence_absence.fa')
+        refdata = reference_data.ReferenceData(presence_absence_fa=presence_absence_fa)
+        self.assertEqual(9, refdata.sequence_length('pa'))
+
+
+    def test_all_non_wild_type_variants(self):
+        '''Test all_non_wild_type_variants'''
+        tsv_file = os.path.join(data_dir, 'reference_data_test_all_non_wild_type_variants.tsv')
+        presence_absence_fa = os.path.join(data_dir, 'reference_data_test_all_non_wild_type_variants.ref.pres_abs.fa')
+        variants_only_fa = os.path.join(data_dir, 'reference_data_test_all_non_wild_type_variants.ref.var_only.fa')
+        noncoding_fa = os.path.join(data_dir, 'reference_data_test_all_non_wild_type_variants.ref.noncoding.fa')
+
+        refdata = reference_data.ReferenceData(
+            presence_absence_fa=presence_absence_fa,
+            variants_only_fa=variants_only_fa,
+            non_coding_fa=noncoding_fa,
+            metadata_tsv=tsv_file
+        )
+
+        m1 = sequence_metadata.SequenceMetadata('var_only_gene\tn\tG9C\tN\tref has variant C instead of G')
+        m2 = sequence_metadata.SequenceMetadata('var_only_gene\tp\tI5V\tN\tref has variant V instead of I')
+        m3 = sequence_metadata.SequenceMetadata('var_only_gene\tp\tF6I\tY\tref has wild type F')
+        m4 = sequence_metadata.SequenceMetadata('presence_absence_gene\tn\tA6C\tN\tref has variant C instead of A')
+        m5 = sequence_metadata.SequenceMetadata('presence_absence_gene\tp\tA4G\tN\tref has variant G instead of A')
+        m6 = sequence_metadata.SequenceMetadata('non_coding\tn\tC4T\tN\tref has variant T instead of C')
+
+        self.assertEqual({'n': {8: {m1}}, 'p': {4: {m2}, 5: {m3}}}, refdata.all_non_wild_type_variants('var_only_gene'))
+        self.assertEqual({'n': {5: {m4}}, 'p': {3: {m5}}}, refdata.all_non_wild_type_variants('presence_absence_gene'))
+        self.assertEqual({'n': {3: {m6}}, 'p': {}}, refdata.all_non_wild_type_variants('non_coding'))
+        self.assertEqual({'n': {}, 'p': {}}, refdata.all_non_wild_type_variants('not_a_known_sequence'))
+
+
+    def test_write_cluster_allocation_file(self):
+        '''Test write_cluster_allocation_file'''
+        clusters = {
+            'presence_absence': {
+                'seq1': {'seq1', 'seq2'},
+                'seq3': {'seq3', 'seq4', 'seq5'},
+                'seq6': {'seq6'}
+            },
+            'non_coding' : {
+                'seq10': {'seq42'}
+            },
+            'variants_only': None
+        }
+        tmpfile = 'tmp.test_write_cluster_allocation_file.out'
+        reference_data.ReferenceData.write_cluster_allocation_file(clusters, tmpfile)
+        expected_file = os.path.join(data_dir, 'reference_data_test_write_cluster_allocation_file.expected')
+        self.assertTrue(filecmp.cmp(expected_file, tmpfile, shallow=False))
+        os.unlink(tmpfile)
+
+
+    def test_cluster_with_cdhit(self):
+        '''Test cluster_with_cd_hit'''
+        inprefix = os.path.join(data_dir, 'reference_data_test_cluster_with_cdhit')
+        presence_absence_fa = inprefix + '.presence_absence.fa'
+        non_coding_fa = inprefix + '.non_coding.fa'
+
+        refdata = reference_data.ReferenceData(
+            presence_absence_fa=presence_absence_fa,
+            non_coding_fa=non_coding_fa,
+        )
+
+        outprefix = 'tmp.test_cluster_with_cdhit'
+
+        expected = {
+            'non_coding': {
+                'noncoding1': {'noncoding1'}
+            },
+            'presence_absence': {
+                'presence_absence1': {'presence_absence1', 'presence_absence2'},
+                'presence_absence3': {'presence_absence4', 'presence_absence3'}
+            },
+            'variants_only': None,
+        }
+
+        got = refdata.cluster_with_cdhit(inprefix, outprefix)
+        self.assertEqual(expected, got)
+        all_seqs = {}
+        pyfastaq.tasks.file_to_dict(presence_absence_fa, all_seqs)
+        pyfastaq.tasks.file_to_dict(non_coding_fa, all_seqs)
+        expected_seqs = {x: all_seqs[x] for x in ['presence_absence1', 'presence_absence3', 'noncoding1']}
+        got_seqs = {}
+        pyfastaq.tasks.file_to_dict(outprefix + '.cluster_representatives.fa', got_seqs)
+        self.assertEqual(expected_seqs, got_seqs)
+
+        expected_clusters_file = os.path.join(data_dir, 'reference_data_test_cluster_with_cdhit.clusters.tsv')
+        got_clusters_file = outprefix + '.clusters.tsv'
+        self.assertTrue(filecmp.cmp(expected_clusters_file, got_clusters_file, shallow=False))
+
+        os.unlink(got_clusters_file)
+        os.unlink(outprefix + '.cluster_representatives.fa')
+        os.unlink(outprefix + '.non_coding.cdhit')
+        os.unlink(outprefix + '.presence_absence.cdhit')
+
+
+    def test_write_seqs_to_fasta(self):
+        '''Test write_seqs_to_fasta'''
+        refdata = reference_data.ReferenceData(presence_absence_fa=os.path.join(data_dir, 'reference_data_test_write_seqs_to_fasta.in.fa'))
+        expected_outfile = os.path.join(data_dir, 'reference_data_test_write_seqs_to_fasta.expected.fa')
+        tmpfile = 'tmp.test.reference_data.write_seqs_to_fasta.out.fa'
+        refdata.write_seqs_to_fasta(tmpfile, {'seq1', 'seq4', 'seq5'})
+        self.assertTrue(filecmp.cmp(expected_outfile, tmpfile, shallow=False))
+        os.unlink(tmpfile)
+
diff --git a/ariba/tests/samtools_variants_test.py b/ariba/tests/samtools_variants_test.py
new file mode 100644
index 00000000..79be04b3
--- /dev/null
+++ b/ariba/tests/samtools_variants_test.py
@@ -0,0 +1,142 @@
+import unittest
+import os
+import pyfastaq
+import pymummer
+from ariba import samtools_variants, external_progs
+
+modules_dir = os.path.dirname(os.path.abspath(samtools_variants.__file__))
+data_dir = os.path.join(modules_dir, 'tests', 'data')
+extern_progs = external_progs.ExternalProgs()
+
+
+def file2lines(filename):
+    f = pyfastaq.utils.open_file_read(filename)
+    lines = f.readlines()
+    pyfastaq.utils.close(f)
+    return lines
+
+
+class TestSamtoolsVariants(unittest.TestCase):
+    def test_make_vcf_and_read_depths_files(self):
+        '''test _make_vcf_and_read_depths_files'''
+        ref = os.path.join(data_dir, 'samtools_variants_test_make_vcf_and_read_depths_files.assembly.fa')
+        bam = os.path.join(data_dir, 'samtools_variants_test_make_vcf_and_read_depths_files.bam')
+        expected_vcf = os.path.join(data_dir, 'samtools_variants_test_make_vcf_and_read_depths_files.expected.vcf')
+        expected_depths = os.path.join(data_dir, 'samtools_variants_test_make_vcf_and_read_depths_files.expected.read_depths.gz')
+        tmp_prefix = 'tmp.test_make_vcf_and_read_depths_files'
+        sv = samtools_variants.SamtoolsVariants(
+            ref,
+            bam,
+            tmp_prefix,
+            samtools_exe=extern_progs.exe('samtools'),
+            bcftools_exe=extern_progs.exe('bcftools')
+        )
+        sv._make_vcf_and_read_depths_files()
+
+        def get_vcf_call_lines(fname):
+            with open(fname) as f:
+                lines = [x for x in f.readlines() if not x.startswith('#')]
+            return lines
+
+        expected_lines = get_vcf_call_lines(expected_vcf)
+        got_lines = get_vcf_call_lines(sv.vcf_file)
+        self.assertEqual(expected_lines, got_lines)
+        self.assertEqual(file2lines(expected_depths), file2lines(sv.read_depths_file))
+        os.unlink(sv.vcf_file)
+        os.unlink(sv.read_depths_file)
+        os.unlink(sv.read_depths_file + '.tbi')
+
+
+    def test_get_read_depths(self):
+        '''test _get_read_depths'''
+        read_depths_file = os.path.join(data_dir, 'samtools_variants_test_get_read_depths.gz')
+
+        tests = [
+            ( ('ref1', 42), None ),
+            ( ('ref2', 1), None ),
+            ( ('ref1', 0), ('G', '.', 1, '1') ),
+            ( ('ref1', 2), ('T', 'A', 3, '2,1') ),
+            ( ('ref1', 3), ('C', 'A,G', 42, '21,11,10') ),
+            ( ('ref1', 4), ('C', 'AC', 41, '0,42') )
+        ]
+
+        for (name, position), expected in tests:
+            self.assertEqual(expected, samtools_variants.SamtoolsVariants._get_read_depths(read_depths_file, name, position))
+
+
+    def test_get_variant_positions_from_vcf(self):
+        '''test _get_variant_positions_from_vcf'''
+        vcf_file = os.path.join(data_dir, 'samtools_variants_test_get_variant_positions_from_vcf.vcf')
+
+        expected = [
+            ('16__cat_2_M35190.scaffold.1', 92),
+            ('16__cat_2_M35190.scaffold.1', 179),
+            ('16__cat_2_M35190.scaffold.1', 263),
+            ('16__cat_2_M35190.scaffold.6', 93)
+        ]
+        self.assertEqual(expected, samtools_variants.SamtoolsVariants._get_variant_positions_from_vcf(vcf_file))
+
+
+    def test_get_variants(self):
+        '''test _get_variants'''
+        vcf_file = os.path.join(data_dir, 'samtools_variants_test_get_variants.vcf')
+        read_depths_file = os.path.join(data_dir, 'samtools_variants_test_get_variants.read_depths.gz')
+        positions = [
+            ('16__cat_2_M35190.scaffold.1', 92),
+            ('16__cat_2_M35190.scaffold.1', 179),
+            ('16__cat_2_M35190.scaffold.1', 263),
+            ('16__cat_2_M35190.scaffold.6', 93)
+        ]
+        expected = {
+            '16__cat_2_M35190.scaffold.1': {
+                92: ('T', 'A', 123, '65,58'),
+                179: ('A', 'T', 86, '41,45'),
+                263: ('G', 'C', 97, '53,44'),
+            },
+            '16__cat_2_M35190.scaffold.6': {
+                93: ('T', 'G', 99, '56,43')
+            }
+        }
+
+        got = samtools_variants.SamtoolsVariants._get_variants(vcf_file, read_depths_file, positions=positions)
+        self.assertEqual(expected, got)
+
+
+    def test_variants_in_coords(self):
+        '''test variants_in_coords'''
+        vcf_file = os.path.join(data_dir, 'samtools_variants_test_variants_in_coords.vcf')
+
+        hit = ['1', '42', '1', '42', '42', '42', '100.00', '1000', '1000', '1', '1', 'gene', 'scaff1']
+        nucmer_hits = {
+            'scaff1': [pyfastaq.intervals.Interval(0, 41)]
+        }
+
+        got = samtools_variants.SamtoolsVariants.variants_in_coords(nucmer_hits, vcf_file)
+        self.assertEqual(1, got)
+
+
+    def test_get_depths_at_position(self):
+        '''test get_depths_at_position'''
+        bam = os.path.join(data_dir, 'samtools_variants_test_get_depths_at_position.bam')
+        ref_fa = os.path.join(data_dir, 'samtools_variants_test_get_depths_at_position.ref.fa')
+        tmp_prefix = 'tmp.test_get_depths_at_position'
+        samtools_vars = samtools_variants.SamtoolsVariants(
+            ref_fa,
+            bam,
+            tmp_prefix,
+            samtools_exe=extern_progs.exe('samtools'),
+            bcftools_exe=extern_progs.exe('bcftools')
+        )
+        samtools_vars.run()
+        tests = [
+            (('ref', 425), ('C', 'T', 31, '18,13')),
+            (('not_a_ref', 10), None),
+            (('ref', 1000000000), None)
+        ]
+        for (ref, pos), expected in tests:
+            got = samtools_vars.get_depths_at_position(ref, pos)
+            self.assertEqual(expected, got)
+
+        os.unlink(samtools_vars.vcf_file)
+        os.unlink(samtools_vars.read_depths_file)
+        os.unlink(samtools_vars.read_depths_file + '.tbi')
diff --git a/ariba/tests/sequence_metadata_test.py b/ariba/tests/sequence_metadata_test.py
new file mode 100644
index 00000000..ffbf0295
--- /dev/null
+++ b/ariba/tests/sequence_metadata_test.py
@@ -0,0 +1,67 @@
+import unittest
+import os
+import pyfastaq
+from ariba import sequence_metadata, sequence_variant
+
+modules_dir = os.path.dirname(os.path.abspath(sequence_metadata.__file__))
+data_dir = os.path.join(modules_dir, 'tests', 'data')
+
+
+class TestSequenceMetadata(unittest.TestCase):
+    def test_init_fails_on_bad_lines(self):
+        '''Test init fails on bad lines'''
+        lines = [
+            'only one column\n',
+            'two\tcolumns is not enough\n',
+            'two\tcolumns\tis not enough\n',
+            'six\tcolumns\tis\tone\ttoo\tmany\n',
+            'name\tp\tI42L\tx', # column 4 should be "Y" or "N"
+        ]
+
+        for line in lines:
+            with self.assertRaises(sequence_metadata.Error):
+                sequence_metadata.SequenceMetadata(line)
+
+        with self.assertRaises(sequence_variant.Error):
+            sequence_metadata.SequenceMetadata('gene\tx\tI42L\tN\n')
+
+
+    def test_init_on_good_input(self):
+        '''test init ok on good input'''
+        data = sequence_metadata.SequenceMetadata('gene\tn\tI42L\tN\tspam spam wonderful spam')
+        self.assertEqual(data.name, 'gene')
+        self.assertEqual(data.variant_type, 'n')
+        self.assertEqual(data.variant.wild_value, 'I')
+        self.assertEqual(data.variant.variant_value, 'L')
+        self.assertFalse(data.always_report)
+        self.assertEqual(data.free_text, 'spam spam wonderful spam')
+
+
+    def test_str(self):
+        '''test __str__'''
+        lines = [
+            'gene1\tn\tA42G\tY\tspam',
+            'gene2\t.\t.\tN',
+            'gene3\t.\t.\tY\teggs',
+            'gene4\tp\tI42K\tN\tthis mutation kills tardigrades',
+        ]
+
+        for line in lines:
+            self.assertEqual(line, str(sequence_metadata.SequenceMetadata(line)))
+
+
+    def test_has_variant(self):
+        '''test has_variant'''
+        tests = [
+            ('gene1\t.\t.\tN', False),
+            ('gene1\tn\tA2T\tN', True),
+            ('gene1\tn\tT2A\tN', False),
+            ('gene1\tp\tI2Y\tN', True),
+            ('gene1\tp\tY2I\tN', False),
+        ]
+
+        seq = pyfastaq.sequences.Fasta('name', 'ATGTATTGCTGA') # translation: MYC*
+
+        for line, expected in tests:
+            metadata = sequence_metadata.SequenceMetadata(line)
+            self.assertEqual(expected, metadata.has_variant(seq))
diff --git a/ariba/tests/sequence_variant_test.py b/ariba/tests/sequence_variant_test.py
new file mode 100644
index 00000000..49a55c53
--- /dev/null
+++ b/ariba/tests/sequence_variant_test.py
@@ -0,0 +1,96 @@
+import unittest
+import os
+import pyfastaq
+from ariba import sequence_variant
+
+modules_dir = os.path.dirname(os.path.abspath(sequence_variant.__file__))
+data_dir = os.path.join(modules_dir, 'tests', 'data')
+
+
+class TestSequenceVariant(unittest.TestCase):
+    def test_init_fails_on_bad_variant_strings(self):
+        '''Test init fails on bad variant strings'''
+        bad_variants = [
+            'x',
+            'x1',
+            '1x',
+            '1x1',
+            'I42K43',
+            'I-1K',
+        ]
+
+        for var in bad_variants:
+            with self.assertRaises(sequence_variant.Error):
+                v = sequence_variant.Variant('p', var)
+
+
+    def test_init_ok(self):
+        '''Test init ok'''
+        variants = ['I42K', 'i42k', 'I42k', 'i42K']
+
+        for var in variants:
+            aa_var = sequence_variant.Variant('p', var)
+            self.assertEqual(41, aa_var.position)
+            self.assertEqual('I', aa_var.wild_value)
+            self.assertEqual('K', aa_var.variant_value)
+
+
+    def test_init_str(self):
+        '''Test init ok and str'''
+        variants = ['I42K', 'i42k', 'I42k', 'i42K']
+        expected = 'I42K'
+
+        for var in variants:
+            self.assertEqual(expected, str(sequence_variant.Variant('p', var)))
+
+
+    def test_sanity_check_against_seq_no_translate(self):
+        '''test sanity_check_against_seq with translate False'''
+        seq = 'BrissSpecialStvff'
+        tests = [
+            ('I3K', True),
+            ('K3I', True),
+            ('A2b', False),
+            ('x1000y', False)
+        ]
+
+        for var, expected in tests:
+            variant = sequence_variant.Variant('p', var)
+            self.assertEqual(expected, variant.sanity_check_against_seq(seq))
+
+
+    def test_sanity_check_against_seq_translate(self):
+        '''test sanity_check_against_seq with translate True'''
+        seq = 'AGTACGACGTAC'  # translates to STTY
+        tests = [
+            ('S1X', True),
+            ('x1s', True),
+            ('a1y', False),
+            ('x5y', False)
+        ]
+
+        for var, expected in tests:
+            variant = sequence_variant.Variant('p', var)
+            self.assertEqual(expected, variant.sanity_check_against_seq(seq, translate_seq=True))
+
+
+    def test_has_variant(self):
+        '''test has_variant'''
+        seq = pyfastaq.sequences.Fasta('name', 'ATGTATTGCTGA') # translation: MYC*
+        tests = [
+            (sequence_variant.Variant('n', 'A2T'), True),
+            (sequence_variant.Variant('n', 'T2A'), False),
+            (sequence_variant.Variant('p', 'I2Y'), True),
+            (sequence_variant.Variant('p', 'Y2I'), False),
+        ]
+
+        for var, expected in tests:
+            self.assertEqual(expected, var.has_variant(seq))
+
+
+    def test_nucleotide_range(self):
+        '''test nucleotide_range'''
+        sv = sequence_variant.Variant('n', 'A2T')
+        self.assertEqual((1, 1), sv.nucleotide_range())
+        sv = sequence_variant.Variant('p', 'I42L')
+        self.assertEqual((123, 125), sv.nucleotide_range())
diff --git a/ariba/tests/summary_test.py b/ariba/tests/summary_test.py
index fc5e4302..096cdd11 100644
--- a/ariba/tests/summary_test.py
+++ b/ariba/tests/summary_test.py
@@ -7,7 +7,7 @@
 modules_dir = os.path.dirname(os.path.abspath(summary.__file__))
 data_dir = os.path.join(modules_dir, 'tests', 'data')
 
-class TestSummry(unittest.TestCase):
+class TestSummary(unittest.TestCase):
     def test_init(self):
         '''Test init'''
         fofn = os.path.join(data_dir, 'summary_test_init.fofn')
@@ -19,73 +19,143 @@ def test_init(self):
         self.assertEqual(s.filenames, ['file42', 'file1', 'file2'])
 
 
-
     def test_line2dict(self):
         '''Test _line2dict'''
-        line = '\t'.join(['gene1', '187', '42', '3', '750', '750', '98.93', 'SNP', 'SYN', '.', '66', '66', 'A', 'gene1.scaffold.1', '1047', '67', '67', 'C', '42', 'A', '22,20'])
-        s = summary.Summary('out', filenames=['spam', 'eggs'])
+        line = 'refname\treftype\t19\t78\tcluster\t120\t120\t98.33\tctg_name\t279\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, foo bar\tsome free text'
+
         expected = {
-            'gene': 'gene1',
-            'flag':  flag.Flag(187),
-            'reads': 42,
-            'cluster': '3',
-            'gene_len': 750,
-            'assembled': 750,
-            'pc_ident': 98.93,
+            'ref_name': 'refname',
+            'ref_type': 'reftype',
+            'flag': flag.Flag(19),
+            'reads': 78,
+            'cluster_rep': 'cluster',
+            'ref_len': 120,
+            'ref_base_assembled': 120,
+            'pc_ident': 98.33,
+            'ctg': 'ctg_name',
+            'ctg_len': 279,
+            'known_var': '1',
             'var_type': 'SNP',
-            'var_effect': 'SYN',
-            'new_aa': '.',
-            'gene_start': 66,
-            'gene_end': 66,
-            'gene_nt': 'A',
-            'scaffold': 'gene1.scaffold.1',
-            'scaff_len': 1047,
-            'scaff_start': 67,
-            'scaff_end': 67,
-            'scaff_nt': 'C',
-            'read_depth': 42,
-            'alt_bases': 'A',
-            'ref_alt_depth': '22,20'
+            'var_seq_type': 'n',
+            'known_var_change': 'A14T',
+            'has_known_var': '1',
+            'ref_ctg_change': 'A14T',
+            'ref_ctg_effect': 'SNP',
+            'ref_start': 13,
+            'ref_end': 13,
+            'ref_nt': 'A',
+            'ctg_start': 84,
+            'ctg_end': 84,
+            'ctg_nt': 'T',
+            'smtls_total_depth': '17',
+            'smtls_alt_nt': '.',
+            'smtls_alt_depth': '17',
+            'var_description': 'noncoding1_n_A14T_N_ref has wild type, foo bar',
+            'free_text': 'some free text'
+        }
+
+        self.assertEqual(summary.Summary._line2dict(line), expected)
+
+
+    def test_dict2key(self):
+        '''Test _dict2key'''
+        d = {
+            'ref_name': 'ref',
+            'var_type': '.',
+            'known_var_change': '.',
+            'ref_ctg_change': '.',
+            'var_seq_type': '.'
         }
-        self.assertEqual(s._line2dict(line), expected)
+
+        self.assertEqual(('ref', '', ''), summary.Summary._dict2key(d))
+
+        d['var_type'] = 'p'
+        with self.assertRaises(summary.Error):
+            summary.Summary._dict2key(d)
+
+        d['known_var_change'] = 'I42L'
+        d['var_seq_type'] = 'p'
+        self.assertEqual(('ref', 'p', 'I42L'), summary.Summary._dict2key(d))
+
+        d['ref_ctg_change'] = 'P43Q'
+        with self.assertRaises(summary.Error):
+            summary.Summary._dict2key(d)
+
+        d['known_var_change'] = '.'
+        self.assertEqual(('ref', 'p', 'P43Q'), summary.Summary._dict2key(d))
 
 
     def test_load_file(self):
         '''Test _load_file'''
-        s = summary.Summary('out', filenames=['spam', 'eggs'])
-        infile = os.path.join(data_dir, 'summary_test_load_file.in.tsv')
-
         lines = [
-            ['gene1', '27', '42', '1', '822', '822', '100.0', '.', '.', '.', '.', '.', '.', 'gene1.scaffold.1', '1490', '.', '.', '.', '.', '.', '.'],
-            ['gene2', '15', '44', '2', '780', '780', '100.0', '.', '.', '.', '.', '.', '.', 'gene2.scaffold.2', '1124', '.', '.', '.', '.', '.', '.'],
-            ['gene2', '15', '46', '2', '780', '770', '99.0', '.', '.', '.', '.', '.', '.', 'gene2.scaffold.3', '1097', '.', '.', '.', '.', '.', '.'],
-            ['gene3', '187', '48', '3', '750', '750', '98.93', 'SNP', 'SYN', '.', '318', '318', 'C', 'gene3.scaffold.1', '1047', '319', '319', 'G', '.', '.', '.']
-]
-        dicts = [s._line2dict('\t'.join(x)) for x in lines]
-        expected = {'gene1': [dicts[0]], 'gene2': dicts[1:3], 'gene3': [dicts[3]]}
-        got = s._load_file(infile)
+            'noncoding1\tnon_coding\t19\t78\tnoncoding1\t120\t120\t98.33\tnoncoding1.scaffold.1\t279\t1\tSNP\tn\tA14T\t1\tA14T\tSNP\t13\t13\tA\t84\t84\tT\t17\t.\t17\tnoncoding1_n_A14T_N_ref has wild type, reads have variant so should report\tgeneric description of noncoding1',
+            'noncoding1\tnon_coding\t19\t78\tnoncoding1\t120\t120\t98.33\tnoncoding1.scaffold.1\t279\t1\tSNP\tn\tA6G\t1\t.\t.\t6\t6\tG\t77\t77\tG\t18\t.\t18\tnoncoding1_n_A6G_N_variant in ref and reads so should report\tgeneric description of noncoding1',
+            'presence_absence1\tpresence_absence\t27\t88\tpresence_absence1\t96\t96\t98.96\tpresence_absence1.scaffold.1\t267\t1\tSNP\tp\tA10V\t1\tA10V\tNONSYN\t28\t28\tC\t113\t113\tT\t29\t.\t29\tpresence_absence1_p_A10V_N_Ref has wild, reads have variant so report\tGeneric description of presence_absence1',
+            'variants_only1\tvariants_only\t27\t64\tvariants_only1\t90\t90\t100.0\tvariants_only1.scaffold.1\t260\t1\tSNP\tp\tS5T\t1\t.\t.\t13\t15\tA;C;C\t96\t98\tA;C;C\t12;13;13\t.;.;.\t12;13;13\tvariants_only1_p_S5T_N_Ref and reads have variant so report\tGeneric description of variants_only1',
+        ]
+
+        dicts = [summary.Summary._line2dict(x) for x in lines]
+        expected = {
+            'noncoding1': {
+                ('noncoding1', 'n', 'A14T'): dicts[0],
+                ('noncoding1', 'n', 'A6G'): dicts[1],
+            },
+            'presence_absence1': {('presence_absence1', 'p', 'A10V'): dicts[2]},
+            'variants_only1': {('variants_only1', 'p', 'S5T'): dicts[3]}
+        }
+
+        infile = os.path.join(data_dir, 'summary_test_load_file.in.tsv')
+        got = summary.Summary._load_file(infile)
         self.assertEqual(expected, got)
 
 
-    def test_to_summary_number(self):
-        '''Test _to_summary_number'''
-        s = summary.Summary('out', filenames=['spam', 'eggs'])
+    def test_pc_id_of_longest(self):
+        '''Test _pc_id_of_longest'''
+        d = {
+            'seqname': {
+                'key1': {'ref_base_assembled': 10, 'pc_ident': 90.0},
+                'key2': {'ref_base_assembled': 20, 'pc_ident': 89.0},
+                'key3': {'ref_base_assembled': 50, 'pc_ident': 95.1},
+                'key4': {'ref_base_assembled': 42, 'pc_ident': 91.0},
+            }
+        }
+
+        self.assertEqual(95.1, summary.Summary._pc_id_of_longest(d, 'seqname'))
+
+
+    def test_to_summary_number_for_seq(self):
+        '''Test _to_summary_number_for_seq'''
         tests = [
             (0, 0),
             (64, 0),
             (7, 1),
             (259, 1),
-            (15, 2),
-            (539, 3),
-            (27, 4),
+            (15, 1),
+            (539, 2),
+            (27, 3),
         ]
 
-        for t in tests:
-            l = [{'flag': flag.Flag(t[0]), 'assembled': 42, 'pc_ident': 99}]
-            self.assertEqual(s._to_summary_number(l), t[1])
+        for test_flag, expected in tests:
+            data_dict = {'name': {
+                'key1': {'flag': flag.Flag(test_flag), 'ref_base_assembled': 100, 'pc_ident': 99}
+            }}
 
-        l = [{'flag': flag.Flag(27), 'assembled': 42, 'pc_ident': 89}]
-        self.assertEqual(s._to_summary_number(l), 0)
+            self.assertEqual(expected, summary.Summary._to_summary_number_for_seq(data_dict, 'name', 90))
+
+
+    def test_to_summary_number_for_variant(self):
+        '''Test _to_summary_number_for_variant'''
+        tests = [
+            (1, {'known_var': '1', 'has_known_var': '1', 'ref_ctg_change': 'I42L'}),
+            (1, {'known_var': '1', 'has_known_var': '1', 'ref_ctg_change': '.'}),
+            (0, {'known_var': '1', 'has_known_var': '0', 'ref_ctg_change': 'I42L'}),
+            (0, {'known_var': '1', 'has_known_var': '0', 'ref_ctg_change': '.'}),
+            (1, {'known_var': '0', 'has_known_var': '0', 'ref_ctg_change': 'I42L'}),
+            (0, {'known_var': '0', 'has_known_var': '0', 'ref_ctg_change': '.'}),
+        ]
+
+        for expected, data_dict in tests:
+            self.assertEqual(expected, summary.Summary._to_summary_number_for_variant(data_dict))
 
 
     def test_gather_output_rows(self):
@@ -94,20 +164,19 @@ def test_gather_output_rows(self):
             os.path.join(data_dir, 'summary_test_gather_output_rows.in.1.tsv'),
             os.path.join(data_dir, 'summary_test_gather_output_rows.in.2.tsv')
         ]
-        s = summary.Summary('out', filenames=infiles)
-        s._gather_output_rows()
+        got = summary.Summary._gather_output_rows(infiles, 90)
         expected = [
-            ['filename', 'gene1', 'gene2', 'gene3'],
-            [infiles[0], 4, 2, 0],
-            [infiles[1], 4, 0, 4],
+            ['filename', 'noncoding1', 'noncoding1;var.n.A14T', 'noncoding1;var.n.A6G', 'presence_absence1', 'presence_absence1;var.p.A10V', 'variants_only1'],
+            [infiles[0], 1, 1, 0, 3, 1, 0],
+            [infiles[1], 1, 1, 1, 3, 1, 0],
         ]
-        self.assertEqual(expected, s.rows_out)
+
+        self.assertEqual(expected, got)
 
 
-    def test_filter_output_rows_filter_true(self):
+    def test_filter_output_rows(self):
         '''Test _filter_output_rows'''
-        s = summary.Summary('out', filenames=['spam', 'eggs'])
-        s.rows_out = [
+        rows = [
             ['filename', 'gene1', 'gene2', 'gene3'],
             ['file1', 0, 0, 0],
             ['file2', 1, 0, 3],
@@ -120,52 +189,36 @@ def test_filter_output_rows_filter_true(self):
             ['file3', 2, 4],
         ]
 
-        s._filter_output_rows()
-        self.assertEqual(s.rows_out, expected)
-
-
-    def test_filter_output_rows_filter_false(self):
-        '''Test _filter_output_rows'''
-        s = summary.Summary('out', filenames=['spam', 'eggs'], filter_output=False)
-        rows_out = [
-            ['filename', 'gene1', 'gene2', 'gene3'],
-            ['file1', 0, 0, 0],
-            ['file2', 1, 0, 3],
-            ['file3', 2, 0, 4],
-        ]
-
-        s.rows_out = copy.copy(rows_out)
-
-        s._filter_output_rows()
-        self.assertEqual(s.rows_out, rows_out)
+        got = summary.Summary._filter_output_rows(rows)
+        self.assertEqual(expected, got)
 
 
     def test_write_tsv(self):
         '''Test _write_tsv'''
         tmp_out = 'tmp.out.tsv'
-        s = summary.Summary(tmp_out, filenames=['spam', 'eggs'])
-        s.rows_out = [
+        rows = [
             ['filename', 'gene1', 'gene3'],
             ['file2', 1, 3],
             ['file3', 2, 4],
         ]
-        s._write_tsv()
+        summary.Summary._write_tsv(rows, tmp_out)
         expected = os.path.join(data_dir, 'summary_test_write_tsv.out.tsv')
         self.assertTrue(filecmp.cmp(tmp_out, expected, shallow=False))
         os.unlink(tmp_out)
 
 
-    def test_write_js_candy_csv(self):
-        '''Test _write_js_candy_csv'''
-        tmp_out = 'tmp.test_write_js_candy.csv'
-        s = summary.Summary(tmp_out, filenames=['spam', 'eggs'])
-        s.rows_out = [
-            ['filename', 'gene1', 'gene3'],
-            ['file1', 1, 3],
-            ['file2', 2, 4],
+    def test_write_phandango_csv(self):
+        '''Test _write_phandango_csv'''
+        tmp_out = 'tmp.test_write_phandango.csv'
+        rows = [
+            ['filename', 'seq1', 'seq1;var.p.I14L', 'seq1;var.p.P42Q', 'seq2', 'seq2;var.n.A14T'],
+            ['file1', 3, 0, 1, 3, 1],
+            ['file2', 3, 1, 0, 3, 0],
+            ['file3', 1, 0, 0, 3, 0],
+            ['file4', 2, 1, 0, 0, 0],
         ]
-        s._write_js_candy_csv(tmp_out)
-        expected = os.path.join(data_dir, 'summary_test_write_js_candy_csv.csv')
+        summary.Summary._write_phandango_csv(rows, tmp_out)
+        expected = os.path.join(data_dir, 'summary_test_write_phandango_csv.csv')
         self.assertTrue(filecmp.cmp(expected, tmp_out, shallow=False))
         os.unlink(tmp_out)
 
@@ -204,8 +257,7 @@ def test_distance_score_between_lists(self):
 
     def test_write_distance_matrix(self):
         '''Test _write_distance_matrix'''
-        s = summary.Summary('out', filenames=['spam', 'eggs'])
-        s.rows_out = [
+        rows = [
             ['filename', 'gene1', 'gene2', 'gene3'],
             ['file1', 0, 1, 0],
             ['file2', 1, 0, 3],
@@ -213,7 +265,7 @@ def test_write_distance_matrix(self):
         ]
 
         tmp_distances = 'tmp.test.write_distance_matrix.distances'
-        s._write_distance_matrix(tmp_distances)
+        summary.Summary._write_distance_matrix(rows, tmp_distances)
         expected = os.path.join(data_dir, 'summary_test_write_distance_matrix.distances')
         self.assertTrue(filecmp.cmp(expected, tmp_distances, shallow=False))
         os.unlink(tmp_distances)
@@ -229,19 +281,19 @@ def test_newick_from_dist_matrix(self):
         os.unlink(tmp_tree)
 
 
-    def test_write_js_candy_files(self):
-        '''Test _write_js_candy_files'''
-        tmp_prefix = 'tmp.test.write_js_candy_files'
-        s = summary.Summary('out', filenames=['spam', 'eggs'])
-        s.rows_out = [
-            ['filename', 'gene1', 'gene2', 'gene3'],
-            ['file1', 0, 1, 0],
-            ['file2', 1, 0, 3],
-            ['file3', 0, 0, 4],
+    def test_write_phandango_files(self):
+        '''Test _write_phandango_files'''
+        tmp_prefix = 'tmp.test.write_phandango_files'
+        rows = [
+            ['filename', 'seq1', 'seq1;var.p.I14L', 'seq1;var.p.P42Q', 'seq2', 'seq2;var.n.A14T'],
+            ['file1', 3, 0, 1, 3, 1],
+            ['file2', 3, 1, 0, 3, 0],
+            ['file3', 1, 0, 0, 3, 0],
+            ['file4', 2, 1, 0, 0, 0],
         ]
-        s._write_js_candy_files(tmp_prefix)
-        expected_csv = os.path.join(data_dir, 'summary_test_write_js_candy_files.csv')
-        expected_tre = os.path.join(data_dir, 'summary_test_write_js_candy_files.tre')
+        summary.Summary._write_phandango_files(rows, tmp_prefix)
+        expected_csv = os.path.join(data_dir, 'summary_test_write_phandango_files.csv')
+        expected_tre = os.path.join(data_dir, 'summary_test_write_phandango_files.tre')
         self.assertTrue(filecmp.cmp(expected_csv, tmp_prefix + '.csv', shallow=False))
         self.assertTrue(filecmp.cmp(expected_tre, tmp_prefix + '.tre', shallow=False))
         os.unlink(tmp_prefix + '.csv')
diff --git a/scripts/ariba b/scripts/ariba
index 7a4debea..ec2c297d 100755
--- a/scripts/ariba
+++ b/scripts/ariba
@@ -5,19 +5,23 @@ import sys
 
 
 tasks = {
+    'getref': 'Download reference data',
     'refcheck': 'Check or fix input genes FASTA',
     'run': 'Run the ARIBA local assembly pipeline',
     'summary': 'Summarise multiple reports made by "run"',
     'flag': 'Translate the meaning of a flag output by the pipeline',
+    'test': 'Run on small test dataset',
     'version': 'Print version and exit',
 }
 
 
 ordered_tasks = [
+    'getref',
     'refcheck',
     'run',
     'summary',
     'flag',
+    'test',
     'version',
 ]
 
diff --git a/setup.py b/setup.py
index 39ab459f..19a5b14e 100644
--- a/setup.py
+++ b/setup.py
@@ -10,6 +10,7 @@
     version='0.6.0',
     description='ARIBA: Antibiotic Resistance Identification By Assembly',
     packages = find_packages(),
+    package_data={'ariba': ['test_run_data/*']},
     author='Martin Hunt',
     author_email='path-help@sanger.ac.uk',
     url='https://github.com/sanger-pathogens/ariba',
@@ -18,9 +19,10 @@
     tests_require=['nose >= 1.3'],
     install_requires=[
         'openpyxl',
-        'pyfastaq >= 3.10.0',
+        'pyfastaq >= 3.11.1',
         'pysam >= 0.8.1',
-        'pymummer>=0.6.1'
+        'pymummer>=0.6.1',
+        'beautifulsoup4'
     ],
     license='GPLv3',
     classifiers=[