-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: Add phylogenomics entries (#145)
- Loading branch information
Showing
7 changed files
with
284 additions
and
42 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,123 @@ | ||
#!/usr/bin/env python | ||
|
||
# Input check for post-assembly "Phylogenomics" workflow | ||
|
||
import os | ||
import sys | ||
from samplesheet_utils import parse_args, make_dir, print_error | ||
|
||
|
||
def check_samplesheet(file_in, file_out): | ||
""" | ||
This function checks that the samplesheet follows the following structure: | ||
sample,gff_file_path | ||
sample,gff_file_path | ||
""" | ||
|
||
sample_mapping_dict = {} | ||
with open(file_in, "r") as fin: | ||
## Check header | ||
MIN_COLS = 2 | ||
HEADER = ["sample", "gff_file_path"] | ||
header = [x.strip('"') for x in fin.readline().strip().split(",")] | ||
if header[: len(HEADER)] != HEADER: | ||
print( | ||
"ERROR: Please check samplesheet header -> {} != {}".format( | ||
",".join(header), ",".join(HEADER) | ||
) | ||
) | ||
sys.exit(1) | ||
|
||
## Check sample entries | ||
for line in fin: | ||
lspl = [x.strip().strip('"') for x in line.strip().split(",")] | ||
|
||
# Check valid number of columns per row | ||
if len(lspl) < len(HEADER): | ||
print_error( | ||
"Invalid number of columns (minimum = {})!".format(len(HEADER)), | ||
"Line", | ||
line, | ||
) | ||
num_cols = len([x for x in lspl if x]) | ||
if num_cols < MIN_COLS: | ||
print_error( | ||
"Invalid number of populated columns (minimum = {})!".format( | ||
MIN_COLS | ||
), | ||
"Line", | ||
line, | ||
) | ||
|
||
## Check sample name entries | ||
sample, gff_file_path = lspl[: len(HEADER)] | ||
if sample: | ||
if sample.find(" ") != -1: | ||
print_error("Sample entry contains spaces!", "Line", line) | ||
else: | ||
print_error("Sample entry has not been specified!", "Line", line) | ||
|
||
## Check assembly gff file extension | ||
|
||
if gff_file_path: | ||
if gff_file_path.find(" ") != -1: | ||
print_error("gff file path contains spaces!", "Line", line) | ||
if not gff_file_path.endswith(".gff") and not gff_file_path.endswith( | ||
".gff3" | ||
): | ||
print_error( | ||
"Phylo files must be one of .gff or .gff3.", | ||
"Line", | ||
line, | ||
) | ||
""" | ||
## Auto-detect paired-end/single-end | ||
sample_info = [] ## [single_end, fastq_1, fastq_2] | ||
if sample and fastq_1 and fastq_2: ## Paired-end short reads | ||
sample_info = ["0", fastq_1, fastq_2] | ||
elif sample and fastq_1 and not fastq_2: ## Single-end short reads | ||
sample_info = ["1", fastq_1, fastq_2] | ||
else: | ||
print_error("Invalid combination of columns provided!", "Line", line) | ||
""" | ||
sample_info = gff_file_path | ||
|
||
## Create sample mapping dictionary = { sample: path } | ||
if sample not in sample_mapping_dict: | ||
sample_mapping_dict[sample] = sample_info | ||
# TODO Come back to this conditional; does it make sense for multiple assemblies to one sample? | ||
else: | ||
if sample_info in sample_mapping_dict[sample]: | ||
print_error("Samplesheet contains duplicate rows!", "Line", line) | ||
else: | ||
# sample_mapping_dict[sample].append(sample_info) | ||
print_error( | ||
"Mutliple files associated to one sample!", "Line", line | ||
) | ||
|
||
## Write validated samplesheet with appropriate columns | ||
if len(sample_mapping_dict) > 0: | ||
out_dir = os.path.dirname(file_out) | ||
make_dir(out_dir) | ||
with open(file_out, "w") as fout: | ||
fout.write(",".join(["sample", "path"]) + "\n") | ||
for sample in sorted(sample_mapping_dict.keys()): | ||
fout.write( | ||
",".join( | ||
["{}".format(sample), "{}".format(sample_mapping_dict[sample])] | ||
) | ||
+ "\n" | ||
) | ||
else: | ||
print_error("No entries to process!", "Samplesheet: {}".format(file_in)) | ||
|
||
|
||
def main(args=None): | ||
args = parse_args(args) | ||
check_samplesheet(args.FILE_IN, args.FILE_OUT) | ||
|
||
|
||
if __name__ == "__main__": | ||
sys.exit(main()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
import os | ||
import sys | ||
import errno | ||
import argparse | ||
|
||
|
||
def parse_args(args=None): | ||
Description = "Reformat nf-core/arete samplesheet file and check its contents." | ||
Epilog = "Example usage: python check_samplesheet.py <FILE_IN> <FILE_OUT>" | ||
|
||
parser = argparse.ArgumentParser(description=Description, epilog=Epilog) | ||
parser.add_argument("FILE_IN", help="Input samplesheet file.") | ||
parser.add_argument("FILE_OUT", help="Output file.") | ||
return parser.parse_args(args) | ||
|
||
|
||
def make_dir(path): | ||
if len(path) > 0: | ||
try: | ||
os.makedirs(path) | ||
except OSError as exception: | ||
if exception.errno != errno.EEXIST: | ||
raise exception | ||
|
||
|
||
def print_error(error, context="Line", context_str=""): | ||
error_str = "ERROR: Please check samplesheet -> {}".format(error) | ||
if context != "" and context_str != "": | ||
error_str = "ERROR: Please check samplesheet -> {}\n{}: '{}'".format( | ||
error, context.strip(), context_str.strip() | ||
) | ||
print(error_str) | ||
sys.exit(1) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
include { PHYLOSHEET_CHECK } from '../../modules/local/samplesheet_check' | ||
|
||
// Input to the phylo work flow is different | ||
// Instead of reads, pass in GFF files | ||
workflow PHYLO_INPUT_CHECK { | ||
take: | ||
samplesheet | ||
|
||
main: | ||
PHYLOSHEET_CHECK ( samplesheet ) | ||
.splitCsv ( header:true, sep:',' ) | ||
.map { get_sample_info_phylo(it) } | ||
.set { genomes } | ||
|
||
//Check that no dots "." are in sample ID | ||
genomes | ||
.map { meta, reads -> meta.id } | ||
.subscribe { if ( "$it".contains(".") ) exit 1, "Please review data input, sampleIDs may not contain dots, but \"$it\" does." } | ||
|
||
emit: | ||
genomes // channel: [ val(meta), [ reads ] ] | ||
} | ||
// Function to get list of [ meta, [ path ] ] | ||
def get_sample_info_phylo(LinkedHashMap row) { | ||
def meta = [:] | ||
meta.id = row.sample | ||
meta.single_end = true | ||
|
||
def array = [] | ||
if (!file(row.path).exists()) { | ||
print("***") | ||
print(row) | ||
print("***") | ||
exit 1, "ERROR: Please check input samplesheet -> Sequence file does not exist!\n${row.path}" | ||
} | ||
array = [ meta, file(row.path)] | ||
|
||
return array | ||
} |
Oops, something went wrong.