diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml index 44c4976..d3d2ca3 100644 --- a/ingest/defaults/config.yaml +++ b/ingest/defaults/config.yaml @@ -124,5 +124,32 @@ curate: genotype_field: "virus_name" nextclade: dataset_name: "nextstrain/measles/N450/WHO-2012" - field_map: "defaults/nextclade_field_map.tsv" + field_map: + # The first column should be the original column name of the Nextclade TSV + # The second column should be the new column name to use in the final metadata TSV + # Nextclade can have pathogen specific output columns so make sure to check which + # columns would be useful for your downstream phylogenetic analysis. + seqName: seqName + clade: clade + coverage: coverage + totalMissing: missing_data + totalSubstitutions: divergence + totalNonACGTNs: nonACGTN + qc.overallStatus: QC_overall + qc.missingData.status: QC_missing_data + qc.mixedSites.status: QC_mixed_sites + qc.privateMutations.status: QC_rare_mutations + qc.snpClusters.status: QC_snp_clusters + qc.frameShifts.status: QC_frame_shifts + qc.stopCodons.status: QC_stop_codons + frameShifts: frame_shifts + privateNucMutations.reversionSubstitutions: private_reversion_substitutions + privateNucMutations.labeledSubstitutions: private_labeled_substitutions + privateNucMutations.unlabeledSubstitutions: private_unlabeled_substitutions + privateNucMutations.totalReversionSubstitutions: private_total_reversion_substitutions + privateNucMutations.totalLabeledSubstitutions: private_total_labeled_substitutions + privateNucMutations.totalUnlabeledSubstitutions: private_total_unlabeled_substitutions + privateNucMutations.totalPrivateSubstitutions: private_total_private_substitutions + qc.snpClusters.clusteredSNPs: private_snp_clusters + qc.snpClusters.totalSNPs: private_total_snp_clusters id_field: "seqName" diff --git a/ingest/defaults/nextclade_field_map.tsv b/ingest/defaults/nextclade_field_map.tsv deleted file mode 100644 index e23c864..0000000 --- a/ingest/defaults/nextclade_field_map.tsv +++ /dev/null @@ -1,28 +0,0 @@ -# TSV file that is a mapping of column names for Nextclade output TSV -# The first column should be the original column name of the Nextclade TSV -# The second column should be the new column name to use in the final metadata TSV -# Nextclade can have pathogen specific output columns so make sure to check which -# columns would be useful for your downstream phylogenetic analysis. -seqName seqName -clade clade -coverage coverage -totalMissing missing_data -totalSubstitutions divergence -totalNonACGTNs nonACGTN -qc.overallStatus QC_overall -qc.missingData.status QC_missing_data -qc.mixedSites.status QC_mixed_sites -qc.privateMutations.status QC_rare_mutations -qc.snpClusters.status QC_snp_clusters -qc.frameShifts.status QC_frame_shifts -qc.stopCodons.status QC_stop_codons -frameShifts frame_shifts -privateNucMutations.reversionSubstitutions private_reversion_substitutions -privateNucMutations.labeledSubstitutions private_labeled_substitutions -privateNucMutations.unlabeledSubstitutions private_unlabeled_substitutions -privateNucMutations.totalReversionSubstitutions private_total_reversion_substitutions -privateNucMutations.totalLabeledSubstitutions private_total_labeled_substitutions -privateNucMutations.totalUnlabeledSubstitutions private_total_unlabeled_substitutions -privateNucMutations.totalPrivateSubstitutions private_total_private_substitutions -qc.snpClusters.clusteredSNPs private_snp_clusters -qc.snpClusters.totalSNPs private_total_snp_clusters diff --git a/ingest/rules/nextclade.smk b/ingest/rules/nextclade.smk index 02bbf88..39148ca 100644 --- a/ingest/rules/nextclade.smk +++ b/ingest/rules/nextclade.smk @@ -5,6 +5,8 @@ and sequences. See Nextclade docs for more details on usage, inputs, and outputs if you would like to customize the rules """ +import sys + DATASET_NAME = config["nextclade"]["dataset_name"] @@ -46,28 +48,33 @@ rule run_nextclade: """ +if isinstance(config["nextclade"]["field_map"], str): + print(f"Converting config['nextclade']['field_map'] from TSV file ({config['nextclade']['field_map']}) to dictionary; " + f"consider putting the field map directly in the config file.", file=sys.stderr) + + with open(config["nextclade"]["field_map"], "r") as f: + config["nextclade"]["field_map"] = dict(line.rstrip("\n").split("\t", 1) for line in f if not line.startswith("#")) + + rule join_metadata_and_nextclade: input: nextclade="results/nextclade.tsv", metadata="data/subset_metadata.tsv", - nextclade_field_map=config["nextclade"]["field_map"], output: metadata="results/metadata.tsv", params: metadata_id_field=config["curate"]["output_id_field"], nextclade_id_field=config["nextclade"]["id_field"], + nextclade_field_map=[f"{old}={new}" for old, new in config["nextclade"]["field_map"].items()], + nextclade_fields=",".join(config["nextclade"]["field_map"].values()), shell: r""" - export SUBSET_FIELDS=`grep -v '^#' {input.nextclade_field_map} | awk '{{print $1}}' | tr '\n' ',' | sed 's/,$//g'` - - csvtk -tl cut -f $SUBSET_FIELDS \ - {input.nextclade} \ - | csvtk -tl rename2 \ - -F \ - -f '*' \ - -p '(.+)' \ - -r '{{kv}}' \ - -k {input.nextclade_field_map} \ + augur curate rename \ + --metadata {input.nextclade:q} \ + --id-column {params.nextclade_id_field:q} \ + --field-map {params.nextclade_field_map:q} \ + --output-metadata - \ + | tsv-select --header --fields {params.nextclade_fields:q} \ | tsv-join -H \ --filter-file - \ --key-fields {params.nextclade_id_field} \