ingest: Rename Nextclade metadata fields with augur curate rename

This construction reads much clearer and cleaner. Moves the Nextclade field map directly and more conveniently into the YAML config instead of referencing a separate TSV file. Putting the field map into a separate file seemed to be only for the sake of the --kv-file (-k) interface provided by `cvstk rename2`, which we're no longer using here. For backwards compatibility, configs that reference a TSV file are still supported and will be handled on-the-fly. Note that `augur curate` commands currently emit CSV-like TSVs that are limited to be IANA-like¹ such that parsing them with tsv-utils is most appropriate, hence the switch from `csvtk cut` to `tsv-select`. ¹ See <nextstrain/augur#1566>.
nextstrain · Sep 10, 2024 · faebd64 · faebd64
1 parent 762acdb
commit faebd64
Show file tree

Hide file tree

Showing 3 changed files with 46 additions and 40 deletions.
diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml
@@ -124,5 +124,32 @@ curate:
   genotype_field: "virus_name"
 nextclade:
   dataset_name: "nextstrain/measles/N450/WHO-2012"
-  field_map: "defaults/nextclade_field_map.tsv"
+  field_map:
+    # The first column should be the original column name of the Nextclade TSV
+    # The second column should be the new column name to use in the final metadata TSV
+    # Nextclade can have pathogen specific output columns so make sure to check which
+    # columns would be useful for your downstream phylogenetic analysis.
+    seqName: seqName
+    clade: clade
+    coverage: coverage
+    totalMissing: missing_data
+    totalSubstitutions: divergence
+    totalNonACGTNs: nonACGTN
+    qc.overallStatus: QC_overall
+    qc.missingData.status: QC_missing_data
+    qc.mixedSites.status: QC_mixed_sites
+    qc.privateMutations.status: QC_rare_mutations
+    qc.snpClusters.status: QC_snp_clusters
+    qc.frameShifts.status: QC_frame_shifts
+    qc.stopCodons.status: QC_stop_codons
+    frameShifts: frame_shifts
+    privateNucMutations.reversionSubstitutions: private_reversion_substitutions
+    privateNucMutations.labeledSubstitutions: private_labeled_substitutions
+    privateNucMutations.unlabeledSubstitutions: private_unlabeled_substitutions
+    privateNucMutations.totalReversionSubstitutions: private_total_reversion_substitutions
+    privateNucMutations.totalLabeledSubstitutions: private_total_labeled_substitutions
+    privateNucMutations.totalUnlabeledSubstitutions: private_total_unlabeled_substitutions
+    privateNucMutations.totalPrivateSubstitutions: private_total_private_substitutions
+    qc.snpClusters.clusteredSNPs: private_snp_clusters
+    qc.snpClusters.totalSNPs: private_total_snp_clusters
   id_field: "seqName"
diff --git a/ingest/defaults/nextclade_field_map.tsv b/ingest/defaults/nextclade_field_map.tsv
diff --git a/ingest/rules/nextclade.smk b/ingest/rules/nextclade.smk
@@ -5,6 +5,8 @@ and sequences.
 See Nextclade docs for more details on usage, inputs, and outputs if you would
 like to customize the rules
 """
+import sys
+
 DATASET_NAME = config["nextclade"]["dataset_name"]
 
 
@@ -46,28 +48,33 @@ rule run_nextclade:
         """
 
 
+if isinstance(config["nextclade"]["field_map"], str):
+    print(f"Converting config['nextclade']['field_map'] from TSV file ({config['nextclade']['field_map']}) to dictionary; "
+          f"consider putting the field map directly in the config file.", file=sys.stderr)
+
+    with open(config["nextclade"]["field_map"], "r") as f:
+        config["nextclade"]["field_map"] = dict(line.rstrip("\n").split("\t", 1) for line in f if not line.startswith("#"))
+
+
 rule join_metadata_and_nextclade:
     input:
         nextclade="results/nextclade.tsv",
         metadata="data/subset_metadata.tsv",
-        nextclade_field_map=config["nextclade"]["field_map"],
     output:
         metadata="results/metadata.tsv",
     params:
         metadata_id_field=config["curate"]["output_id_field"],
         nextclade_id_field=config["nextclade"]["id_field"],
+        nextclade_field_map=[f"{old}={new}" for old, new in config["nextclade"]["field_map"].items()],
+        nextclade_fields=",".join(config["nextclade"]["field_map"].values()),
     shell:
         r"""
-        export SUBSET_FIELDS=`grep -v '^#' {input.nextclade_field_map} | awk '{{print $1}}' | tr '\n' ',' | sed 's/,$//g'`
-
-        csvtk -tl cut -f $SUBSET_FIELDS \
-            {input.nextclade} \
-        | csvtk -tl rename2 \
-            -F \
-            -f '*' \
-            -p '(.+)' \
-            -r '{{kv}}' \
-            -k {input.nextclade_field_map} \
+        augur curate rename \
+            --metadata {input.nextclade:q} \
+            --id-column {params.nextclade_id_field:q} \
+            --field-map {params.nextclade_field_map:q} \
+            --output-metadata - \
+        | tsv-select --header --fields {params.nextclade_fields:q} \
         | tsv-join -H \
             --filter-file - \
             --key-fields {params.nextclade_id_field} \