NCBI Dataset field name transformations

Organize field renaming into two parts. 1. Rename the NCBI output columns to match the NCBI mnemonics¹ (see `source-data/ncbi-dataset-field-map.tsv`) 2. Where necessary, rename the NCBI mnemonics to match Nextstrain expected column names² (see "transform: fieldmap:" in `config/config.yaml`) ¹ https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields ² https://docs.nextstrain.org/projects/ncov/en/latest/reference/metadata-fields.html
nextstrain · Nov 13, 2023 · a386f7d · a386f7d
1 parent d154a88
commit a386f7d
Show file tree

Hide file tree

Showing 4 changed files with 44 additions and 59 deletions.
diff --git a/ingest/bin/reverse_reversed_sequences.py b/ingest/bin/reverse_reversed_sequences.py
diff --git a/ingest/config/config.yaml b/ingest/config/config.yaml
@@ -7,17 +7,31 @@ ncbi_field_map: 'source-data/ncbi-dataset-field-map.tsv'
 
 # Params for the transform rule
 transform:
-  # Fields to rename.
+  # NCBI fields to rename to Nextstrain field names.
   # This is the first step in the pipeline, so any references to field names
   # in the configs below should use the new field names
-  field_map: ['collected=date', 'submitted=date_submitted', 'genbank_accession=accession', 'submitting_organization=institution']
+  field_map: [
+    'accession=genbank_accession',
+    'accession-rev=genbank_accession_rev',
+    'isolate-lineage=strain',
+    'sourcedb=database',
+    'geo-region=region',
+    'geo-location=location',
+    'host-name=host',
+    'isolate-collection-date=date',
+    'release-date=release_date',
+    'update-date=update_date',
+    'sra-accs=sra_accessions',
+    'submitter-names=authors',
+    'submitter-affiliations=institution',
+  ]
   # Standardized strain name regex
   # Currently accepts any characters because we do not have a clear standard for strain names
   strain_regex: '^.+$'
   # Back up strain name field if 'strain' doesn't match regex above
-  strain_backup_fields: ['accession']
+  strain_backup_fields: ['genbank_accession']
   # List of date fields to standardize
-  date_fields: ['date', 'date_submitted']
+  date_fields: ['date', 'release_date', 'update_date']
   # Expected date formats present in date fields
   # These date formats should use directives expected by datetime
   # See https://docs.python.org/3.9/library/datetime.html#strftime-and-strptime-format-codes
@@ -47,26 +61,27 @@ transform:
   # User annotations file
   annotations: 'source-data/annotations.tsv'
   # ID field used to merge annotations
-  annotations_id: 'accession'
+  annotations_id: 'genbank_accession'
   # Field to use as the sequence ID in the FASTA file
-  id_field: 'accession'
+  id_field: 'genbank_accession'
   # Field to use as the sequence in the FASTA file
   sequence_field: 'sequence'
   # Final output columns for the metadata TSV
   metadata_columns: [
-    'accession',
+    'genbank_accession',
     'genbank_accession_rev',
     'strain',
     'date',
     'region',
     'country',
     'division',
     'location',
+    'length',
     'host',
-    'date_submitted',
-    'sra_accession',
+    'release_date',
+    'update_date',
+    'sra_accessions',
     'abbr_authors',
-    'reverse',
     'authors',
     'institution'
   ]

diff --git a/ingest/source-data/ncbi-dataset-field-map.tsv b/ingest/source-data/ncbi-dataset-field-map.tsv
@@ -1,17 +1,17 @@
+# Maps the NCBI output TSV column names back to the NCBI mnemonics.
+# This list should match the list in
+# ingest/workflow/snakemake_rules/fetch_sequences.smk _get_ncbi_dataset_field_mnemonics
 key	value
-Accession	genbank_accession_rev
-Source database	database
-Isolate Lineage	strain
-Geographic Region	region
-Geographic Location	location
-Isolate Collection date	collected
-Release date	submitted
-Update date	updated
+Accession	accession-rev
+Source database	sourcedb
+Isolate Lineage	isolate-lineage
+Geographic Region	geo-region
+Geographic Location	geo-location
+Isolate Collection date	isolate-collection-date
+Release date	release-date
+Update date	update-date
 Length	length
-Host Name	host
-Isolate Lineage source	isolation_source
-BioProjects	bioproject_accession
-BioSample accession	biosample_accession
-SRA Accessions	sra_accession
-Submitter Names	authors
-Submitter Affiliation	submitting_organization
+Host Name	host-name
+SRA Accessions	sra-accs
+Submitter Names	submitter-names
+Submitter Affiliation	submitter-affiliation
diff --git a/ingest/workflow/snakemake_rules/fetch_sequences.smk b/ingest/workflow/snakemake_rules/fetch_sequences.smk
@@ -76,8 +76,7 @@ def _get_ncbi_dataset_field_mnemonics(wildcards) -> str:
 
 
 rule format_ncbi_dataset_report:
-    # Formats the headers to be the same as before we used NCBI Datasets
-    # The only fields we do not have equivalents for are "title" and "publications"
+    # Formats the headers to match the NCBI mnemonic names
     input:
         dataset_package="data/ncbi_dataset.zip",
         ncbi_field_map=config["ncbi_field_map"],
@@ -93,8 +92,8 @@ rule format_ncbi_dataset_report:
             --package {input.dataset_package} \
             --fields {params.fields_to_include:q} \
             | csvtk -tl rename2 -F -f '*' -p '(.+)' -r '{{kv}}' -k {input.ncbi_field_map} \
-            | csvtk -tl mutate -f genbank_accession_rev -n genbank_accession -p "^(.+?)\." \
-            | tsv-select -H -f genbank_accession --rest last \
+            | csvtk -tl mutate -f accession-rev -n accession -p "^(.+?)\." \
+            | tsv-select -H -f accession --rest last \
             > {output.ncbi_dataset_tsv}
         """
 
@@ -114,7 +113,7 @@ rule format_ncbi_datasets_ndjson:
         augur curate passthru \
             --metadata {input.ncbi_dataset_tsv} \
             --fasta {input.ncbi_dataset_sequences} \
-            --seq-id-column genbank_accession_rev \
+            --seq-id-column accession-rev \
             --seq-field sequence \
             --unmatched-reporting warn \
             --duplicate-reporting warn \