ingest: Add rules and config params for curation

nextstrain · Sep 14, 2023 · b3e7515 · b3e7515
1 parent b61121f
commit b3e7515
Show file tree

Hide file tree

Showing 5 changed files with 166 additions and 1 deletion.
diff --git a/ingest/Snakefile b/ingest/Snakefile
@@ -1,4 +1,5 @@
 # Use default configuration values. Override with Snakemake's --configfile/--config options.
 configfile: "config/defaults.yaml"
 
-include: "rules/fetch_from_ncbi.smk"
+include: "rules/fetch-from-ncbi.smk"
+include: "rules/curate.smk"
diff --git a/ingest/config/annotations.tsv b/ingest/config/annotations.tsv
@@ -0,0 +1,6 @@
+# Manually curated annotations TSV file
+# The TSV should not have a header and should have exactly three columns:
+# id to match existing metadata, field name, and field value
+# If there are multiple annotations for the same id and field, then the last value is used
+# Lines starting with '#' are treated as comments
+# Any '#' after the field value are treated as comments.
diff --git a/ingest/config/defaults.yaml b/ingest/config/defaults.yaml
@@ -12,3 +12,47 @@ ncbi_taxon_id: ""
 
 # Optional fields to add to the NCBI Datasets output
 ncbi_dataset_fields: []
+
+# Config parameters related to the curate pipeline
+curate:
+  # URL pointed to public generalized geolocation rules
+  # For the Nextstrain team, this is currently
+  # 'https://github.com/nextstrain/ncov-ingest/master/source-data/gisaid_geoLocationRules.tsv'
+  geolocation_rules_url: ""
+  # The path to the local geolocation rules within the pathogen repo
+  # The path should be relative to the ingest directory.
+  local_geolocation_rules: "config/geolocation_rules.tsv"
+  # List of field names to change in the format of <old_field_name>=<new_field_name>
+  # This is the first step in the pipeline, so any references to field names
+  # in the configs below should use the new field names
+  field_map: []
+  # List of date fields to standardize to ISO format YYYY-MM-DD
+  date_fields: []
+  # List of expected date formats that are present in the date fields provided above
+  # These date formats should use directives expected by datetime
+  # See https://docs.python.org/3.9/library/datetime.html#strftime-and-strptime-format-codes
+  expected_date_formats: []
+  titlecase:
+    # List of string fields to titlecase
+    fields: []
+    # List of abbreviations not cast to titlecase, keeps uppercase
+    abbreviations: []
+    # Articles that should not be cast to titlecase
+    articles: []
+  # Metadata field that contains the list of authors associated with the sequence
+  authors_field: ""
+  # Default value to use if the authors field is empty
+  authors_default_value: ""
+  # Name to use for the generated abbreviated authors field
+  abbr_authors_field: ""
+  # Path to the manual annotations file
+  # The path should be relative to the ingest directory
+  annotations: "config/annotations.tsv"
+  # The ID field in the metadata to use to merge the manual annotations
+  annotations_id: ""
+  # The ID field in the metadata to use as the sequence id in the output FASTA file
+  output_id_field: ""
+  # The field in the NDJSON record that contains the actual genomic sequence
+  output_seq_field: ""
+  # The list of metadata columns to keep in the final output of the curation pipeline.
+  metadata_columns: []
diff --git a/ingest/config/geolocation_rules.tsv b/ingest/config/geolocation_rules.tsv
@@ -0,0 +1,6 @@
+# TSV file of geolocation rules with the format:
+# '<raw_geolocation><tab><annotated_geolocation>' where the raw and annotated geolocations
+# are formatted as '<region>/<country>/<division>/<location>'
+# If creating a general rule, then the raw field value can be substituted with '*'
+# Lines starting with '#' will be ignored as comments.
+# Trailing '#' will be ignored as comments.
diff --git a/ingest/rules/curate.smk b/ingest/rules/curate.smk
@@ -0,0 +1,108 @@
+"""
+This part of the workflow handles the curation of metadata for sequences
+from NCBI and outputs the clean data as two separate files:
+
+    - results/subset_metadata.tsv
+    - results/sequences.fasta
+"""
+
+# The following two rules can be ignored if you choose not to use the
+# generalized geolocation rules that are shared across pathogens.
+# The Nextstrain team will try to maintain a generalized set of geolocation
+# rules that can then be overridden by local geolocation rules per pathogen repo.
+rule fetch_general_geolocation_rules:
+    output:
+        general_geolocation_rules = "data/general-geolocation-rules.tsv"
+    params:
+        geolocation_rules_url = config["curate"]["geolocation_rules_url"]
+    shell:
+        """
+        curl {params.geolocation_rules_url} > {output.general_geolocation_rules}
+        """
+
+rule concat_geolocation_rules:
+    input:
+        general_geolocation_rules = "data/general-geolocation-rules.tsv",
+        local_geolocation_rules = config["curate"]["local_geolocation_rules"]
+    output:
+        all_geolocation_rules = "data/all-geolocation-rules.tsv"
+    shell:
+        """
+        cat {input.general_geolocation_rules} {input.local_geolocation_rules} >> {output.all_geolocation_rules}
+        """
+
+# This curate pipeline is based on existing pipelines for pathogen repos using NCBI data.
+# You may want to add and/or remove steps from the pipeline for custom metadata
+# curation for your pathogen. Note that the curate pipeline is streaming NDJSON
+# records between scripts, so any custom scripts added to the pipeline should expect
+# the input as NDJSON records from stdin and output NDJSON records to stdout.
+# The final step of the pipeline should convert the NDJSON records to two
+# separate files: a metadata TSV and a sequences FASTA.
+rule curate:
+    input:
+        sequences_ndjson = "data/ncbi.ndjson",
+        # Change the geolocation_rules input path if you are removing the above two rules
+        all_geolocation_rules = "data/all-geolocation-rules.tsv",
+        annotations = config["curate"]["annotations"]
+    output:
+        metadata = "results/all_metadata.tsv",
+        sequences = "results/sequences.fasta"
+    log:
+        "logs/curate.txt"
+    benchmark:
+        "benchmarks/curate.txt"
+    params:
+        field_map=config["curate"]["field_map"],
+        date_fields=config["curate"]["date_fields"],
+        expected_date_formats=config["curate"]["expected_date_formats"],
+        articles=config["curate"]["titlecase"]["articles"],
+        abbreviations=config["curate"]["titlecase"]["abbreviations"],
+        titlecase_fields=config["curate"]["titlecase"]["fields"],
+        authors_field=config["curate"]["authors_field"],
+        authors_default_value=config["curate"]["authors_default_value"],
+        abbr_authors_field=config["curate"]["abbr_authors_field"],
+        annotations_id=config["curate"]["annotations_id"],
+        id_field=config["curate"]["output_id_field"],
+        sequence_field=config["curate"]["output_sequence_field"],
+    shell:
+        """
+        (cat {input.sequences_ndjson} \
+            | ./vendored/transform-field-names \
+                --field-map {params.field_map} \
+            | augur curate normalize-strings \
+            | augur curate format-dates \
+                --date-fields {params.date_fields} \
+                --expected-date-formats {params.expected_date_formats} \
+            | ./vendored/transform-genbank-location \
+            | augur curate titlecase \
+                --titlecase-fields {params.titlecase_fields} \
+                --articles {params.articles} \
+                --abbreviations {params.abbreviations} \
+            | ./vendored/transform-authors \
+                --authors-field {params.authors_field} \
+                --default-value {params.authors_default_value} \
+                --abbr-authors-field {params.abbr_authors_field} \
+            | ./vendored/apply-geolocation-rules \
+                --geolocation-rules {input.all_geolocation_rules} \
+            | ./vendored/merge-user-metadata \
+                --annotations {input.annotations} \
+                --id-field {params.annotations_id} \
+            | augur curate passthru \
+                --output-metadata {output.metadata} \
+                --output-fasta {output.sequences} \
+                --output-id-field {params.id_field} \
+                --output-seq-field {params.sequence_field} ) 2>> {log}
+        """
+
+rule subset_metadata:
+    input:
+        metadata: "results/all_metadata.tsv"
+    output:
+        subset_metadata: "results/subset_metadata.tsv"
+    params:
+        metadata_fields: config["curate"]["metadata_columns"]
+    shell:
+        """
+        tsv-select -H -f {params.metadata_fields} \
+            {input.metadata} > {output.subset_metadata}
+        """