Skip to content

Commit

Permalink
ingest: Add rules and config params for curation
Browse files Browse the repository at this point in the history
  • Loading branch information
joverlee521 committed Sep 14, 2023
1 parent b61121f commit b3e7515
Show file tree
Hide file tree
Showing 5 changed files with 166 additions and 1 deletion.
3 changes: 2 additions & 1 deletion ingest/Snakefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# Use default configuration values. Override with Snakemake's --configfile/--config options.
configfile: "config/defaults.yaml"

include: "rules/fetch_from_ncbi.smk"
include: "rules/fetch-from-ncbi.smk"
include: "rules/curate.smk"
6 changes: 6 additions & 0 deletions ingest/config/annotations.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Manually curated annotations TSV file
# The TSV should not have a header and should have exactly three columns:
# id to match existing metadata, field name, and field value
# If there are multiple annotations for the same id and field, then the last value is used
# Lines starting with '#' are treated as comments
# Any '#' after the field value are treated as comments.
44 changes: 44 additions & 0 deletions ingest/config/defaults.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,47 @@ ncbi_taxon_id: ""

# Optional fields to add to the NCBI Datasets output
ncbi_dataset_fields: []

# Config parameters related to the curate pipeline
curate:
# URL pointed to public generalized geolocation rules
# For the Nextstrain team, this is currently
# 'https://github.com/nextstrain/ncov-ingest/master/source-data/gisaid_geoLocationRules.tsv'
geolocation_rules_url: ""
# The path to the local geolocation rules within the pathogen repo
# The path should be relative to the ingest directory.
local_geolocation_rules: "config/geolocation_rules.tsv"
# List of field names to change in the format of <old_field_name>=<new_field_name>
# This is the first step in the pipeline, so any references to field names
# in the configs below should use the new field names
field_map: []
# List of date fields to standardize to ISO format YYYY-MM-DD
date_fields: []
# List of expected date formats that are present in the date fields provided above
# These date formats should use directives expected by datetime
# See https://docs.python.org/3.9/library/datetime.html#strftime-and-strptime-format-codes
expected_date_formats: []
titlecase:
# List of string fields to titlecase
fields: []
# List of abbreviations not cast to titlecase, keeps uppercase
abbreviations: []
# Articles that should not be cast to titlecase
articles: []
# Metadata field that contains the list of authors associated with the sequence
authors_field: ""
# Default value to use if the authors field is empty
authors_default_value: ""
# Name to use for the generated abbreviated authors field
abbr_authors_field: ""
# Path to the manual annotations file
# The path should be relative to the ingest directory
annotations: "config/annotations.tsv"
# The ID field in the metadata to use to merge the manual annotations
annotations_id: ""
# The ID field in the metadata to use as the sequence id in the output FASTA file
output_id_field: ""
# The field in the NDJSON record that contains the actual genomic sequence
output_seq_field: ""
# The list of metadata columns to keep in the final output of the curation pipeline.
metadata_columns: []
6 changes: 6 additions & 0 deletions ingest/config/geolocation_rules.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# TSV file of geolocation rules with the format:
# '<raw_geolocation><tab><annotated_geolocation>' where the raw and annotated geolocations
# are formatted as '<region>/<country>/<division>/<location>'
# If creating a general rule, then the raw field value can be substituted with '*'
# Lines starting with '#' will be ignored as comments.
# Trailing '#' will be ignored as comments.
108 changes: 108 additions & 0 deletions ingest/rules/curate.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
"""
This part of the workflow handles the curation of metadata for sequences
from NCBI and outputs the clean data as two separate files:
- results/subset_metadata.tsv
- results/sequences.fasta
"""

# The following two rules can be ignored if you choose not to use the
# generalized geolocation rules that are shared across pathogens.
# The Nextstrain team will try to maintain a generalized set of geolocation
# rules that can then be overridden by local geolocation rules per pathogen repo.
rule fetch_general_geolocation_rules:
output:
general_geolocation_rules = "data/general-geolocation-rules.tsv"
params:
geolocation_rules_url = config["curate"]["geolocation_rules_url"]
shell:
"""
curl {params.geolocation_rules_url} > {output.general_geolocation_rules}
"""

rule concat_geolocation_rules:
input:
general_geolocation_rules = "data/general-geolocation-rules.tsv",
local_geolocation_rules = config["curate"]["local_geolocation_rules"]
output:
all_geolocation_rules = "data/all-geolocation-rules.tsv"
shell:
"""
cat {input.general_geolocation_rules} {input.local_geolocation_rules} >> {output.all_geolocation_rules}
"""

# This curate pipeline is based on existing pipelines for pathogen repos using NCBI data.
# You may want to add and/or remove steps from the pipeline for custom metadata
# curation for your pathogen. Note that the curate pipeline is streaming NDJSON
# records between scripts, so any custom scripts added to the pipeline should expect
# the input as NDJSON records from stdin and output NDJSON records to stdout.
# The final step of the pipeline should convert the NDJSON records to two
# separate files: a metadata TSV and a sequences FASTA.
rule curate:
input:
sequences_ndjson = "data/ncbi.ndjson",
# Change the geolocation_rules input path if you are removing the above two rules
all_geolocation_rules = "data/all-geolocation-rules.tsv",
annotations = config["curate"]["annotations"]
output:
metadata = "results/all_metadata.tsv",
sequences = "results/sequences.fasta"
log:
"logs/curate.txt"
benchmark:
"benchmarks/curate.txt"
params:
field_map=config["curate"]["field_map"],
date_fields=config["curate"]["date_fields"],
expected_date_formats=config["curate"]["expected_date_formats"],
articles=config["curate"]["titlecase"]["articles"],
abbreviations=config["curate"]["titlecase"]["abbreviations"],
titlecase_fields=config["curate"]["titlecase"]["fields"],
authors_field=config["curate"]["authors_field"],
authors_default_value=config["curate"]["authors_default_value"],
abbr_authors_field=config["curate"]["abbr_authors_field"],
annotations_id=config["curate"]["annotations_id"],
id_field=config["curate"]["output_id_field"],
sequence_field=config["curate"]["output_sequence_field"],
shell:
"""
(cat {input.sequences_ndjson} \
| ./vendored/transform-field-names \
--field-map {params.field_map} \
| augur curate normalize-strings \
| augur curate format-dates \
--date-fields {params.date_fields} \
--expected-date-formats {params.expected_date_formats} \
| ./vendored/transform-genbank-location \
| augur curate titlecase \
--titlecase-fields {params.titlecase_fields} \
--articles {params.articles} \
--abbreviations {params.abbreviations} \
| ./vendored/transform-authors \
--authors-field {params.authors_field} \
--default-value {params.authors_default_value} \
--abbr-authors-field {params.abbr_authors_field} \
| ./vendored/apply-geolocation-rules \
--geolocation-rules {input.all_geolocation_rules} \
| ./vendored/merge-user-metadata \
--annotations {input.annotations} \
--id-field {params.annotations_id} \
| augur curate passthru \
--output-metadata {output.metadata} \
--output-fasta {output.sequences} \
--output-id-field {params.id_field} \
--output-seq-field {params.sequence_field} ) 2>> {log}
"""

rule subset_metadata:
input:
metadata: "results/all_metadata.tsv"
output:
subset_metadata: "results/subset_metadata.tsv"
params:
metadata_fields: config["curate"]["metadata_columns"]
shell:
"""
tsv-select -H -f {params.metadata_fields} \
{input.metadata} > {output.subset_metadata}
"""

0 comments on commit b3e7515

Please sign in to comment.