Skip to content

Commit

Permalink
[wip] merge rules
Browse files Browse the repository at this point in the history
  • Loading branch information
j23414 committed Aug 21, 2023
1 parent 7ec737b commit 0aa0b23
Show file tree
Hide file tree
Showing 7 changed files with 537 additions and 0 deletions.
1 change: 1 addition & 0 deletions ingest/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ rule all:

include: "workflow/snakemake_rules/fetch_sequences.smk"
include: "workflow/snakemake_rules/transform.smk"
include: "workflow/snakemake_rules/merge.smk"


if config.get("upload"):
Expand Down
41 changes: 41 additions & 0 deletions ingest/bin/ndjson-to-tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#!/usr/bin/env python3
"""
Parses NDJSON records from stdin to two different files: a metadata TSV and a
sequences FASTA.
Records that do not have an ID or sequence will be excluded from the output files.
"""
import argparse
import csv
import json
from sys import stderr, stdin


if __name__ == '__main__':
parser = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument("--metadata", metavar="TSV", default="data/metadata.tsv",
help="The output metadata TSV file")
parser.add_argument("--metadata-columns", nargs="+",
help="List of fields from the NDJSON records to include as columns in the metadata TSV. " +
"Metadata TSV columns will be in the order of the columns provided.")
parser.add_argument("--id-field", default='strain',
help="Field from the records to use as the sequence ID in the FASTA file.")

args = parser.parse_args()

with open(args.metadata, 'wt') as metadata_output:
metadata_csv = csv.DictWriter(
metadata_output,
args.metadata_columns,
restval="",
extrasaction='ignore',
delimiter='\t'
)
metadata_csv.writeheader()

for index, record in enumerate(stdin):
record = json.loads(record)
metadata_csv.writerow(record)
47 changes: 47 additions & 0 deletions ingest/bin/replace_name.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#! /usr/bin/env perl

use strict;
use warnings;

my $USAGE = "Usage: $0 <lookuptable.tsv> <sequences.fasta>\n";
$USAGE=$USAGE." Replaces the names of the sequences in <sequences.fasta> with the names in <lookuptable.tsv>\n";

if (@ARGV != 2) {
print $USAGE;
exit;
}

my ($fn1, $fn2) = @ARGV;
print STDERR "Reading hashmap from $fn1 to rename headers in $fn2\n";

my $fh;
open ($fh, "<:encoding(UTF-8)", $fn1) or die "Couldn't open $fn1: $!\n";

my %headers;
my $line;

while(<$fh>) {
chomp;
$line = $_;
my @fields = split(/\t/, $line);
$headers{$fields[0]} = $fields[1];
}
close($fh);

open ($fh, "<:encoding(UTF-8)", $fn2) or die "Couldn't open $fn2: $!\n";

while(<$fh>) {
chomp;
if(/>/) {
my $header = $_;
$header =~ s/>//;
if(exists $headers{$header}) {
print ">$headers{$header}\n";
} else {
# print STDERR "Couldn't find $header in lookup table\n";
print ">$header\n";
}
} else {
print "$_\n";
}
}
123 changes: 123 additions & 0 deletions ingest/bin/transform-citations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
#! /usr/bin/env python
"""
Updates the title and journal fields of the NDJSON record from stdin and outputs by making Entrez queries to fetch citations
"""

import argparse
import json
import re
from sys import stderr, stdin, stdout
import time

from Bio import SeqIO
from Bio import Entrez
import requests
import warnings


def parse_args():
parser = argparse.ArgumentParser(
description="Use Entrez query to extract citations. Fetch in batches of group size."
)
parser.add_argument(
"--genbank-id-field",
default="genbank_accession",
)
parser.add_argument(
"--group-size",
default=200,
help="Fetch in batches of group size [default:200]",
required=False,
)
parser.add_argument(
"--entrez-email",
default="hello@nextstrain.org",
help="Entrez email address [default:hello@nextstrain.org]",
required=False,
)
return parser.parse_args()


def fetch_and_print_citations(data: dict, genbank_id_field: str):
genbank_ids = ",".join([d[genbank_id_field] for d in data])

try:
handle = Entrez.efetch(
db="nucleotide", id=genbank_ids, rettype="gb", retmode="text", retmax=1000
)
record = SeqIO.parse(handle, "genbank")

title_dict = dict()
journal_dict = dict()
for row in record:
title_dict[row.id.split(".")[0]] = row.annotations["references"][0].title
journal_dict[row.id.split(".")[0]] = row.annotations["references"][0].journal

# Maintain the order in data
for d in data:
if d["genbank_accession"] in title_dict:
d["title"] = title_dict[d["genbank_accession"]]

if d["genbank_accession"] in journal_dict:
d["journal"] = journal_dict[d["genbank_accession"]]

# Always print the record
json.dump(d, stdout, allow_nan=False, indent=None, separators=",:")
print()

handle.close()
except Exception as exception_msg:
for d in data:
fetch_one_citation(d, genbank_id_field)
time.sleep(1)

return None


def fetch_one_citation(data: dict, genbank_id_field: str):
genbank_id = data[genbank_id_field]

try:
handle = Entrez.efetch(
db="nucleotide", id=genbank_id, rettype="gb", retmode="text", retmax=1000
)
record = SeqIO.read(handle, "genbank")

data["title"] = record.annotations["references"][0].title

json.dump(data, stdout, allow_nan=False, indent=None, separators=",:")
print()

handle.close()

except Exception as exception_msg:
warnings.warn(f"Pass through and skip title processing for {data[genbank_id_field]}: {exception_msg}", stacklevel=2)

# example: GenBank ON123563-81 records are in the ndjson but were removed from Entrez
json.dump(data, stdout, allow_nan=False, indent=None, separators=",:")
print()

return None


def main():
args = parse_args()

data = []
chunk_size = int(args.group_size)
genbank_id_field = args.genbank_id_field

Entrez.email = args.entrez_email

for index, record in enumerate(stdin):
data.append(json.loads(record))
if (index + 1) % chunk_size == 0:
fetch_and_print_citations(data, genbank_id_field)
data = []

if data:
fetch_and_print_citations(data, genbank_id_field)


if __name__ == "__main__":
main()
17 changes: 17 additions & 0 deletions ingest/bin/tsv-to-ndjson
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/usr/bin/env python3
"""
Copied from "bin/csv-to-ndjson" in nextstrain/ncov-ingest:
https://github.com/nextstrain/ncov-ingest/blob/2a5f255329ee5bdf0cabc8b8827a700c92becbe4/bin/csv-to-ndjson
Convert CSV on stdin to NDJSON on stdout.
"""
import csv
import json
from sys import stdin, stdout

# 200 MiB; default is 128 KiB
csv.field_size_limit(200 * 1024 * 1024)

for row in csv.DictReader(stdin, delimiter = '\t'):
json.dump(row, stdout, allow_nan = False, indent = None, separators = ',:')
print()
Loading

0 comments on commit 0aa0b23

Please sign in to comment.