-
Notifications
You must be signed in to change notification settings - Fork 2
Command line interface
Adrian Viehweger edited this page Apr 10, 2017
·
18 revisions
Note: The following code is for reference only, and not intended as a structured tutorial.
# at place A, create some json
zoo init file.json # validation checks on records at this stage
# more modifications
zoo commit -m 'new RNA virus assemblies from study'
dat share . # generates link asdew3es...
# in some faraway place B
dat clone asdew3es...
zoo add --db zika --cell new_study
# rename
zoo drop --db zika --cell new_study --force
zoo add --db zika --cell renamed_study
# now we can do analyses, e.g. MSA against some flaviviruses
zoo commit -m 'new RNA viruses related to flavivirus'
# at some place C, somebody already cloned zoo B
zoo pull # new sequences now present
zoo status --db zika
# This is lucky because B got frustrated and deleted evrything.
zoo destroy --db zika
Example:
zoo init --db zika --cell a zoo/data/cell_a.json
# Initializing data cell.
# Inserted 3 entries into cell "a".
zoo add --db zika --cell a --primkey genbank.a zoo/data/cell_b.json
# Loading data cell.
# Index created on field "genbank.a".
# 1 documents inserted in cell "a".
# 3 duplicates skipped.
zoo add --db zika --cell a cell_c.json
# Loading data cell.
# 2 documents inserted in cell "a".
zoo init --db zika --cell c zoo/data/cell_c_change.json
zoo commit --db zika --cell c --ksize 3,4,5 --n 5 cell_c_change_commit
# now pull these changes to cell "a"
zoo drop --db zika --cell a --force
# Dropped cell "a" from database "zika".
pull
zoo init --db virus --cell original virus.json
# Initializing data cell.
# 3 entries inserted into cell "original".
# Primary key assigned to field "_id".
zoo commit --db virus --cell original --ksize 3,4,5 --n 5 original
mkdir send
cp original.json send/
dat share send/
# Syncing Dat Archive: /Users/pi/tmp/send
# Link: dat://73401e1b931164763ecc5a04fad78e4788682677cefc718ebf49f6b4fe4dbad7
mkdir receive
dat clone receive/
# Download Finished!
# Total size: 1 file (484 B)
ls receive
# original.json
from pymongo import MongoClient
c = MongoClient('localhost:27017')['virus']['original']
[i for i in c.find()]
# we did experiments to replace "N" in bunyavirus seq w/ nucleotides
c.update_one({'virus': 'bunya'}, {'$set': {'sequence': 'ACTACCTTATA'}})
zoo commit --db virus --cell original --ksize 3,4,5 --n 5 original
# before
cat original.json
{"_id": "89d96f57-63d1-4efc-9c5b-13af6473eaad", "alt_id": {"gb": "gb1"}, "md5": "e7b1f3d8199b4b7fd5d54af4a1afac37", "sequence": "ACTAACCTATA", "virus": "flavi"}
{"_id": "ecd51cba-ce85-4c48-b63d-40a29a1b6676", "alt_id": {"gb": "gb1"}, "md5": "87bb94d7795874f84ef7731a823be434", "sequence": "TTTAACCTATA", "virus": "corona"}
{"_id": "780ca018-6267-440d-86e0-56fe0c211d70", "alt_id": {"gb": "gb1"}, "md5": "4c720baa79bfaf58597660b9720cd5d8", "sequence": "ACTANNNNATA", "virus": "bunya"}
zoo commit --db virus --cell original --ksize 3,4,5 --n 5 original
# Dumping data cell.
# | 3 Elapsed Time: 0:00:00
# Done.
# after
cat original.json
{"_id": "89d96f57-63d1-4efc-9c5b-13af6473eaad", "alt_id": {"gb": "gb1"}, "md5": "e7b1f3d8199b4b7fd5d54af4a1afac37", "sequence": "ACTAACCTATA", "virus": "flavi"}
{"_id": "ecd51cba-ce85-4c48-b63d-40a29a1b6676", "alt_id": {"gb": "gb1"}, "md5": "87bb94d7795874f84ef7731a823be434", "sequence": "TTTAACCTATA", "virus": "corona"}
{"_id": "780ca018-6267-440d-86e0-56fe0c211d70", "alt_id": {"gb": "gb1"}, "md5": "e5a49f574d58bfc3d27fe2c93285a199", "sequence": "ACTACCTTATA", "virus": "bunya"}
zoo drop --db virus --cell original --force
dat still buggy. try without and "simulate changed files".
zoo drop --db virus --cell original --force
zoo init --db virus --cell original virus.json
zoo commit --db virus --cell original --ksize 3,4,5 --n 5 original
from pymongo import MongoClient
c = MongoClient('localhost:27017')['virus']['original']
[i for i in c.find()]
# we did experiments to replace "N" in bunyavirus seq w/ nucleotides
c.update_one({'virus': 'bunya'}, {'$set': {'sequence': 'ACTACCTTATA'}})
zoo commit --db virus --cell original --ksize 3,4,5 --n 5 modified
zoo add --db virus --cell modified original.json
zoo pull --db virus --cell modified modified.json
# Updating cell's md5 hashes.
# / 0 Elapsed Time: 0:00:00
#
# 2 entries unchanged.
# 1 entries replaced.
tail -n1 original.json
# {"_id": "c133bf2d-04b1-4c8e-910a-c41e5376bae5", "alt_id": {"gb": "gb1"}, "md5": "4c720baa79bfaf58597660b9720cd5d8", "sequence": "ACTANNNNATA", "virus": "bunya"}
tail -n1 modified.json
# {"_id": "c133bf2d-04b1-4c8e-910a-c41e5376bae5", "alt_id": {"gb": "gb1"}, "md5": "e5a49f574d58bfc3d27fe2c93285a199", "sequence": "ACTACCTTATA", "virus": "bunya"}
diff
zoo add --db diff --cell mock tests/cell_a.json
zoo diff --db diff --cell mock --out diff.json tests/cell_b.json
cat diff.json
minhash, SBT
$ zoo sbt_index --db ref --cell ref --ksize 16 --nsketch 1000 \
reference
Initialize SBT.
Compute minhash signatures for selected documents.
k-mer size: 16, sketch size: 1000
\ 9158 Elapsed Time: 0:01:45
Save SBT.
Done.
$ sourmash sbt_search --ksize 16 reference survey.fa.sig
# running sourmash subcommand: sbt_search
loaded query: survey.fa... (k=16, DNA)
0.11 0ef85591-d464-4953-915f-f673907b7e8e (Zika reference genome)
status
zoo status --db diff --cell mock --example
dat
'''shell
dat cp data/tests/cell_a.json tmp/dump.json
dat share send/
dat clone <link> receive/
'''
import json
import random
l = []
with open('tmp/dump.json', 'r+') as file:
for line in file:
l.append(
json.loads(line.strip())
)
# modify
for i in l:
del i['dangerous']
i['random'] = random.uniform(0, 1)
# dump again
with open('send/dump.json', 'w+') as file:
for i in l:
file.write(json.dumps(i))
file.write('\n')
'''
dat pull receive/
# changes updated
'''
load
zoo load --source ncbi --fmt json --email '' \
--ids data/rna_virome_shi2016/rna_virome_shi2016.txt \
result.json
# streaming GenBank records in JSON format to cell
zoo load --source ncbi --fmt json --email '' \
--ids data/rna_virome_shi2016/rna_virome_shi2016.txt \
--stdout - | \
zoo init --db foo --cell bar -
dump
cat q.json
# {}
zoo dump --query q.json --selection _id,meta.date,meta.geo.cou,seq \
--delim "|" --fmt fasta dump.fa
head -n2 dump.fa
# >a0b5d956-a940-427d-b5ff-f3a22e750389|2015-09-07|sierra_leone
# NNNNNNNNNNNNNNNNNNNNNNNNNNNNNTTTAGGATCTTTTGTGTGCGAATAACTAT...
# pipe to stdout
zoo dump --query q.json --selection _id - > qdump.json
zoo dump --query q.json --selection _id | wc -l
zoo dump --query q.json --selection _id,seq --fmt fasta - | grep ">" | head
# minhash
zoo dump --query q.json --selection _id,seq --fmt fasta - | \
sourmash compute -k 16 -n 100 --singleton --out q.sig -