Skip to content

Command line interface

Adrian Viehweger edited this page Apr 10, 2017 · 18 revisions

Figure: CLI use case.

Note: The following code is for reference only, and not intended as a structured tutorial.

# at place A, create some json
zoo init file.json  # validation checks on records at this stage
# more modifications
zoo commit -m 'new RNA virus assemblies from study'
dat share . # generates link asdew3es...

# in some faraway place B
dat clone asdew3es...
zoo add --db zika --cell new_study 
# rename
zoo drop --db zika --cell new_study --force
zoo add --db zika --cell renamed_study
# now we can do analyses, e.g. MSA against some flaviviruses
zoo commit -m 'new RNA viruses related to flavivirus'

# at some place C, somebody already cloned zoo B
zoo pull # new sequences now present
zoo status --db zika

# This is lucky because B got frustrated and deleted evrything.
zoo destroy --db zika

Example:

zoo init --db zika --cell a zoo/data/cell_a.json
# Initializing data cell.
# Inserted 3 entries into cell "a".

zoo add --db zika --cell a --primkey genbank.a zoo/data/cell_b.json
# Loading data cell.
# Index created on field "genbank.a".
# 1 documents inserted in cell "a".
# 3 duplicates skipped.

zoo add --db zika --cell a cell_c.json
# Loading data cell.
# 2 documents inserted in cell "a".

zoo init --db zika --cell c zoo/data/cell_c_change.json
zoo commit --db zika --cell c --ksize 3,4,5 --n 5 cell_c_change_commit

# now pull these changes to cell "a"


zoo drop --db zika --cell a --force
# Dropped cell "a" from database "zika".

pull

zoo init --db virus --cell original virus.json
# Initializing data cell.
# 3 entries inserted into cell "original".
# Primary key assigned to field "_id".
zoo commit --db virus --cell original --ksize 3,4,5 --n 5 original

mkdir send
cp original.json send/
dat share send/
# Syncing Dat Archive: /Users/pi/tmp/send
# Link: dat://73401e1b931164763ecc5a04fad78e4788682677cefc718ebf49f6b4fe4dbad7

mkdir receive
dat clone receive/
# Download Finished!
# Total size: 1 file (484 B)
ls receive
# original.json

from pymongo import MongoClient

c = MongoClient('localhost:27017')['virus']['original']
[i for i in c.find()]

# we did experiments to replace "N" in bunyavirus seq w/ nucleotides
c.update_one({'virus': 'bunya'}, {'$set': {'sequence': 'ACTACCTTATA'}})
zoo commit --db virus --cell original --ksize 3,4,5 --n 5 original

# before
cat original.json
{"_id": "89d96f57-63d1-4efc-9c5b-13af6473eaad", "alt_id": {"gb": "gb1"}, "md5": "e7b1f3d8199b4b7fd5d54af4a1afac37", "sequence": "ACTAACCTATA", "virus": "flavi"}
{"_id": "ecd51cba-ce85-4c48-b63d-40a29a1b6676", "alt_id": {"gb": "gb1"}, "md5": "87bb94d7795874f84ef7731a823be434", "sequence": "TTTAACCTATA", "virus": "corona"}
{"_id": "780ca018-6267-440d-86e0-56fe0c211d70", "alt_id": {"gb": "gb1"}, "md5": "4c720baa79bfaf58597660b9720cd5d8", "sequence": "ACTANNNNATA", "virus": "bunya"}

zoo commit --db virus --cell original --ksize 3,4,5 --n 5 original
# Dumping data cell.
# | 3 Elapsed Time: 0:00:00
# Done.

# after
cat original.json
{"_id": "89d96f57-63d1-4efc-9c5b-13af6473eaad", "alt_id": {"gb": "gb1"}, "md5": "e7b1f3d8199b4b7fd5d54af4a1afac37", "sequence": "ACTAACCTATA", "virus": "flavi"}
{"_id": "ecd51cba-ce85-4c48-b63d-40a29a1b6676", "alt_id": {"gb": "gb1"}, "md5": "87bb94d7795874f84ef7731a823be434", "sequence": "TTTAACCTATA", "virus": "corona"}
{"_id": "780ca018-6267-440d-86e0-56fe0c211d70", "alt_id": {"gb": "gb1"}, "md5": "e5a49f574d58bfc3d27fe2c93285a199", "sequence": "ACTACCTTATA", "virus": "bunya"}

zoo drop --db virus --cell original --force

dat still buggy. try without and "simulate changed files".

zoo drop --db virus --cell original --force
zoo init --db virus --cell original virus.json
zoo commit --db virus --cell original --ksize 3,4,5 --n 5 original
from pymongo import MongoClient

c = MongoClient('localhost:27017')['virus']['original']
[i for i in c.find()]

# we did experiments to replace "N" in bunyavirus seq w/ nucleotides
c.update_one({'virus': 'bunya'}, {'$set': {'sequence': 'ACTACCTTATA'}})
zoo commit --db virus --cell original --ksize 3,4,5 --n 5 modified
zoo add --db virus --cell modified original.json
zoo pull --db virus --cell modified modified.json
# Updating cell's md5 hashes.
# / 0 Elapsed Time: 0:00:00
# 
# 2 entries unchanged.
# 1 entries replaced.
tail -n1 original.json
# {"_id": "c133bf2d-04b1-4c8e-910a-c41e5376bae5", "alt_id": {"gb": "gb1"}, "md5": "4c720baa79bfaf58597660b9720cd5d8", "sequence": "ACTANNNNATA", "virus": "bunya"}

tail -n1 modified.json
# {"_id": "c133bf2d-04b1-4c8e-910a-c41e5376bae5", "alt_id": {"gb": "gb1"}, "md5": "e5a49f574d58bfc3d27fe2c93285a199", "sequence": "ACTACCTTATA", "virus": "bunya"}

diff

zoo add --db diff --cell mock tests/cell_a.json
zoo diff --db diff --cell mock --out diff.json tests/cell_b.json
cat diff.json

minhash, SBT

$ zoo sbt_index --db ref --cell ref --ksize 16 --nsketch 1000 \
reference
Initialize SBT.
Compute minhash signatures for selected documents.
k-mer size: 16, sketch size: 1000
\ 9158 Elapsed Time: 0:01:45
Save SBT.
Done.
$ sourmash sbt_search --ksize 16 reference survey.fa.sig
# running sourmash subcommand: sbt_search
loaded query: survey.fa... (k=16, DNA)
0.11 0ef85591-d464-4953-915f-f673907b7e8e (Zika reference genome)

status

zoo status --db diff --cell mock --example

dat

'''shell
dat cp data/tests/cell_a.json tmp/dump.json
dat share send/
dat clone <link> receive/
'''


import json
import random


l = []
with open('tmp/dump.json', 'r+') as file:
    for line in file:
        l.append(
            json.loads(line.strip())
            )

# modify
for i in l:
    del i['dangerous']
    i['random'] = random.uniform(0, 1)


# dump again
with open('send/dump.json', 'w+') as file:
    for i in l:
        file.write(json.dumps(i))
        file.write('\n')


'''
dat pull receive/
# changes updated
'''

load

zoo load --source ncbi --fmt json --email '' \
--ids data/rna_virome_shi2016/rna_virome_shi2016.txt \
result.json

# streaming GenBank records in JSON format to cell
zoo load --source ncbi --fmt json --email '' \
--ids data/rna_virome_shi2016/rna_virome_shi2016.txt \ 
--stdout - | \
zoo init --db foo --cell bar -

dump

cat q.json
# {}
zoo dump --query q.json --selection _id,meta.date,meta.geo.cou,seq \
--delim "|" --fmt fasta dump.fa
head -n2 dump.fa
# >a0b5d956-a940-427d-b5ff-f3a22e750389|2015-09-07|sierra_leone
# NNNNNNNNNNNNNNNNNNNNNNNNNNNNNTTTAGGATCTTTTGTGTGCGAATAACTAT...

# pipe to stdout
zoo dump --query q.json --selection _id - > qdump.json
zoo dump --query q.json --selection _id | wc -l
zoo dump --query q.json --selection _id,seq --fmt fasta - | grep ">" | head

# minhash
zoo dump --query q.json --selection _id,seq --fmt fasta - | \
sourmash compute -k 16 -n 100 --singleton --out q.sig -
Clone this wiki locally