Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support CIGARs longer than 65535 operations #263

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 59 additions & 0 deletions bench/cljam/io/sam_bench.clj
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
(ns cljam.io.sam-bench
(:require [libra.bench :refer [defbench are]]
[libra.criterium :as c]
[clojure.java.io :as cio]
[cljam.test-common :as tcommon]
[cljam.util :as util]
[cljam.io.sam :as sam]))

(defbench encode-alignment-short-bench
(are [f]
(util/with-temp-dir [d "encode-alignment-short-bench"]
(with-open [r (sam/reader f)
w (sam/writer (cio/file d "out.bam"))]
(let [header (sam/read-header r)
xs (vec (sam/read-alignments r))]
(sam/write-header w header)
(sam/write-refs w header)
(doseq [x xs o (:options x)] o)
(c/quick-bench
(sam/write-alignments w xs header)))))
tcommon/test-sam-file
tcommon/medium-sam-file))

(defbench encode-alignment-long-bench
(tcommon/prepare-cavia!)
(are [f]
(util/with-temp-dir [d "encode-alignment-long-bench"]
(with-open [r (sam/reader f)
w (sam/writer (cio/file d "out.bam"))]
(let [header (sam/read-header r)
xs (vec (take 2000000 (sam/read-alignments r)))]
(sam/write-header w header)
(sam/write-refs w header)
(doseq [x xs o (:options x)] o)
(c/quick-bench
(sam/write-alignments w xs header)))))
tcommon/large-bam-file))

(defbench decode-bam-alignment-short-bench
(are [f decode-opts?]
(c/quick-bench
(with-open [r (sam/reader f)]
(doseq [a (sam/read-alignments r)
opts (when decode-opts? (:options a))]
opts)))
tcommon/test-bam-file false
tcommon/test-bam-file true
tcommon/medium-bam-file false
tcommon/medium-bam-file true))

(defbench decode-bam-alignment-long-bench
(tcommon/prepare-cavia!)
(are [f]
(c/quick-bench
(with-open [r (sam/reader f)]
(doseq [a (vec (take 10000000 (sam/read-alignments r)))
opts (:options a)]
opts)))
tcommon/large-bam-file))
26 changes: 22 additions & 4 deletions src/cljam/io/bam/decoder.clj
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,14 @@

(defrecord BAMRawBlock [data ^long pointer-beg ^long pointer-end])

(defn- B-I-type-cigar-str->cigar-str
[B-I-type-cigar]
(let [xs (cstr/split (subs B-I-type-cigar 2) #",")
bb (doto (ByteBuffer/allocate (* 4 (count xs)))
(.order ByteOrder/LITTLE_ENDIAN))]
(doseq [x xs] (.putInt bb (Long/parseLong x)))
(first (cigar/decode-cigar-and-ref-length (.array bb)))))

(defn decode-alignment
"Decodes BAM block and creates SAMAlignment instance which is compatible with SAM.
When called with start and end, this function may return nil if any base of the block
Expand Down Expand Up @@ -129,9 +137,14 @@
seq (decode-seq (lsb/read-bytes buffer (quot (inc l-seq) 2)) l-seq)
qual (decode-qual (lsb/read-bytes buffer l-seq))
rest (lsb/read-bytes buffer (options-size (alength ^bytes (:data block)) l-read-name n-cigar-op l-seq))
options (decode-options rest)]
options (decode-options rest)
[cigar* options*]
(if-let [cg (and (cigar/placeholder? cigar-bytes)
(:value (some :CG options)))]
[(B-I-type-cigar-str->cigar-str cg) (remove :CG options)]
[cigar options])]
(SAMAlignment. qname (int flag) rname (int pos) ref-end (int mapq)
cigar rnext (int pnext) (int tlen) seq qual options)))
cigar* rnext (int pnext) (int tlen) seq qual options*)))
([refs block ^long start ^long end]
(let [buffer (ByteBuffer/wrap (:data block))
ref-id ^int (lsb/read-int buffer)
Expand All @@ -157,9 +170,14 @@
qual (decode-qual (lsb/read-bytes buffer l-seq))
rest (lsb/read-bytes buffer (options-size (alength ^bytes (:data block)) l-read-name n-cigar-op l-seq))
rname (or (refs/ref-name refs ref-id) "*")
options (decode-options rest)]
options (decode-options rest)
[cigar* options*]
(if-let [cg (and (cigar/placeholder? cigar-bytes)
(:value (some :CG options)))]
[(B-I-type-cigar-str->cigar-str cg) (remove :CG options)]
[cigar options])]
(SAMAlignment. qname (int flag) rname (int pos) ref-end (int mapq)
cigar rnext (int pnext) (int tlen) seq qual options))))))))
cigar* rnext (int pnext) (int tlen) seq qual options*))))))))

(defn decode-region-block
"Decodes BAM block and returns a SAMRegionBlock instance containing covering range of the alignment."
Expand Down
24 changes: 17 additions & 7 deletions src/cljam/io/bam/encoder.clj
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,20 @@
read-length
^long (get-options-size aln))))

(defn- add-cigar-to-options
[options cigar]
(cons
{:CG {:type "B",:value (str "I," (cstr/join "," (cigar/encode-cigar cigar)))}}
options))

(defn encode-alignment [wrtr aln refs]
(let [aln (update aln :seq #(if (= % "*") "" %))]
(let [aln (update aln :seq #(if (= % "*") "" %))
cigar-ops-count (cigar/count-op (:cigar aln))
[encoded-cigar cigar-ops-count opts*]
(if (> cigar-ops-count 65535)
[(cigar/->placeholder (:cigar aln))
2 (add-cigar-to-options (:options aln) (:cigar aln))]
[(cigar/encode-cigar (:cigar aln)) cigar-ops-count (:options aln)])]
;; refID
(lsb/write-int wrtr (or (refs/ref-id refs (:rname aln)) -1))
;; pos
Expand All @@ -104,7 +116,7 @@
(lsb/write-ubyte wrtr (short (:mapq aln)))
(lsb/write-ushort wrtr (sam-util/compute-bin aln))
;; flag_nc
(lsb/write-ushort wrtr (cigar/count-op (:cigar aln)))
(lsb/write-ushort wrtr cigar-ops-count)
(lsb/write-ushort wrtr (:flag aln))
;; l_seq
(lsb/write-int wrtr (.length ^String (:seq aln)))
Expand All @@ -118,16 +130,14 @@
(lsb/write-string wrtr (:qname aln))
(lsb/write-bytes wrtr (byte-array 1 (byte 0)))
;; cigar
(doseq [cigar (cigar/encode-cigar (:cigar aln))]
(lsb/write-int wrtr cigar))
(doseq [cigar encoded-cigar] (lsb/write-int wrtr cigar))
;; seq
(lsb/write-bytes wrtr (seq/str->compressed-bases (:seq aln)))
;; qual
(lsb/write-bytes wrtr (encode-qual aln))

;; options
(doseq [op (:options aln)]
(let [[tag value] (first (seq op))]
(doseq [opt opts*]
(let [[tag value] (first (seq opt))]
(lsb/write-short
wrtr
(short (bit-or (bit-shift-left (byte (second (name tag))) 8)
Expand Down
24 changes: 24 additions & 0 deletions src/cljam/io/sam/util/cigar.clj
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,13 @@
(recur (+ ref-length (case op 0 n 2 n 3 n 7 n 8 n 0))))
[(.toString sb) ref-length]))))

(defn placeholder?
athos marked this conversation as resolved.
Show resolved Hide resolved
"Returns a boolean indicating whether a CIGAR is in `kSmN` format."
[^bytes cigar-bytes]
(and (= 8 (alength cigar-bytes))
(= 4 (bit-and 0xF (aget cigar-bytes 0))) ;; S
(= 3 (bit-and 0xF (aget cigar-bytes 4))))) ;; N

(defn encode-cigar
"Encodes CIGAR string into a sequence of longs."
[cigar]
Expand All @@ -116,3 +123,20 @@
(defmethod count-ref (Class/forName "[B")
[b]
(count-ref-bytes b))

(defn ->placeholder
"Creates an encoded placeholder from a given CIGAR string.
The placeholder is in the format of kSmN where k is the read length and m is
the reference length. Returns a vector of ints."
[cigar-str]
(transduce
identity
(fn
([[^long r ^long q]]
[(bit-or (bit-shift-left q 4) 4)
(bit-or (bit-shift-left r 4) 3)])
([[^long r ^long q] [n op]]
[(+ r (case op (\M \D \N \= \X) (long n) 0))
(+ q (case op (\M \I \S \= \X) (long n) 0))]))
[0 0]
(parse cigar-str)))
Binary file added test-resources/bam/long_cigar_operations.bam
Binary file not shown.
3 changes: 3 additions & 0 deletions test-resources/sam/long_cigar_operations.sam

Large diffs are not rendered by default.

78 changes: 78 additions & 0 deletions test/cljam/io/bam/decoder_test.clj
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
(ns cljam.io.bam.decoder-test
"Tests for cljam.io.bam.decoder."
(:require [clojure.test :refer [deftest is are testing]]
[cljam.io.protocols]
[cljam.io.bam.decoder :as decoder]
[cljam.test-common :as test-common]))

(deftest B-I-type-cigar-str->cigar-str-test
(is (= "1M2I4D8N16S32H64P128=65535X"
(#'decoder/B-I-type-cigar-str->cigar-str
"I,16,33,66,131,260,517,1030,2055,1048568"))))

(deftest decode-alignment-test
(let [rid-bin [0 0 0 0 1 0 0 0 2 0 73 18]
flag-qname [0 0 2 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 0 0 81 0]
seq-qual [17 -1 -1]
no-opts (concat rid-bin
[1 0] ;; n_cigar_op: 1
flag-qname
[32 0 0 0] ;; cigar: 2M
seq-qual)
with-CG (concat rid-bin
[2 0] ;; n_cigar_op: 2
flag-qname
[36 0 0 0 ;; cigar: 2S
35 0 0 0] ;; 2N
seq-qual
[67 71 66 73 ;; tag: CG, val_type: B-I
1 0 0 0 ;; count
32 0 0 0]) ;; cigar: 2M
expected (test-common/to-sam-alignment
{:qname "Q", :flag (int 0),
:rname "ref", :pos (int 2), :end (int 3),
:mapq (int 0), :cigar "2M",
:rnext "*", :pnext (int 0), :tlen (int 0),
:seq "AA", :qual "*", :options []})]

(testing "two-arguments-decode-alignment-test"
(are [?expected-aln ?byte]
(= ?expected-aln
(decoder/decode-alignment
[{:name "ref", :len 0}]
{:data (byte-array (map unchecked-byte ?byte))}))
expected
no-opts

expected
with-CG))

(testing "four-arguments-decode-alignment-test"
alumi marked this conversation as resolved.
Show resolved Hide resolved
(are [expected-aln byte start end]
(= expected-aln
(decoder/decode-alignment
[{:name "ref", :len 0}]
{:data (byte-array (map unchecked-byte byte))} start end))
expected
no-opts ;; pos 2, ref-end 3
2 3

nil
no-opts
1 1

nil
no-opts
4 100

expected
with-CG ;; pos 2, ref-end 3
2 2

nil
with-CG
1 1

nil
with-CG
4 (Math/pow 10 15)))))
108 changes: 106 additions & 2 deletions test/cljam/io/bam/encoder_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@
"Tests for cljam.io.bam.encoder."
(:require [clojure.test :refer [deftest are testing]]
[clojure.string :as cstr]
[cljam.io.bam.encoder :as encoder])
(:import [java.nio ByteBuffer]))
[cljam.io.bam.encoder :as encoder]
[cljam.test-common :as test-common])
(:import [java.io ByteArrayOutputStream DataOutputStream]
[java.nio ByteBuffer ByteOrder]))

(defn- get-encoded-option-data [?type ?values]
(let [bb (ByteBuffer/allocate 200)]
Expand Down Expand Up @@ -112,3 +114,105 @@
0x00 0x00 0x00 0x00
0x00 0x00 0xb8 0x40
0xff 0xff 0x7f 0x7f])))

(deftest add-cigar-to-options-test
(are [?cigar ?sample-option-list ?expected-option-list]
(= ?expected-option-list (#'encoder/add-cigar-to-options ?sample-option-list ?cigar))
"45S34N"
'({:Xa {:type "A", :value \p}}
{:XI {:type "B", :value "I,0,2147483647,4294967295"}})
'({:CG {:type "B", :value "I,724,547"}}
{:Xa {:type "A", :value \p}}
{:XI {:type "B", :value "I,0,2147483647,4294967295"}})

"1M2I4D8N16S32H64P128=65535X"
'()
'({:CG {:type "B", :value "I,16,33,66,131,260,517,1030,2055,1048568"}})))

(deftest encode-alignment-test
(let [cigar-consisting-of-65535-operations (apply str "1S" (repeat 32767 "1I1M"))
seq-for-65535-operations-cigar (apply str (repeat 65535 "A"))
aln-65535-cigar-operations-byte
(vec (concat
[0 0 0 0 ;; refID
0 0 0 0 ;; pos
5 0 ;; l_read_name, mapq
73 2 ;; bin
-1 -1 ;; n_cigar_ops
0 0 ;; flag
-1 -1 0 0 ;; l_seq
-1 -1 -1 -1 ;; next_refID
-1 -1 -1 -1 ;; next_pos
0 0 0 0 ;; tlen
82 48 48 49 0] ;; read_name
;; cigar
[20 0 0 0] ;; 1S
(take (* 8 32767) (cycle [17 0 0 0 16 0 0 0])) ;; repeat 1I1M
(repeat 32767 17) '[16] ;; seq
(repeat 65535 -1))) ;; qual
cigar-consisting-of-65536-operations (apply str (repeat 32768 "1I1M"))
seq-for-65536-operations-cigar (apply str (repeat 65536 "A"))
aln-65536-cigar-operations-byte
(vec (concat
[0 0 0 0 ;; refID
0 0 0 0 ;; pos
5 0 ;; l_read_name, mapq
73 2 ;; bin
2 0 ;; n_cigar_op
0 0 ;; flag
0 0 1 0 ;; l_seq
-1 -1 -1 -1 ;; next_refID
-1 -1 -1 -1 ;; next_pos
0 0 0 0 ;; tlen
82 48 48 49 0 ;; read_name
4 0 16 0 3 0 8 0] ;; cigar replaced by placeholder "65536S32768N"
(repeat 32768 17) ;; seq
(repeat 65536 -1) ;; qual
;; options
[67 71 66 73] ;; tag: CG, val_type: B-I
[0 0 1 0] ;; count
(take (* 8 32768) (cycle [17 0 0 0 16 0 0 0])) ;; real cigar: repeat 1I1M
))]

(are [?aln ?expected-byte]
(= (doto (ByteBuffer/wrap (byte-array (map unchecked-byte ?expected-byte)))
(.order ByteOrder/LITTLE_ENDIAN))
(with-open [baos (ByteArrayOutputStream. 4096)
dos (DataOutputStream. baos)]
(encoder/encode-alignment dos ?aln '({:name "ref", :len 0}))
(ByteBuffer/wrap (byte-array (.toByteArray baos)))))

(test-common/to-sam-alignment
{:qname "r003", :flag 16, :rname "ref", :pos 29, :end 33, :mapq 30,
:cigar "6H5M", :rnext "*", :pnext 0, :tlen 0, :seq "TAGGC", :qual "*",
:options ()})
[0 0 0 0 ;; refID
28 0 0 0 ;; pos
5 30 ;; l_read_name, mapq
73 18 ;; bin
2 0 ;; n_cigar_op
16 0 ;; flag
5 0 0 0 ;; l_seq
-1 -1 -1 -1 ;; next_refID
-1 -1 -1 -1 ;; next_pos
0 0 0 0 ;; tlen
114 48 48 51 0 ;; read_name
101 0 0 0 80 0 0 0 ;; cigar
-127 68 32 ;; seq
-1 -1 -1 -1 -1] ;; qual

(test-common/to-sam-alignment
{:qname "R001" :flag (int 0) :rname "ref" :pos (int 1) :end (int 32767)
:seq seq-for-65535-operations-cigar :qual "*"
:cigar cigar-consisting-of-65535-operations
:rnext "*" :pnext (int 0) :tlen (int 0) :mapq (int 0)
:options ()})
aln-65535-cigar-operations-byte

(test-common/to-sam-alignment
{:qname "R001" :flag (int 0) :rname "ref" :pos (int 1) :end (int 32768)
:seq seq-for-65536-operations-cigar :qual "*"
:cigar cigar-consisting-of-65536-operations
:rnext "*" :pnext (int 0) :tlen (int 0) :mapq (int 0)
:options ()})
aln-65536-cigar-operations-byte)))
Loading