Skip to content

Commit

Permalink
Support CIGARs longer than 65535 operations
Browse files Browse the repository at this point in the history
  • Loading branch information
matsutomo81 committed Sep 27, 2022
1 parent 33e5f1b commit b62bd28
Show file tree
Hide file tree
Showing 8 changed files with 347 additions and 14 deletions.
45 changes: 41 additions & 4 deletions src/cljam/io/bam/decoder.clj
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,33 @@

(defrecord BAMRawBlock [data ^long pointer-beg ^long pointer-end])

(defn- first-cigar-operation-clips-entire-read?
[cigar l-seq]
(let [first-operation (re-find #"\d+[MIDNSHP=X]" cigar)]
(if (and (= \S (last first-operation))
(= l-seq (Integer/parseInt
(subs first-operation 0 (dec (count first-operation))))))
true
false)))

(defn- B-I-type-cigar-str->cigar-str
[B-I-type-cigar]
(apply str
(map #(str (first %)
(case (second %)
"0" \M "1" \I "2" \D "3" \N "4" \S "5" \H "6" \P "7" \= "8" \X))
(partition 2 (rest (cstr/split B-I-type-cigar #","))))))

(defn- get-cigar-from-options
[options]
(->> options
(keep #(get % :CG))
first
(#(B-I-type-cigar-str->cigar-str (get % :value)))))

(defn- remove-cigar-from-options [options]
(filter #(not (contains? % :CG)) options))

(defn decode-alignment
"Decodes BAM block and creates SAMAlignment instance which is compatible with SAM.
When called with start and end, this function may return nil if any base of the block
Expand Down Expand Up @@ -129,9 +156,14 @@
seq (decode-seq (lsb/read-bytes buffer (quot (inc l-seq) 2)) l-seq)
qual (decode-qual (lsb/read-bytes buffer l-seq))
rest (lsb/read-bytes buffer (options-size (alength ^bytes (:data block)) l-read-name n-cigar-op l-seq))
options (decode-options rest)]
options (decode-options rest)
[cigar* options*]
(if (and (contains? (first options) :CG)
(first-cigar-operation-clips-entire-read? cigar l-seq))
[(get-cigar-from-options options) (remove-cigar-from-options options)]
[cigar options])]
(SAMAlignment. qname (int flag) rname (int pos) ref-end (int mapq)
cigar rnext (int pnext) (int tlen) seq qual options)))
cigar* rnext (int pnext) (int tlen) seq qual options*)))
([refs block ^long start ^long end]
(let [buffer (ByteBuffer/wrap (:data block))
ref-id ^int (lsb/read-int buffer)
Expand All @@ -157,9 +189,14 @@
qual (decode-qual (lsb/read-bytes buffer l-seq))
rest (lsb/read-bytes buffer (options-size (alength ^bytes (:data block)) l-read-name n-cigar-op l-seq))
rname (or (refs/ref-name refs ref-id) "*")
options (decode-options rest)]
options (decode-options rest)
[cigar* options*]
(if (and (contains? (first options) :CG)
(first-cigar-operation-clips-entire-read? cigar l-seq))
[(get-cigar-from-options options) (remove-cigar-from-options options)]
[cigar options])]
(SAMAlignment. qname (int flag) rname (int pos) ref-end (int mapq)
cigar rnext (int pnext) (int tlen) seq qual options))))))))
cigar* rnext (int pnext) (int tlen) seq qual options*))))))))

(defn decode-region-block
"Decodes BAM block and returns a SAMRegionBlock instance containing covering range of the alignment."
Expand Down
42 changes: 37 additions & 5 deletions src/cljam/io/bam/encoder.clj
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,41 @@
read-length
^long (get-options-size aln))))

(defn- long-cigar->placeholder* [cigar regex]
(->> cigar
(re-seq regex)
(map #(Integer/parseInt (re-find #"\d+" %)))
(apply +)
(#(str %))))

(defn- long-cigar->placeholder
[cigar]
(str
(long-cigar->placeholder* cigar #"\d+[MIS=X]") "S"
(long-cigar->placeholder* cigar #"\d+[MDN=X]") "N"))

(defn- str-cigar->B-I-type
[cigar]
(->> cigar
(cigar/parse)
(map #(str (first %) ","
(case (second %)
\M 0 \I 1 \D 2 \N 3 \S 4 \H 5 \P 6 \= 7 \X 8)))
(cstr/join ",")
(str "I,")))

(defn- add-cigar-to-options
[options cigar]
(cons {:CG {:type "B", :value (str-cigar->B-I-type cigar)}} options))

(defn encode-alignment [wrtr aln refs]
(let [aln (update aln :seq #(if (= % "*") "" %))]
(let [aln (update aln :seq #(if (= % "*") "" %))
cigar-count-op (cigar/count-op (:cigar aln))
[cigar op* cigar-count-op*]
(if (> 65536 cigar-count-op)
[(:cigar aln) (:options aln) cigar-count-op]
[(long-cigar->placeholder
(:cigar aln)) (add-cigar-to-options (:options aln) (:cigar aln)) 2])]
;; refID
(lsb/write-int wrtr (or (refs/ref-id refs (:rname aln)) -1))
;; pos
Expand All @@ -104,7 +137,7 @@
(lsb/write-ubyte wrtr (short (:mapq aln)))
(lsb/write-ushort wrtr (sam-util/compute-bin aln))
;; flag_nc
(lsb/write-ushort wrtr (cigar/count-op (:cigar aln)))
(lsb/write-ushort wrtr cigar-count-op*)
(lsb/write-ushort wrtr (:flag aln))
;; l_seq
(lsb/write-int wrtr (.length ^String (:seq aln)))
Expand All @@ -118,15 +151,14 @@
(lsb/write-string wrtr (:qname aln))
(lsb/write-bytes wrtr (byte-array 1 (byte 0)))
;; cigar
(doseq [cigar (cigar/encode-cigar (:cigar aln))]
(doseq [cigar (cigar/encode-cigar cigar)]
(lsb/write-int wrtr cigar))
;; seq
(lsb/write-bytes wrtr (seq/str->compressed-bases (:seq aln)))
;; qual
(lsb/write-bytes wrtr (encode-qual aln))

;; options
(doseq [op (:options aln)]
(doseq [op op*]
(let [[tag value] (first (seq op))]
(lsb/write-short
wrtr
Expand Down
Binary file added test-resources/bam/long_cigar_operations.bam
Binary file not shown.
4 changes: 4 additions & 0 deletions test-resources/sam/long_cigar_operations.sam

Large diffs are not rendered by default.

153 changes: 153 additions & 0 deletions test/cljam/io/bam/decoder_test.clj
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
(ns cljam.io.bam.decoder-test
"Tests for cljam.io.bam.decoder."
(:require [clojure.test :refer [deftest is are testing]]
[cljam.io.protocols]
[cljam.io.bam.decoder :as decoder]
[cljam.test-common :as test-common]))

((deftest first-cigar-operation-clips-entire-read?-test
(are [expected cigar l-seq]
(= expected
(#'decoder/first-cigar-operation-clips-entire-read? cigar l-seq))
true "34S" 34
true "1000000000S20N" 1000000000
false "10S20N" 11
false "10S" 11
false "10H20N" 10)))

(deftest B-I-type-cigar-str->cigar-str-test
(is (= "1M2I3D4N5S10H11P12=13X"
(#'decoder/B-I-type-cigar-str->cigar-str
"I,1,0,2,1,3,2,4,3,5,4,10,5,11,6,12,7,13,8"))))

(deftest get-cigar-from-options-test
(are [expected-cigar sample-option]
(= expected-cigar (#'decoder/get-cigar-from-options sample-option))
"12S34N"
'({:CG {:type "B", :value "I,12,4,34,3"}}
{:Xa {:type "A", :value \p}}
{:Xi {:type "i", :value -100}}
{:Xf {:type "f", :value -1400.0}}
{:Xz {:type "Z", :value "AATTGGCC"}}
{:Xh {:type "H", :value (map unchecked-byte [0x1A 0xE3 0x01])}}
{:Xc {:type "B", :value "c,-128,0,127"}}
{:XC {:type "B", :value "C,0,127,255"}}
{:Xs {:type "B", :value "s,-32768,0,32767"}}
{:XS {:type "B", :value "S,0,32767,65535"}}
{:Xi {:type "B", :value "i,-2147483648,0,2147483647"}}
{:XI {:type "B", :value "I,0,2147483647,4294967295"}}
{:Xf {:type "B", :value "f,-0.3,0.0,0.3"}})

"1M2I4D8N16S32H64P128=256X"
'({:MD {:type "Z" :value "2G0C0"}}
{:XT {:type "A", :value \R}}
{:NM {:type "i", :value 44}}
{:cg {:type "Z", :value "foo"}}
{:CG {:type "B", :value "I,1,0,2,1,4,2,8,3,16,4,32,5,64,6,128,7,256,8"}})))

(deftest remove-cigar-from-options-test
(are [expected-option sample-option]
(= expected-option (#'decoder/remove-cigar-from-options sample-option))
'({:Xa {:type "A", :value \p}}
{:Xi {:type "i", :value -100}}
{:Xf {:type "f", :value -1400.0}}
{:Xz {:type "Z", :value "AATTGGCC"}}
{:Xh {:type "H", :value (map unchecked-byte [0x1A 0xE3 0x01])}}
{:Xc {:type "B", :value "c,-128,0,127"}}
{:XC {:type "B", :value "C,0,127,255"}}
{:Xs {:type "B", :value "s,-32768,0,32767"}}
{:XS {:type "B", :value "S,0,32767,65535"}}
{:Xi {:type "B", :value "i,-2147483648,0,2147483647"}}
{:XI {:type "B", :value "I,0,2147483647,4294967295"}}
{:Xf {:type "B", :value "f,-0.3,0.0,0.3"}})
'({:CG {:type "B", :value "I,12,4,34,3"}}
{:Xa {:type "A", :value \p}}
{:Xi {:type "i", :value -100}}
{:Xf {:type "f", :value -1400.0}}
{:Xz {:type "Z", :value "AATTGGCC"}}
{:Xh {:type "H", :value (map unchecked-byte [0x1A 0xE3 0x01])}}
{:Xc {:type "B", :value "c,-128,0,127"}}
{:XC {:type "B", :value "C,0,127,255"}}
{:Xs {:type "B", :value "s,-32768,0,32767"}}
{:XS {:type "B", :value "S,0,32767,65535"}}
{:Xi {:type "B", :value "i,-2147483648,0,2147483647"}}
{:XI {:type "B", :value "I,0,2147483647,4294967295"}}
{:Xf {:type "B", :value "f,-0.3,0.0,0.3"}})

'({:MD {:type "Z" :value "2G0C0"}}
{:XT {:type "A", :value \R}}
{:NM {:type "i", :value 44}}
{:cg {:type "B", :value "I,3,1"}})
'({:MD {:type "Z" :value "2G0C0"}}
{:XT {:type "A", :value \R}}
{:NM {:type "i", :value 44}}
{:cg {:type "B", :value "I,3,1"}}
{:CG {:type "B", :value "I,34,0"}})

'()
'({:CG {:type "B", :value "I,34,7"}})))

(deftest two-arguments-decode-alignment-test
(let [encoded-sam-byte-without-CG-tag
[0 0 0 0 8 0 0 0 5 30 73 18 2 0 0 0 6 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0
0 0 114 48 48 51 0 85 0 0 0 96 0 0 0 20 40 17 -1 -1 -1 -1 -1 -1 88 88
66 83 4 0 0 0 17 49 2 0 20 0 112 0]
encoded-sam-byte-including-CG-tag
[0 0 0 0 6 0 0 0 5 30 73 18 2 0 -93 0 19 0 0 0 0 0 0 0 36 0 0 0 39 0 0
0 114 48 48 49 0 52 1 0 0 3 1 0 0 -120 20 24 17 20 20 65 -127 40 64
-1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 67 71 66 73
10 0 0 0 8 0 0 0 0 0 0 0 4 0 0 0 1 0 0 0 4 0 0 0 0 0 0 0 1 0 0 0 2
0 0 0 3 0 0 0 0 0 0 0]]
(testing "two-arguments-decode-alignment-test"
(are [expected-aln byte]
(= expected-aln
(decoder/decode-alignment
[{:name "ref", :len 0}]
{:data (byte-array (map unchecked-byte byte))}))
(test-common/to-sam-alignment
{:qname "r003" :flag 0, :rname "ref", :pos 9, :end 14, :mapq 30,
:cigar "5H6M", :rnext "*", :pnext 0, :tlen 0, :seq "AGCTAA",
:qual "*" :options [{:XX {:type "B", :value "S,12561,2,20,112"}}]})
encoded-sam-byte-without-CG-tag

(test-common/to-sam-alignment
{:qname "r001", :flag 163, :rname "ref", :pos 7, :end 22, :mapq 30,
:cigar "8M4I4M1D3M", :rnext "=", :pnext 37, :tlen 39,
:seq "TTAGATAAAGAGGATACTG", :qual "*", :options ()})
encoded-sam-byte-including-CG-tag))

(testing "four-arguments-decode-alignment-test"
(are [expected-aln byte start end]
(= expected-aln
(decoder/decode-alignment
[{:name "ref", :len 0}]
{:data (byte-array (map unchecked-byte byte))} start end))
(test-common/to-sam-alignment
{:qname "r003", :flag 0, :rname "ref", :pos 9, :end 14, :mapq 30,
:cigar "5H6M", :rnext "*", :pnext 0, :tlen 0, :seq "AGCTAA", :qual "*",
:options [{:XX {:type "B", :value "S,12561,2,20,112"}}]})
encoded-sam-byte-without-CG-tag
14 9

nil
encoded-sam-byte-without-CG-tag
14 8

nil
encoded-sam-byte-without-CG-tag
15 9

(test-common/to-sam-alignment
{:qname "r001", :flag 163, :rname "ref", :pos 7, :end 22, :mapq 30,
:cigar "8M4I4M1D3M", :rnext "=", :pnext 37, :tlen 39,
:seq "TTAGATAAAGAGGATACTG", :qual "*", :options ()})
encoded-sam-byte-including-CG-tag
22 7

nil
encoded-sam-byte-including-CG-tag
22 6

nil
encoded-sam-byte-including-CG-tag
23 7))))
Loading

0 comments on commit b62bd28

Please sign in to comment.