diff --git a/.gitignore b/.gitignore
index 4e851d01..771bf7b4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -30,4 +30,5 @@ src/backends/qnn/qualcomm_ai_engine_direct_220/*
 src/backends/qnn/HexagonSDK/*
 examples/demo_deepseek.cpp
 examples/demo_xllm.cpp
-src/models/xllm/*
\ No newline at end of file
+src/models/xllm/*
+src/models/deepseek/*
\ No newline at end of file
diff --git a/src/Layer.hpp b/src/Layer.hpp
index 4625224f..04968752 100644
--- a/src/Layer.hpp
+++ b/src/Layer.hpp
@@ -544,7 +544,7 @@ class Matmul final : public Layer {
         return ts[0].get();
     }
 };
-
+/*
 class Split final : public Layer {
 public:
     Split() = default;
@@ -570,7 +570,7 @@ class Split final : public Layer {
         return run({input}, (int)param_["split_num"]);
     }
 };
-
+*/
 class Convolution2D final : public Layer {
 public:
     explicit Convolution2D(int in_channel, int out_channel, vector<int> kernal, vector<int> stride, PaddingType padding, bool bias, std::string name) {
diff --git a/src/Tensor.cpp b/src/Tensor.cpp
index e41e190b..efdd1c88 100644
--- a/src/Tensor.cpp
+++ b/src/Tensor.cpp
@@ -342,7 +342,8 @@ vector<std::reference_wrapper<Tensor>> Tensor::split(Tensor &input, std::vector<
     std::vector<float> args;
     for (int i = 0; i < each_dims.size(); ++i) {
         args.push_back(each_dims[i]);
-        next_names.push_back(input.name() + "-split-" + std::to_string(i) + "-" + std::to_string(each_dims[i]));
+        // next_names.push_back(input.name() + "-split-" + std::to_string(i) + "-" + std::to_string(each_dims[i]));
+        next_names.push_back(input.name() + ".split-" + std::to_string(i));
     }
     args.push_back(split_dim);
     args.push_back(head_size);
diff --git a/src/models/deepseek/configuration_deepseek.hpp b/src/models/deepseek/configuration_deepseek.hpp
deleted file mode 100644
index c5a1d70a..00000000
--- a/src/models/deepseek/configuration_deepseek.hpp
+++ /dev/null
@@ -1,83 +0,0 @@
-
-#ifndef CONFIG_DEEPSEEK_HPP
-#define CONFIG_DEEPSEEK_HPP
-#include "Types.hpp"
-#include "models/transformer/configuration_transformer.hpp"
-
-using namespace mllm;
-
-class DeepseekNameConfig : public TransformerNameConfig {
-public:
-    /**
-     * @brief Deepseek following the hugging face naming method
-     *
-     * @param type RoPEType
-     */
-    void init() {
-        blk_name = "model.layers.";
-        _attn_base_name = "self_attn.";
-        _ffn_base_name = "mlp.";
-        _q_proj_name = "q_proj";
-        _kv_a_proj_with_mqa_name = "kv_a_proj_with_mqa";
-        _kv_a_layernorm_name = "kv_a_layernorm";
-        _kv_b_proj_name = "kv_b_proj";
-        _o_proj_name = "o_proj";
-        _gate_proj_name = "gate_proj";
-        _up_proj_name = "up_proj";
-        _down_proj_name = "down_proj";
-        _attn_norm_name = "input_layernorm";
-        _ffn_norm_name = "post_attention_layernorm";
-        token_embd_name = "model.embed_tokens";
-        post_norm_name = "model.norm";
-        lm_head_name = "model.embed_tokens";
-    }
-
-    std::string _kv_a_proj_with_mqa_name;
-    std::string _kv_a_layernorm_name;
-    std::string _kv_b_proj_name;
-
-    std::string blk_name;
-    std::string token_embd_name;
-    std::string post_norm_name;
-    std::string lm_head_name;
-    std::string _gate_proj_name;
-};
-
-struct DeepseekConfig {
-    explicit DeepseekConfig(int token_limit):
-        cache_limit(token_limit) {
-        names_config.init();
-    };
-
-    int vocab_size = 32000;
-    int max_position_embeddings = 8192;
-    int num_hidden_layers = 12;
-    int hidden_size = 768;
-    int intermediate_size = 2048;
-    int num_heads = 16;
-    int qk_rope_head_dim=24;//qk_rope_head_dim
-    int qk_nope_head_dim=48; //qk_nope_head_dim = qk_rope_head_dim*2
-    int v_head_dim=48;  //v_head_dim=  qk_nope_head_dim*2
-    int kv_lora_rank = 192; //kv_lora_rank = 2568* qk_nope_head_dim;
-
-    // int vocab_size = 152064;
-    // int max_position_embeddings = 8192;
-    // int num_hidden_layers = 12;
-    // int hidden_size = 1024;
-    // int intermediate_size = 4864;
-    // int num_heads = 16;
-    // int qk_rope_head_dim=32;//qk_rope_head_dim
-    // int qk_nope_head_dim=64; //qk_nope_head_dim = qk_rope_head_dim*2
-    // int v_head_dim=64;  //v_head_dim=  qk_nope_head_dim*2
-    // int kv_lora_rank = 256; //kv_lora_rank = 2568* qk_nope_head_dim;
-
-
-    float rms_norm_eps = 1e-6;
-    int cache_limit;
-    bool do_mask=true;
-
-
-    DeepseekNameConfig names_config;
-};
-
-#endif //! CONFIG_DEEPSEEK_HPP
diff --git a/src/models/deepseek/modeling_deepseek.hpp b/src/models/deepseek/modeling_deepseek.hpp
deleted file mode 100644
index 3fbece36..00000000
--- a/src/models/deepseek/modeling_deepseek.hpp
+++ /dev/null
@@ -1,203 +0,0 @@
-//
-// Created by Rongjie Yi on 24-6-20.
-//
-
-#ifndef MODELING_DEEPSEEK_HPP
-#define MODELING_DEEPSEEK_HPP
-
-#include "configuration_deepseek.hpp"
-
-using namespace mllm;
-
-class DeepseekMultiHeadLatentAttention final : public Module {
-    Layer q_proj;
-    Layer kv_a_proj_with_mqa;
-    Layer kv_a_layernorm;
-    Layer kv_b_proj;
-    Layer k_proj;
-    Layer v_proj;
-    Layer q_rope;
-    Layer k_rope;
-    KVCache k_cache;
-    KVCache v_cache;
-    Softmax softmax;
-    Layer o_proj;
-    int num_heads{};
-    int q_head_dim{};
-    int v_head_dim{};
-    int qk_nope_head_dim{};
-    int qk_rope_head_dim{};
-    int kv_lora_rank{};
-    float softmax_scale{};
-public:
-    DeepseekMultiHeadLatentAttention() = default;
-    DeepseekMultiHeadLatentAttention(const DeepseekConfig config, const DeepseekNameConfig &names, const string &base_name) {
-        num_heads = config.num_heads;
-        qk_nope_head_dim =config.qk_nope_head_dim;
-        qk_rope_head_dim =config.qk_rope_head_dim;
-        kv_lora_rank = config.kv_lora_rank;
-        v_head_dim = config.v_head_dim;
-        q_head_dim=config.qk_nope_head_dim + config.qk_rope_head_dim;
-        q_proj = Linear(
-            config.hidden_size, 
-            num_heads * q_head_dim, 
-            false, 
-            base_name + names._q_proj_name);
-        kv_a_proj_with_mqa = Linear(
-            config.hidden_size,
-            kv_lora_rank + qk_rope_head_dim,
-            false,
-            base_name + names._kv_a_proj_with_mqa_name
-        );
-        kv_a_layernorm = RMSNorm(kv_lora_rank, config.rms_norm_eps, base_name + names._kv_a_layernorm_name);
-        kv_b_proj = Linear(
-            kv_lora_rank,
-            num_heads * (q_head_dim - qk_rope_head_dim + v_head_dim),
-            false,
-            base_name + names._kv_b_proj_name
-        );
-        o_proj = Linear(
-            num_heads * v_head_dim,
-            config.hidden_size,
-            false, 
-            base_name + names._o_proj_name
-        );        
-        q_rope = RoPE(RoPEType::MLAROPE, base_name + "q_rope");
-        k_rope = RoPE(RoPEType::MLAROPE, base_name + "k_rope");        
-        if (config.cache_limit > 0) {
-            k_cache = KVCache(num_heads/num_heads, config.cache_limit, base_name + "k_cache");
-            v_cache = KVCache(num_heads/num_heads, config.cache_limit, base_name + "v_cache");
-        }
-        softmax = Softmax(DIMENSION, config.do_mask, base_name + "softmax");
-        softmax_scale = 1/std::sqrt(q_head_dim);
-    }
-    vector<Tensor> Forward(vector<Tensor> inputs, vector<std::any> args) override  {
-        auto hidden_states = inputs[0];
-    
-        auto q = q_proj(hidden_states);
-        auto qs = Tensor::split(q, {qk_nope_head_dim, qk_rope_head_dim}, D_HD, num_heads);
-        q = Tensor::cat({qs[0], q_rope(qs[1])}, DIMENSION);
-
-        Tensor compressed_kv = kv_a_proj_with_mqa(hidden_states);        
-        auto kvs = Tensor::split(compressed_kv, 
-                        {kv_lora_rank, qk_rope_head_dim}, DIMENSION);
-        auto k_pe = k_rope(kvs[1]);
-        auto kv = kv_b_proj(kv_a_layernorm(kvs[0]));//.view(-1, head_size_, -1, qk_nope_head_dim_ + v_head_dim_);
-        kvs = Tensor::split(kv, {qk_nope_head_dim, v_head_dim}, D_HD, num_heads);
-        auto v = kvs[1];
-        auto k = Tensor::cat({kvs[0], k_pe}, DIMENSION);  
-        if (k_cache.ready() && v_cache.ready()) {
-            k = k_cache(k);
-            v = v_cache(v);
-        }
-        k = k.transpose(SEQUENCE, DIMENSION);
-        auto qk = Tensor::mm(q, k);
-        qk = qk * softmax_scale;
-        qk = softmax(qk, k_cache.getCacheSeqLen());
-        auto o = Tensor::mm(qk, v);
-        o = o.view(-1, 1, -1, v_head_dim * num_heads);
-        o = o_proj(o);
-        return {o};        
-    }
-};
-
-class DeepseekMLP final : public Module {
-private:
-    Layer gate_proj;
-    Layer up_proj;
-    Layer down_proj;
-    Layer gelu;
-public:
-    DeepseekMLP() = default;
-    DeepseekMLP(const DeepseekConfig &config, const DeepseekNameConfig &names, const std::string &base_name) {
-        int hidden_size = config.hidden_size;
-        int intermediate_size = config.intermediate_size;
-        gate_proj = Linear(hidden_size, intermediate_size, false, base_name + names._gate_proj_name);
-        gelu = SiLU(base_name + "act");
-        up_proj = Linear(hidden_size, intermediate_size, false, base_name + names._up_proj_name);
-        down_proj = Linear(intermediate_size, hidden_size, false, base_name + names._down_proj_name);
-    }
-    std::vector<Tensor> Forward(std::vector<Tensor> inputs, std::vector<std::any> args) override {
-        auto x = gate_proj(inputs[0]);
-        x = gelu(x);
-        auto y = up_proj(inputs[0]);
-        x = x * y;
-        x = down_proj(x);
-        return {x};
-    }
-};
-
-class DeepseekDecoder final : public Module {
-private:
-    DeepseekMultiHeadLatentAttention self_atten;
-    DeepseekMLP mlp;
-    Layer input_layernorm;
-    Layer post_attention_layernorm;
-public:
-    DeepseekDecoder() = default;
-    DeepseekDecoder(const DeepseekConfig &config, const DeepseekNameConfig &names, const string &base_name) {
-        self_atten = DeepseekMultiHeadLatentAttention(config, names, base_name + names._attn_base_name);
-        mlp = DeepseekMLP(config, names, base_name + names._ffn_base_name);
-        input_layernorm = RMSNorm(config.hidden_size, config.rms_norm_eps, base_name + names._attn_norm_name);
-        post_attention_layernorm = RMSNorm(config.hidden_size, config.rms_norm_eps, base_name + names._ffn_norm_name);
-    }
-
-    std::vector<Tensor> Forward(std::vector<Tensor> inputs, std::vector<std::any> args) override {
-        auto x = input_layernorm(inputs[0]);
-        x = self_atten({x, x, x})[0];
-        auto tmp = x + inputs[0];
-        x = post_attention_layernorm(tmp);
-        x = mlp({x})[0];
-        x = x + tmp;
-        return {x};
-    }
-};
-
-class DeepseekModel final : public Module {
-private:
-    std::vector<DeepseekDecoder> blocks;
-    Layer norm;
-public:
-    DeepseekModel() = default;
-    DeepseekModel(const DeepseekConfig &config, const DeepseekNameConfig &names, const string &base_name) {
-        blocks = List<DeepseekDecoder>(config.num_hidden_layers, config, names, base_name);
-        norm = RMSNorm(config.hidden_size, config.rms_norm_eps, true, names.post_norm_name);
-    }
-
-    std::vector<Tensor> Forward(std::vector<Tensor> inputs, std::vector<std::any> args) override {
-        auto x = inputs[0];
-        for (auto &block : blocks) {
-            x = block({x})[0];
-        }
-        x = norm(x);
-        return {x};
-    }
-};
-
-class DeepseekForCausalLM final : public Module {
-private:
-    int hidden_size;
-    Layer embedding;
-    Parameter lm_head;
-    DeepseekModel model;
-public:
-    DeepseekForCausalLM(DeepseekConfig &config) {
-        auto names = config.names_config;
-        hidden_size = config.hidden_size;
-        embedding = Embedding(config.vocab_size, config.hidden_size, names.token_embd_name);
-        model = DeepseekModel(config, names, names.blk_name);
-
-        // lm_head and tok_embedding is tied together.
-        // They share same parameters. Use a Transpose to do the lm_head instead.
-        lm_head = Parameter(1, config.vocab_size, 1, config.hidden_size, names.lm_head_name + ".weight");
-    }
-    std::vector<Tensor> Forward(std::vector<Tensor> inputs, std::vector<std::any> args) override {
-        auto x = embedding(inputs[0]);
-        auto outputs = model({x})[0];
-        outputs = Tensor::mm(outputs, lm_head().transpose(Chl::SEQUENCE, Chl::DIMENSION));
-        return {outputs};
-    }
-};
-
-
-#endif // MODELING_DEEPSEEK_HPP
diff --git a/src/models/stablelm/modeling_stablelm.hpp b/src/models/stablelm/modeling_stablelm.hpp
index 2c834379..50737a2a 100644
--- a/src/models/stablelm/modeling_stablelm.hpp
+++ b/src/models/stablelm/modeling_stablelm.hpp
@@ -3,53 +3,37 @@
 
 #include "Layer.hpp"
 #include "Module.hpp"
+#include "Types.hpp"
 #include "configuration_stablelm.hpp"
-#include "models/transformer/modeling_transformer.hpp"
-#include <chrono>
 
 using namespace mllm;
 
 class StableLMMultiHeadAttention final : public Module {
-    Layer qkv_proj;
-    Split qkv_split;
     Layer q_proj;
     Layer k_proj;
     Layer v_proj;
     Layer q_rope;
     Layer k_rope;
-    Layer q_norm;
-    Layer k_norm;
     KVCache k_cache;
     KVCache v_cache;
     Softmax softmax;
     Layer o_proj;
-    Parameter bias_k;
-    Parameter bias_v;
     int head_size_{};
     int kv_head_size_{};
     int attn_hidden_dim_{};
+    Chl split_chl_{};
 
 public:
     StableLMMultiHeadAttention() = default;
     StableLMMultiHeadAttention(int hidden_dim, int head_size, int kv_head_size, int attn_hidden_dim,
-                                  AttnQKVSplitType do_qkv_proj, bool post_qkv_norm, bool bias_kv_cat,
                                   RoPEType RoPE_type, int cache_limit, bool do_mask, bool bias,
                                   const TransformerNameConfig &names, const string &base_name) {
         attn_hidden_dim_ = attn_hidden_dim;
         head_size_ = head_size;
         kv_head_size_ = kv_head_size;
-        if (do_qkv_proj > 0) {
-            qkv_proj = Linear(hidden_dim, head_size * attn_hidden_dim * 3, bias, base_name + names._qkv_proj_name);
-            qkv_split = Split(3, (Chl)do_qkv_proj, head_size, base_name + names._qkv_proj_name + ".split");
-        } else {
-            q_proj = Linear(hidden_dim, head_size * attn_hidden_dim, bias, base_name + names._q_proj_name);
-            k_proj = Linear(hidden_dim, kv_head_size * attn_hidden_dim, bias, base_name + names._k_proj_name);
-            v_proj = Linear(hidden_dim, kv_head_size * attn_hidden_dim, bias, base_name + names._v_proj_name);
-        }
-        if (post_qkv_norm) {
-            q_norm = LayerNorm(attn_hidden_dim, true, 1e-6, base_name + names._q_norm_name);
-            k_norm = LayerNorm(attn_hidden_dim, true, 1e-6, base_name + names._k_norm_name);
-        }
+        q_proj = Linear(hidden_dim, head_size * attn_hidden_dim, bias, base_name + names._q_proj_name);
+        k_proj = Linear(hidden_dim, kv_head_size * attn_hidden_dim, bias, base_name + names._k_proj_name);
+        v_proj = Linear(hidden_dim, kv_head_size * attn_hidden_dim, bias, base_name + names._v_proj_name);
         if (RoPE_type > 0) {
             q_rope = RoPE(RoPE_type, 10000, 0.25, 4096, base_name + "q_rope");
             k_rope = RoPE(RoPE_type, 10000, 0.25, 4096, base_name + "k_rope");
@@ -60,35 +44,16 @@ class StableLMMultiHeadAttention final : public Module {
         }
         softmax = Softmax(DIMENSION, do_mask, base_name + "softmax");
         o_proj = Linear(head_size * attn_hidden_dim, hidden_dim, false, base_name + names._o_proj_name);
-        if (bias_kv_cat) {
-            bias_k = Parameter(1, 1, head_size, attn_hidden_dim, base_name + "bias_k");
-            bias_v = Parameter(1, 1, head_size, attn_hidden_dim, base_name + "bias_v");
-        }
+
     }
     vector<Tensor> Forward(vector<Tensor> inputs, vector<std::any> args) override {
         Tensor q, k, v;
-        if (qkv_proj.ready()) {
-            auto qkv = qkv_proj(inputs[0]);
-            auto qkv_sp = qkv_split(qkv);
-            q = qkv_sp[0];
-            k = qkv_sp[1];
-            v = qkv_sp[2];
-        } else {
-            q = q_proj(inputs[0]);
-            k = k_proj(inputs[1]);
-            v = v_proj(inputs[2]);
-            q = q.view(-1, head_size_, -1, attn_hidden_dim_);
-            k = k.view(-1, kv_head_size_, -1, attn_hidden_dim_);
-            v = v.view(-1, kv_head_size_, -1, attn_hidden_dim_);
-        }
-        if (q_norm.ready() && k_norm.ready()) {
-            q = q_norm(q);
-            k = k_norm(k);
-        }
-        if (bias_k.ready() && bias_v.ready()) {
-            k = Tensor::cat({k, bias_k()}, SEQUENCE);
-            v = Tensor::cat({v, bias_v()}, SEQUENCE);
-        }
+        q = q_proj(inputs[0]);
+        k = k_proj(inputs[1]);
+        v = v_proj(inputs[2]);
+        q = q.view(-1, head_size_, -1, attn_hidden_dim_);
+        k = k.view(-1, kv_head_size_, -1, attn_hidden_dim_);
+        v = v.view(-1, kv_head_size_, -1, attn_hidden_dim_);
         if (q_rope.ready() && k_rope.ready()) {
             q = q_rope(q);
             k = k_rope(k);
@@ -141,7 +106,7 @@ class StableLMBlock final : public Module {
 public:
     StableLMBlock() = default;
     StableLMBlock(int hidden_dim, int head_size, int ffn_hidden, RoPEType RoPE_type, int cache_limit, const stablelmNameConfig &names, const string &base_name) {
-        attention = StableLMMultiHeadAttention(hidden_dim, head_size, head_size, hidden_dim / head_size, SPLIT_NONE, false, false,
+        attention = StableLMMultiHeadAttention(hidden_dim, head_size, head_size, hidden_dim / head_size,
                                                   RoPE_type, cache_limit, true, true, names, base_name + names._attn_base_name);
         mlp = StableLMMLP(hidden_dim, ffn_hidden, names, base_name + names._ffn_base_name);
         norm1 = LayerNorm(hidden_dim, true, 1e-5, base_name + names._attn_norm_name);
diff --git a/src/models/transformer/modeling_transformer.hpp b/src/models/transformer/modeling_transformer.hpp
index b633d8c4..509b1883 100644
--- a/src/models/transformer/modeling_transformer.hpp
+++ b/src/models/transformer/modeling_transformer.hpp
@@ -21,7 +21,6 @@ enum AttnQKVSplitType {
 
 class MultiHeadAttention final : public Module {
     Layer qkv_proj;
-    Split qkv_split;
     Layer q_proj;
     Layer k_proj;
     Layer v_proj;
@@ -38,6 +37,7 @@ class MultiHeadAttention final : public Module {
     int head_size_{};
     int kv_head_size_{};
     int attn_hidden_dim_{};
+    Chl split_chl_{};
 
 public:
     MultiHeadAttention() = default;
@@ -51,7 +51,7 @@ class MultiHeadAttention final : public Module {
         kv_head_size_ = kv_head_size;
         if (do_qkv_proj > 0) {
             qkv_proj = Linear(hidden_dim, head_size * attn_hidden_dim * 3, bias, base_name + names._qkv_proj_name);
-            qkv_split = Split(3, (Chl)do_qkv_proj, head_size, base_name + names._qkv_proj_name + ".split");
+            split_chl_ = (Chl)do_qkv_proj;
         } else {
             q_proj = Linear(hidden_dim, head_size * attn_hidden_dim, bias, base_name + names._q_proj_name);
             k_proj = Linear(hidden_dim, kv_head_size * attn_hidden_dim, bias, base_name + names._k_proj_name);
@@ -80,7 +80,7 @@ class MultiHeadAttention final : public Module {
         Tensor q, k, v;
         if (qkv_proj.ready()) {
             auto qkv = qkv_proj(inputs[0]);
-            auto qkv_sp = qkv_split(qkv);
+            auto qkv_sp = Tensor::split(qkv, {attn_hidden_dim_, attn_hidden_dim_, attn_hidden_dim_}, split_chl_, head_size_);
             q = qkv_sp[0];
             k = qkv_sp[1];
             v = qkv_sp[2];