diff --git a/.gitignore b/.gitignore index 4e851d01..771bf7b4 100644 --- a/.gitignore +++ b/.gitignore @@ -30,4 +30,5 @@ src/backends/qnn/qualcomm_ai_engine_direct_220/* src/backends/qnn/HexagonSDK/* examples/demo_deepseek.cpp examples/demo_xllm.cpp -src/models/xllm/* \ No newline at end of file +src/models/xllm/* +src/models/deepseek/* \ No newline at end of file diff --git a/src/Layer.hpp b/src/Layer.hpp index 4625224f..04968752 100644 --- a/src/Layer.hpp +++ b/src/Layer.hpp @@ -544,7 +544,7 @@ class Matmul final : public Layer { return ts[0].get(); } }; - +/* class Split final : public Layer { public: Split() = default; @@ -570,7 +570,7 @@ class Split final : public Layer { return run({input}, (int)param_["split_num"]); } }; - +*/ class Convolution2D final : public Layer { public: explicit Convolution2D(int in_channel, int out_channel, vector kernal, vector stride, PaddingType padding, bool bias, std::string name) { diff --git a/src/Tensor.cpp b/src/Tensor.cpp index e41e190b..efdd1c88 100644 --- a/src/Tensor.cpp +++ b/src/Tensor.cpp @@ -342,7 +342,8 @@ vector> Tensor::split(Tensor &input, std::vector< std::vector args; for (int i = 0; i < each_dims.size(); ++i) { args.push_back(each_dims[i]); - next_names.push_back(input.name() + "-split-" + std::to_string(i) + "-" + std::to_string(each_dims[i])); + // next_names.push_back(input.name() + "-split-" + std::to_string(i) + "-" + std::to_string(each_dims[i])); + next_names.push_back(input.name() + ".split-" + std::to_string(i)); } args.push_back(split_dim); args.push_back(head_size); diff --git a/src/models/deepseek/configuration_deepseek.hpp b/src/models/deepseek/configuration_deepseek.hpp deleted file mode 100644 index c5a1d70a..00000000 --- a/src/models/deepseek/configuration_deepseek.hpp +++ /dev/null @@ -1,83 +0,0 @@ - -#ifndef CONFIG_DEEPSEEK_HPP -#define CONFIG_DEEPSEEK_HPP -#include "Types.hpp" -#include "models/transformer/configuration_transformer.hpp" - -using namespace mllm; - -class DeepseekNameConfig : public TransformerNameConfig { -public: - /** - * @brief Deepseek following the hugging face naming method - * - * @param type RoPEType - */ - void init() { - blk_name = "model.layers."; - _attn_base_name = "self_attn."; - _ffn_base_name = "mlp."; - _q_proj_name = "q_proj"; - _kv_a_proj_with_mqa_name = "kv_a_proj_with_mqa"; - _kv_a_layernorm_name = "kv_a_layernorm"; - _kv_b_proj_name = "kv_b_proj"; - _o_proj_name = "o_proj"; - _gate_proj_name = "gate_proj"; - _up_proj_name = "up_proj"; - _down_proj_name = "down_proj"; - _attn_norm_name = "input_layernorm"; - _ffn_norm_name = "post_attention_layernorm"; - token_embd_name = "model.embed_tokens"; - post_norm_name = "model.norm"; - lm_head_name = "model.embed_tokens"; - } - - std::string _kv_a_proj_with_mqa_name; - std::string _kv_a_layernorm_name; - std::string _kv_b_proj_name; - - std::string blk_name; - std::string token_embd_name; - std::string post_norm_name; - std::string lm_head_name; - std::string _gate_proj_name; -}; - -struct DeepseekConfig { - explicit DeepseekConfig(int token_limit): - cache_limit(token_limit) { - names_config.init(); - }; - - int vocab_size = 32000; - int max_position_embeddings = 8192; - int num_hidden_layers = 12; - int hidden_size = 768; - int intermediate_size = 2048; - int num_heads = 16; - int qk_rope_head_dim=24;//qk_rope_head_dim - int qk_nope_head_dim=48; //qk_nope_head_dim = qk_rope_head_dim*2 - int v_head_dim=48; //v_head_dim= qk_nope_head_dim*2 - int kv_lora_rank = 192; //kv_lora_rank = 2568* qk_nope_head_dim; - - // int vocab_size = 152064; - // int max_position_embeddings = 8192; - // int num_hidden_layers = 12; - // int hidden_size = 1024; - // int intermediate_size = 4864; - // int num_heads = 16; - // int qk_rope_head_dim=32;//qk_rope_head_dim - // int qk_nope_head_dim=64; //qk_nope_head_dim = qk_rope_head_dim*2 - // int v_head_dim=64; //v_head_dim= qk_nope_head_dim*2 - // int kv_lora_rank = 256; //kv_lora_rank = 2568* qk_nope_head_dim; - - - float rms_norm_eps = 1e-6; - int cache_limit; - bool do_mask=true; - - - DeepseekNameConfig names_config; -}; - -#endif //! CONFIG_DEEPSEEK_HPP diff --git a/src/models/deepseek/modeling_deepseek.hpp b/src/models/deepseek/modeling_deepseek.hpp deleted file mode 100644 index 3fbece36..00000000 --- a/src/models/deepseek/modeling_deepseek.hpp +++ /dev/null @@ -1,203 +0,0 @@ -// -// Created by Rongjie Yi on 24-6-20. -// - -#ifndef MODELING_DEEPSEEK_HPP -#define MODELING_DEEPSEEK_HPP - -#include "configuration_deepseek.hpp" - -using namespace mllm; - -class DeepseekMultiHeadLatentAttention final : public Module { - Layer q_proj; - Layer kv_a_proj_with_mqa; - Layer kv_a_layernorm; - Layer kv_b_proj; - Layer k_proj; - Layer v_proj; - Layer q_rope; - Layer k_rope; - KVCache k_cache; - KVCache v_cache; - Softmax softmax; - Layer o_proj; - int num_heads{}; - int q_head_dim{}; - int v_head_dim{}; - int qk_nope_head_dim{}; - int qk_rope_head_dim{}; - int kv_lora_rank{}; - float softmax_scale{}; -public: - DeepseekMultiHeadLatentAttention() = default; - DeepseekMultiHeadLatentAttention(const DeepseekConfig config, const DeepseekNameConfig &names, const string &base_name) { - num_heads = config.num_heads; - qk_nope_head_dim =config.qk_nope_head_dim; - qk_rope_head_dim =config.qk_rope_head_dim; - kv_lora_rank = config.kv_lora_rank; - v_head_dim = config.v_head_dim; - q_head_dim=config.qk_nope_head_dim + config.qk_rope_head_dim; - q_proj = Linear( - config.hidden_size, - num_heads * q_head_dim, - false, - base_name + names._q_proj_name); - kv_a_proj_with_mqa = Linear( - config.hidden_size, - kv_lora_rank + qk_rope_head_dim, - false, - base_name + names._kv_a_proj_with_mqa_name - ); - kv_a_layernorm = RMSNorm(kv_lora_rank, config.rms_norm_eps, base_name + names._kv_a_layernorm_name); - kv_b_proj = Linear( - kv_lora_rank, - num_heads * (q_head_dim - qk_rope_head_dim + v_head_dim), - false, - base_name + names._kv_b_proj_name - ); - o_proj = Linear( - num_heads * v_head_dim, - config.hidden_size, - false, - base_name + names._o_proj_name - ); - q_rope = RoPE(RoPEType::MLAROPE, base_name + "q_rope"); - k_rope = RoPE(RoPEType::MLAROPE, base_name + "k_rope"); - if (config.cache_limit > 0) { - k_cache = KVCache(num_heads/num_heads, config.cache_limit, base_name + "k_cache"); - v_cache = KVCache(num_heads/num_heads, config.cache_limit, base_name + "v_cache"); - } - softmax = Softmax(DIMENSION, config.do_mask, base_name + "softmax"); - softmax_scale = 1/std::sqrt(q_head_dim); - } - vector Forward(vector inputs, vector args) override { - auto hidden_states = inputs[0]; - - auto q = q_proj(hidden_states); - auto qs = Tensor::split(q, {qk_nope_head_dim, qk_rope_head_dim}, D_HD, num_heads); - q = Tensor::cat({qs[0], q_rope(qs[1])}, DIMENSION); - - Tensor compressed_kv = kv_a_proj_with_mqa(hidden_states); - auto kvs = Tensor::split(compressed_kv, - {kv_lora_rank, qk_rope_head_dim}, DIMENSION); - auto k_pe = k_rope(kvs[1]); - auto kv = kv_b_proj(kv_a_layernorm(kvs[0]));//.view(-1, head_size_, -1, qk_nope_head_dim_ + v_head_dim_); - kvs = Tensor::split(kv, {qk_nope_head_dim, v_head_dim}, D_HD, num_heads); - auto v = kvs[1]; - auto k = Tensor::cat({kvs[0], k_pe}, DIMENSION); - if (k_cache.ready() && v_cache.ready()) { - k = k_cache(k); - v = v_cache(v); - } - k = k.transpose(SEQUENCE, DIMENSION); - auto qk = Tensor::mm(q, k); - qk = qk * softmax_scale; - qk = softmax(qk, k_cache.getCacheSeqLen()); - auto o = Tensor::mm(qk, v); - o = o.view(-1, 1, -1, v_head_dim * num_heads); - o = o_proj(o); - return {o}; - } -}; - -class DeepseekMLP final : public Module { -private: - Layer gate_proj; - Layer up_proj; - Layer down_proj; - Layer gelu; -public: - DeepseekMLP() = default; - DeepseekMLP(const DeepseekConfig &config, const DeepseekNameConfig &names, const std::string &base_name) { - int hidden_size = config.hidden_size; - int intermediate_size = config.intermediate_size; - gate_proj = Linear(hidden_size, intermediate_size, false, base_name + names._gate_proj_name); - gelu = SiLU(base_name + "act"); - up_proj = Linear(hidden_size, intermediate_size, false, base_name + names._up_proj_name); - down_proj = Linear(intermediate_size, hidden_size, false, base_name + names._down_proj_name); - } - std::vector Forward(std::vector inputs, std::vector args) override { - auto x = gate_proj(inputs[0]); - x = gelu(x); - auto y = up_proj(inputs[0]); - x = x * y; - x = down_proj(x); - return {x}; - } -}; - -class DeepseekDecoder final : public Module { -private: - DeepseekMultiHeadLatentAttention self_atten; - DeepseekMLP mlp; - Layer input_layernorm; - Layer post_attention_layernorm; -public: - DeepseekDecoder() = default; - DeepseekDecoder(const DeepseekConfig &config, const DeepseekNameConfig &names, const string &base_name) { - self_atten = DeepseekMultiHeadLatentAttention(config, names, base_name + names._attn_base_name); - mlp = DeepseekMLP(config, names, base_name + names._ffn_base_name); - input_layernorm = RMSNorm(config.hidden_size, config.rms_norm_eps, base_name + names._attn_norm_name); - post_attention_layernorm = RMSNorm(config.hidden_size, config.rms_norm_eps, base_name + names._ffn_norm_name); - } - - std::vector Forward(std::vector inputs, std::vector args) override { - auto x = input_layernorm(inputs[0]); - x = self_atten({x, x, x})[0]; - auto tmp = x + inputs[0]; - x = post_attention_layernorm(tmp); - x = mlp({x})[0]; - x = x + tmp; - return {x}; - } -}; - -class DeepseekModel final : public Module { -private: - std::vector blocks; - Layer norm; -public: - DeepseekModel() = default; - DeepseekModel(const DeepseekConfig &config, const DeepseekNameConfig &names, const string &base_name) { - blocks = List(config.num_hidden_layers, config, names, base_name); - norm = RMSNorm(config.hidden_size, config.rms_norm_eps, true, names.post_norm_name); - } - - std::vector Forward(std::vector inputs, std::vector args) override { - auto x = inputs[0]; - for (auto &block : blocks) { - x = block({x})[0]; - } - x = norm(x); - return {x}; - } -}; - -class DeepseekForCausalLM final : public Module { -private: - int hidden_size; - Layer embedding; - Parameter lm_head; - DeepseekModel model; -public: - DeepseekForCausalLM(DeepseekConfig &config) { - auto names = config.names_config; - hidden_size = config.hidden_size; - embedding = Embedding(config.vocab_size, config.hidden_size, names.token_embd_name); - model = DeepseekModel(config, names, names.blk_name); - - // lm_head and tok_embedding is tied together. - // They share same parameters. Use a Transpose to do the lm_head instead. - lm_head = Parameter(1, config.vocab_size, 1, config.hidden_size, names.lm_head_name + ".weight"); - } - std::vector Forward(std::vector inputs, std::vector args) override { - auto x = embedding(inputs[0]); - auto outputs = model({x})[0]; - outputs = Tensor::mm(outputs, lm_head().transpose(Chl::SEQUENCE, Chl::DIMENSION)); - return {outputs}; - } -}; - - -#endif // MODELING_DEEPSEEK_HPP diff --git a/src/models/stablelm/modeling_stablelm.hpp b/src/models/stablelm/modeling_stablelm.hpp index 2c834379..50737a2a 100644 --- a/src/models/stablelm/modeling_stablelm.hpp +++ b/src/models/stablelm/modeling_stablelm.hpp @@ -3,53 +3,37 @@ #include "Layer.hpp" #include "Module.hpp" +#include "Types.hpp" #include "configuration_stablelm.hpp" -#include "models/transformer/modeling_transformer.hpp" -#include using namespace mllm; class StableLMMultiHeadAttention final : public Module { - Layer qkv_proj; - Split qkv_split; Layer q_proj; Layer k_proj; Layer v_proj; Layer q_rope; Layer k_rope; - Layer q_norm; - Layer k_norm; KVCache k_cache; KVCache v_cache; Softmax softmax; Layer o_proj; - Parameter bias_k; - Parameter bias_v; int head_size_{}; int kv_head_size_{}; int attn_hidden_dim_{}; + Chl split_chl_{}; public: StableLMMultiHeadAttention() = default; StableLMMultiHeadAttention(int hidden_dim, int head_size, int kv_head_size, int attn_hidden_dim, - AttnQKVSplitType do_qkv_proj, bool post_qkv_norm, bool bias_kv_cat, RoPEType RoPE_type, int cache_limit, bool do_mask, bool bias, const TransformerNameConfig &names, const string &base_name) { attn_hidden_dim_ = attn_hidden_dim; head_size_ = head_size; kv_head_size_ = kv_head_size; - if (do_qkv_proj > 0) { - qkv_proj = Linear(hidden_dim, head_size * attn_hidden_dim * 3, bias, base_name + names._qkv_proj_name); - qkv_split = Split(3, (Chl)do_qkv_proj, head_size, base_name + names._qkv_proj_name + ".split"); - } else { - q_proj = Linear(hidden_dim, head_size * attn_hidden_dim, bias, base_name + names._q_proj_name); - k_proj = Linear(hidden_dim, kv_head_size * attn_hidden_dim, bias, base_name + names._k_proj_name); - v_proj = Linear(hidden_dim, kv_head_size * attn_hidden_dim, bias, base_name + names._v_proj_name); - } - if (post_qkv_norm) { - q_norm = LayerNorm(attn_hidden_dim, true, 1e-6, base_name + names._q_norm_name); - k_norm = LayerNorm(attn_hidden_dim, true, 1e-6, base_name + names._k_norm_name); - } + q_proj = Linear(hidden_dim, head_size * attn_hidden_dim, bias, base_name + names._q_proj_name); + k_proj = Linear(hidden_dim, kv_head_size * attn_hidden_dim, bias, base_name + names._k_proj_name); + v_proj = Linear(hidden_dim, kv_head_size * attn_hidden_dim, bias, base_name + names._v_proj_name); if (RoPE_type > 0) { q_rope = RoPE(RoPE_type, 10000, 0.25, 4096, base_name + "q_rope"); k_rope = RoPE(RoPE_type, 10000, 0.25, 4096, base_name + "k_rope"); @@ -60,35 +44,16 @@ class StableLMMultiHeadAttention final : public Module { } softmax = Softmax(DIMENSION, do_mask, base_name + "softmax"); o_proj = Linear(head_size * attn_hidden_dim, hidden_dim, false, base_name + names._o_proj_name); - if (bias_kv_cat) { - bias_k = Parameter(1, 1, head_size, attn_hidden_dim, base_name + "bias_k"); - bias_v = Parameter(1, 1, head_size, attn_hidden_dim, base_name + "bias_v"); - } + } vector Forward(vector inputs, vector args) override { Tensor q, k, v; - if (qkv_proj.ready()) { - auto qkv = qkv_proj(inputs[0]); - auto qkv_sp = qkv_split(qkv); - q = qkv_sp[0]; - k = qkv_sp[1]; - v = qkv_sp[2]; - } else { - q = q_proj(inputs[0]); - k = k_proj(inputs[1]); - v = v_proj(inputs[2]); - q = q.view(-1, head_size_, -1, attn_hidden_dim_); - k = k.view(-1, kv_head_size_, -1, attn_hidden_dim_); - v = v.view(-1, kv_head_size_, -1, attn_hidden_dim_); - } - if (q_norm.ready() && k_norm.ready()) { - q = q_norm(q); - k = k_norm(k); - } - if (bias_k.ready() && bias_v.ready()) { - k = Tensor::cat({k, bias_k()}, SEQUENCE); - v = Tensor::cat({v, bias_v()}, SEQUENCE); - } + q = q_proj(inputs[0]); + k = k_proj(inputs[1]); + v = v_proj(inputs[2]); + q = q.view(-1, head_size_, -1, attn_hidden_dim_); + k = k.view(-1, kv_head_size_, -1, attn_hidden_dim_); + v = v.view(-1, kv_head_size_, -1, attn_hidden_dim_); if (q_rope.ready() && k_rope.ready()) { q = q_rope(q); k = k_rope(k); @@ -141,7 +106,7 @@ class StableLMBlock final : public Module { public: StableLMBlock() = default; StableLMBlock(int hidden_dim, int head_size, int ffn_hidden, RoPEType RoPE_type, int cache_limit, const stablelmNameConfig &names, const string &base_name) { - attention = StableLMMultiHeadAttention(hidden_dim, head_size, head_size, hidden_dim / head_size, SPLIT_NONE, false, false, + attention = StableLMMultiHeadAttention(hidden_dim, head_size, head_size, hidden_dim / head_size, RoPE_type, cache_limit, true, true, names, base_name + names._attn_base_name); mlp = StableLMMLP(hidden_dim, ffn_hidden, names, base_name + names._ffn_base_name); norm1 = LayerNorm(hidden_dim, true, 1e-5, base_name + names._attn_norm_name); diff --git a/src/models/transformer/modeling_transformer.hpp b/src/models/transformer/modeling_transformer.hpp index b633d8c4..509b1883 100644 --- a/src/models/transformer/modeling_transformer.hpp +++ b/src/models/transformer/modeling_transformer.hpp @@ -21,7 +21,6 @@ enum AttnQKVSplitType { class MultiHeadAttention final : public Module { Layer qkv_proj; - Split qkv_split; Layer q_proj; Layer k_proj; Layer v_proj; @@ -38,6 +37,7 @@ class MultiHeadAttention final : public Module { int head_size_{}; int kv_head_size_{}; int attn_hidden_dim_{}; + Chl split_chl_{}; public: MultiHeadAttention() = default; @@ -51,7 +51,7 @@ class MultiHeadAttention final : public Module { kv_head_size_ = kv_head_size; if (do_qkv_proj > 0) { qkv_proj = Linear(hidden_dim, head_size * attn_hidden_dim * 3, bias, base_name + names._qkv_proj_name); - qkv_split = Split(3, (Chl)do_qkv_proj, head_size, base_name + names._qkv_proj_name + ".split"); + split_chl_ = (Chl)do_qkv_proj; } else { q_proj = Linear(hidden_dim, head_size * attn_hidden_dim, bias, base_name + names._q_proj_name); k_proj = Linear(hidden_dim, kv_head_size * attn_hidden_dim, bias, base_name + names._k_proj_name); @@ -80,7 +80,7 @@ class MultiHeadAttention final : public Module { Tensor q, k, v; if (qkv_proj.ready()) { auto qkv = qkv_proj(inputs[0]); - auto qkv_sp = qkv_split(qkv); + auto qkv_sp = Tensor::split(qkv, {attn_hidden_dim_, attn_hidden_dim_, attn_hidden_dim_}, split_chl_, head_size_); q = qkv_sp[0]; k = qkv_sp[1]; v = qkv_sp[2];