From cd4af859c1070e6964a90f86ab0ae71b271aef19 Mon Sep 17 00:00:00 2001 From: Taemin Lee Date: Fri, 22 Mar 2024 05:16:57 +0900 Subject: [PATCH] [BugFix] gemma loading after quantization or LoRA. (#3553) --- vllm/model_executor/models/gemma.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index fd3dbe79..fa8ce60e 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -340,6 +340,10 @@ def load_weights(self, weight_loader(param, loaded_weight, shard_id) break else: + # lm_head is not used in vllm as it is tied with embed_token. + # To prevent errors, skip loading lm_head.weight. + if "lm_head.weight" in name: + continue # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: continue