diff --git a/test/quantization/test_quant_api.py b/test/quantization/test_quant_api.py index b22a15756..e8b9d606d 100644 --- a/test/quantization/test_quant_api.py +++ b/test/quantization/test_quant_api.py @@ -238,7 +238,6 @@ def test_8da4w_quantizer(self): def test_8da4w_gptq_quantizer(self): from torchao.quantization.GPTQ import Int8DynActInt4WeightGPTQQuantizer from torchao._models._eval import InputRecorder, TransformerEvalWrapper - torchao._models.llama.model.use_index_put_for_kv_cache = True # should be similar to TorchCompileDynamicQuantizer precision = torch.bfloat16 device = "cpu" @@ -338,7 +337,6 @@ def test_8da4w_quantizer_eval(self): def test_gptq_quantizer_int4_weight_only(self): from torchao.quantization.GPTQ import Int4WeightOnlyGPTQQuantizer from torchao._models._eval import InputRecorder, TransformerEvalWrapper - torchao._models.llama.model.use_index_put_for_kv_cache = True precision = torch.bfloat16 device = "cuda" checkpoint_path = Path("../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth") diff --git a/torchao/_models/_eval.py b/torchao/_models/_eval.py index 2d5386565..858196776 100644 --- a/torchao/_models/_eval.py +++ b/torchao/_models/_eval.py @@ -52,8 +52,13 @@ def __init__( pad_token=0, device="cpu", ): - super().__init__() - self._tokenizer = tokenizer + try: + super().__init__() + except TypeError: + # lm_eval 0.4.2 removed the default init + super().__init__("gpt2", device="cpu") + + self.tokenizer = tokenizer self._device = torch.device(device) self.vocab_size = vocab_size self._max_seq_length = calibration_seq_length @@ -74,9 +79,9 @@ def __init__( @property def eot_token_id(self): try: - return self._tokenizer.eos_id() + return self.tokenizer.eos_id() except: - return self._tokenizer.eos_id + return self.tokenizer.eos_id @property def max_length(self): @@ -96,16 +101,16 @@ def device(self): def tok_encode(self, string: str, **kwargs): # TODO: verify this for multi-batch as well - tokens = self._tokenizer.encode(string) - if hasattr(self._tokenizer, "bos_id"): + tokens = self.tokenizer.encode(string) + if hasattr(self.tokenizer, "bos_id"): try: - tokens = [self._tokenizer.bos_id()] + tokens + tokens = [self.tokenizer.bos_id()] + tokens except: - tokens = [self._tokenizer.bos_id] + tokens + tokens = [self.tokenizer.bos_id] + tokens return tokens def tok_decode(self, tokens): - decoded = self._tokenizer.decode(tokens) + decoded = self.tokenizer.decode(tokens) return decoded def add_input(self, args): @@ -185,9 +190,9 @@ def __init__( input_prep_func=None, device="cuda" ): - super().__init__(None, None) + super().__init__(tokenizer, None) self._model = model - self._tokenizer = tokenizer + # self.tokenizer = tokenizer self._device = torch.device(device) self._max_seq_length = max_seq_length diff --git a/torchao/_models/llama/eval.py b/torchao/_models/llama/eval.py index 7842c3e66..36e508501 100644 --- a/torchao/_models/llama/eval.py +++ b/torchao/_models/llama/eval.py @@ -13,7 +13,7 @@ ) from torchao.quantization.quant_api import ( - quantize, int4wo, int8wo, int8da_int8w, unwrap_tensor_subclass + quantize, int4_weight_only, int8_weight_only, int8_dynamic_activation_int8_weight, unwrap_tensor_subclass ) from torchao._models._eval import TransformerEvalWrapper, InputRecorder @@ -21,7 +21,7 @@ from tokenizer import get_tokenizer import time from torchao.quantization.GPTQ import Int4WeightOnlyGPTQQuantizer -from model import prepare_inputs_for_model +from torchao._models.llama.model import prepare_inputs_for_model torch._inductor.config.fx_graph_cache = True torch._inductor.config.force_fuse_int_mm_with_mul = True @@ -60,17 +60,18 @@ def run_evaluation( if quantization: if "int8wo" in quantization: - quantize(model, int8wo()) + quantize(model, int8_weight_only()) if "int8dq" in quantization: - quantize(model, int8da_int8w()) + quantize(model, int8_dynamic_activation_int8_weight()) if "int4wo" in quantization and not "gptq" in quantization: groupsize=int(quantization.split("-")[-1]) assert groupsize in [32,64,128,256], f"int4wo groupsize needs to be one of [32,64,128,256] but got {groupsize}" - quantize(model, int4wo(groupsize=groupsize)) + quantize(model.to(device), int4_weight_only(group_size=groupsize)) if "int4wo" in quantization and "gptq" in quantization: groupsize=int(quantization.split("-")[-2]) assert groupsize in [32,64,128,256], f"int4wo groupsize needs to be one of [32,64,128,256] but got {groupsize}" assert precision==torch.bfloat16, f"{quantization} requires precision or bfloat16 but got {precision}" + assert "cuda" in device, "int4 gptq quantization only works on cuda" inputs = InputRecorder( tokenizer, calibration_seq_length, @@ -83,7 +84,7 @@ def run_evaluation( calibration_limit, ).get_inputs() - quantizer = Int4WeightOnlyGPTQQuantizer(groupsize=groupsize) + quantizer = Int4WeightOnlyGPTQQuantizer(groupsize=groupsize, device=device) model.setup_caches(max_batch_size=1, max_seq_length=calibration_seq_length) model = quantizer.quantize(model, inputs).to(device) else: diff --git a/torchao/_models/llama/generate.py b/torchao/_models/llama/generate.py index 7f7cfab88..34e7ca82b 100644 --- a/torchao/_models/llama/generate.py +++ b/torchao/_models/llama/generate.py @@ -35,8 +35,8 @@ def device_sync(device): wd = Path(__file__).parent.parent.resolve() sys.path.append(str(wd)) -from model import Transformer, prepare_inputs_for_model -from tokenizer import get_tokenizer +from torchao._models.llama.model import Transformer, prepare_inputs_for_model +from torchao._models.llama.tokenizer import get_tokenizer def multinomial_sample_one_no_sync(probs_sort): # Does multinomial sampling without a cuda synchronization q = torch.empty_like(probs_sort).exponential_(1) @@ -189,21 +189,21 @@ def main( if quantization: from torchao.quantization.quant_api import ( quantize, - int8wo, - int8da_int8w, - int4wo, + int8_weight_only, + int8_dynamic_activation_int8_weight, + int4_weight_only, autoquant, unwrap_tensor_subclass ) if "int8wo" in quantization: - quantize(model, int8wo()) + quantize(model, int8_weight_only()) if "int8dq" in quantization: - quantize(model, int8da_int8w()) + quantize(model, int8_dynamic_activation_int8_weight()) if "int4wo" in quantization: groupsize=int(quantization.split("-")[-1]) assert groupsize in [32,64,128,256], f"int4wo groupsize needs to be one of [32,64,128,256] but got {groupsize}" - quantize(model, int4wo(groupsize=groupsize)) + quantize(model, int4_weight_only(groupsize=groupsize)) if "autoquant" == quantization: model = autoquant(model, manual=True) diff --git a/torchao/_models/llama/tokenizer.py b/torchao/_models/llama/tokenizer.py index 15082068d..2d6be2dcf 100644 --- a/torchao/_models/llama/tokenizer.py +++ b/torchao/_models/llama/tokenizer.py @@ -30,6 +30,8 @@ class SentencePieceWrapper(TokenizerInterface): def __init__(self, model_path): super().__init__(model_path) self.processor = spm.SentencePieceProcessor(str(model_path)) + self.bos_token_id = self.bos_id() + self.eos_token_id = self.eos_id() def encode(self, text): return self.processor.EncodeAsIds(text) @@ -86,6 +88,8 @@ def __init__(self, model_path): # BOS / EOS token IDs self._bos_id: int = self.special_tokens["<|begin_of_text|>"] self._eos_id: int = self.special_tokens["<|end_of_text|>"] + self.bos_token_id = self.bos_id() + self.eos_token_id = self.eos_id() def encode(self, text): return self.model.encode(text) diff --git a/torchao/quantization/GPTQ.py b/torchao/quantization/GPTQ.py index 12910c89b..275e716aa 100644 --- a/torchao/quantization/GPTQ.py +++ b/torchao/quantization/GPTQ.py @@ -81,6 +81,9 @@ def __init__( # trace model for one input one_input = [multi.values[0].cpu() for multi in inputs] # pyre-ignore[16] + # needed for GPTQ on the torchao llama model + import torchao + torchao._models.llama.model.use_index_put_for_kv_cache = True exported_model = torch._dynamo.export( model.cpu(), aten_graph=True, pre_dispatch=True, tracing_mode="fake" )(*one_input) @@ -95,7 +98,7 @@ def __init__( self.groupsize = groupsize self.inputs = inputs self.gptq_done = False - self.debug = False + self.debug = True def configure_quantization_mode( self, @@ -672,9 +675,9 @@ def quantize( class Int4WeightOnlyGPTQQuantizer(GPTQQuantizer): def __init__( self, - blocksize, - percdamp, - groupsize, + blocksize=128, + percdamp=0.01, + groupsize=64, inner_k_tiles=8, padding_allowed=True, device: torch.device = torch.device("cuda"), diff --git a/torchao/quantization/__init__.py b/torchao/quantization/__init__.py index f1bb82921..b4139f0c1 100644 --- a/torchao/quantization/__init__.py +++ b/torchao/quantization/__init__.py @@ -17,8 +17,6 @@ "swap_conv2d_1x1_to_linear" "safe_int_mm", "autoquant", - "change_linears_to_autoquantizable", - "change_autoquantizable_to_quantized", "get_scale", "SmoothFakeDynQuantMixin", "SmoothFakeDynamicallyQuantizedLinear", diff --git a/torchao/quantization/autoquant.py b/torchao/quantization/autoquant.py index 1a63fed57..18a58cd17 100644 --- a/torchao/quantization/autoquant.py +++ b/torchao/quantization/autoquant.py @@ -17,6 +17,12 @@ except: from torch._inductor.runtime.runtime_utils import do_bench +__all__ = [ + "AutoQuantizableLinearWeight", + "autoquant", +] + + aten = torch.ops.aten AUTOQUANT_CACHE = {} @@ -382,11 +388,11 @@ def from_float(cls, weight): AQInt8DynamicallyQuantizedLinearWeight, ] -def change_linears_to_autoquantizable(model, **kwargs): +def _change_linears_to_autoquantizable(model, **kwargs): """ Converts all linear weight tensors to the AutoQuantizableLinearWeight tensor subclass. Expectation is that this is followed - by running the model and then calling change_autoquantizable_to_quantized + by running the model and then calling _change_autoquantizable_to_quantized """ from torchao.quantization.quant_api import _is_linear filter_fn = kwargs.pop("filter_fn", _is_linear) @@ -401,7 +407,7 @@ def change_linears_to_autoquantizable(model, **kwargs): filter_fn if filter_fn is not None else _is_linear, ) -def change_autoquantizable_to_quantized(model, **kwargs): +def _change_autoquantizable_to_quantized(model, **kwargs): """ Converts AutoQuantizableLinearWeight tensor subclasses to various quantized/non-quantized tensor subclasses depending @@ -490,7 +496,7 @@ def autoquant( # perform initial swap from linear weights # to AutoQuantizableLinearWeight - change_linears_to_autoquantizable( + _change_linears_to_autoquantizable( model, filter_fn=filter_fn, qtensor_class_list=qtensor_class_list, @@ -531,7 +537,7 @@ def autoquant_prehook(module, args, kwargs): # note the torch.compile wrapper (eval_frame) moves the assignment of any assigned # attributes to the inner model that didn't exist before, so we have to call delattr on the inner model def finalize_autoquant(): - change_autoquantizable_to_quantized( + _change_autoquantizable_to_quantized( real_model, **aq_kwargs, )