AmpereComputingAI · kkontny · Mar 1, 2024 · Mar 1, 2024
diff --git a/natural_language_processing/text_generation/alpaca/run.py b/natural_language_processing/text_generation/alpaca/run.py
@@ -3,7 +3,7 @@
 from utils.benchmark import run_model
 
 
-def run_pytorch(model_path, num_runs, timeout, dataset_path, use_torch_fp16=False):
+def run_pytorch(model_path, batch_size, num_runs, timeout, dataset_path, use_torch_fp16=False, revision=None):
     from transformers import AutoModelForCausalLM, AutoTokenizer
 
     def run_single_pass(pytorch_runner, _dataset):
@@ -13,7 +13,7 @@ def run_single_pass(pytorch_runner, _dataset):
         response = decode(outputs[:, inputs.input_ids.shape[1]:])
         _dataset.submit_prediction(response)
 
-    model = AutoModelForCausalLM.from_pretrained(model_path)
+    model = AutoModelForCausalLM.from_pretrained(model_path, revision=revision)
     if use_torch_fp16:
         model = model.half()
     model.eval()
@@ -30,11 +30,14 @@ def run_single_pass(pytorch_runner, _dataset):
 
 
 def run_pytorch_fp32(model_path, num_runs, timeout, dataset_path, **kwargs):
-    return run_pytorch(model_path, num_runs, timeout, dataset_path, use_torch_fp16=False)
+    return run_pytorch(model_path, num_runs, timeout, dataset_path)
 
 def run_pytorch_fp16(model_path, num_runs, timeout, dataset_path, **kwargs):
     return run_pytorch(model_path, num_runs, timeout, dataset_path, use_torch_fp16=True)
 
+def run_pytorch_int8(model_name, batch_size, num_runs, timeout, dataset_path, revision, **kwargs):
+    return run_pytorch(model_name, batch_size, num_runs, timeout, dataset_path, revision=revision)
+
 
 def run_pytorch_cuda(model_path, num_runs, timeout, dataset_path, **kwargs):
     from transformers import AutoModelForCausalLM, AutoTokenizer

diff --git a/natural_language_processing/text_generation/llama2/run.py b/natural_language_processing/text_generation/llama2/run.py
@@ -4,7 +4,7 @@
 from transformers import LlamaForCausalLM, AutoTokenizer
 
 
-def run_pytorch(model_name, batch_size, num_runs, timeout, dataset_path, use_torch_fp16=False):
+def run_pytorch(model_name, batch_size, num_runs, timeout, dataset_path, use_torch_fp16=False, revision=None):
     def run_single_pass(pytorch_runner, _dataset):
         input_tensor = tokenizer.encode(_dataset.get_input_string(), return_tensors="pt")
         input_tensor = torch.cat([input_tensor for _ in range(batch_size)], 0)
@@ -18,7 +18,7 @@ def run_single_pass(pytorch_runner, _dataset):
     np.random.seed(44)
     torch.manual_seed(44)
 
-    model = LlamaForCausalLM.from_pretrained(model_name, torchscript=True)
+    model = LlamaForCausalLM.from_pretrained(model_name, torchscript=True, revision=revision)
     model.eval()
     if use_torch_fp16:
         model = model.half()
@@ -42,6 +42,10 @@ def run_pytorch_fp32(model_name, batch_size, num_runs, timeout, dataset_path, **
 def run_pytorch_fp16(model_name, batch_size, num_runs, timeout, dataset_path, **kwargs):
     return run_pytorch(model_name, batch_size, num_runs, timeout, dataset_path, use_torch_fp16=True)
 
+def run_pytorch_int8(model_name, batch_size, num_runs, timeout, dataset_path, revision, **kwargs):
+    return run_pytorch(model_name, batch_size, num_runs, timeout, dataset_path, revision=revision)
+
+
 def main():
     from utils.helpers import DefaultArgParser
     llama_variants = ["meta-llama/Llama-2-7b-chat-hf", "meta-llama/Llama-2-13b-chat-hf"]