From 243070084643e1500d6b966ad14780f45d94b9c2 Mon Sep 17 00:00:00 2001 From: Sudhakar Singh Date: Mon, 2 Jun 2025 14:16:43 -0700 Subject: [PATCH 1/7] add tutorial files and other local changes Signed-off-by: Sudhakar Singh --- docs/examples/te_gemma/check_cuda_graphs.py | 60 + docs/examples/te_gemma/check_gemm.py | 132 ++ docs/examples/te_gemma/check_rope.ipynb | 716 +++++++++ docs/examples/te_gemma/media/calibration.svg | 1 + .../te_gemma/media/calibration_1_half.svg | 1 + .../te_gemma/media/calibration_2_half.svg | 1 + .../te_gemma/media/fp8_model_init.svg | 1 + .../te_gemma/media/fp8_model_init_1_half.svg | 1 + .../te_gemma/media/fp8_model_init_2_half.svg | 1 + .../te_gemma/media/generation_animation.gif | Bin 0 -> 135280 bytes docs/examples/te_gemma/media/graphs.svg | 1 + docs/examples/te_gemma/media/graphs_1.png | Bin 0 -> 16100 bytes docs/examples/te_gemma/media/graphs_2.png | Bin 0 -> 15177 bytes docs/examples/te_gemma/media/plot.svg | 1 + docs/examples/te_gemma/media/thd_bshd.svg | 1 + docs/examples/te_gemma/requirements.txt | 4 + docs/examples/te_gemma/run_gemma_2b.py | 15 + docs/examples/te_gemma/run_generation.py | 22 + .../examples/te_gemma/run_generation_llama.py | 10 + docs/examples/te_gemma/te_gemma.py | 808 +++++++++++ .../te_gemma/te_gemma_loading_weights.py | 160 +++ docs/examples/te_gemma/te_llama.py | 759 ++++++++++ .../te_gemma/te_llama_loading_weights.py | 224 +++ docs/examples/te_gemma/test_paged_attn.ipynb | 33 + ...celerate_hf_gemma_finetuning_with_te.ipynb | 314 ++++ .../tutorial_generation_gemma_with_te.ipynb | 1277 +++++++++++++++++ docs/examples/te_gemma/utils.py | 366 +++++ .../pytorch/attention/inference.py | 15 +- .../pytorch/attention/multi_head_attention.py | 6 +- .../pytorch/csrc/extensions/apply_rope.cpp | 3 +- 30 files changed, 4927 insertions(+), 6 deletions(-) create mode 100644 docs/examples/te_gemma/check_cuda_graphs.py create mode 100755 docs/examples/te_gemma/check_gemm.py create mode 100755 docs/examples/te_gemma/check_rope.ipynb create mode 100755 docs/examples/te_gemma/media/calibration.svg create mode 100755 docs/examples/te_gemma/media/calibration_1_half.svg create mode 100755 docs/examples/te_gemma/media/calibration_2_half.svg create mode 100755 docs/examples/te_gemma/media/fp8_model_init.svg create mode 100755 docs/examples/te_gemma/media/fp8_model_init_1_half.svg create mode 100755 docs/examples/te_gemma/media/fp8_model_init_2_half.svg create mode 100755 docs/examples/te_gemma/media/generation_animation.gif create mode 100755 docs/examples/te_gemma/media/graphs.svg create mode 100755 docs/examples/te_gemma/media/graphs_1.png create mode 100755 docs/examples/te_gemma/media/graphs_2.png create mode 100755 docs/examples/te_gemma/media/plot.svg create mode 100755 docs/examples/te_gemma/media/thd_bshd.svg create mode 100755 docs/examples/te_gemma/requirements.txt create mode 100644 docs/examples/te_gemma/run_gemma_2b.py create mode 100755 docs/examples/te_gemma/run_generation.py create mode 100755 docs/examples/te_gemma/run_generation_llama.py create mode 100755 docs/examples/te_gemma/te_gemma.py create mode 100755 docs/examples/te_gemma/te_gemma_loading_weights.py create mode 100755 docs/examples/te_gemma/te_llama.py create mode 100755 docs/examples/te_gemma/te_llama_loading_weights.py create mode 100755 docs/examples/te_gemma/test_paged_attn.ipynb create mode 100755 docs/examples/te_gemma/tutorial_accelerate_hf_gemma_finetuning_with_te.ipynb create mode 100755 docs/examples/te_gemma/tutorial_generation_gemma_with_te.ipynb create mode 100755 docs/examples/te_gemma/utils.py diff --git a/docs/examples/te_gemma/check_cuda_graphs.py b/docs/examples/te_gemma/check_cuda_graphs.py new file mode 100644 index 0000000000..fa198db5ef --- /dev/null +++ b/docs/examples/te_gemma/check_cuda_graphs.py @@ -0,0 +1,60 @@ +import torch +from transformer_engine.pytorch import Linear, LayerNorm + +# 1. Define model with static buffers +class TE_Model(torch.nn.Module): + def __init__(self, max_seq_len=4096): + super().__init__() + self.max_seq_len = max_seq_len + self.ln = LayerNorm(1024) + self.attn_proj = Linear(1024, 1024) + + # Pre-allocate static buffers + self.register_buffer('kv_cache', torch.zeros(max_seq_len, 1024, device='cuda')) + self.register_buffer('attn_mask', torch.tril(torch.ones(max_seq_len, max_seq_len, device='cuda'))) + + def forward(self, hidden_states, seq_start: int): + # Dynamic slicing of static buffers + seq_len = hidden_states.size(1) + current_mask = self.attn_mask[seq_start:seq_start+seq_len, :seq_len] + + x = self.ln(hidden_states) + x = self.attn_proj(x) + # Update KV cache (in-place) + self.kv_cache[seq_start:seq_start+seq_len].copy_(x) + return x + +# 2. Create graphable callables +model = TE_Model().cuda() +static_input = torch.randn(8, 256, 1024, device='cuda') # (batch, seq, hidden) +seq_start = torch.tensor(0, device='cuda') + +# Wrap with CUDA Graphs +graph_model = torch.cuda.make_graphed_callables( + [model], # Module list + sample_args=[(static_input, seq_start)], # Must match actual input structure + # memory_pool=torch.cuda.graphs.graph_pool_handle(), + allow_unused_input=False +) + +# 3. Warmup and execution +def run_inference(x, seq_start): + # Inputs must match sample_args' device/type/shape + x = x.to('cuda', non_blocking=True).requires_grad_(False) + seq_start = seq_start.to('cuda', non_blocking=True) + + with torch.cuda.amp.autocast(): + return graph_model(x, seq_start) + +# Warm-up (essential for TE's kernel auto-tuner) +for _ in range(3): + _ = run_inference(static_input, seq_start) +torch.cuda.synchronize() + +# 4. Usage with dynamic sequence lengths +def process_batch(inputs, start_pos): + # inputs: (batch, seq) on CPU + inputs_gpu = inputs.to('cuda', non_blocking=True) + + # Output shares memory with pre-allocated buffers + return run_inference(inputs_gpu, start_pos) diff --git a/docs/examples/te_gemma/check_gemm.py b/docs/examples/te_gemma/check_gemm.py new file mode 100755 index 0000000000..dbcc0f53af --- /dev/null +++ b/docs/examples/te_gemma/check_gemm.py @@ -0,0 +1,132 @@ +import functools +from typing import Optional, Tuple, Union, List +import torch +import transformer_engine as te +import transformer_engine_torch as tex +from transformer_engine.pytorch.constants import TE_DType +from transformer_engine.pytorch.utils import assert_dim_for_fp8_exec +from transformer_engine.pytorch.module.base import get_workspace +import transformer_engine.pytorch.cpp_extensions as cpp_tex + +@functools.lru_cache(maxsize=None) +def _empty_tensor() -> torch.Tensor: + """Get tensor with no entries and no data""" + return torch.Tensor() + +def gemm( + A: torch.Tensor, + B: torch.Tensor, + dtype: torch.dtype, + workspace: torch.Tensor, + gelu: bool = False, + gelu_input: Optional[torch.Tensor] = None, + grad: bool = False, + accumulate: bool = False, + layout: str = "TN", + out: Optional[torch.Tensor] = None, + bias: Optional[torch.Tensor] = None, + use_bias: bool = False, + ub_algo: tex.CommOverlapAlgo = None, + ub: Union[tex.CommOverlap, tex.CommOverlapP2P] = None, + extra_output_tensor: torch.Tensor = None, +) -> Tuple[Union[torch.Tensor, None], ...]: + """Non FP8 GEMM.""" + + assert layout in ("TN", "NN", "NT"), f"GEMM layout {layout} not supported." + transa = layout[0] == "T" + transb = layout[1] == "T" + empty_tensor = _empty_tensor() + fp8_index = -1 # dummy index + + if out is None: + out = torch.empty( + B.shape[1] if transb else B.shape[0], + A.shape[0] if transa else A.shape[1], + dtype=dtype, + device="cuda", + ) + else: + if not out.is_contiguous(): + raise ValueError("Output tensor is not contiguous.") + + if gelu and not grad: + gelu_input = torch.empty_like(out, dtype=dtype) + elif not gelu: + gelu_input = empty_tensor + + if grad and use_bias: + grad_bias = torch.empty(B.shape[1], dtype=out.dtype, device="cuda") + else: + grad_bias = empty_tensor + + bias = bias if use_bias else empty_tensor + + assert ( + A.dtype == dtype and B.dtype == dtype + ), f"Expected dtype={dtype}, but found A.dtype={A.dtype} and B.dtype={B.dtype}" + input_dtype = TE_DType[dtype] + output_dtype = TE_DType[out.dtype] + if use_bias: + bias_dtype = TE_DType[grad_bias.dtype] if grad else TE_DType[bias.dtype] + else: + bias_dtype = output_dtype + + args = ( + A, + empty_tensor, + fp8_index, + input_dtype, + transa, + B, + empty_tensor, + fp8_index, + input_dtype, + transb, + out, + empty_tensor, # out_scale + output_dtype, + empty_tensor, # out_amax + grad_bias if grad else bias, + bias_dtype, + gelu_input, + grad, + workspace, + workspace.shape[0], + accumulate, + False, # use_split_accumulator + ) + fn = torch.ops.tex_ts.te_gemm_ts + if ub_algo is not None: + assert ub is not None, "ub object is None!" + _ = fn(*args) + + import pdb; pdb.set_trace() + return out, grad_bias, gelu_input + +if __name__ == "__main__": + fc2_weight = torch.load("fc2_weight.pth").cuda() + + base_repo = "/perfhome/mnt/wkstn/work/repos/te_gemma_gen_support/TransformerEngine/docs/examples/te_gemma/" + base_repo = "" + gelu_out = torch.load(base_repo + "gelu_out.pth").cuda() + + activation_dtype = torch.bfloat16 + fc2_bias = _empty_tensor() + use_fc2_bias = False + + dim_size = list(gelu_out.size()) + dim_size[1] = fc2_weight.size(0) + fc2_out = torch.empty(dim_size, dtype=activation_dtype, device=gelu_out.device) + + _ = cpp_tex.gemm( + fc2_weight, + gelu_out, + activation_dtype, + get_workspace(), + bias=fc2_bias, + use_bias=use_fc2_bias, + out=fc2_out, + ub_algo=None, + ub=None, + extra_output_tensor=None, + ) \ No newline at end of file diff --git a/docs/examples/te_gemma/check_rope.ipynb b/docs/examples/te_gemma/check_rope.ipynb new file mode 100755 index 0000000000..26d5c9058f --- /dev/null +++ b/docs/examples/te_gemma/check_rope.ipynb @@ -0,0 +1,716 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "72f61b51-b6fc-4463-9783-d42a25ca3a2f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "before tex import\n" + ] + } + ], + "source": [ + "import torch\n", + "import torch.nn.functional as F\n", + "import math\n", + "print(\"before tex import\")\n", + "import transformer_engine as te\n", + "import transformer_engine_torch as tex" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "1f81be75-bf64-43b2-852a-7c482a1c3418", + "metadata": {}, + "outputs": [], + "source": [ + "from transformer_engine.pytorch.attention import apply_rotary_pos_emb" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "8853f973-d834-41a9-929d-8687b947134f", + "metadata": {}, + "outputs": [], + "source": [ + "def compare_rope_outputs(t, freqs_s11d, freqs_sb1d):\n", + " output1 = tex.fused_rope_forward(t, freqs_s11d, torch.Tensor(), False)\n", + " output2 = tex.fused_rope_forward(t, freqs_sb1d, torch.Tensor(), False)\n", + " print(output1, output2, sep=\"\\n\")\n", + " assert torch.allclose(output1, output2)\n", + " return output1, output2" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "6b7bada1-6748-46f1-93a4-c2ac1a617063", + "metadata": {}, + "outputs": [], + "source": [ + "torch.manual_seed(0)\n", + "b = 2\n", + "s = 3\n", + "h = 2\n", + "d = 4" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "54a8f6d6-28f8-4a9a-8ba0-0fdefff138e7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "torch.Size([3, 1, 1, 4]) torch.Size([3, 2, 1, 4])\n" + ] + } + ], + "source": [ + "freqs_s11d = torch.ones(s, 1, 1, d).cuda() * math.pi/4\n", + "freqs_sb1d = freqs_s11d.broadcast_to(s, b, 1, d).clone()\n", + "t = torch.ones(s, b, h, d).cuda()\n", + "\n", + "print(freqs_s11d.shape, freqs_sb1d.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "5070307a-3104-401b-b84c-00f3bbf02ccc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([[[[0.7854, 0.7854, 0.7854, 0.7854]]],\n", + "\n", + "\n", + " [[[0.7854, 0.7854, 0.7854, 0.7854]]],\n", + "\n", + "\n", + " [[[0.7854, 0.7854, 0.7854, 0.7854]]]], device='cuda:0')" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "freqs_s11d" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "81e52785-e6ad-4180-9567-564af692375c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(4, 4, 4, 1)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "freqs_s11d.stride()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "0da9bc09-7e1e-4056-85eb-64b6122c7440", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "4, 0\n", + "4, 4, 4, 1, \n", + "nvt_fused_rope_fwd: 4, 0fused_rope_fwd: 4, 0fused_rope_fwd_launcher: 4, 0thread_id: 0, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n" + ] + } + ], + "source": [ + "output = tex.fused_rope_forward(t, freqs_s11d, torch.Tensor(), False)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "1b78017d-09b3-4b5f-93a8-75f6ba6f131c", + "metadata": {}, + "outputs": [], + "source": [ + "output_unfused=apply_rotary_pos_emb(\n", + " t,\n", + " freqs_s11d,\n", + " tensor_format=\"sbhd\",\n", + " fused=False,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "6f5d9350-deb1-48ef-a0a2-e18e01ed336f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([[[[-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.4142e+00],\n", + " [-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.4142e+00]],\n", + "\n", + " [[-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.4142e+00],\n", + " [-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.4142e+00]]],\n", + "\n", + "\n", + " [[[-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.4142e+00],\n", + " [-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.4142e+00]],\n", + "\n", + " [[-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.4142e+00],\n", + " [-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.4142e+00]]],\n", + "\n", + "\n", + " [[[-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.4142e+00],\n", + " [-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.4142e+00]],\n", + "\n", + " [[-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.4142e+00],\n", + " [-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.4142e+00]]]],\n", + " device='cuda:0')" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "output_unfused" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "b01e29b8-dfdf-41ac-81a5-d8edf6a8c168", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "4, 0\n", + "4, 4, 4, 1, \n", + "nvt_fused_rope_fwd: 4, 0fused_rope_fwd: 4, 0fused_rope_fwd_launcher: 4, 08, 4\n", + "8, 4, 4, 1, \n", + "nvt_fused_rope_fwd: 8, 4fused_rope_fwd: 8, 4fused_rope_fwd_launcher: 8, 4thread_id: 0, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 2, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 2, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 2, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 2, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 2, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 2, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 2, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 2, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 0, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 0, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 0, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 0, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 0, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 0, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 0, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 0, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 2, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 2, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 2, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 2, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 2, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 2, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 2, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 2, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 0, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 0, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 0, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 0, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 0, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 0, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 0, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 0, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 0, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 0, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 0, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 0, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 0, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 0, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 0, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 0, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 0, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 0, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 0, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 0, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 0, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 0, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 0, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 0, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 2, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 2, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 2, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 2, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 2, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 2, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 2, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 2, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 1, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 1, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 1, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 1, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 1, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 1, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 1, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 1, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 2, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 2, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 2, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 2, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 2, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 2, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 2, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 2, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 1, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 1, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 1, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 1, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 1, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 1, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 1, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 1, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 1, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 1, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 1, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 1, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 1, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 1, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 1, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 1, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 1, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 1, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 1, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 1, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 0, s_id: 1, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 1, s_id: 1, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 2, s_id: 1, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "thread_id: 3, s_id: 1, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n", + "tensor([[[[-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.4142e+00],\n", + " [-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.4142e+00]],\n", + "\n", + " [[-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.4142e+00],\n", + " [-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.4142e+00]]],\n", + "\n", + "\n", + " [[[-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.4142e+00],\n", + " [-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.4142e+00]],\n", + "\n", + " [[-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.4142e+00],\n", + " [-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.4142e+00]]],\n", + "\n", + "\n", + " [[[-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.4142e+00],\n", + " [-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.4142e+00]],\n", + "\n", + " [[-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.4142e+00],\n", + " [-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.4142e+00]]]],\n", + " device='cuda:0')\n", + "tensor([[[[-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.4142e+00],\n", + " [-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.4142e+00]],\n", + "\n", + " [[-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.4142e+00],\n", + " [-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.4142e+00]]],\n", + "\n", + "\n", + " [[[-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.4142e+00],\n", + " [-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.4142e+00]],\n", + "\n", + " [[-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.4142e+00],\n", + " [-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.4142e+00]]],\n", + "\n", + "\n", + " [[[-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.4142e+00],\n", + " [-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.4142e+00]],\n", + "\n", + " [[-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.4142e+00],\n", + " [-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.4142e+00]]]],\n", + " device='cuda:0')\n" + ] + } + ], + "source": [ + "output1, output2 = compare_rope_outputs(t, freqs_s11d, freqs_sb1d)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "b168b178-1f63-4ccc-b084-2ac2c1ec016b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "torch.Size([6, 1, 1, 4]) torch.Size([6, 2, 1, 4])\n" + ] + } + ], + "source": [ + "freqs_s11d = torch.randn(s, 1, 1, d).cuda()\n", + "freqs_sb1d = freqs_s11d.broadcast_to(s, b, 1, d).clone()\n", + "\n", + "print(freqs_s11d.shape, freqs_sb1d.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "33ec2e07-6e54-49f7-92f7-2f217a766456", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor([[[[-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.4142e+00],\n", + " [-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.4142e+00]],\n", + "\n", + " [[-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.4142e+00],\n", + " [-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.4142e+00]]],\n", + "\n", + "\n", + " [[[-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.4142e+00],\n", + " [-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.4142e+00]],\n", + "\n", + " [[-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.4142e+00],\n", + " [-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.4142e+00]]],\n", + "\n", + "\n", + " [[[-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.4142e+00],\n", + " [-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.4142e+00]],\n", + "\n", + " [[-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.4142e+00],\n", + " [-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.4142e+00]]],\n", + "\n", + "\n", + " [[[-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.4142e+00],\n", + " [-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.4142e+00]],\n", + "\n", + " [[-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.0000e+00],\n", + " [-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.0000e+00]]]],\n", + " device='cuda:0')\n", + "tensor([[[[-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.4142e+00],\n", + " [-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.4142e+00]],\n", + "\n", + " [[-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.4142e+00],\n", + " [-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.4142e+00]]],\n", + "\n", + "\n", + " [[[-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.4142e+00],\n", + " [-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.4142e+00]],\n", + "\n", + " [[-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.4142e+00],\n", + " [-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.4142e+00]]],\n", + "\n", + "\n", + " [[[-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.4142e+00],\n", + " [-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.4142e+00]],\n", + "\n", + " [[-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.4142e+00],\n", + " [-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.4142e+00]]],\n", + "\n", + "\n", + " [[[-5.9605e-08, -5.9605e-08, 1.4142e+00, 1.4142e+00],\n", + " [ 7.0711e-01, 7.0711e-01, 7.0711e-01, 7.0711e-01]],\n", + "\n", + " [[ 7.0711e-01, 7.0711e-01, 7.0711e-01, 7.0711e-01],\n", + " [ 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00]]]],\n", + " device='cuda:0')\n" + ] + }, + { + "ename": "AssertionError", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[9], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m output1, output2 \u001b[38;5;241m=\u001b[39m \u001b[43mcompare_rope_outputs\u001b[49m\u001b[43m(\u001b[49m\u001b[43mt\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfreqs_s11d\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfreqs_sb1d\u001b[49m\u001b[43m)\u001b[49m\n", + "Cell \u001b[0;32mIn[8], line 5\u001b[0m, in \u001b[0;36mcompare_rope_outputs\u001b[0;34m(t, freqs_s11d, freqs_sb1d)\u001b[0m\n\u001b[1;32m 3\u001b[0m output2 \u001b[38;5;241m=\u001b[39m tex\u001b[38;5;241m.\u001b[39mfused_rope_forward(t, freqs_sb1d, torch\u001b[38;5;241m.\u001b[39mTensor(), \u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28mprint\u001b[39m(output1, output2, sep\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m----> 5\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mallclose(output1, output2)\n\u001b[1;32m 6\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m output1, output2\n", + "\u001b[0;31mAssertionError\u001b[0m: " + ] + } + ], + "source": [ + "output1, output2 = compare_rope_outputs(t, freqs_s11d, freqs_sb1d)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3b58b818-7b31-4ecd-80bd-b5ba049b3c2e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "before tex import\n" + ] + } + ], + "source": [ + "freqs_s11d = torch.randn(s, 1, 1, d).cuda()\n", + "print(freqs_s11d)\n", + "freqs_sb1d = freqs_s11d.broadcast_to(s, b, 1, d).clone()\n", + "print(freqs_sb1d)\n", + "assert torch.all(torch.eq(freqs_sb1d[:, 0, ...], freqs_sb1d[:, 1, ...]))\n", + "\n", + "comp" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c04940b8-3056-466b-90f6-07a02ac47ace", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/examples/te_gemma/media/calibration.svg b/docs/examples/te_gemma/media/calibration.svg new file mode 100755 index 0000000000..b1e1b5ae4b --- /dev/null +++ b/docs/examples/te_gemma/media/calibration.svg @@ -0,0 +1 @@ +FP8 with initial scaling factorsHighprecisionweightInitialFP8 scalingfactorsFP8WeightFP8InputHighprecisioninputFP8GEMMWeight calibrationHighprecisionweightFP8 scalingfactorsHighprecisioninputHighprecisionGEMMFP8 with calibrated scaling factorsHighprecisionweightCalibratedFP8 scalingfactorsFP8WeightFP8InputHighprecisioninputFP8GEMM \ No newline at end of file diff --git a/docs/examples/te_gemma/media/calibration_1_half.svg b/docs/examples/te_gemma/media/calibration_1_half.svg new file mode 100755 index 0000000000..af2641387f --- /dev/null +++ b/docs/examples/te_gemma/media/calibration_1_half.svg @@ -0,0 +1 @@ +HighprecisionweightInitialFP8 scalingfactorsFP8WeightFP8InputHighprecisioninputFP8GEMMHighprecisionweightFP8 scalingfactorsHighprecisioninputHighprecisionGEMMFP8 with initial scaling factorsWeight calibration \ No newline at end of file diff --git a/docs/examples/te_gemma/media/calibration_2_half.svg b/docs/examples/te_gemma/media/calibration_2_half.svg new file mode 100755 index 0000000000..2d56f7d434 --- /dev/null +++ b/docs/examples/te_gemma/media/calibration_2_half.svg @@ -0,0 +1 @@ +Weight calibrationHighprecisionweightFP8 scalingfactorsHighprecisioninputHighprecisionGEMMFP8 with calibrated scaling factorsHighprecisionweightCalibratedFP8 scalingfactorsFP8WeightFP8InputHighprecisioninputFP8GEMM \ No newline at end of file diff --git a/docs/examples/te_gemma/media/fp8_model_init.svg b/docs/examples/te_gemma/media/fp8_model_init.svg new file mode 100755 index 0000000000..c7fce2120d --- /dev/null +++ b/docs/examples/te_gemma/media/fp8_model_init.svg @@ -0,0 +1 @@ +FP32/BF16FP8FP8 with fp8_model_init()FP8weightFP8GEMMHighprecisionweightHighprecisioninputHighprecisionGEMMHighprecisionweightFP8WeightFP8inputFP8GEMMFP8input \ No newline at end of file diff --git a/docs/examples/te_gemma/media/fp8_model_init_1_half.svg b/docs/examples/te_gemma/media/fp8_model_init_1_half.svg new file mode 100755 index 0000000000..3b217a3eb2 --- /dev/null +++ b/docs/examples/te_gemma/media/fp8_model_init_1_half.svg @@ -0,0 +1 @@ +FP32/BF16HighprecisionweightHighprecisioninputHighprecisionGEMMHighprecisionweightFP8WeightFP8inputFP8GEMMFP8 \ No newline at end of file diff --git a/docs/examples/te_gemma/media/fp8_model_init_2_half.svg b/docs/examples/te_gemma/media/fp8_model_init_2_half.svg new file mode 100755 index 0000000000..46587664fe --- /dev/null +++ b/docs/examples/te_gemma/media/fp8_model_init_2_half.svg @@ -0,0 +1 @@ +FP8FP8 with fp8_model_init()FP8weightFP8GEMMHighprecisionweightFP8WeightFP8inputFP8GEMMFP8input \ No newline at end of file diff --git a/docs/examples/te_gemma/media/generation_animation.gif b/docs/examples/te_gemma/media/generation_animation.gif new file mode 100755 index 0000000000000000000000000000000000000000..25150cb9b64162084b017442a3905c57127c6713 GIT binary patch literal 135280 zcmdSfRZtvG_%7(d!T^K=5F}EkGa;EQ1g3?#|%u?(VLGySwl2e`@#C z*{i+YzUum_tE;*%zUq4Wk&%<;<2TOuz=!An08qZ8DNAXnONgmTak8-^zyba%Jt88F z0Ym{T|IGpavn=ra{r&CjEjT#%zd}z>Phnx9m6a7YH#Zg*mW+(d+S*!OT^%(wwY$6f z$;pY9me%CtWKvQR5-m3lrzj!~*ZJiYCMITNWF!Rz#rgSpMMZ^!gM+H7>cqsv#>R%I zsAy(p=D&acjEs!f+1WcfIsotph*a#boxSbdy{oIMUlj;{ngD2Q?7qIfO-)VH)6-E= zQ4$go&d$z#eSI7p955J6U0ppRBg53xl#-INw6ydS$5-mFN)`nlu)nZwZ14WhLH_?d zkki2cUqAnVz#vF)NN8AiL}XNSOl(|yLSj;KN@`kqMrKxaPHtX)L17WJxTLhKyrQzI zx~8_SzM-+HxuvzOy`!_MyQjCWe_(KEcw}^Jd}4BHdS-TReqnKGd1ZBNeFL_+wY{^u zw|{VWbbNApc7Abrb$xStcmMGC^!)Pr_6|TmC6TSn>hc4AV$vV1%O zCE${^ULCB;9f+im2qBTH&KrtlP%qXWs?HxtWdAvwCRbB1mdf}0XmzNja3WLG50zBD zwrDC>HkRqfa4mGEP&HRRUB0e(u0*?r*LrQZu4JLYs3(L}p}usf#&V|k$4GtIN`u4p zaJoW6`C5zH_0if$L&Zi17y*q;v9WTq2l9!T&0w^#YI`7xS|LNRsd{%LiOXhvw5eu) zB1IKZ2-^wJBI!u&+jCc*EbAA%=>K5WTMgyQ-RQiUH1 zi(*HUP#sYEi)&h$eHA9kF%dqX%@uV=K5)o|$H+NQONb%h|Cpjxe@G{!N|du1VvQ

70h+CDgkjRjPS0?*>3nJ| zqhb<-RNW8+_+e7_pOl0^RZFLj8b^a+!w`9?)Ut!49~8 zp%T5n0$DiOY%!{LTLlA?j;xMVAA{8w$ zKnyg!uyzN6iwxaC;4s+_6O@zJ6zGx^bre{g51rLJC>J$O%10Mj;r=_Vzn$VE&`T1e z3xl|+C%hhwkU73uo>CA(o?s!`Um=~5zNWBxr{B3OwA4YL7Y-><{gZ?*1AUFk*K=^cVsuWke^HgK z#Pw#YJ%=nRQI9q=<8$mEW&b$;-7Vw0^ZQ-7n5A-o{p#)cw4ksd=ZZbWYVZ`qJs}iZ zk3D^RidD|aOL;>k-U@41AEWU@_8#XRe9Fao=gcP)i>eUcgIo8jZD4Qb`OJU1lH~NF zRae$@=1TyQ8bWmB6dNr!-^!tYCmf_%M_2N}bzeUvDgsWE<~V+*mV6+aZ3jWvg|Q_4 zArx?J$h2qf%(WBuzS`N}yBvI-ski)ET_tvJvYHFyPrc?E}i=zjX?gWc`Q-yb4(B58VB7{gG3an8&CG> z3Bzp|UKH6hfp4adLcd;OoidN1HyMO^9{oXIw{b8CzyYVI0g9RLS1s?eMZ9?&J$_TqW%)=x826dl3{7E*(Fwf-fh9B&ZO&=7kI9?2&BooGD@$2jL zdK#+OH}R+m&zlj8v!k2b*gIv_LVrv4+wE^xwKJ)%LXzR~PmQEUPJq*y$hqkxm5L{v z(lea6gw}?sQKZ?j*F$S|Iff||wquJDp(2C(Crv80+7L&cSqhRnX+YSOKry#}56f^j z<(EHD-|suW@92Ot0ddknvKol`NJ2V~W4WP48zrSl@^T?qfXbdDkxNKRJzZg{g$u)# zQ%KH}RYJ{S_Q|6tpY-lK3z~eUQc5)6FsE3bx;hly= z0Bx_`&qk`?5*-OQLY7T*sbE0uY_qw+1sKeS=v z;CNAbimwac*Ujvn|NPZz?6syW#6Mr`qX*eSq9^n&U1N=*O#4avk57z0V9LjZWN|>8 zaRdi1(z8%uXeWj5{g1WMWoUM&34?@7KqxjRX&=3@_txT&OnGJUvw$&$pnxCo&9%p2 zK5Uy%A?_x-AxbCGC~sO+H*zKWXq4oh*zw{#me((8hWnKvNXD_$seOar<}k;0o#J0_)6CKp^W`KtX$FyX znRqW&yXScVGfNLYpWiR~4njMU(H$8hmp2!%D*goNF-xTJj;_Ghd;dTMdsd`uh$P$u zDo5;}*Lwm!1$X>xWEBC1Ux&APNxL<CXF6tj#a*jHo z(mIhQ1`l-|JSU;LbrytKCrccx?44-s6Z78%-K~Xm{2~R;oXs3J$j9HWhZ_!8NfE(Q zd;R?D;hb^AD@#03UUG3d(jCybv)ShFbndTfml-h_fjKB=tK;5D`aSC|I2R%@dW0eZ z5N%|-8@`#LAiv(w$;AOr8~V@9TcMe4ev9pP4HAdo_f_k&4XErC@^ownXckUX`ZA_Q zWNRD8y&QEPUSref`-Ik`%T0~9V;y|AfW==R1BHkd z{pGB%=WmG`It2`fyF#H=HJwsb!!PARb3-KREHjBu*WO_OJA4-(o1H&%0}hP^|LR=L zkOEEfVrkJE1po*>hgecZ^M5wPOe~P7x9R`R%=_S8`TYvgXOn#g@=Dr-^Z2<-s0-6c zq7Veo`u=rs66NXE^ZOv{NAa~K*I8%JSg}!*kk9P0K=ote*Dy^=g8?S}2MPck(0I`e zr^qiw)7-0o?hAvHv#fw)3@^8@Gy4|%We<%K!k-Y#AOcI#zit3t6R=tVAaoJ(L=XH6 z^pg|!@jj6L97Ki85Rkz~f|m6CzL0FVN3;B%l|DXrvBxyX-;)0iAMyoP;;s8%DsgWo z8Pp4@^qtR`Kt1u;pe%apqblB!LRT}HU!sT z-32`(s!-{^cJn1dbeqI>Zua3V#ZI*No9OoV5eDbIKkEp?VSV|@{Wk!`T>alB zeNf93PoR}NBsuX<#ADJGdkDO9_$T!rsadu~8pKH`$!SZ;Cl{g>-}SkD0e)5}nuA8I zjwx6N$)!svI}}EF`3RUr%Gm(L-EHD}e)00A)Yc_*W5$HwEG2^>87Uo90nvDPznCt^ zv`MF!_J!0*!t`nW^jV$s`M>Fl$?3~Y>8ngj>FbZ_Fv5&2{)`=+jJ>}Z2gw;nO&KRk z8E20f7lfHt{Fyg8nRkCPALx@apPDjXmNMTSGXX?d@B&$gx>+CGvw$gCsLffRW^_Z)_l944CP9G2xAwx=9+ zqFhdaTyEXmFYdW~DY*j8xkAgiB2T%ZM0w)W0(p|UdD8BAvMG7;&3TH;dCE_DszmwU z1oAa>^EKV`wNvunb({0`m-B;-;Ejk1Oaux{bqmbh3oKI#teXpLmkaEl3LJ6MJdI_&Bdk5#pSwCkLJQeqVycyl6v=&#*~ug z<`U;(5FKVoCsAp)KxwaTX}^2vU`pw5bLr@E>G)IWBvIM4K-sKr*@!^NBw+z=W!dU- z+4@r%jHrA|pnONSe9yi7Af^1Mx%_0g{Oqaxf~evOwHyafkS7mE5CaEC!S-BS@$yvh zMpXGKQ27m@PzkSB3G}E$wW$Q9R-(65Vh~qh3RZp8tHQRa!ttmgO06OZts-lwB44Q@ zW(Im#!!ItU8p*@Kf2*KN1$7ix!(CO=kEC)EQ_y-;z+aWH>I>GCC08@sRIo9ZaOhPC z9jEZ1QScJi3YL%x=+z28mwauh?T6J!FxM$RC&`qM%7ucI^y;*ZlQlwXMMvsXkLwIa zl66u^^*lg7&?xLY>LUp2bp`8f_3G^vYTq3~>%>EA-A3xXMr!}1Hn=i2e7uCG1p~JG zv408H{XuKgWNy4ks_&#=vLOJ?87FUn!}>a_+ld-CRT&@@Oz;;X@o0d0zb5>dw2uz# zg{q`*RgF=^P2p%Dq4?wiWEH55}W zF9(4X1`jXFR;Cd6l-0JF3if|S!?^;S`!pXBYo9%%?2cfdkbqv$K<{W6NMYU|YnbY{ zL#G2YgH_4%ctR#UlAlsPqOW#v$K!W581V`i36|h=EzfkNS9Tu3x;q@&y`LL5FuVTr zK$ej@vQtU?4p}FLjJ0@B=bk^(yt8!g-gUNOvgqY~1xvu=`qCHn@uGLq)2gPGej(NQ z3m|e4-rDTdXH5I>n+K5-p=g(vL$sfez%ho(cZ&+(=?h16=!gtsCAa{5mrW zL&y$46`O!F+xo9GF!V&CbWixNAK)_p#5r1@f!2k7Gukgnv1BG_OV2cU48NMMc6^Z7 zlRNs7&)uf~-y@`F9Xwp{UTP4ShKA@wdhH2hvgML@>e;(QNF1>~=is{~mis2ridan* z?CD8L0Vw(g`E^T{t5rWYb*q-4C1LlTS-onyBCDB={H z<}1xC`uanL@|?Z5<{GyiJXnfCA5~L)zmwvfp#A#++J6NFRJVq9(vMs*)Q=d?CJG^p zuj9*5mV% z**0u1ODu1E*@VAi1`teY3-v|G@nE4t!(e#N*ur<-D^Oj)`_Uo$cLwfFpFw<^&j844U^17#p%R~bQ?M_0yWRe( zA#NaO-pgeFo3a#dxPKDX_q;RG#_ljSt&Oo3XW<`0+7S@JNfo=OdkkFa-sPthZ~A^1fLh-b{&3v@>|WnnhLcjXFcHv>HwcMDPR~LxzI!nY zHg-9p`uIM+X|xWU!-`{D*f~+X*z%?wd=n@%80W#=|DX>f!dxQ@YWehQ|8zaZJJ6gi zb6eQp_-W79l%VUh;3_wP!TB+fGID?Giot7bI^qnC^cOnTi^z{Q#~CFdU(is9!#r)m zE|wxCwi3mP9i8TXSCmHIXtTADbxCYUYqWuqQ!W1oLGNIE@abpr`DN6}xaL`e@l))N zA$;jPT8B5gb8MCK<=fp4HKTKlo*vvrtH1)@*04if=eF3VxhM@|;VSL7+7RnIZm4@;SV_-TK$1Nd(1)hrc}Yr47o+uN+@og^rytVdqY zV0H?CsJ0~oJNu-uEQGI9mLS&hA&MaiE-TBAz4KA~;Gq&2g%wTk)6IG@nh%VELd-0v zpr>GL@~xz0WW~nP>6lqaQ6I$oIOGCt9{K!*;87Ngv< za+-x{w3?jy)f{rjMPiRpuA`3Vl|96%$AW{M>bmP{ee#=>y~bE5T43UEL3 z-%(G<^|*;p>VD*$QjHIDkQBZT!5kK!IWoItV5ZRiNitwA`Ju#*p0u(w4d5A$>+|%$6SEw4xR$mygpf`xWdS@t(S+6}zC(&f!Gj@C)5~s}v zk_`JVPD4dF)$WN=t}2}vv^&1g3$y6Zxr(+lL#FOcGC_3m2sPOX?ho3pl!7? z+p%r!v~nn8?d;Q_jrqL6$aejz_g^&oMvG!5`}lSHS=%PVj1v2nv-TG1mWzZWyTJBw zwEWhGvyopNSTV;HkxwFs4xMZ3HV)mWpIbLOL4w-nW*A>bVZAtpVUGPk+)|DZJmeJn z0d8*=ry;sb5XumP3f^TTup(_|gndHs_gIUA?e74lUnEZBjKe2^zbD0WTdya{>PBf@ zrCqgerj@_iF^{WKFFK2TQwgV?)c!T*()iu=6mvSx3;m0PX=3=F<%zq}yFy)sKQ1fL z>8!46e{AIL*Ipq%c2?XA5AQd8KfYP4`gagMq}YYf@DBJ}%&=MVqhAoaAh#O{2x?q@{t3xiznVg^B&FULZLK z0yx!SkI8~{|C`#@PF&l6IcL2e-N=ndPt)bcdZ18p3wjjFS~~*Q*nnSPCe;4;6lpsVCRdn14=xRMaA> zNvJW+FQ|1RWU&rZNzTjSt>_lfNjgv6v6bTM(`k1~xK0b*DOiz)4u@uOX58Ea@VL0B z(D6(fT+n=oW`im_Rdbmgf{J*N2JYirCSDKj2izT`NMq z*ofshvfa4s9$QFIN(xLD#l?j%BJE3(U#kr`JBM=ko)vq9$bGz z3w)Il3$h*$R6Z(_B^6cseDf{*M7_{7nCbg_g)Szd1AGZ@&C#Lv5n6k8EtFZRP&_pS zQk{ZZW_5dHVzQi+BCT4kUv^xK*$Eb=izyVSVe0e`nJ;t?g398a7)=mq=LrjaXqJtn~zFH3j`XD;`~;UZyKk?B8Y8LvLA&BfZ69ZOm5q};GsgBocRLWzMTKM%2S2Bs=8ZLKj?olR ziNKM&-jT)aWT*BpNu98$`oRP=N2h|?z3@@nViXO*YrCj^1nJdp~6n?tL}2fk^ecIOP%wG|rjFZ0j>NEVlgWz%l+ zZ0k5ynnmqiMK$k?*&Cf_=-8yp#8aDv0cM=v=?46d`yF>nWFs)<<=-P(Il-FTKqTKE zWBc};Efa$US*0k>Hghg3V=Ec;6W(5oZXZ!*KmNG4Zc;oxPDss@{&1g38ba%y*t~XD zpWMpqa)FDLmUZ`~H}bX3%1c6M@OAtiS4X&B_ovGyQl?Hf_3h;+F=7XFa?bYTGFv5e z{iU0=1_hmn4RFuu8p-*?uy{K|T4K?j3mZRWyWM47L~6doKhXHF5sWMN^*Ys6jozUDXfd+PYlPlsw#56k^jF)l`gvYkA>=3$)$1q9gvqL2;-Vh@^+A`O zd&$(>q!sULk3QJbH|y^&xcdA++=%en8DEq@Eh!KE4-y43b=y@;#<@q=$&m75OrY zxjF93V0~=l<^=k@!T%fEQ{f|^p%|p}+9$vT_={@GNig_JJ;=&v%ZUwouv&^R4*I{| z`BM=vd^`9k;-=57?YHwn9ps`3Qw8(C7Kp`d@Xl5B`VHtQ~473~!w37|AZVa?% z2(kbM$(wt4?|W+}1w1(U1B(vuJRE{^1(%rGx&c&K=Xt~v=MfKO3{$LY+#*9K$#iDo6jL-F<2GhZw$VPcb#{P zY6uSp;6Cb;#3oj_mShftP7;A%<)CXnL1ORj8xxfR(8K`pE#k`3{#lk3 zG?)|y{lqfb7+MPsMVi75`2w1krA$lW>TSeYWe_`B41b5gyL{6MSSwCm=rw?xQLZ{Fd9k_P-jlNP>cJjDv{F zB(5-RieJ6VWMiIT>i1({#jNS-phOmJ8I0`wg%!3ePv6x^A32Sb^92`M;FUDmh=o@|9yRi_B z5Ah1-$3i;XOI4I_7P#J1!jx6=gp4#Lmd0u*--zYhRT$%)sAImn#fO+mE`fF;d8QyI}-tk(3v=22Hh zNdPAtVMkIZKYB$|RCF?tyK!?iY%J@~{YZlG6B~aQJC~yIot@#L{NWBMVG!Np`4guYb zF&u_aTo%zls&-w2lxh#2=J}avI02bAnbNL7oM!1!x#=rbECnb@MaikuoAD1E;js@1 zQ`D)sl^J1CDImsA<&FsmqH*w@7zdP0Z7AwbLjhkca)uLfSQ>HwS}u!no=72C??f}{ zFD~)#ts%uVI}}ACj-poUgZ>&$9ei4kz3=OP(XFKd-^1AcQ_bPX~j{S}xUBuI zg>AYuT5O1JqplY65wzo*S@81DyN7h)zW@#Zc7Xsk2L&MB0dT1UwBC%}iHVB6>?{L; zV{R)!RztBQ!=I{t1NdNJZAK+g_0Z$*ku3M&AsQ7ox&?N)6^PcQ6g3s{3l&4CJUm1{ z>cF9FmFNdX_!|xYzscN^3v>W9$}jq3FhvziC(qxyNZ&Drfw9aKnwrYU-c6+0i;Ah~ zY9mtjl`fT&A+@3%w-P(L08yga-LDkhh8guJU2Ca4a_ZB~ptOvY3RS8>1d-PGlb)={ z5BU*~+PjLdsptaN8otz@k}E&!2eXn131ugbYOuBJjP39(99BN}~ z!yBj9RN&Nrv_eyXRKINA2#46+$&g`(AWw`9-bhm8yFA-oXhr9+>axuj9)-Ks(da}O4ds#kKXJLrTTz0@7!1mU6v+-_cXIo$%I zZC9!9VhwFacf{8pMDs)ts#uZjMqXjz3@!FNQ0FhQ5MezL`5fkZgWpW;(QVAtW^r7? z+|sj!*NKVHa%A4ka3bnO5WpKo=?sNWvO+`8G)c;EqTaxFpE?vz#R#ou-|Bgi)z z&i^Ib^Uy?@&)oJtVrM4a8u45&7UnFG)(6e)wZtAk#qKD->RO)by8kl(AnEQ;rBr0; z`0&y{%Q;~BZBT=HP)M=UdZj0#A}Qi&bpcm+PL?VMz6G^M}DdFN0(xqa9HL zOq#9vFF6UVBYv@JVb;u8fH4zW$~oeZ9=$O)n>GzlZ~bb(s-k*h>*%l5k%5(=Z;BNe zp4EG!19>kl>z*zho~1C7aY>=^Am0A+RgnkJ{_588c)HGb&*5t!J~yf4hHn$t2ovAF zkM_0>WQUD4%ji<+w_l`A64Ot>r`O=O8WXJ<hJ`w*1+9a{NF}Df!tGV) zJRz^`kkjyb)|KSy(r)as&r3xWV<9+7tFF+MHsOidv5K>x74@Xm55p_N9(IflM0(L) z)UjjV)mmm|l-6RpS7ou+Rgc%CuUAox7jR8nybm#P$X;*QPi%485H8V+M6p zQ3C-oiC`+10lVZHxnS`OgJ z|BZjobll&7^tYz>?r`J0ajVuN;hy=ruT#0NiTB=MDU`N-PlPL!nPiBKXUlwZ_Xz)> z7iNK#ICcUWviGnkLmOvNo@LA6VsGF7LAHrmW<+T~>E|tvk`esx&FOXY2jl+1vBxOC z-LSR6p;%2oR{P%o47@ahuu3*ikg}s)Nk7K^L8I~r2>wu$`tXay;d_ZUjTfKWr0XGP z&RW&+QL4q!fUY_kMfCN(n-$TiJa6i5#{me!|6=jad5;;XvT5#`ssSa z9%vA@L>99`cAAuebrpftRkm&1c=p6}lpJyR_I8e5aOiJ;`ZwtG!X9?Fe(-$O+>#a9 zJ%)xF$uy&N2KWBUf9ycI?woz)Y#IB6QREC5>43dqeW(oDf+oYNPl{_U_dj^ITUq&ScxwjBWqr+giWudUkjYG zd?qhdA;#s6hWp2i%Qs0}_yv0oF6T!F`6YjJ2JH5esI>523qD8*s_9R zOf0YHCnpOL5KzgkkVmCx6ys^sh%6#A`ZV^~LV6NEFSeVJ-m1X{f`1O`lS?EECEMmn z+f&Q63&lJBFL_%~0{!3e7RNx!hFJ&o_usOfjv!QgI-wi6AS&6QG%`$D4t1>%mC)f4 zI=tM^`OQgqf}!)8!!foiZ1GKujLQvXo71%NU;YQ)c3p!0Z+IIj%rwTP&^kq+NmV3= zjZusM&?W`rs%cB0>YKXpv0_n3I$}9Xs(RX2ZW%kR74ISn-mR+E*-6aD3ErJ`a=bR1 zE44mMZuZAA8;-xX^4s)Jkz{)QVJ-Dl$z`TwF=gih1GPS)Z+SC(u$VSb5ky{YXuLUW z)##5`XghnUzdhxeA$@1R$~rBhhk=h?WMDo(x+0kGB)bCu;eNGn_E(i>jwmf3=t7hh zCI(reB;<@*Gsh>|nguL*ndw@A+D(m&&jTbkDLMXdQv-PRep5$aEA#0YckSMqgi7Mc zZAVJeL3d&l*^;R_1q-e^0ifPVFeM>HZzlpDg}#%^Ul#(*Yhgn^j?VAn?O3J|# z-JGYN)tHv-)AmDd*;*^tL#Oke3i3Q;w;a7$|+S^ihkR2~x*4Jy=Rt`l~UtJ8Nl8;`Eu)h!3PW2Tj>MOL$O_%O;OKuhy z4vC! zSKH>YZ^s`QZCL)e$MRcvZ1dTCys3$a7`Z;nM)=LKukdGTV?#VKTXSQ6r2TVr%|`k2 zS%kaD@ZrDfH}y?XB=jbR{tA)WBPs-R_k&<1d;62_5k=42MT@T#)FnT-MwnnhpWhy~ zkX1~nJNijaM0Og^-&pKQUf*7JL(7KlZ}v04WIjqB|9XFFN)3Lhy{|pSx&Q-(a9#qt zIzI?p_{{1>o$H!*ii=(N5|9KA;U#rpSWh~Vfi?#v54r$vlJ1`Wp_EUWUAWCV7Xc{4 zA5gd~#gUYv#1Ifc$QHmZ^s7q0(q~HaM9N+=d4&Mf5>in9d^fGYW$+iZURaiz50ma? zi1z3fE|)4Cn>mGG+w*8n3gi=O%4L`?A~pTwpfq=}+9$iPpLiQU8UE#}&?#nunJA!? zNpw{t7(^pzKgci6#1Q1mvOvDzGbl-96CDwjM{`Wsk3hs3=$W>Yt?HJhXk9HA^^(UR zLl~l9Fdm0&O~W{FFjU3h7{7x`%VNPOudz3lfDl8=n%X0mec+Th@<_`bv?Qayj;>=* zw9OpcRIJ%A5kJZj!=5WQI)+G8lagLa`Q_uR!jfH$+RRGZ#Y<%DhVm6Q0(f%r{VxDEasCJ<;D;XNQG?7$K?xjJ4UAF1m-1sG~`5b~{GEsc=% z)ElfP3Q+p2RPLTc+c^LHt8!;SAZ1<>V!9(GptTj`ELj@I23Bz*u%^SFcnmZs0EJ4d z)>E`79-2c1$fmp0DzsVjoxf`y-VfVwi&*>?!=vSW4{txv?LlA5+ARZ zciW)0LRIcz%$D>#sgz6R{^t=gt)_Rx3b(1g)l22v#;$4VFCl0mc~SLk;l`g`^>Suo zQIOmAcIln2M>ck0S#XJev(Y|))?Sx)+=-!|5viuGjtgM=P|6s+UBvON!&| zvN+uMW5R|!;WPSmh^FD@dP6Tho|d!v^{C0UIbZ9OK06Bu%eK}DSIc0DBQ2ReH^KzM z0MnggC{oQ9CGjIYGNen>;9;x6?&0%AIEIHySoHn3CUYvyw5Uv-op7PY!JmhmNdrQ$ zbb_O!0L2Jjuf}Pw@+Q5PB~Br*rx8aiq4l5knv889^T1o3Aj5(S_hVv{d_1CYZ}H2U z9Yu`Jv%e&1UbT5smlpotpISKVuj7B#S*fo!&djcHM@Hire^SwQMmx!&PujrBjUJGo7G@H^0WRXorN2+0|j4~R}O`GcYs8cC_ULUje4*omBVJX0xZWvz9FE<3Cl=nZA*Nit1pJt1?_C7`(R_JFCM@S z)DPQ}lvmkk!B;vnu+G`1mQUZ8gaxeiptP zQ|;BKt}I87-%4lWh+kSAmEGKk+pn4k1dic4AN3Rwuj-B5*)lTFX4KcPn>yXubpM4; zX^{hrJNg!XXFup=wOux7w=Cp-{;NModU2QYSB~A_-ej0rS7?cI8eA z!u}tu1R_CzKnN{Lkuf>HT=u-$y{7s2S-D8xW%_y;g%BN!#ghl;?L zjKGoe_YbYr*x#e2c<@6`fLk#gZl=`=$%Rgj#&x^Z%fCoQbUf0t*S1tm>z(9M* z9n-~tnvmN^aiG7=l)oxp057uNPNhc>*n_RmL%_|Wjou-KDqss7a1HY-G!N8B3bc6e zZ=Vft68|Ud=J?muuciLqS-rnCgx(}6$kQ!Q6KKT=aph)!=p9HHLHIm?7^Vc~!34IJ zi$PJArZ_U1zl-p1ErK13Twi>G@Ma-Z?7 zZHOEnb3a3PL4R1sbPCCG}S-2M{OS6elqhCu1d*ofW6VA1^}~r%H&b?=0gK7jfAT z@pmys$eb`?4>5riFFGkcCPq5Jq$y!;Cl*kYQ0x$67@T0MlX!@jh;N=SPM6?jmFV3h zZ@=`<%L?eD7-=(C-b`|J3#|;tWwI8h09P3vW8^y{#sN$rZhqYi>#cA593R9Qkw|V6r59s zcv6wz(|$^)bvLEC)uhrcqfjfc|L&2%z8 zOJt-3r2jKZj~UF^d(8N|okm5UArzc(@i+6xB-1ZHgQF;Y=rH4fFiR0JBO91?FOj8Q zlnLLQg+!O|vSj%o#RPRY3#VAIoiGFIDG*;b>rE${teNvuisnaD2&ua%FPeL~@0J{DnrtIe(anLQqLsDBuJgq6~4N z20TzzDX6XsR3{#46aY;vhQ{bZvxcFWOvMRD(EMd+kw9?)QE`cUacN3%VR3P#F7%cU z4%?{2^9g;n5<%qR``6q5FL*0Z)`nUXno=g~E+%jr^MAtIug)gKa&UA1C%hG?m|ZT4 z%`ba6;y$!4e_5`0d#V5sS8g5^K2cVF@Tdf)R-!&vAf{GyVLao1y+EL6uOg(V!u6>7 z3jSa4wxx<}rHbOYii)_JMzESrubRQ5nkluKrKOr}rJDV@nv=MOTd?K}S`EEw6+dy+ zR~WuduWmDLZ4qrPu4t{aUadrEtyD>^3>rZ0xK;tJPU*Q;gShUyV4Y@aopwu|;!54O z<2o(0dR^vv{pUJ!;(80gdMmwp%awZT<9gfYdbO5XO`Y109u01(4el)so+}OB&kbPW zMqj~3f4#;)k48vpV@OM5*h*srMG=F*ks^5^DC;+AT`mRh}*dXJXI)RyL!me!S)_UD#P;?{0D!PZ{A z)_#xH!PM5_me$dg*74`oN#eF?!M0hwwt0{L3~k$TOB>bdO56H#8;rPpOR#-MuYJ#> z{UEjdsHOd6rTy%={ermTN<^^ZMz7<}qvIj9>^0(>UFPp$6D>etFJ&K?xqszrqS;XI_hFrRehfH zWm)ZJd+BB;>ERUW;nwf@;@QKO)+5l`BedEh^3o$p(#!c0#i-xQ#)BrF)+^uItGL>$ z{L-sR(kDlvFRkCl$piY9)~DOrr@z`~@Y2Vd1}|ybXBwtjg9(67>SyNju^#QWS?zat z>32HmcP1Hd5gPcDHsIDe;JP~Celp#n1 z5pAPb0S(i!&akgd%X-bq+s!Jb&nl6QBRYlD-&rIv>%th^{ngcr)SqV=0#l%^Q6fp>ZK?ZRu~@lD_0pg79(z*X(Eg z`N*}!sJ7+ewPon(B8%;!<;z@A-j5XzNjNspaz@)qYul8q=VCVLbiMHExbd>K-6G_5 zxwLGxylr)SZB-S0;bOJ2_s1HiH9V{BMDyBzVbR(IXt|wsbzXQqH-5^Sb?M-Bb+m2$ zXl;FRY@I@BZAW;+WlO;Q_idXmYnxI)m*vxAmfzO@a39ZzSticH5%m5mMoE>;^lm_B)@wcVML}@MJs65geLh+uuZX zRYAi_Z#&xMIfQEq_dkXu%6H_*_GCr&STk0@>ARL^GHTv1p!b}{+nxhij~V)KZTj9H zHV#_!;Rml}Bjr7l^?mRjOs>=33gErs4Q*aq`C z+fT_jOe0%bTJv>bJIwLUx;^Q>D_si9I8rZP;dTHI-IEl^yG zySr1|o#5{765QS0-QC??Ki>EI&zd=ygE`50)=G9(o_+tWy@-6_$=ma7{PZYeKbp2@ zi}(CBVqq#`ILqVwf%lT4?!2x1Y>4jSqW$tpYyVbyxI_999pyX7Y8bYn|1IN+yG(tl zeGQH8ir7o9*Xr1A{1Rc~lCokwt8J~q;)29_r^o7QGGhIk_6_0K^@=70{l_(V`&g#u z6}`ImiHz^4@LFo8ngQ!8HY1kx2`DnZq}39^dtWC556Df zav4h&GB;M4k5jD|RRoWIACJ36)0P=$Pw5ZtUQc?FPg7<6rL8wne5+6GBO!dxvte-N z)^`yfA5SHb2m56ged`aE6;IU{V~PyV80W{}i?e9=k-UwUq|xgi9}AcnFTGx2g_$dL znNPz+FC!fzSssh^p0^p9W9QF&DY9>5fp2MP7p-2W?H_L%?u)TRD|zmBeHREw3T5+N zZarD=WC`zguHAPnV*`?`tdy%yrZIZ1#}^8KuJZ7gNuicaJW<)VydPt z!O6u4x|WZh5%5OL;Hb>qR>WUGBY0pi0zU0$eQV(!mR6c( zF^y0Bn1;qfr()?5o?g~Aws!UgB&6q+*0S=9$S9AHvP}O=qRG@Escr2QnOV^>v2kI1 zAqMIW!K8-!Mhm*k67Vl0g~3l^-0lvh{wClm%9Pt1Qu^-l3dfu`82uxhGx$xMMzbRN!M#d7(~5Eh{cq<>fhI>go5LRi2b z^JN+>T4pSarHfSt1LNSn&%;M`G~?&uBdk^`4O%D5ZJ}%x^8(}v;~KE+m787uhE#^g?l@~PRVt?#&fRpqH%c+_Z7jhm_h52P+*qiMbLnKP#q9(69Yb*?cqElC zlDEyZm$?kaMuVmG_38FxrJ+}i>u3NQ{%B*I$oTz@0?X-v;IqChu2S+bU$4c+vqpbs z)wRLUGSLmhv;vV>SDDsKD?m{Hx4w18R)oe{3=<3}WmgWxZiCZTXZFhZPk!rbKW(w( z-l!ENfGlPZeORjubm#hSe*5M7ve&>ErEIvo-52aIW!@pfL@i&t7{3i65HR_hBWOR_ z%&yTW#iBjg2qu&#={JnJXRvXGD{HY{xOq>dy{ca$oUYq%{;)=ATLH* z+*HrtT%ph>)N1&{v~Uo&c{@GKTRf{MV7h1zluXe?AXV`t97~>UN0}GQuUZ&0scnnc z8o*U?ToXw4levsI;Ys>RHzZ|HP@|vmE%vMZj3{4Sc!7X+_+18z@&MDgujAOiaXk4n zhb1FU%OE#{qHIFA;iL_X!9LAS` zL#vcPhuyz2gbsT#umOy=&)o+1^Bx~deBIn9JklhV)BKsKuJ#I>`SIL+}?l*hX?+fg;>e5ep^U9GtcgvS6)?8^7 z=kAZ&*7>a(dEfuG8x;k;-97Jg$eflp1z1w`w(SA-nD4l@qP#a#T%qzIY z@^J!G8;`G9Q0g!|b@AWb7G|_y7({=wzL}VP{tOdiJa(dnRN@?kM!ZZe)P$yAqLRIi zB8b2VlDE+dq3>#UG!}N?A4jRP?&^GbZixBlvjGVy{6UT8FbQ7hS-M1m~1<0nF!Zp$pve;k>GCNWDxd6d^@Yu?r;PR za0?FyT0+UixWf+ce|-6?H7Q04g*YGtO)nP}Rw~oZ4F{&vvyF)1j{A^ell>r@;8b3mIV`HTWWsYqY!oR}HDmR*7eXJJ#esP)fLWmQ8lBDD( zXqI&ozf~E;SYk~YYBLe|mpgH4Ql`;Vok}T>>YY|3HZdO>w@->rtym19o-%EhVPEhR zwWH@b*W+|-&syjge1OvTdAxhtzUXHUg`_ppWG-oZws<^@T;_AMi10+AFoRaBurT`Bg93Y((=g1?7lss87?E+UPMh-xL_Dl3^AHFBuY2pD6=#4FbjcC5{# zD>F@&gL4_DO9-^2lX)>wj8YP$zCiOgkqKQ{J$7tJ@h0THJ)W8Cj;TpGE42+B8C^tR zb-h5PvCc$XI~2HgsGU=`jZr6GFmq}-_FuEe{w;G9;?%mkXAbR>sJUlX*?3A;?#2}& zh~I3--eG6uauK@m%KXo)lc2=`Y_ANAU>pR%L1kJ^;kw&RhmtVgJ|23Bh4T|q#c|Qc60f-1DQR%MTC>b zZmf1<hsKr9jAs2 zUaF7^^gn+_;YLK2I1TU^)uhtF6=w>O4SqrfyeT1wTqn5Vu_iP=crW*mz4P=f~R1(aJ`NZtEk@%b`bY z#vJzrB}?aQ!z!uD);n&S@q>}^gHkp;d};(|N)0_r;Up*9&=uFxkDX0R32y3X!=~R6 zgx`SI4Zm?JAU7%+hgQhWWl3Il<;$zs(Gjnb*fjR*eK=Zo)NB*H@OGx>o1zAGuDSx< z57()y^tRRPGObrPcM^)H4PU9VAf5LWyqxE7p_Ws@EC(_VEE6p2cc?^542%@bOH?o% zloc#Tx&v%8is!fWUhIpjlvi^XR(GuoPTfCMX%_aREp3GHPQ8s8y;AM&doz*L(E`)e zmb4zz3tJUvBHjH8MNKBA-EW)^T15#i{*-0jT_k+)-7f5CJ-^;lo>VHeo=^qk<>q2u z7e74j<|{lcgWGZGU6{9dP+rF?TBIE^8w=noUW7Awp9BfHOcGz84|6HjmPKjKlC94L zw+ZhS6Rn_42c5P~G+&uSsw5*O-p*+>E;Ejrp4<4g_l!KXfjFH2jZPr;Vh6-R<>!L? z;k9`Q35g2zfepnz2{E0B^UMZ#RYpcS^xn0RpYsRS9C%(7+B7uKHwF1j?%AQ|Vaz_z z+&OtA-Jv5<0)9bw)0lXXiu%&B17NcNU2L9`Re<|MUs9Z}Z0rCI6Fk>GE;SKX0(Sio z5r0BQgYPU}JC{h1I$n>kq^OHtOoxz)CS(E1hd`uFFBD>cA`S$*H{jkFLbk};k0b!4 z(JOiv+YH-q(8>RUuF&VA+s8UdU5>)b^qS^5lZ?nD^ldK|_E#Te>0Pn1in*~78 z@j}W1aEkh+E(DPuVn-i>{!(BFiUtuI`6m}?gW1?5RFI`rd{=A&=lDUNQW+XQ=x35< z$fBPiB@P6@XOfMtrYKM>0l3f`+;s04m+0AoqlyUY`CG&h_p=Sa0eTGbdEtQ2Dd2S3 z0f8#a1f5|_zrEkFp{Q`a`r$x9WC4JTASiDDBr1>r5O~i9iKasivlzaC6S1Qbalj5Z zD*A;m2?+y=V84Xq>hg9wgk%DQqv(JjP$7smz1TZLjIJSpSpoMCfMd|t&$<^l1c1x} zM4KpnY%E4WT}DOuM$d|b{JobuCO{1YPzna20m2s;L19Y(Ex2$z;s`XI+~JP9I~18`ypL>dfFIf5+5fl_+}aLNXhgP;t$y^&BMKEe+n z*(*bcs#ETRpp;a+CJ+5^`D0s>nFjS zMaYQQkavo#kUb|FR?M&eF!@Fp%I|FOrx7bSDb)IV3gjBwsmjbW0{ z{=`WTUpN5lj|(t}!#557t8s9W5)0(^BZ)2uU@PoPARY^|DTu=ePz#9`GqYd8RdW=}`#>Vk zt0c~d9mVI;Qq#PF7zZffe&7`NcE%W%q%8q_CQx&yl5}An zA@~eK$@fL9#RMAyivK84(4b*L*pwfoKpr;#hfi>rCf*rVM-d(W&Mx-c<9$ zucWM`xtiY%(94)(hvlT+8j;gw$W?nbjC%`%IF;k|S#UmCi#M+N`!~~woI#AzSaca3S3Ng*wrPM$LA78>Z5U()& z*eU@|*yg0U;)QqbsmO~JIRjkbk5>}rR^g)8>ggU9$6;NZ> zsvQcB+^zvkMm@sjPT_sEbgAN^p&2cy)pg7ME6Qbv*SY~{|0+Z{bcw291=n&i(q68g zw^3rWgp2Xi%;Sm(ct8FF_!7>q%B(vGnA0hM-+?@bk0Zsaj8eyMHt$3Y?JR(6>PcmE zm#FuK|M3P#MLd#UwDkKHFAKf5!aao%Eh_fQa&-e*OzX?<#N~FH(eC{i|F^o_+QX*x z)Rw+yMNJHVF;xP^eF_;8U~4g%?~iZ!QXAWkIQf@4iIG-Tq_!c6Mmj3EU%_C9I0%NO zI#~S{KRo&^(62BKsT>@ip`}{3U&*O0URmr7VhQcktKA=#r~RdRL5GH__5RKSgA7D^ z9q4JA*MIT@l7K%b6RN6mh!Hw<+50oq^B1!kpL1Fwz=J^A0&hxS98TZlW9zJV7lFDj zzd*+XWyE4-4&>$MH>#C~wt0FpgwqDNMqNUC45AoqVSeeaksQ&4w8s-xG|*nSD(m3pUA0IGLPkz{$nW3vy~YRoneQ>hIG8^OxMZ(TTS(3R_x>W5pza zsUeTJfgv}`4x}>KZ%~gK1*8bxRn46UG(*uRBOfF!l`-%`Wg1fen*_jOt=eLEUeb_} zMx#?nTLpYLaT$O|C;_a}9Y5;J2mixJqRlNCIrT^J`{)c$*>`XQ#ufAkE@Ka1ek8GV zU`4(Fx^Wt|GL4Ov2JI3sIS7Dr9vDy`!(OX^dlOVy`rEr1B#P{5-S-7{}i>Kx3YNy4?!%^3X(@^W6w#h|wzx6d_Ie;dfY zsB>#5Ue|GnOG(+5%&xnFlG|Fe_q3ONRgN$lS&u{#+FIb#If96TV{i)d&a~q_mLpO+ z_9_{VLl^5TS2od@4_kCk0P2EcOW1!})cg$dQ`;Ci=y(#>8f3#)t)3P+UhJY0R@dSg zo3cYcSCR!yQhVdlirSWCs#R`{jrusg1YcFnbkH$#I}0>UzLPyy7O&=Kr9CO9SXvopH@4Jgxf&<3yM;_$T~2{yPe^iu;ZCs1e5JN0~urfsXx8i+IIgT357Ro3Mg|$rpO->k8rGu z#UminobToP5a`nHhw$$kj_-k|jPIURH_SFm`>b<3mPnqKY}dsd2={>W9SZB3N*Txx zYpXctia_3$CptXhjQwT%{kD57W8{5<2&{;V!+G@0Q=Ub}Qe8^!jYQ;w4!4b}IaUPE zEu`_)@v_Yih11^+61z^cdzV}mcG4{|9vQ%QpgQl7&8;bF_Q{v76PT%!f7$#^?Hz!3e>*`nxb zH_z?4hP=3*Siklg=%*x7yb9`H?bI6wuDAN>t+$^+zP0>rKSs{j+KmWFDLXhGKRY7a z5Q!9nP3+#M@7ZAexDdN<=YHj-{u#-w=0(jY!>z@~uCs9mZG9JW<&#+6|0OW&cg6i5 z>)jgUI}i0OvEiGl3^6Dv)qQw#7^{Os-mBiR<~}-P3a@*n-hbN$;@;JdcLkh5M7Rv2 z2ZNeAP7IIsVduWq2V3uFQWsBIgtw`ls}<+>p68q2wKuZ(?qcS}d6_PvP(<2P_>q4n z2ITQqXYz0Qy>vmpIO_&&Pv>frVVXK^qHNq8%B~AuT&~hRywx1J@$L?6oRy8|UWVVE zm%sAH-QX+Tw4waLd64L_e>qLOYnFMp(|ug`LP`nghS0>1M^TfEcpIHqawc3~E#FFy zeDkut6$8TIe&7Op0|Fre{epx2!+gU-qa$Ntb3FNfxVKf)a96gk(^qAvTkxu;4c-DR_Tww zdcv{4w5vgJxAS?q@y=@rKRBk(HN~j)#dqE8(4>+Bmpn&9r($$`sN{-|_v`+bdi#XU zRia2W^Ni%0s%SkU9wZEuuB$6b|5R_KTJEc?F;R;ZL6^4MNMBHPEHPXvH$Zl8F&M$G zld6684=I{rt+b+t9w*XH>n{d^Tf@rF((GF37AuD{VC|2I_P4p>@`)DT3N=rf=lruB zSj&yJj}O2X6%;lD2N5xcFOUs*j+=i5h;_H#<`QkU<=__d0{$ovN_GH+MeWdz;}_MJ z_Y`&XVd1gl#f#I(H$xfwDR;t-qOmeVbMyC?&DlxwcOous7bhaQ?nQQ@Mahqdtn}s3 zqio?k8#KtGi+|+%p|i3NN#J9OAp9BojiY571xGHfNwyvjpFEu(yy%a`jG;#&Ol^9Q z?z~@INdF=?r=2PD)`So>cgRMW?QmZF8Q-o$C-}%*?~%FK3Or3E6Y8d{*}$eOlBAKD zl^i+#OIR)lu&T$T4vNCs|B*)*GJjZFJ8wqg$~yl+u^Ux}*#uS)TBoF`@c+VbS~f@y zK9!yM1uvUVr3OnKKTl$!oRn$iQtTJ4RI-;cp_z!CVkZ4Vw~;czZMD!Q#iKDN7vQOrYxd`sKtu2okrV4H`R)f=yq_ET$We_e)0oS+k3-7`odsN$}$D#MQvn z>XWUegnkz?oh`?qmEfvxrPsr>4X@(mPj6h>mCb~R78Tvq`xRsK52rU?lH$ErB5P(3 z(;nqTSz&8mHx5S^_}~Yp4u)1xutJunR>#VB`VV_C~c-w^V;^u zk^e?B+k2|!rq$$BqTG_-_wf1L_ zDAm)=UkRI%JpFdnXVT2+%XhpyXx#UHAku@DYK<%2?#DOP6*_EF+n?9+-2^>PRtqaU zx#74rxHkL9+PQlF22q(gqaezZH{W8bM89u%tYevn=Ywf0@@sm8Ii1xCn?<^D6~ZtoNP?a za9iKd?7q!7R(=`}i?A6keGM#;MkrwfQ(v9WX-#xsQt{FayBBxe9*dV(pen-KLT6Ig zxj!ka^!rs0mhQcB>jByQ~V zX4#Ae1?cctBR!%$`KBhOlKv`{%HzHt-)eaZg+$OhOxYCYEq|3^9Q>v*tGr*bi}C)c z_{&9Jl9(FZ_LNi~3p=)MfFsC4ZS^ld820aw?z9~xq@k1250Q}N+X6Ts1z}&Il)|+_ z;mT|l;YiR-4k;oV;LpLE??ZdwWu^b|YtGd=#)JqxQZ9`{9>UqsxUkl>o!AN~+0hT7 z+?2agH8~gldstkfH)6 zlWU)i>8~vA(F7GuMSi#DOAX~_i?sfm&+FM)ynd+5@u~%nU%JA``R;e!ZMq*D*&YGt;tiSN%|5?#AFgCZU}rrk@g?Z{U7kC8oaV9nac>LQ+>B^vB;6oGstDyUI6URYfm#k@Te{w*Q)y~u zqCGM|x&$>us(FdD;pzEf@|+2oNggy(<;}J6oG1Yl}O@>p(XF^JvCh!0BX^%|A&zVC|PQ5i}hkLM5d*(W4h?3MG5RRy#E%2ggaR%%Jf39ZvB3<&O_j+{>? zRWve9P2Jr?zO2C?OJay`5+fJ1250BbRsc2UmAPJ)CL`+6QVrKc@IcAGXhU&BdH9Ry zXL3i|e$G}tH~hW{l-e)9(}l?MvY@FTXDLp*!NUrcRbVuQ7>T zQdQ?elyIb{%J|^nrnj@Kf%0)`qmaG|QFm98WEQQP3O;AiazR&MWjA6Fwy+Ut2oQG6 zuydfxe)U9>CnI1p;BV8|5PjWdpJ*C)I?uT0A>a6-wJX~dYHY@e@H_+TU%iNMI@Q+a zIE-CmNeo@F7d778=bmVuZy0wdLtU7O*Q+t0GFAh>FGSNpTo(QINV3qjUnwLfY_ePn z9$l`X9k_2Gsa*fKYkzsW@JU73%@z%vr<=Dt81 zN{onmTdE@M&KilIL7qTkj!p%S2w0`)I6KEf12std)p-M5WtS%?U)x2H)>dFWi*y8g z@M#S~++T^IM5YM^M;#m!RZ=eRiZ!#V}Y(M+I1dR7zArI9&_b2El1VlB(Lx| z4^X4Mq>`ZJE_ax71hflEtte%ON)&<%-Yhv>9~^#*lhy1Yy-j$n_TFE~92E;skR45< zPBj~7deQ4E2eSv|*JxvNv&JauA(YtP5o7f+j8vEjhf&((bcgo=_J?$YN9HfEN-o%L zyeg(=ur{v+3L%4HJkFL*MR9#eF-%W!HU-hbMEHI4T5)22&`=YiIl zS@68PwH$nmSN_WSoytywGIC6X$5bvt@q;6AC`$H#IMKc-=BNnkeZW{SnOM+I2ND)k zX>1QJa(jr4ba1qZZQ#>Nij%7QRT~KBoSc}zfq_;hpV*WPo{2(0_hM_b8xwR~5jOKp zc8u-Qeo^`NRW))NEk&z{9VSrb0!J!-D7g_An2f8?l$`jWBq+xmXO;NZP08zO!BrK` z=Ii3$+(BDwv5+%3ug}$8pDI9$6{}$>b$JP;u~XW*~FpfeXGIZl!V z*#nF6ww)1GwDFM6sz}m3WoF)8_(oHv`*iwYRQl@@=mE>D)+ml>$PGvSeLXh?Y^*l!=270m%E~w{AI_h-vH&ZX; zhv=)XNyBI5TL=g5=cq|iV*M1Cl$$0U<{75C^^snF+e1jV-!2CWI3e} zM>Z@mnj9}d!WpLpALG;LSo17RO`Q(Jsc>d9Q%SKmX^uu2AsKZnhcl1w z`_zZ=s`uD(ngla)I1F&iGDn+|yjf))f=OO_tk@8=s8`%cR0A27gIyo3AkC{k_Ejna z&}#+oTtS>}-N|6l5U}1;b&l{q+0gEEukyu1!Q-p77)6LZ>7?K=_QtYN>{>iaSwv|R)!3HM{UMEh&CAIV9^5n;U zldfgv3*fB1+JPd)A<=;Ey5Uc?xs3*4pM5SVXSJYObsD1UzodHV)T$*GM=maw71%14 z#Tu3t(_!ZZ4yx4Ac$f4f(fO*TZ1cPft^(bb7R|!Sz>#VwtQ6hZ0tkvEZg9h3ef>vB zLVal63$UA)~p4H)q0`(T$@iA1Q z?4{V{`Ue8ty(CwpDNrSDT$T9NnS?_;CUGF_2sgyn#KO0z-m<06_e;e&4IElp*B)$e z9Zd7DR)>I<4t)?hK(g=)Yt0XfteD~E=@uJuq!59jgj^le#pDhSN$*3$o>t7hZMJ6p z#*%^)xTEQSkDD|E<8H8eo9>ay8+8ktOCe{mp@f<OeO| zD|c%yc%-}598CkOKWK$TV!nQ*$oN*QSXH96kfp~Je<;R6VIx&~q`8kazFkelA=woq zg55oRSDKYjVOG+4)7b54itx-aRJuB>-`_zHR)F`tf|^n2gr4}Rc} zZ(GQ7;g8%L7cV2_!ZcXxhdDNv(oOpf#s-Zs{A#EuG2Y}Je{P+a_wPfb9adv+3i#)P zGEka<)XzIuWSBP2p_<^vsYB+x;`%x^`)MPL4O^ckQ@b8JJo{2z>@RsYYT4j<1%ZbC*mWq!{IbiWsMK@jL0iJ!L4PC zuNpkz(tW23A90#XIECeyIB_FXlMy1>*h`TdV>V<|*@!Ep0#Z$(?!j#AO9^Ys<(3ps z-`WpW@^6y)mIwQPXfoZ4)>$RzOT2W;zNEzkNMm*`S9q)rzL9PXl}36veN}5;g`41C zjh$~X55#UM3@;m#=%jOP=ucnWBv?OY%KFPQ8V8vw*#Lexw9c>`X$o2%51a2f%Y(BZ z(WP5|Oy8(kq=nC|xEO0}B}gBBNwS1r`}rx%j74n5tg&g%9>QL61tP>$Nc=kQt z4@P9&Q}VX-4kn4^qXj(UF4~+i@Rlx42Pwwg#1ryp+TnDN50gIcwN88+5Z(ybW#7v;5)ZrN6M9&et$E6X z2#!=FCVvok?1Km0GiK}GPbUcltOHRgi?B)4tHa91pB*qYe0pDnFaDD6vP?JdPwx`lAdo41{R4eKkP zOX_KJ(#dAR@J00U%KW_~9pN>5$NBef$K3SxPmAOF4~pjm`9&u=dbKA6AG4IyYw<1Y zJG?vjj3*p?cO){zKPr5`tV`UKPktLScW#@c4*?gX-Mzin$EO50Z_)mhxwELa8SD09 zl3Am|Y@kQHXU# zXg$8iXx+!0kB*l*!hq0;QMmL|pR#YqVeiZK zCQ10%tqwCdlx5mN8QtHu2vb8e&L#&9Sq!1ySe*Z9j3o*bDnZH!V4k~2E?jnJ8WWdTsWy?%;dUDzYck1~402IDpF_kU)XcIzW`8VSmi+y++BDb7uiI*c_wOCZ z{n2>2)fi<>qmqHb(3BM3#%AJ)Yr!%!vvKtrnxAzN=u}%@>)arQOsYO3{$nnoedN8 z>rD%iWphk-qin6Hj8dK7sdm%N_Dw0W1J9|9lH}jnjT8U0H|6FC_8%1)M%rtcnHE?H z2j~@Mcp{h;51WC@+Y!u5tN+X1u7{YHH=hriSG2z)fc5kL)7|zH zhFVk)`&J`Wj`Jc~ezLa`mUZ*OG^f?eRwHzD^hk{SP-_tqR!x{ZcM3nE-GgL)9;xM? zxA^v|LjVl>@Pb>O&tKx&4)?(fosb`?R$WhoVf0-vDt+h4Zvn0 z=7;7b3b&sYXBe}e`Hx$ik)Tes85$+_St3M5BOb|kaYJ}p>@+SWamDTxa8oGHSr`+;*Bc|db19p88 z>F>$;YHhqXVTE)C$vL5s>7jra3b`mubU2}JnrVTBR%f>8`GNrn@0T4dr(=P!Lytolyj(L zXQHQFYp71%=WT>Y&m%q)U%@vhob+JGyPtqsR2@>QlN)t%{?AEOv)huOa&-?sF&J40 z#iNT$2lDEwkni2p+7JIU3zqM&t4-6_BR$jyk#o}*)*BHvS#$${lLZAs*$LBkLfA;X z*!>W14N!vW+eQU+AyrlcB{F)?3Wx~WL<7F^q2eQRdUwB4G5mw>g??eSZSTsax2=Ez zOe(<&QdUPod^(bsoSnSTQ78b_A8BNbMS=h!VvnK|{wo`}Zti2-NcU6ynMUJ3UB55U z`$o0^$f%qW2K+9;2HH1Ya6|wKXfB)I^2ioNcpFS*qru-vUz-PoM}JSpe*d+JJeLTM ziA)@Ag$F=<`cf{kZw*L*_FloD9O^O^h!0jE!DXrv)qJ*1>ZL7U`iOj@LeoOmNYyZU zaRhk_Y3W7O((VB+8+%6gV&YN(YIJM$LK3?di7L?8Pgo-3{Qb5u;YcP9K+#mJ@Ui>Q2g+F7YFAL2sRd3;giEe*s{p@Qsl{8xGt{wpQ8 zC0T$}@$WyyR1o~Ka^9GZoN=WTb`l|5OVT!%&}Fufp{u+5S5-xI=0Z*?vbmhGWvzy0 z_I%j@2ukayN|*q0v=-?ywOB9%MxTBU=b9|^FU*OSbh{12jmKdps~q6b`RT}t@A`B$ z_SlvKwzI%Nb4=9UT_Irp+k}n7&#KfY+T9RDJ_sbc28p&$UU{ks3N)7nOsFK4X;cLi zp?BZVRRzm@gl*d~Jv1dXo*=`dXi2zZ=XqdWkQ*g00JXn@1vT-%d|U0iJHM`Pkpe?b zsIN(E)?~e;QBdlZ==A!WK&IeWhE#;@(VF8b+|iYgIl~S?G%U7>XBc38Pu7){{hBIF ze}l+2B0<0wn_`jjkt=m0gscbTBQhfQuD02$eXHJXa4W09?e1t z)F7taX#6iBM=)p?fkt&+D=k0JMM^>&MJ^qsNast@leVaY+wyEVt8%lXCZqA!n&dk0 zHGPn1vTXme-Ll11Lm!(wtk!vMV$$z7HJ;<=qt1qG_l92(1Y+>*FWJUeM5I5vENvK; zm!TONyHMKE#Q0|W#H6z~Q|MZ^Tseo(6!s*1aLxL+yn6{cyP!$hgGYp|w?(n|uW%}D zvjbP_d{f!ON9ZiVf+qH8yy(WhA6~7j;cszYG);IB#!xr~bn`PbPen%D%D|`7bV)bQ zWFm{+6;T);sgpF%Rz%#DniJ3}EQrrEAm3MpdYs}Pxhj=P-`9dY&MYPZma4Ar>PI}z z?JHW=E+QUUPdqN%FIqM}kRLk{Jum&~J{feB$j4q9&#N%&)*T|0r$GtN>$r;6J%-4q zQFG6m&+|+Nd??S8p`N#S476=WGLg^!3%%7wd08CsysxcjJF|{_Sv~Q5XuW8=@Irar zMD%*>WiV{Nii~{SrSWQu+}jK@>vza1{9|F~I{5op>7kepC9BlKA++q6vch#gfoO*q!-{{AJ(* z`aAu7*#hKD0+bsO72%wvK>?bF0X4Gz3WHSI;-4vL5rW#I)2nWv`CwqKccAA(pe1FH z9ZryYVcWpZpc~CW75_%*)%Tl|Axn#vJWiwuKCi}R4Mq<8ASTQ( z>KIz)77PFL;E*}4o7oZO{2+YZ47d7Ij1YVxpptXIF-osE?g#v@GFBqFp#;ArH>)K~ zt>pMva!nY81cRk`A=NnP{=_`!#4UYJdPbXW-6a2^s$q0XiN!APxLHv5&We3k z$@NERX$xtpTdb~2@KC|ZVWugOrfKC(iKVJ(*|L$)y@vR;A5Up}O|DbPnCp-5ee!Co z1$G*$N>NShu3~9%M~MWo8vaK3ho%{CxGt|Qn5w;iWHA9jczsgxm@W=p$Uq4lm$dY_ zbXOyNtMB}nME~3-5A>evrAcx(LXP)9(?$r1{ zF^_nFlx#LvXI3*z+)Fi-N+tTs5d1&-zoGJgjG3eyQHvO|BR8fsSUjiX3ZF?aKlLBf zV`&8dQh#LM=Iv7Zau0Kb%isa#!!pmg z!UH4)I>UwT4TVZqpbbYr^$|R|GAEUYbz(lZP@MYR_r{_}O1OLH&ktW=h}s{7PCLb& z2+dE0`!d+XvMsZ6HMa6EiRA~w^V8EZRWv)(@h=^5)ca>2Im1yQws;pJpoPpRQRr3i|5lU4AG}YI0 zm7vC|@1fOU_f_Fo)qGSZ)me4b)Na)b5;f#lHR|)#EZ`a|#p+4U8ZN}zL5CW(gqq*x zwFoS=I4?CKBek~iwJJ)rQZ#kFvo%f+wF&Z}byEN8sIcl(PwI$j>nO46wI%BJnd`~t z>kPs5^YV51#1vry-?>N|Y$Y1(Airl`)_dqRxPcoyxEe6L8}uL>++P|z)f=?_)tiGG zhtSh!d>f{(8>hBeOj{b`x!A%Hn=&>VQ&O8WVjB|>n{$1eFtQ-?&6^8Dn~Ou6*$12Q zz|9qiE!3UO={D3=<}G0L3N^u|`RtJv!)*gx0KJ(Vk=a!|RwhoN6jb9otx(HFY29W?(l@89mJ=<02*BsNJN*ZUEh$p@R0zd?j1i~yFWu*=FwkY z!@4=R>8T|lK1l`d6L?12j@wXpfv|R=*I(p?r9Bd*$@F0e$giEBvn&E_->=dx*4AI_ zqrK|xwAi!=!rbl5Nd4Hry%*Ecz8ryY*MwR7!uTjrP0Y zQ(K+(h|&)5ybipp_x^kB@=xnE8660fEDbrW59IEq=k5!_9{?~8MQaSPjkbr44()CC zW$O&=st-ir_vF$JvYZZLr46T#4(D$V6ln})bGI3~4=cA0^X3jDmz9n@UJchH^`;Y# z9EA@0y!LyS)-|+_47rm+BaK9>kH$z23x$86>N@=N{kSA zOuR}C#nMcXoK4^OjK8^0Q4Q5xq_Gto3uZ!6VX*0i|Y;&PMUZroW%f zNLa=~dLWSTbbMPIr3#x*h z3|tDOTY6xdiR4*I1TOV+PrAJ=r1PNXzR(I0EJt%KYkr^42w#RBTOtXg`>DCyxw@LZx&Fm% zhhRJNJEHIH;oTVl6)?z({}y;TZWp6E7# zH`Lm~d!G;ow#H6E(V%XBr{5-%-lnkH-oV>p<=VnO-^MxJ$|as%#ormq-(q;*KK7XT zL)XDMu6h@~voE<|udy@eu=Bfp%gAz8{mZVX)tET%EbjV(wDexZ=8pN;&T8v6+5E1Y zV)=-y^v++cT`lST2kw27vpKc-J;U)K9l~*AWHpiT{WFiffV91>*1em${eR;FRv8mE zFQ|dBZTtXhBNwq-yP@deYoL<4>xeJo@c)6w-kU!z9XG+UbD9&Gq?|* z*F26ATvyr%k1X>JpBzmbczBXp?W0B>=TRShZ(ksy!A0_}9C)$`? z7C3*UTKV>+l2pC*d1>mIM&P+(WW_ip>_Hj!(fZ;;Xb`>5)y9=8{VMxJ`NvN3}7H z;&;$8n+?sC`{5C((i2wTHJjvhIPPj(t!PJIwNBhL`Cnld_^q2 zuIql)wX`RTI~9C%Gf2%ImT_aQbks9n7WuiU(Q~(%=B9GFNtt`%vG!;OmcIGOdAAj@ zS#CW)sDG6C^|oiU3+wT=iz28g@?Nd9-M#R>GxVmE;NiDw8M_72iq-s9#+`8K!$|AY zLB?2h`ctm7WbWChqd_rPUGAJ^b zIVB_W-}i5ez{Ms1i@=ReD@hCNAMgoDkLw#8_ZyBMo0#|=J2gEx+qGQujClmMvH9Q5 z-=1HxX@p@RWO<%<%>n7P*wQ;3&R<&ADf)M z0+te{C@zMe*@w3uOFu6^Tceq;L#bc5oUt?`UrQ&L ze-bqFbkqO&A?=ELXTI*|-qLVm?U7@?uwFp@%!b;KW~G>lRTJ27Gw=OFx=M4y#okQi z9v(yUnSOCXioeDJ(CzM}hcl*O0`8r2Kf=2*n{rQynQ8U|!qEWT0-k_nv zT|V5nt?ZR~QtcIU$#DlxN|L$G7BTjIBjHcQl9vf@!^+$Jcjk|KX#+WJdnt{x`}&`i zWkwE!l%<{y1eba7rSrEg@C^xXaq=gHKL4Wig~Niar=H!S!&(5( zyxZl$4hrPCa+}r)CX)D*N+G1aq!=}qa*UrXAtC0N;66?DYI_!u$u6FLD zyR)&*S+{eKZUuvT!~xu+o%1>F-*%UX1GthQ3z7JMlg$FeVY3sq(H7Q?R_@2U^T_9| zSKHIj$A>#?2hXd=y$k@loRk|%nd>!2CK5@XRAk)|5m68f#fkP6mduzBQAs8mH*5Pm zRhhR5Elw83j}@UN_X7t^$}B96K53h{F+aV^MRXmxPVzY(+U5DKewVgR>YL+0sk@c_ zcP7mghWa^o&KZ$Tkr6Jew@0PeZiTqT>(D?hc>}2@Us34`>WjID2 zXN6^W2;u;4GS}ShS<<`W)>}zZz&F%I70q03T+Za=P|ueA&kY z5;k?7_u%lj@}WFnEQw6e_(X7xQVNHo{LoiotT1Sp6a-T$5Zmw_QoHz(rga=bxKJR44yOH45tQnomj8Q9ek?%Dj^|)uN}NVvJcPI^136~%#dEXq2$ypy(&lG0 zlbT(Tp!3{j-oj`C5IE{ICWqB6yh!O6;d}G;bJHhjMJkS9(!`jZM1#~#diQ}}+2n#B z-!wClTJKh+QePCkLk(8Q=s=?RcmY-)W>>ETf1>d+DO3svYxasolZ335q)8QP{~DNr z!iS3t!;5uycoj*Flmp(R6@R|G!Hq-|YgjN!4A2#4I*5tOu##ByB}oP~U6ta!a4bs< zSm7~<$SaIH#?rDu^ z^>C@mk9qOgU3hKQr&543_QI-mTwVQH>5s%g&2_7|`Y}DoPkHjGZSBkIX5AG#=%Ajp z6-g%3fhfdD_;ztN-mz(Wa>H4tVBx%BI=ACs*=1Q#UuDG6{-6}KyBjr zA#oW%L7INDvFs&+KLq0fULm|~nP2I(L7B=`2TSR(PG{%}-kESC`|q+qSsdx7FJn3x zRd{{G8zqT~uez~}%5Uh$aMs9?`Py%PmWMkwwZwiu?v?7vj1nd_q{mP8l>^$wM3$^G z^I!MNM4ZOt?HjVcb{^1#*d{&KQ7^oz%$>Vs2?|gs7J&ttO4;^X{+sbgD?0-wDpR=++7l6o zd&k1zJMuc9Ul~(&`LAA>y+^!`jR#&7y>2%9RNyiRWfm;{W@)D3s6ADaAXuuEX8v)x zVXCaf-oVpl`zyumOrJ_k1r1_6!RQ}|TUDTZV5W0BH#1UGQ#*WM<$600!=tIK>s3Cq zO}SmzR;g{69zJwvyj?v0D{+rp=5Ln}OtrSmV69 z4xkXfTOo+7>-uYP?^a1i>w1JoP9j|I)@bo5>-%I?PGeH;)>&2S2Q)@b6B_R}cp~eE zj8)D+LvNS^DkWmK1)&SfI*zHpTt69Jy-4MB>PKzWDvlyU2#$1u7R{o%>zQ zKN^>81ZM- zeu(dglLpWDq)-_7k4{dlp~w#P0Bj(Ssi~=~tZZ;_FaraFhK5FGXXpO@enCM&Qc_ZN zb+wC&%hRV%1q1{Z7Z=CI#{T@a|K$rc4#pH=Xqs*UhFI*i*l>>8SuQ(45HF!0+cItB zILRaXa&7x9#4^ST?G#G2p);I(nFlVdurenlV9MxrmCLYI(N&Yj=+W(? zm-pYi7x={?DD=POJ(rN=)U@>f$a`rih5se*xq@rz8ycIMTUy)NJG@ouI(vHi`UeJw zhDRXXqZ7Y=|E*ljy&RukT86ExuB~sxEpG1Y?(H8O9vw?;pZp1YSJyXOmXBZa{FUhKY+-JD)I&sdA<$AwcGw3 zJlIXu{P|DbW19Y1IG!UH%RVgr$*$;E!Mhv<``>op$v>qlt`GLb(@;ddw|3p3-?xJtYBh8J{2TPryH2U}w*JsGo2Z#tu0*F z@Uy+;msl_Vw5|{}Wltnx6PULLCpDu^Z^{KknPn_j2eu!Fo9i`Z{Z@Fg-hXL@NEW1i3is8s3BDO^=P>VBf}Ua zp=*x07!?_|? zV=_iGubh9Sxxjaf72F=!wlkfdB4VE<>Yv#I!d#P_D47uU(VQ4{GdA(K&)}W>y&s6& z$MYR)w=kEP!K|n-e>VwSQZqMJ1nFTfW@9ZIG21Im{T;FgX~LV?gSG|am?L%%TtmzI zm?<(4xet4&Y($2`l7wwc#r&Z9{m0OQng!}COGNOS@2p)-Fh9guPcJ@fV9OmmY&>ef zK5E)z7*ITFUYrX(YMI&{JZc@j!9H&5!BRSIZzT^q?xjs>&A)^7Mvtfg|u(J`3-J!Ell^dM% zFKsLH6PEUqCV3i{DHN$`_N=+Tj;dBxZ>4hv$L|pFw8`QHIX~K;iJY z>!M-!{Bb?*<-$(C^5x>{Z20BU{Lb*@@&xTQ?iFkhQ{`&Km>%LZ5GtHruo58)q*#y9 zXd&N7Fjiq8-vl`>k!__1dXQ~rC*qOqi+YPYj73YrVI^WTG??k`NgL7T(p9aWdlmjWXRuT~Rj z2(LE_M(Lj3?AGc(y*=zbBDg!9&LFslZ*dbm++5-(Jpw+e0nn4^C>{_C2uaeJNIKuz zP{rP9;)l`3Gs?A(emeF^GNnZm?qkHkwLSLjJ*LHwz2YPU9ow@g(doY_>>%#(@dt*{ zaTu_6k|!|v1PBe$vDy@NQXBaMO5&`F`pLa129yWM>2^Jh?Rz85SswiEj-it%NA9ii z$CD7TWP0M-D>N?KlTdu59MVs#-Mk>)r^ZSQa`Xd*Jwn>v;Z|YaYS!gsgWJj@oED_1 zFN31QPfsEj0J*gHj4Qpb$S2VJC35M%MfJTEi$M){%Vkt0=$F@4Ly2!%XXG#Hm!-0; zhz-4CWW{G2c#l^Rmyd$jlI!pNRKXVsar;h1^^HwYbfO}`s)-3WSvaVdUY^)O$-v`_ z+4F@kHk7hQf|)qBXo#J!GWoq5vp@>#uqksnXo{0wsFIDU%hL8&;L=?_UMHIhMNDNX zp9G7TUg3y6xGe2Zf{IRJO}@i6J|neqINMT&`a%v+MGEE7$SR+=cJ z&7UvM3(wsIz`*|g@qJbHO(Uz!0ad|x_y9DA+MY)4HMk`@=C>R3i=rzvu-Z|1Ro*<6 zE1Oa|%dcd0XueQ5waU-ArnFN5XUTih6Cd{9Fsi53h0WsZYW4-c^X4E$D)Cewb9Nhw z)IQm(H-mSmD!`w{e^eKz-?D28vP_k0LrRR|DYbv?)>T*7|Dd#T=U823*XS6phIln{ ze7??~ZeB$stUL+>WNfP1cESY9fES!|gvFW_yfx(puAGMT%(MMpODhucsK|}@v8#t| z6A&N9a85{bXlF;)R7r_*nK|drO|DXvR73K}EWTk^&Q2tJuW9BQur1b!IjO0oz2&kI zrD9%K`C3xflSgVBi(S6Km-uDG9oUi0q383ywgEuV2z0p4TRd1TZd}SEaq7f|p2Q^T zZN7juOcYZrxl~UV?Ympot#T~8P1Y8kH*?pV7B72TBo>h0-CI{7al*WwJLRKD+Efsh zzCzUq;L`^1z|>{kdjh7Wnvcx7`P?0=dM``rTfs3<+CvDXPBUwD9DcHMW| z#NNKxlyr2RC_QYbz75&;5OyYA*V#A%^zy z>L(%OxI<7P2*>m42R-nl>ytzv#j{os8$QU7Wt*n8hHH_&9}K6_zw+0aGjs=Z$b(Mf z`NuZcEJTOiGn^%-yF3hUD=PBmzsDNJT z{l*l#z>uaQLhAd|(?SG8_q|U?2E}Mn{|<(;8pjtkjcI!po{bBw5TE*as>3gdy)%=D zTnN=e^}v@{)}X!W$e=n7P<30n7y(y^(*hzRCDvKEuLnLz0=~1jt>Qwjhq0+o((&(B z9c?ZY4JFYFRA0?vx7|!NK3tS}y6?U(yP27PfTJcgW12Aj2Chc5Eke{?1XfQ?u{vj_~73SPsE$n#Dm_+n{>vBLKB%r(}!8a1IX$Pyz*g9 z@&Vg=agO;g75dV$`tsKK2rgkfB@059kn?`K?JXsOL{;Y{Lh8rx&I|F7pNZ}(j%aRO zdA%_4Cn@s-u==ZD_{YNhs|x&eg8XIV0(AR4B{lt7w*5{n@t+U`+KL3)X$Cs521eNh zy3_?a$$7b51v+TreeLr$F$s{H3GkEi(FzLcH4aje3$W5enn4MQ6A5(I432XOPPz*8 z!3YN0_(k>wg%<`HvxaEMg#?lYbMpkJU<83?LR_v8R~&=Zd4d`>f(l7PKGp?oU;5@3 zhA_8kl9xi}o{&A{Gs0 z!VD#q_rt=BY3_^pW)f+(9o&`_ao`kFY7(206eA3X;4O;gF^yQSi+({C!!8xO8x)%q z6c=9@E7TS%wG+Fx9e$nUfkEarAX5~6gb}Hb99+#BD_R$Ghfu-ELGR_`4(Ou+_3@CT zc){8Db4^c!>o6|Sgxagvo9*~$r}%}w1lj&X&H6AWQU9N8iC^WD46mb%$ihsD;yvmU z-^#~WiAGY1#`%(kb89687bP)>Mu%%fr#VMQCr8I#$K^ODscDL3PT~1lbU2&w3NzD_B!dL;bC{{k&Ka2Gcqn6;csH2@ z7Sxs!#sXo0b5PmRU%1%5G>gdy&V8kj@m7oi4J%i4AosOGy0X!3u53t*d`O;RNY1Al zMh%x(Ev$SUtUSG4a#e8d`+>X+Gn2*KvUX#OwZVlhU;qfrE&@ox<&w4zPBB}!!C0DeUY!fuF-w`-g>25nZSR)sf}#5^Dd1~J1OfDZz~{CBGP)Z* z4uYcHmR{V%oVkP?ke3BE#Hqb1o4zSq0hg~~l_eJyeZ_iCid{~r2)#3d+R2xDDikR4 zR#3QBFk)BM?UV=mVph1YSM=Oe@P?M5Dk9&SBwaOB_{>#^>Li|OSBz^^F(dw~Zp)t! z#=l>wkO{3SAT7rtGP-cYB0y|yk?qv_pkUG-pn(O%_~ zkjBKW;zrlL#;M{a|HjIa!77MrP40YAK6d>O;(>gxY1gqnY_Pe@scCntnBM z2k8{HAh%bscCjIKHJW$HDs|Zxb}8P~n(cPgpmiIvcYkne@iOZsf92D)SlnIN({;Gr zt-;w-zSe2j#8PV8W+_}`hU1Vh*W(n3WV2w;6vj?OnfClHhPtHJnX*dQsTTrjC+6(% zRqC)NmvHOab;V5TYp(4Jb8{uU>WkWMb2jb?g|*CW#bu|I#mzRj@AVe>^iLM_YbmDO zaFll{#wCPdWG{IBxbDB6uPSsKm=`aeb8Rf*Y^o|5tk50wOsyNe9T?l~@1z{uNNGe& zCzS3Z5Ng>v_F%=Wd2g1^K%V??YAUE}e<(8*zq~l%Rcihx&fxn!tid6#{DnV53md7P zag+|a60NYIf^_n37QT_vvXRm0p|sTDTZ^Hwup!N%idDCc3#G9n!_gA2Q98}x1lN&X z#Iyos%n@aKD0xLHs2erB1@px)?(kUq{@5eVIDYAXfkkkzS<^{VQG3u>apu@U!RXA; zDDS~&_g%xn(1dExFLcW>RPe8S*hIhV0G0c23g@r4e2#qZGY-^>!z;<&RfhJWc(VcT0y z&MEy$4xg+anzmTPkH4S(vN#!|hm{yUlT$MDdU&SzU`F_0Qj%)Un``vaVkS6z_P%7c zet5RAykzpD(!AoqZ(FW~SiQMio4GfOGfvI3E-xlp(tewY&nR#$j#JHcQBAhF&GawM z4K~jYTTYGM&rEa8SC&q+E6)^oEqqU#ULIa*+FwY&pWk#}`t@RATY0HEZLv0d4iYxc zk6Y`y80o&)Wi{Mxqqp4czL;-0Uv#jHfJ=|V^G_sWGY*F3ai$`4mb}uIwMv&tUQF%{ zFSKwjZKomLyeYSo-48Dh?ZXCeVaH%D*z?b@WjENQrgK=WW#5FE|zfqv&yVwGmMC`k5;#D zU$n#6qwlnNMGdkF4~Ma~V2Lz!@@j8!C}f}UZ1E^;QOWEOCS4Y)g{ zx0-|ZJa1_#%?g(Bc_+6NR(kjhiZ`c!xc$XrZ=`caMP;wrW4ZZoPd<8&c&WSGW397g ztq1a~&uWc?dOzJ`x9xskwR~X4ityW9%K_u~fj>|056<3Ih`ohGueI)Bhzh%-Qo9yp z<9m9LM$2&C#G#=|pHX_c3FOG^VX*b)r~%{!czGPpa@?kH97WO}L+NUEFoF}>3=?k= z<2d2NTROQPytr*HE^eSZT-0Fz4)-;lSPY&7gtk6ET3W$9C8IfY*=uCxUg1|+scbkE zSZ?BcSlyFkhc}*)FP}Wc8_QZaV^Td~z(1MCUBwnZn;blS{j$1!Z{^$rKhoow#OV3z z-Z}U361}O)ggNzwxA_H8!=@V)<&!eaEi#`m5@x#_}}{WKsU*4OrzOU;q58YjfozwAuX{s&{2hb8Em^ zQJ*>H8@MS4`Ss?7)Az)eZ(s7TQWA#tFz^-W@}-Qh2(YKA6|1Ymz>Buuy}3m=sbkw0 z`Nc{d^3XbkaDj1y+fuj1?sI2vuLKP;VYFYeR_MNFuhM_Z$x)liS=ZLe`#xis&pfHG zYCO25vOQojQutY)J!HVBc#p%PWS`Tr^x(jN@z82C&zj7d$C6Ej?IfJne9SZiT-jb; zb}nFFeIe*jBb0RrKj#+7zjQK~_XfB$+=w;0oVGbP-rc$0-8DDzg^9X7dcSP*M0z5_ zV0=v5;fF=S$6+Ctm(r=Z!Yor#my6pIMlNg>)_KAp7e@U$W?!>_Q6ZXGqrzh6lxZj_ zHlKFDLV#CyI8{*X)Q&&LcpzHp^#i4X8p}kkLL#5#ZcP4%SHkUY-;%dI@h;J>wLRDs z>%%+I^KI;CYz-9|G5G#m?aw2RAQzjxZ%wh>)(;Xl z=fZ8sB-}DD?ygR^eoYUzNIu-%T%H~2U$l5W!4jeMMkRN`_CjZ`qxHoVyUHYbgDEXS z=l@*WX~moHYaLx6mCM!2PjVj-`d}uI)9QEn>^k~Tu7<0xitHUC4B>)vPHUWet91;K zFAz7Lo{J%gGDg28cb3d0kz=oCj8*1i%Z+`nAj%YvC9TDjpk-G7-C4u-nknf^NN`Hx zH;^+k$W&52A7t5Z&73M$PR5dEKj*w9B=U2$o+ZQmram~`6HAOW%irN?VOB7E18dGJ z+Bw$TXa(|}oLFrawtOdb5L-c-3;AY#Mu-@Dk%B;gHaoZotgu#8+;GEQLRCY~QCdBR zI8a!-+rR-m*_h)fYsaEkgm#j}euH$cE-(2^FP zHM6Z5w*Uk!-3M-D!}bEwAzq#-`SlDHQ{#k!l9p2vaeUC!V2D0d=b z8gf>neCo8Zp8{}twJmTy^8ilTSt~iVi#?`ROdCfYAl2jWs{m06x?lj5_bi#UlbRo> zuTeETso-03^AU5y?Y0LUL(eIJP6r$kv?2fu)?uHcE|xM!>^UT;&=D_`;7L6R5w{KY zG4s1uYny6qcc{OSVUFh*q1;y+z(OSnKu9f*#0g>UonULfK3?S~FJE?AZgPP7AZin2? z&!{u;k`4%;_uRE@08bXVCVJ!tunK^kBkPUnHim@fMFCSE6DBp`c1h;f+!jbG$MjNl z!+EERIwT~?z})r~3zpSW!SbTT63>}M_Z3#!;r7rw1u)J2k+4vzrC6#Kx>5skJ@q4l8&Wr2@xk*K7lm?P#R2XnS89>br@HP zN4MID<)mVgUgr>#khMrWaJi4UzUmReluM|l{r2K#J4P}|9}@MUM`G&d3+=H!SsArB z64vQUZCXh&F>WVI9T=+a-$Q1}vT_f>M@w#v0WJ~Ffy(2E$OWp@D%Tat#?o0B1MOdlX%_AeW{(dNf zf`SnzB5_=;qAhbZ^$WB7jzHR1AevSV06QR9-oxPon*O&moT}%_o-s!SyqPAd`cbVY zJ{4|DRokfQa5eQQDMVMS;EEW~7!~uJm%&{@{zE1)uY_G=Rkc6KIxeWGHAi@=aD@y5PGEu2A+j@=NjV8dU7EFW7twd4=L4 zVx>^1wWh{z9ydyOwTsE$g@OjsOZ{d(0`9M>%OiW@Dx3s0q|3=Chc`cFaKcOtB3VCz zh^NX@?J33goOMD@&nw*g+du9#EO1E+)+WbW;6xNl6x5IF#xrm^U4PY12zp(r?}V#I zq5(mb1Nc2lF@MQzFnvXMX+6VrXz1RE1nY4$@?9)=`ZG#;1non8!#xL8TCwgv@Vr5! z5aLZ`;Y+{S_5|NXubz^830J3EXd|!7S?f$T!wjDIO6llCOl;+0!m+)>;?U#C;MbLo z8QUjI-u@H?3Qkzz!^5!gds z8^ms;*pGAx;Uo!XNW8|^V&fe9+Z zH4@LgP7b%%P7*up=W`-UZb8lst=Q>KV~A2}yM%F}?zJ#4?QqJ5MUrTzRCR+9Kjdfg zrogTCtX)leP@eXxtyM{CHZOPMWH2!Yx zdacX@RE3iTS@m*km!g-J->TkIMdPSRu|j?XEL)0=h3So{#Cltch9glW*|MM7Ep_ub75E%=&vbJfzG1N>qIs34NoK$32gKw-#b$8nG>L!I`f>E zWDdrwUbSXnQV!LdKPM<_^Kb95B56BnY@(uhmL}WVDGAg6vH+i+c*`3$sk;;y>xIqw z{P=^UDxY0;;PS+U5}bj>+HJSO>r5?8|0lV=|5>irLsz1EePt`%BLOrBW=i$N$?9N zjy`!1*(P?3i*kHp3I#sOpcPYJqBr|P=dsTVQw@fl`m~jlinLoDO??_80ejs<{qkgR z?iI07-cvM%VX1anlvC>4bAwx^6}vIL6|!_#@WK;v+BcbGOB*!M3-ZSoboYWQ(6?Dc z%(S-e=t#}_)=i%wQ%1~4kuRsxjk@Uvqe|@ph}4>C+a**c$EJ>&lRT11_kRpd{*DO{ z1q5six8~9c(Mg^4w;`8htz{0c+C@yI;#oxF?MZ2(yuqE*yrWI®{r_M6pA__12k zr;RGL%BmgC2EqHyM#o{+d0XAnYe`2Uy|RXy`S>bp@qH?EHiLMM{u71)U@n7mcdmlT zpKhsxr6!y2YcGdb*PYCZg$f59N@bZ9`%KCx^CV%X+9u> zn3ybogFdHp4ynt^bCne&hpYkboQf5CLHf^v1FJ$ZInUuTyaR7Ur#eLQb6yelit%;v zFXc$0Je^jhZ(Vw6XNS0uyTjTPfdkM@F zQuwAq3{%=P6gbQsBui0)=e8xUsbl47;C;8`To*v2R>-E571~gYVUkc^(FEs7K$t!< zf7gO%yrkdISo$uPoGTra_dcC+r8HSqZ2nVCs+Mg|3T_IJ@bi1IB`4iA*BPo_RAz)( zr{4K4eaxiykT;M;X<(Ia;Ib}awrO;eXEMUH?oDYdX4L(j*?c$pJ3We#WL9ck{-g7z zG0;E?M5ZpespP|~xMY+xn{QIX`~yC2&i7Wmr9+08<)_dV5#^Q|V*v+e!B6!9dzsZx zM>%7TcU$%j1&)URJ9d8uqyZ<70_Sv=#P2Ms?8eU71+Ff}PRwszN4nfv3fzT^-IfYG z4hy{2Sv((Dye0xY(OA8S`Arf@S$&vUqi_m+gbIC`dwpdJ{jxE9RSNw-Z<~H(4X|Pj z{2CD8z#0@_^3|;{D4{TTmkTvGy)Xn)m|9jC+QRCX%NjPq8vfZRe2F!pMmFM*HF6{% z49*&bR`dpgEt;6eG=!8bhIuE5oh?>q=kr}*tV~gy`ja@7qWI4{gdf=wtk@F86cRny zl3aWf0@#uhI876h*+BWGTwz6^8n%RrqLd!fxQ?RKDbtvVqO{GTsMVr$_)f%WQ3e`& zI6D$JlbAh}5S+zq7EBM$7Ge+N1Lw%>`bmRx)y;g>zk>E` zga+&LIU30a8)`V3;4A};Jsiz*15HyLEiD7hn;fm#11)flHjjZ;G|u*~18smpJst8rRwcdp^1U7*eA?rn+<qQy0~fi16H~OFO9h2f=Od7qXAcLOjf9^vT%T_Kr2&L;F=`k;SVXu z`c&|m3S=YX_l6O$&kVQ~A-~}O+?F)mj<8B_1MZ|VK-{t+yB+(xHNco8$X?IkLO<{T zj=(W6=9e zXyPvSqqNE6gS9+T86dVFK*D252ZKIg;UQ+_K{hr)epzNr%Y%YlhN61>s9uJ8#*Jod z^FU+8gFX#K_q4eSD8p#v#!M``N$0`JfMP*;u4~G$eQCLIdTlO8cyNuOxXWc1n`L;4 z-1wKrXAe9C+)x61-cuUl@~1@Hge*UffxJYw5TchShce~Q)_~77cn?1F68F+Vh^>F@ zIh2zW14#or>@aFM-rsC##3NG&B&J zN8S~*3R+}ZARUSAGBqFlRw+HV?V?Zx!yp%Qui&2N zxa!q64rvyFYM{Vt+2YqP?J8xe-mq}I)vzn4`79uVUMyp6hxmiaZn4V++LcBK$oGQf zGwh1Mh7b_G*a%Es&}TXuPe0;;lL)knJ=8no(nEbQ+|?6ar^ z)zHCe-1Zql)gQLlKFZpssR(}Rr7ilTZ=YgRtzN*U;aHvQA*dN!q!}rglu)hZ$fjLz zkw6I%)X^`}X}yT+sn#7e)f4B_TNaGntp2=gs!u}2r~haljaFlDWcuZd_X~|fBy-Kz zN7HYkyx(5J!zpBH4Drpr7xRAChld)~7}1y+2l5(w!h-{9Oend{OuzA(W(Wo4*OZuc%s+6LVPlwljBWO$H!df^JwPlc`R^)oN+Y*cDl|N3EkQV4TOgF z)(H%YgllYuQ%>7Nm_x<`5%YA{lU_(Ccdmug~%7qhoPmSSadi`J4&FMbChuanqr3 zs`a%W>*K#+Cm7e)+fvmNy4D|Ci6)7MCPj)?#?>b?V1o+k8!N7#H98+sLPS%MLsLgZ zp}*_X_OQ~o>YGnRNp9=+4n;FNDMB*w8;YMbWEEp&voy5wxsZvA?EuAc6NhpGaUi)G zE(N+``SVH`fPVp3k^cm){s@=)Ux6zC2^j%a5w!Im(TV`B2*rxPt_aThk8VYHR|INB z09S;1MMzf^dY+Y)mE7E10t#AOTwMHTBu^Q+(5TsP2%n=gCprG)Nt@Zc!udJ*@2wN>JEp#dt96Wqf z23|}Os{h5$0?=ZlQvW9?TjDVIKS5a)==}c;WrKo4{yUUS2BrL0C|mgdDMod5|A&(u z9UK3*lYKph*n0jql>L93Q3xn|dw2gYC|j1-_^|mFf}e1WX7iyYwI3sJ~LS@I&z*DZ4ja_IIA3=3u^# z?qB7oR`bWzC zX-Cl@sO#(!&$I5OZtgPu~vPpgs#U{w&pNbS4!pctlvr@2??gro4${_PX zW8cp7`==uXMqpW?-E9PxW#7pqi&Hn-$xAXqVA;PVsmuU&1eQ%O+btp~%m?oxwCwI~ zaaj-h-;&f`Y27B61fgZ&y9h0d#<7pkvgZ2;E&Hb>MJ%+pUpXejaZvSJ9ie4sjEWIj z)?tt2;Gdq00s1<1*1~vL|o@%Z%8c9$K_Z`&nJ|G4-joBB3Z7B-ybzBFD5mO zOE0E$9S<(1Um65*!DkGyiP{j81&F$oMS&iC{wD_Q<$~R`<>jJXXX)jV+a+JyvKKn= z3g&ZZ=|u5_G3shHM0n(CErP{{=xRL<(U;mtGKO4lraB&8Z)FA&0&li+6RmD`3jY?S zY7cMro|g3@HVY7qsoR6PEyyjRG4=fL_NWb=`|h~wPi3l)gPY{;R1GouuPQru_i1bYahngbhNy{c03oJ_jA}^V3H<$ zfTUwTR;4Vg_i`P?4IIb*JYiWlC$=?Y9ypdlsahy_HeQs7+LQ}c8-}0PYx)XHOe_BE zxgeWYw2P~Qe$qT&@P9;!pmG92nBM6UA`y6_0otCtacg^ym>MK}LkoEOP6|l@@B{#( z4OQuyenL!(LJg9_z^tQ3ahmBtdjr6f7N8?75_!T0Y6}4ze|yS2CPOU*Ku4GQMtS?( zix6gj6hq{LM93!3%-k00+vSb4B!_|yppAJ008suQcq07pBjzrK62K>lA{G!6+ntt>?@s;|^lt`lDMe=II?QyuI-i$;r8JAT96o$+V@%zbs z^;N0NCw1Ap{>K2Kd?NIkeM0~qz{?5mqduf$h2ja1ca>ljNRIUhfH?E5Tl$HGr-hdk zHwY?ycmk^TYeP}s>Cp(V%hi5nip-n|2-%IvD}T1b78nJ16Z0efwHN(UEO|9Lk+uTL z@Ga(UQNJRj3$W%eSe>DEU=reTQEY%)tl6vJZw0DOF}$B%HYuB`!SgD$w&&2D1yN2l z%sqcaKePus)P^2=A3;~vivQN8$l+x{1j;2}Nd{+n#Ng#o;fUIl;?vmyZFofzqBkWx zI6LA3ugoOCEj5ue_zDTaSpKNb$LD^OhA)xK<)8ODb=)DA8lx?DEGaG_4UicgnL zL6=S2%21c-!6kUZW%Ds!IMi)RartKMvgK+R>cM(BxQuY%i29Ta5;Xx#ZZQ)Lw@u2M z!X8F0Qimq2S(Z(J3g2Fe(xOs%oUl1Be3N_uKoS*NK?>3}eDQ*|PfT({K{BPO zKtWoxPcDxyL&5Dk5m`Yvau-w*FJgIAf0g`;&)Mc#6$&+ z5?cnf!K3m$mF1DmCTvqYF0)_B?P9**9tf~Cj8;Kvs_j8$Wai?^%`JRot)+Qi^c!Z@ zfw6_<;fHpSMOo0#g4K<^>t=$+&P)1)m4VH77H;2m=H5?Lx7Hi(d~|eLKu(OQ&ssS2 z`x!hcZ6(!7g?KyjO=pSs*>X%3w);_-zWnl&quSoATx)4p*L7_y$Bd^DwoJJ5Fe2;g zxCpnM=nCCUS)a@9xBL-5nNZtM(2NB}T_bX1Vx70&2otvXavtXj7r}nC)c^9Wvcd0l z+f~eV?m`^JV$_ltAmu(1ixr|aGz(3Wr!^Lv7(HajaZNr@JrbzN!@HzxrrfuK-iCLH zSVHz*$mwfb%mE)Ga$IEV*ZlBf<}1^#=(%2WhYJ(__D{X+>oi&6hBt25zd3VPY&uP` zep=Bk$$K@{X%dbmOt-3>kyBMrcV1#WzM4XPOHm}zmkGTOK7{Np?6tgc z<=yA$s*PSL$GKfkF@X;|N(NJV+V0GooYvJog!3$~teES+O;fGDY<(Gp3wpGAOV$i~ z>|Z8mZlJMI5NW;Js`0xXZGC{hY~_9T@?f%3pZ4g|LxI5SuoeZSPy@}$i{1|h((Cl7 zLEyEfK}ah4blXHE(+gX~i?r|sW}WwZEKcR67l8=iqv0#&Iz?g(5n!1Q=Zp_R$O1LJ z*{+0nG<}8YeE5P;*qwYOW_+F(;?$xm&|`Qpnjp?+y%D<-<}UM7n(2=&;THkC z)AU!b^HWRm6S48vCH0r9!)bDKYZ3O_JQij$@f-dYK>i)Y>zBXP6_S<{ise;+E@_~J zoDTxa>i6NaYdEaQ1{`Jv5M!YFB>8wJr1u()15;3e=eidZ!sQYT%}}gcfuy z6|9~a{FW!srY_iXO(m@_C_@u9QY0ig2sL#ZhZ%165Ey(=7V?2MR2mknVH5JkAT(11 zkew9jq#2U0>E~OBGlm{|CKdYLE3APutf?*_$0?{yBs6O~tR*R=mNjfr%Du`sY}+f$ zFDN_|BYb)$>_ea5+Dzzcg8)sZa3&|*85X|<(oh&i$N^&Jpe}SCwy5uvTqKS>?uxPfH_fOKn<$m2D7^l#YbS54ooJ$> zh;i2Ni#pt0N4N2g!055eXyvx(OV*f;x(L1^e*v;EmVVp?bq8o)%nU3>2@osH7QSv0 z#4C?lw;lO3IF=|F7uN)dV+ZlhBMvhuwq+(-PBfffE2h6NUZ_7_&oq9Kz(=hg_hXUw z*V!mp`8b;TFfFaP(7HIS(gzE>-}`#QW( z9uFvF`3^I6kT$hYE;XV)I!Y_0-!!#uHZ>A)3Cqj;iEP@3;N*hfv|Q6T*mcZ&aLB<< z`l=~jGw}}+)AVA4w8_GB{QmUWo%F$?xD&Ft?S8yXX2aHijCQTek^T%pvgAkk%qQB( z$lCFZMR=fV#Km*$o1!?p-9#F8kUnPASv}tFmc=8*|6%W~zS>;)w%b5yaY%xM06|KN zTLmpz1qu`j#kHk4MQXSecXxMpcX#*T?(WVGZP!|D*YkYed$Pyg2XBrt@&_at_ng=K zt#V)yM^T!GGo?jLK@pN{+&WN)r!|a@9s{Nj%CZ{Cdm4g=;a3WesYkemivEZ` zfj=!$a5d;QaWH{f5JWfOk+Aj{E#gNUa62d=M$O5D>c+_nj{7|!Yv{3B7_L%nAq*n8 z>|b=YO;MBskN`fSD%=>FBR*t8!FE(P9#Y(RznW~z4X`IpvC&QWCK5_^A;=G#j zZ8XI*EkzS8)x|tDC@l4*8_tc;nCAjW@4h6PirAZ%*;$4qyRX{W2&bi%*_Ftr5edcb z`@~{H09-+dak}U}PXHB&OW3FBHK*w@E8b+s2`LI0Tk@VcX|{O^xGhCtPt}l7umzIc zA_CH5Y@J4Q?Y!&Lk!mt0MF0~RSyN@4Rj0mV^jXWqS*zSxZcvtN`vL)H_-m(C&h1nia9?nh691tA^5v+Zqj&@FQC$Jmi9 za$h{nAueY}7mN~hzCn73I4hF7AsH=^^bxy8lv#X7q6 zS_07w%#Hwh{W@{Gy26v%PcLilS=J=F)=G!f-><-bm{EUQzm^XXWDu?6A14qLZFr*J z@NC?Nr6TxMM8osk2J!I*iSau5ih7~XqKyjbjfx(P?z)KhRRe&hQH2Lzji(7wz59<;+d5J?&$~&KeB^OpkKjv+CdE?q`eS1hEW^~*je-c(G7;MhQG5P##VnJ+jF|u%l zX>vSsa(-iSqjORob80VgybU%<=A_0}Q_>ZA(>uJlP`LD~Ug-^$VqsR@qRKJ!s%f;Y zX&HtQOq>}o^9;854DRGKf!7S=Lm{+lhQw>^aATVL3*gJ>l=Ak}xm6%CSt&zSF^V;c z$DZtc94=2ht_M*7cJVn^|2dwnxqDS}{F8HnWb=F~a{?dcAA8L|<(n5$nU86=K|2MY zsLiV8d`pwE_?B_zA=~vWxqLy+x=m(tfhA-COAyHlU*o-Wo^)3E@Js&@_+sPB3XNNGi?-XVtfqQJ6p%IL z@~vX1Bhw4a=7`^l%vj6!S}n0&t!Hc=Oh@ta&;65 zwqBa>on+{xI^Hm#sqZ`ZO zTlDH%E3R8Rlba2fn@e6Bmsy)FUaiR4t=pU1s4Sc4-C#g=IQFZyB9(1RrEO7+RlF-8 z!K(yZ@11(<_07p0a@DQ1$t{%ZjS2YcgGex{~k4H_ao2l zqV*mydzE)tsT?<<67pPDbzj4wXIdtNlAHOPf_Yd&~z?)!T9F`{T^}>g0P! z-3NDL_pw+G&$13#4fo#|9#YyJs&*efdUYhmzwyEW_lS)0sMaItBx~>Mt3$%P+tjQYdHKuI$B- z>!Z7{Cg$yx98x}is@+G*>o`z-4#_-+diIUJvTb9&xP_E02+Lko$bKM}^@R6wpEzs$ zG|Maga;3VT3tMQ5{A#}X(hKimnjEMaIVb=G;{sd@9yaRg;%w{Hiw5 z)l*mGY65mQ&&aAaK>!)r;{yGD&0}+4CnfKcy1;(E+UD6y39S@r2RZm%=uX+G5T6R zr^-N)70K5RCc$C)WdOMUZFc$SJo|-*)S*mQdm6GX~jj__NG%gcZ z$IY#Z!M>Ye=b?MnI@JPHlNOdIdn?0Rw`{O3@2s_Mf4wZ`x_2&hJ8N|E8>7G(zLHN- z?JZcvvuf7cq!Nz-?Ihhf=Zq0uya&LF?ONb7Qr`wJt}D<3ode78S#{sLoo9zw`#I=9 zu?kQ#>F06D8eH5AP$`y$2dExJN`EK!(E6cYGj*gTxb3Fo`QvA|*$Vsw?quVe@V97c zfLdr9N&P$STZQ+z)0gsq+nL8SK6WBMLh%=Vuu*~6&BOLMACqluU8G zD_#QSfZpd97$z;{=-wRz9qPaZYN|@s&&(6W|=rc=5Viat^upM-ZthGBL}`n{7-G zz4uYJhHBD2yYzmivU1lA1_j=KUb$MoPJI>*J@;#aI6*4ZT@U5zg zCZSEosX^|AvfQTfwQ2&@wul{PWRB74%xrd&9T}2p(!CBOb4!!5?jptA1hXL$gHADi z*k{98KSAmMjT93AlANJ(!tla^@Rw1&;)hvC8Zm-S!#22+U#A^8s!V5_#UoD^BVSN6 zb9Anvn+PUXPn@o;1zw(3MqiPgZ+p8&t*@nCv7fDAice*1v~#E$?JPt`Un;NP23(yV zOhsQ^EZz45xJz~bj*622I1;wzbTPhwZ`A-!pgoFd3<_09NZ%mtH|W4LP0W+9r=M8a z$x$dlwD%r^KL8yc(@Cq-ikdzK6|sVuB7(43N*12WAONN1m}*gFMvn=ZIS6o%3_e0^LT6}n6y#b34Q{~0eKD~n0I=#zmZz7zN> zn@hYHB!XoDbW#P0Z)Etf=x>%ELq{c?`S9aFfy+k?zfkh&=J=>)s83(Q@ttiNrLnaBb%?8(0PDW*`F9~w&2-elm73J#5hzN6C->kodj>$Fi z5$5>@u4~XK%D1Z!<`aevu4{6Q$#+{57Q_;4XbUSU^hXgE5~jY{(3Kxk7_K5L$_?Dm z*HKg)nWbxBxKOR#C=s;D$a1}VjFeY0s2I;OP52Px|f-29ZPsJx~^ z04awLZhk2pQ{J+MRLl@;nYJpb?BPd2Dp%fYnZw6aj;bJ4JAqr4D~kB4XOocXlff

%j%!2I z0HP$^aS>K}ix*AQNGG%7CO?k<7E(>rbT4SfLr3Wy$rMpD*U*j^81Xci{EDbWkZ{+> zRY`-29NLN}EVJtuI<7&(4{ehS+6_omdd~n>g|^EN?FN;Ozh|+5cBm8Xg#cTXG#^Am zJ9T9C!r&+0b6p320o4{|ee;N4Ot@uB`opoVXX{DzhI4 zozQ;4Pu%Alw4Xqy4AOb2O57hhw4cN^q4Uaycpw%?c#tBjtSc2wJeVqTkS0F?(v__y z9?A_m$k0*NQ=B3mF2x->$TFMIQ@tV{sUCX=E}iU2oBX6na5d@Tz2K7$Y#o)4;ZvkbT*D__W|P2= z(O0C)f{@c5R~3XS0$afrmObqYo&1!{4_lQC#yK5GRr#E*3R{yOJ{>B>nf#n>16x;z zoQ<@qe96a&hHdD`o{hpMzZ6%)HnEL^&&F3&zLrnHw#l=JNPp4|uu1dCt6*_#r$Tek#xZ$uLivzh>7FIRyk49AM zWG_~+sSA(z{(s+M!)*yJazvX{G| zQ3fX#9KJh?W;Zd-#vG1Hy3lRe1QrcDO;# zgmm+kj;`)YQPJw^Y5@TO3JQwo=;*DjEg%qRV`HPHrZzP-B_Sb^ot@3f%4%d}`CrN5d;Rr)C5Qj-BnPhVlEc%RYS$sy-z0}$ z9S35%*nb>{pNa$YPsibJ#ewK|$KfBvf$&$y;UC2T@|)xEkK#c1i{tQ%;z0Pf!Gmd|$HQp?(D%HiuK7g7jG?a5NY4Ki*}e6Tpealp+iTg}j8 zK7*uIPnxYi9{6xJU9whXFjIF)}`eYTGrqgrwLqQvmg`EIfy-`TmW`qkR8|J2?r zjlk)kp~dtAl`BG10pvIqTil0gpi9(8=;A=TaVd%$93hXeWN_{0Nz^U{LIf~PwjDA* zs-cN-qF@AxJKP+fnwOS)42|1%V%k^R7YY$1#CYZ0zBhxbGxCJw>+H4@vr;6`|Co#D zd5;U^&Mc~xf7Y?CQ6JvZ`yOJ?Ze{+!68;3%ct|`1oC`B?Dx?F zB2+0Kq`u=976*H5y}LsRlYjcstIwZgUW2gG<%PuLs{rq(cW=BwZ-4!^FA$KCNcf)X zrIL)1f0bn{t`52=yHlP|#Q^C&F=#uKY&WRM^6q{A*5dbmCc!{1!^+Kqh^wb`zG~Ax?zX$UC`aY6&Q#kK1dPb)m_8^A_v_cOci6R zkxE4TuTfx2GsR5nvrQX(@P*Bb6=Fpr*38|XSGS+R#%l|1ag!-Z8q9oYl&;iyIuzn` zkY(D5$icP^SDhYY+hS*^J-qJ0UW*Oiy#_hAB%Xy=?iYRM!`o z^x!~w&6!|$_bs+`)pyti9i&=ZjkVc@-Xs+I$`(yXxMfJ6DZDO7NsC*ck zmJn69cL8U(azykgOIw{@%g#`7#XN7erSS8q?L*m0&4@*7jziUHjJ@LiPL_|TJbB18 zBejR`S(_<5R58|M>&WxB?fuKuajL)CMU~Onu5rFP(><<#C&OdF_9CMhEF02jo1sl| zuS1oHrLPgfZs=Sd{%*drF!Pfm1HXZiFqFK%r4RpuvO1ruvo4`*u#Wv3$_eru7~m#4~L2tX+I#SRBBK(Y0=vg>IQp6qWDu?=o8#*U6;r4=P+I_OJTjP>Ij71nNS7|D*of7D>{|IEQ& z*I>1dI=A>Wz z9+lju(;6$TESmq|&(t;}GZy#)CnF5eI?=+h^(B#ulGe<3ep0#j<|V0W5@A8w=gE(8 z4p-wUg3TvAR2$urEm%#GUVrh0z2*iKqKDgLx3$`xexg}X`9il)TDby>S|Jyq|@8pERWH)qb-QEZFZg* z*{nb;pT4(w7i_Zz`Pva|xqW24`BnGf5E5$);DQDP8*l>{V{^`dG7dq)-nOGiv06pF z0TDvUkhMpb`;t^_yD|Lfv$N)qqx~}qyCxbG=NEQQZLKMrQSOJ>t1hZyjMxH0K6pcr zYF?n=OlzFgS#vbmi!D1mOL`ABMS+CaJvD*=W;tvfAQsPakQo5BCv^^z%l6@#U-<@J zC=t-21fl}ew7;?1+9GixMWR|B9XUOV0RUJX)rlN%X+K$we6SISJfMRkC z(9yOTI069V9A1o=>X$fJ3ORiZ)(&=Y)e=&2J~q#IVUwkW1T->z9)jYC2EfTNO(sGj z>;ae_JGslTyyblVc-h&m#QrSN%wEgJk;W_l;*L~ekxFC*wsj`eLSg`Tn8~_+D%Qzw z(K<_%J`9$ACSfZ}Yt`#4JsjdRviwat*}na&`wXpDr}I~VW)D4xO=pO!ruL^rB9`vu zr(bS;)7dIv-VxH9mh+l#u|1%5Uv7D)bmSe@@>hoUCXVt`8YDsf{ z*=G^iS3=0W5X}WG7zw$?mkjPpapHR$-H$z3uk872>J zC?jF0uYHL(Q%UGxUuevULzJs~99I}>s*g>x&pM}Xsw-oHj%&D!59VWNc-Q^#HM#JN zRId_rrm|2k9HDUZlkhfhL{@8fj!s0qeZ-B_i1gNo=GllMcv!1^M4t}h06e_DG<+m9 zvLP>Wj1N7k@pe>tX~aC7VKLPETrTWgNK}Mb6e?HLHk5JgV(8@@92eP9$kd6KEX0#oH2?p^4>A2^q7&TyP_fRDbz2V;SA}hpizWsgmA5N&2*x zSoZpZF2;wsLxT^xAM|zI=Gl`Cx!)LIB!3o39>RA@vK&peK25eIPO;}saZpHcGEV`! zgr&G4&d`mfc%7yos$YKGsdxbjsX^wcAz`UuWvLOPsZpn?F<|1fIPSCrg|sB|w3M*4 zw6e4es?oHp)3hAo^t?3uA3URZLATXLIu zMtfRv$7x0vab_zjnwjQmA>|h5+K>j@!JO@AF!LzGbBm14B9mokNRTnd zzR^D-yv;yze(FkBp34?a$gxH|&S3@!$zv68yBy6|-O0}oL?Qq1W@#sC-xQkvq&rPJIj3rLOy z(ZEp_tT>zW$hFiRQvQ+%)h8deln3g`saGuVD<>ksG$|OXu#~K@qckJu@E90_46Q-P z){Lnhl~h`&mwEJ-<1&b`&-w>+3u!)q4vYOq9^(aviI!L?u>CVZvZUXxm)b4Fr{irS9Z8nSUF z3evi3VbtnRny z`ysxkEt}))6NNHb9O_zL>bGblt|Cm`tI>VW?)+!Pg{agTS7-G z(|W5Ka$C}PTh=;q3RZpE_}z^2wj##1T%P*;jJt)F?TF&nbM}gM*}nFw^|l()jylo1 z4N4tJmK|IX9k25`IxO45pp8A}clvlbOO!gLEjva1I!CeE$1NKtBkoLRbat&bEa=}^ z;^|xw)bBK2r{3u3*uv`Gsi59d>NpthJc^(`S#LiR?YiJ;eS+7ON!p`O(Z#ddUF^}L zoYBKo*Hg{YWrx+ve!Dk&y+=q>zxRQCZ6=k@`l{qd^H{P%@E{8nP$~2KqI$k$1 zK4dkFOffF2JV84q);@v=kHIDvc_tq!PcB;B8K|5z?3>)2px%Z}9eYgP%ADM1y0ga= zIi;O9buQL3u`yAPJ$Z#aa0u%6B;IEoU|( zW{Ab_;9kz$>73bLpSewThthhsUu^bNf0lleitcju4S4R({SVW~IYXRLbDCl^Y?rr- zCgyrQh6O&{wa!d_Vm&VuHU9!IS|GAX?LINDIol**P0hbS>HTQIq-J2N){4MrK@JD? zO%!F+*n%n_szMdnIwn8^Cp~*&@x$e!5!sRn-_j?(QJ-34Pz(|%XDRXs0NMsv<1E{d zEjwH;;cNqNjFw%!mfa?ovzGuVkQE=k6+h-LgfW2NsFjeam9P&;amOn_z-k=XDiDA~ zpthP~y_y!a3hV(8^jKtXuIBKq-AcQO^0nylAb zqSo80)}KF{HWmEXL$=Y!w=tlyF=V|l61CBAiIR~S-*LGyO}07Pwa(l{2)Eu`j@n$U z+FZvmLFm_N-;(C|w)R!F4#^hHdN)t2w$3NF;8_6dm}*G}|HtMCG`o!!z3$uVD@uS& zzpzayfP@d+!DHFMx7h)2ITBazkY?||ws$DVcW+OXFu-{ERp~ z3zMVb-DG6x0I=G=guonu*A{V~sY3Qm6}G8qwQNet?M-QkLyW9GX{MRU&i#*_^MW`> zaOH#dQT z$f=%PCXHPoypQOM*q4%jM|H)#}Ujsmo0pvWd$}GLwtAU2n>L)hj~#eUa#W z6Ynbl03O0}m;;4MGlz;T;SmRlr@mSHZaFB{4ij8k4!g2FZ~nXG0K6w@3<7qpgYIK_ z`sVfR{>ySWXuIe4S^xf@mcv)$!omZB-z^6PVN=e$5C!93ECG^1ec3<$ZaLh1+tK2!Ari?Tqm- zL!9o4d}MZruk$)qEL!%7Dw{mcx=KOeV-csr2K@P$5G*6XNL0$>G5n#7K?bOum-DlF0aNqx%K8sVG?7SmRoA zNLnTGQ*ZySIh;^xe)bUzX15~#tot!e&rjxp_=`R~-!OM_DeTLKiE^_eW{oqf4^#E7 z5Dsh7DbcXneu}S`q~`lGH4t{SU~8DgmyN(w_udMarRmOG@t0xkNK5mBH8{z$J!I=^ z&!PKjlFY{X;tUqIb46YQxH0ctgN)4sq(CB2+U7@rSpw_O%J0?K0&!w`9%Hd&7}aBO z4M;pe|E@WRG4g;K@!u%^2lt#@bx zmHb-2iw?Aj5q@p-nHAu6#=>#GcIL`+a0lym(V>$Euk+#EK_&lA_6bY8F3yDr|1R$D zro+Lwe>dO7IbM&zO;|wBV{Be3{NAVERR@?AexK0Y$bdc(rb_&NQML($>cD$}KOpf8 zMj1HpT8x)qQ2LE>;Gm51chw;>a7e+Rl3-Zr^F-jV3c1wl zv?3hUPK>;E9blD&WBP>?L1P~(F9^qt8ezfXCf{X;LFM3yFQgMzkV(^p$lyuyjY`PW zwe2uveQ^Pqw!KLfLNaZS%||rjK%f$W&>gIaW?k+^h0MA!RT0g35VK8&%z5!%63zQO zBMY7P6XSy}1iVoRT?kT!T0<8@-baNlh8a{rmm)q-hAu@}UHcA>L}X#hah`m{D+z%r zVJk_I*4M&ARM=`7B(sWmEu(NUY%S~Bc*toa3t!LcT@;-#{o2B%37*QQ@0q z8&xD*6$g_9*UrNw$#%_6@`&v^Y<|+61_IRx$W9Z?hIF^(ZZzU>8Pm1%z%~`J*TqY4 zMY`Yfj68C`4fd;wHGpgkyH8ih_y5K{$s z?LOSX=Nb{jwFWv+;?JN7%RR=60y@%3&-6bRs-4g}~Q3F4;%f%}I90sPB?_)CHS{%Jwre%B!Wu^@ndY7n^i@xN&h zIKO)kxPN#MxW9-H*B%7!A0hU=_2&9Xky$I}!0KN$Fi(tM8;ETY%2-=I_ya*zU;KB&_ zi$K5#Sd0uLM8X6jgP};M7)a>Y$T)b&_%LKbiW^X>8zj^>V04HDJ>dV%*Db#qpZ{gY z|I3d5cOd`&|A9RC$3Q;V;PB@_zIX$CIA+w$^m8F^x^%se2VGwj{XUU*fP6TfK@eob zi6;+^&Eq*AM2hq0L>~0ZM*hb{9{lS@{>MZf_{&EA`$Qi2&qhA`e7A#R3xv2U`hw+R ze?Zpo`mAX7#UXI~>(<5fUD3;*BYE6EM)JVnKSuJP7{qDO-$wF4#7h2pBu^t$i=nV> zPt*a5L7bLCtmL=9UEdacAyfwjhW(hy%eEk9^4BYQP%M_#^>xv0C(e#ooUbkQWVqMo zMX}<5&O!~hug{C(ipGJ1TN9@?Vu#JIzV3!`N_o<<(=q)S50+_2kBN7E z5I06|LvUMLo^$W|`XG*s(g?STDB$h-g>)oPb0Dsan!lDtI@X9r7N8fA+|&V}0&VH` z9@@$jnFM|{Szu5Xrjal63o^+|WHd&nQJ(Y*HrQQYzGqiTt$yhjqJ4jn)mNTc(P=uU zU1KpM0=-jP1W&a7JdrIMepid{W6%@|4M*jt4ih)n$B2H!eW~tG9iOSCA~)|yJ(`1y zn)W>q-K$6r*`-N+r9YY@==@}-H?*!Wn|P}yMOTZ2yv_a_!L ziwlPbH^06qpD6isREq4lY5IypWjpAIX^D-=*kDcR5NZkfnAkctCe z4b`K6xvvqZPe_etR{Na!zN5G3`y4rVgK&~zh^ms}9JBjE``+C>kH+zL#M$r;MZx?4 z63m5L-J!Wgd=I>=Fc;bLL)$-)eT>Szy~Oy=tX<)GUNoxMGA~1Mx9#P}I4d5V=URz9 zE_?-XRGFHBiedC2lb;grX6lFpzd7xfMK4U{*VmTO8XZje@F^_O=)GJvcUxxD=ZyE6 zjtXlE1Ig5dnP0J2@87T(sk=PL+z?%tCb59Ssy=EYiop?wM-dO&3d}!t>gz#B#>T7* z^KMiUXd6!59d9Q4npbY{4_;;$`+jcpm>IjQwR43MAyiU!9Z4QmZfbuVo1WDyu5o<4 zEgt-64A=0u&g!E@S!MQnpSw7Fp zg*fCSKEYT*I>y@U+7Yh5=kus+5f#_8%O-Cxa*q^~<7mG}@AZDP#nFn;5m#5RxPMZ9 zMFS&;d80H!hfJP>tjd;fb*6gCXYDntzPmE-&c8Ite=xDOd$XLNq3V#EYJ7u3&|+di z#W8`Obc?0eVgPP^K>AX7#n#Yux}N!n+z;W^-4tlh)x2r$I0CRmjy|$ZO}SmjSQz7C%m&R1a4Pm5@93tJ)k!5%9lk znKIeJB|pC-dnGjD{W&K3YF8@gN@Us4wC)k#1V*w=?^laM8jC^5*PtWQ7#dsZTAM6c z+bL(O6Q*X{+Tkw@$p(a`cHJ-RNFZ8hga`4^A!nX6NgzJgYd- zw+ku2EsAfI#sEZc3OxqE-a_oC1pojY&}$HTP-30~Q7$3}HaUkBU#S-WhsQ#;qAt(} z2XE1V4uBj7sS;qqLnW1CM>SJbiIi`ztx6ovdK}e5oODZ^Tt7RC(6ZCJ*f9{w4eIK2x2yhmd*CNjP&>XcrAw&Dju(bPVA*@3bLAlKroG3T<=?7DNrG8EtpAi(JX=Q`BS5K~TQaRP6DKQ{ts{WdEnC~81p1M<*+c8uyX=W{!%OZMp#8)PNzMop%?Lisx<=%H z_uhy77|o2T*^@}i2L%tQg9r)v#4FO&!xxS93Jrh+_nFpo+X?YPiuaLg@gbzs&eQf$ zk*OwNa%%MZ!l0l$^~KHM81EftBwl>GPmB+ClHkSXE114C{mXootNJgAL4BbzB`AN`e&R{gs3Rk4yj^EiUgq zLqlGc`nl`)pY;U`9tYc+`AX3Rw#tP#Lpi?48<>^`o0|or+z16wg!-fVebWh5mS<^q zadbiF2$%m7i5|ww<$r}7iY5@2=<1hT8rZMRk`8s~S`Kpy^Ehj5+*n9bRiG{?82+R&~tTet+8KS_8V7LSS+|3HN01~9>T zE};T7@u)c#>K2h1;%X>?o>cB^E2(R%NT0|rlITX8Am$eTAUPfslDP2IG}Vkb zrhv*~_wG9tLpi@;q&gBOM~=qszD_l=e~=bFn#OCMlE%rh08NW?OQlXqwP;IKI*p^F zcl%bKR=S$@Vl_Sr$})%sC{Rd$%Wai-nr6tGzpH8{fawzrz~VCE(_%} zeTF`HW68#QG+C#OnNXK=!!7%yExUm}d_67-i6Q4`G-V4T7e&#g*W6*x%@IvE^Tt`m zX<6wUU>o!T2Z-7c-yw%of(JF=+2k z*`CFxn3Z&zPgt zA^#0YysTcK{93jmNs9Z8BA3+y_3-?6ip*?bMd_B-Nt9s|=p|9=AsNCYiIgQS zkBFT}9D>S9qGL;ZFo_3FOZ?lRMO-EH!4}W~;)w8aXzNL7Bq(x*t}IeA3Oxo9ScR^( zMWtIn7v!_E6``vuWw-NRLNDpciy5Fi<>i6KF;#j{OoiN9OlXZ_h5x;ZmPgQLn(_*t z*orPp=tGi9PvfH3FDt!blS~wMt#XS^S=S@7 zEvwqHAliIX{S8#Drc>=8S$&F0WJ5hxZI6_ehFN2}Q;l*C$>**?Rf2q6DaEXSG>2v2 zRzM;j)mqPFN)nDkGG9J=s`qf;jV-F|CfQoX;Q<=}v+n*QQwT{0BM&OflC@uvjV`@T zb-PYatsb2#nl?kc@6?*XBh$gQ{tkMc!Z%5(j%@z*EPILulGOUEM-iIXpNp4643dK0 zGnXgdB}jh&DSF-^`Rb)^D55dUqdIn{aZp-Om9Y_(i}s#A8|<#+*O##=Z6OUb%_O6Z zIE|Ti&Gi7N4G`K?jdFdd?V;@VQfI+8;HbtAgXFt=tV(GSEent>VK)}oC# zUt0~%v&|RTEZcx2O^sNjvTHjuFPz(D5`Q%g#EF zPFd2`5%_JCyBZ#_pXTRb*)ky|Udv_Au z9`g3i-Why1ks%5z2rr`X8)KtV>P}|t>u~GU#OT*G7?{iG;^giAGBF&=*p-kruop4x zGBN0ZLGAf6SXFA+V`I2#e8_NO=tgC~DR!?vHsWqn_E81g6*d}FITUsQkC}kmSv45k zAN~YWtk56L<7uzG+jXqe>;h{G_ZW72)9HcTWiQsAr#x1G-65jiIZZki-Z`qbI}%hr z$VoTUQ4kr+MUd{5k!b0e<|CRMao$>K)l7l?I^S|+mUm2^wB0(gM{#!4G8;A#RW|Y2 zqL3wWY&)`#hpl<^V)9hEKl!3)U_9|74^1GZvrcErmGWe|lQvXrXdm`k&MJ=iZ2FFJ zV-4ZVATK;Ua12P+)SN^U>dq@#fl67$K!7#t2}7gGVu;n7U7bLs^I}LS&w}sGeifK2 zvM<9}=V4bteZk|8&|)>5+kc@?BIeBNBWMN|dSc{cwUsq!_>!c!`x z1*y#jIcA7t)q;`Wf(j1AfOJtGREQMmBX-$qlK`D`BO?Vao(fNwX&Rkp<0lCtul5t&16~VizRq2^|@+=jCOtLjeq_qNOf_&>WarL#b zOZ>#~wQRnC8a{$`3Z8YT+@2;f0_w+(9c?yO-7SJAhw53DT9=+Wj4-2^Wa_>&> z^5bZ*pz1#5y%kC!8}N(jeKOw_aYJw<*#Wh2#<}>x9l=#Oa&U6h0cGqexe6{=nf#DU za$Ca&L~L`2uYRce3Zyf22&z3aA_rAd@*P2aJG4F=fh5<>y+JLLM})cSwuYcCzGM8} zeWz^TG)2@gX77dvImrF$7;Wa*Zwu(FdU8|nB%~ThUUhLo(i}>c+(=D$|-LGHCZBDF!kt?12O4YUdVvZ7afbT9){`}#;p>+co4 z6>t`4&->4cKJRQ>{ht+m>^;;i@IQQ6A6%=5;!g4xU(O~}2xUdA=zs9#KJ=z1#OjC@ z{dc}xABbY)r==$m`<*ZE`-(DL`LKWf!IzyWI8g$0Wiqe%a+>!yCoEA%+M5E0dOue5 zdhXu>QJ&z)74}UVs8m{9^W|m|shiH=mM*|P8!!kdBYR2qPK+R_hUVHf)&(G z@!_L(Z@h4DQ3>45p+yE9`sF)cUV2FSTB3$Ty9I>P@qI==ldG$E&j%BvF%AGPlOg!> zrVhSd*jK+NcO$k5l)eHyOu z@-uzp?OVu83g7Ya`ogot_44m{ITsHX&;ocy!StOkUrh`AkMJ_>{|GN5=<=Qa8(zMq z%T)hIy!?YMlmAz|{DUr&{};Uc7hQ(^C%pVOT_*iEUjBzJll+O7f2GU6X{5xHS{Ee4?r^^IC@$&C zy!pnad$i(w)EM2cAdAjNM8-1h?}BEaI`n^43em5N`iQlo^7 z>BZFC!mmgW#lfdlIK7f{t+V_}jn+#;CrUdlOMk7(h_U;V!IWv%mgUw}P0at>kWU~j zJtH$KJ0~|Uzo4+FxTLfUah|rSx~8_S{y%51|Eyp?TwPn=*xcIQ+1=YeI6V3vWw3Rb z$6Ipao4qOihZ$@-k%E-(6Vd-`2K%md|1TZv|CYg~{8tD2&kQ!@O8xIhW*D5 z_Foxn(*N4QMr5#I|ECW2GU@-RgG~~o{a-uSB;P~W|G9$={Sm_c4;^giUm@&&b+93( zi>W_D*uQnKA-{&Of7yr&BG!5<;)A0tuxKSS8pBT)i|&5}Pk zUlRk~|1^Y+_uESJ$4C^%0l_(;F4Zbd6{~p2yeJ^4E8o~zuEMfn` z`9R-G*u@+Fa6a(S#1GK-59b5^1bu&VKH%S=?7w1%0@GfIi%g-$5S;LHd3NeYn>>>|a11 z?ltNA1@z%wlfGX-AMQ2j`vvsj{vdsSgFc)er0*}#hx?QC{RR4Pev-bQpbzIS()R=O z;r>nfetw(a*(}^;f|!mj>#GBXTD!iK?L}_uGi!0{;dPsj_DAj|bn+g_cUyKjMFSd> z`QPF7*xyo%L1Cj60AY|gG6%$F8PoDpQxZ$MPWs1X<}N?GS4QkP>K_j~(B=_LBlh(Q zNJzOy$0Hf`IuPSdKw_ft%1d=_;?Oq#q=a0$N5)}Nkz@hMaXTxo>@bL9!E6to!+2M< z{L%)xSPN6b&FG~cPz|Q4B&Eq@V#(xlhi1Y|*zT)btiG$GA8PU`%y8#oP{4~HDkMwF z{J?~zgf;hr^?gTugg>{W&jyrkybP`d%P>3fGAe;#>2k%aMY;NBj2eTt;jM9rc|mNL znh#zEb(+X9M!_x^yx1`s$`NjNmoqX zE-Ds1VKNH5JwEgQu=iGBQSW`c_RtJSO1D9$Ae|O1V$qFsgTM^k9YYSo(A^+8G|~+U z0z)fG3kXWb{=3$CR`0!^eO>##-jjU>CmjE|fA{zP+{{Py3KKXpCa5HS5!B=u_2yI0 z;nvSoZ)lQg@7uw0;7M|ElBS>;7d>={?X7dNK7o1S zYxIyhpO8|wlzEas33S9lz9xBzhg_rqo%fnJJbjnvy;R_^)x3%|ZB%q(`H7WJ>3N@#!DgOT4j=A<6gO zD;iMJpiQrd&c&n4`t~IZN4+rt&*d!+)Z1^-dqKPO{h#e2;uLL;{wfTN6o#+6`I%y( zY|cM0P>1$?8Qz0Mo`1ZtF3>RW_>ObDuGg0(%f_JzA{T{RsW0@EC&I5}T->)>=tZgh z2Yna88AtQE5Bu)cISJhPo@ddyt}tCRQA75f?5_7_lAp&UZ-lGOi-k?`-6p}>HrMb7 zZ#qLf$^J2!!&*Y=ZC+dckh!I{Llk}BHuG5B!mf;a)0X#ccZcU93Gs2q(ZX&Je~SRN z>~TD|3{4cHuax9X{*%}0ZTmE`QmboDCnHbv4l+U&)*kro=QAcOPG!k@OUhSzWC2KF z(@1)|4%qm7zVN-CU-2uW%dYGE!DUb7)rD99|9Q;y_TxK8()%u6=U=$>XBbAOy$^DI zJ=dQz9}l+6v=mleSZ}wT^&iU|$KGJP{6;Txwi>H&GC=~}sac}k@@?n;#D8Tl-hSDH zVwSyh$-cUn%elJLBm~j;RSWoG(+~oU>|oJ3NDN$+*}|$Q42R25^e$uPo=?4n+Z4A3un%n z!O0Pe&{2;lNWm#eiO{HuDAU5JIF5Km8d*()Q_G?ispc4on!yf5MY?50wiRKw(nNWS zMRn8Q^+KW?KvB$tTG#_KQMMgX;~dyyV$tzh(NhrInUv@hRP;P5e39iiI)f(W6HUZt ztr#GQy$*@kM8%ZM#O%yO?9s$liN!L17mGZE#MY<8o~A^e&&0MI$AUPcFvVd?EYY|d znD{IR7;hEqS}`W^35-)6MlOy`rX9y77e@oerf!V07BysIq79;>jo%Lry{&}L2#r5C zkLRMr=S0VA<0i!I#WM(^A;RD=9%zDxT!IADgaw@t-jN_T0J@J(c*JCtcuPEy0FkJS zhN+?xNly~h#p9ldC(&vrX=r0A&_fe+2a>c9NnxTdjKKJY;>j%INy5OfX=A)NePi;w z8Q{`1%$gP+A`kcWOBNJ|yICc8HYPYKqT#Rn;JlnEfyMAHHc%KMC89AU3Y`*x_VI1> z#S%dL3~fUZDcT5_mG|8gM8*jsS;;@d$~I96Q~*saLZp@;QWH5-<1{!C3EH4)ab&GF zvR)fm4)w1DBe5KjZ70ZT1oBWcHIEkA0ZpsrOly`)QI$vZpQPbrr7@zAE#UNi&U6;V zw1L&MDR9~hlyDfG){9PWJ4weZN|&D#&-iSGT(t_A-0+=m%-9x3425TGpppCHnON$X zyWq@CMCMzi%yWy(BXAaqGYdU{P@>7irAx<`2)Lm2yJnsB^(25ymMi-ToJHB>hdpR} zy(F7tGecH8m^n3*evsg!wmB~QwDD;0cS}Lt#!_g2tR&;T+zWC@lyg} zS?k76yPNr95lg{3*`-T)F)F%6S;pc~nig1eW>Z z;rZEF`MQJouTH_{kGS%1#|!LF3moYR9a9Sg*YoCZQz4rO!;&oHltLf6R5|OyIX6%c z9Vo~H6rxiUnpzarR1~9AS(mK}l-fKho8~M(?JZTY=qv9FFaMlXK1Wclxmlj0 zP`=Dou~=NODu*z!s#vowry!{O!cuu%uJXrV{v$4u{nO{?5gI$E$j>E27%GXQbhK?8pk!)2bEHDsss;hXgrgo5VES>dUjG^pUxYbHvQe&uJxJ5_ux=kVNcr zPl6k&Pr+3QC+dFeHF!GJ+k4gA`4BN=8WlIORCATom_)TqNzIl(?XArk3f;1M^u)@| zP+3VNZzPcvy~p#p{JWAM&eAuRI(5@~Z%W{GS9B!cIgeL_^;21ud{yxJCxms43J7P3 zhSzfi){@2dK&3tr4f+T@6*Y&7IYYtd8{>gsL?Tiz z-=i^Cr}4>h1Ei!;t)nq(qw%qNDN3iQ5ZaKlg=rsI%UmppT{_pKyw_9}iCrz(EU(^N zuY!$2Hs7ymZXv{OV>xSINylvab8U*Q?d z5cVE&_a3P9?%DKyjqKel?WJ5B>ixXcyF}PG%iTAv(l=?-_b#$eZnU&-XsEA$tFMQ< zfglTnE6^XT+8>hEAJ)kI# z#e=VO!`0{~sZoN@ zdZVjeqw8s-n}jW++w-G4=c9WJV+8wBW8d}04!y>X3Dd?-TgJ}k$1cyu2tbVEn9}3e z`s27@?{WO}ae~(IYj_Le#QI@m|FPJa|3FSb08aX~fc9tJ1eoi9!v3qa4(RNF*A9s6 zfZz@&?10w}`0Iew4w&wM%MKXrfT|Ao>wvZn7T#EC**=PHOOR@ep`)nyU{YrSS@wb7tf8S?g`hB47U-#J<{upTcCzqoC^FZ6*xD@?g2HO70rD*>$ z(Dr98MfyW0C;4Ng?H4RX`sbClpZ+=NUsu{lqsaawp#O@cNd6?C z|HM)xz)sum0y==D$o?vz16YdW&jLDtrAYoPpaWQnQuF7+QQMf@w5`VW>O{*_Dp2TKwE!li!4Qba$w)bCh| z=oc>aA1p=ulS`=SR04{ZqaXd^^D04J0=5u`7n<9BpiKkJY*mAJ&ixm}=-^a(_&0%VN{1Dvn zKDp_8zJNEU@^c+zQt$TyG1d-MD+zk!>i0t41#!+NrYq00E?*Qq@P7O>I^uaATb;0q znYC*A>EwrpFG~c2EngbDyez|gS*q*$ShKa{c~wY~u;~x3mv_djQ)^9J%WYYsb@K_P zQ4!`9s^z8n_X0I@H(e_;Hn`QV2((7=4~xus%k*w2Pa(;LG~OV@ZPl0w(L;*$#o^^V zhCsJB<(USs&{}3HL^wMd=2nu&`_)`+3q4sPQl0j-OhxwI*=%2xNGWEKoslWGZt{DL zhN#A`w)%wgN&S!qD<3F2i?$_#6{@83|{cW^u zf%E~e*%s8Rd1q$C!G1TA=E4Ntei2-xgr~X;li{T9culsWc@$~X$uZN=Z6{#Jq^E(a zA|^l~vlqM+{}JDThM%#1FZ7I@} z$KY~bvy*4F0@gUJnc-|&dOK>h_+f3MV~mh9dVaO&_^8QMZ797EX;B+?1PJH$Ki%{D z*XU2j!_;bXs7Kc2C1p~5$TfhkOU2=Wuz^S%1o+k!DzuV)^VL#Jb9aEo@!HvPue%j6I5(}G)^*Yje zn7F`4&UHV(ig!c~@E|DW4As46ZAO2euFu1_iu@|N?Dmpbys%5D+`1U~b{pR7;t-4r zQq-1HvU;_)9l#dvY=yPeIbgWwqe>gjbFjGZ?6OOMXiD8+)7 zd@@kiRup_R$6f2(BLE^r9gJhS9x({=sOPZM>Iki+A$R9QMY7!SD@%51n;{ES^y&&D zd#32vw@#*F88u`{#&8@(Z|on(5lw$5l6@+LtPCGi4v(gO7rAIjCUNZfQH)GZD~9rE z(7F~GOw4Obj0|2Bb#s@4YJ8%IXM)))kdw_K1HUxIf zDTV++x;X;_zX^)Rj|0U;mCMKBzm28QCe=s9VLN$a-QI}9_!fJUmedg+Nj^>DnPdEC#l#DJ^ zCTWr*f*33g-|m39A&5y3@Qv}9!+!X3M}ol(aLQ-(*r0G?Gib_+UvdN|u?;$9nFR(@ zBE}U*d>r>wxMhWy_k*XRi9FGW1(vw14I-*=uiQqWC}`>oETsfWGz~}GC|4pbM)b=n7(RMu{BB1UN1IBBwJQQ9ahpaA$kt}42UGrgbG zUBxPW$Ot*Rajl6BojyG7sR7RD1BDYRW(=^TtQTJkY0T&sNZf&5n@dUfh9-OkPiOu? zdo3Q3*{zN^FV3Vp%IxYuV9pX|YG<`B!|_iEH*Yj%wT)*G4-%HqW;ei6DXa-O6~m|_ z2wfYqQOlW3I)o`Kv^glbtlM;ipP)JQMrmAJg!$;4T2P_@9pRpMZk1g6O%Xc68bog8 zI8v&F!1*M%NG@rGGp`VoCNGgU2G1)cP4>Mpke4NptU?UqDAnp~2+WkF^~j`b{ zFUax0&%Uiw62;;b8&Tp{Ra91jZ>*GANr10lky>MokMt-F@+)pS#q+&2TN+Z8+o6mv zBT?qdQq)H?LD^t6q3~ird&!X;)Rqe948^PFLliUV*KP z+v8DX=~qP1j9W2VWxZTP+Kd~2L*k94dNGv^ZXx`Q6)2t|5_h*Sn^_XKP@>wjs)DT< zhxG=s+9<1pha30%X|*9~sUSVBicXDTRrE-BjSj3_N)iX_)>(}XOWA!S4p_JLxlxtk z7WPn6t=4i`>1M4mOSvL@ou*MavxY8?46;tiu~M%Td+D@Jg{8ve44Xx#Ue&L{vKgC$ zrMdp$a)o^)_BXl)1=5<=k=O>F4RTqP9xB)uvkmu2>u>m-VFgQ|9%a>rAh8sXD5;LB zC_-$KGgMrILp(jUlvOE5(h$~kAqIjFTfrS8{S1qJpfP(6Yq!ude+a8pp{ckO%VMrc zST0GAf}mMw*@bR7tX2|>3f9~**W7m2Owd8!(ka;j5UE~5&z62<3xK4C=UNCx&sqQ` z^#T1{#&uPdY^a?796$r@#!IH>Emn>vggbdUbN7b#k?I0u?j@zVl82hAu&=uDf8p zE)lOTv9vA;!j`Vz2o*y&!F{RjhkD)eUfqh|v~D1TraIrPMtI(>&d~Ers^^7X4;VI1gjz;0fB zfNu`e&Vfo`Ul4e{*~hZ_7I>B0qh2lh5!%)AQ%AC0PO*Q z+TfB>U|?Ybv3Z~_@GGeS@&f>=afd*8dHIr(;lG9o0MG+q9RTzIlm~!2ztSH7{QyV^ zNEG~x+yktKk@L?M(*Kp8ff_;l|JS0@-+9pg4hKE>yQuVc9`s+}Aklx}LH`X7;`@UK z{Wmy>^MB?k{Xg1wjA>5!FucsMra=dr_&{ zF{Sql33C5>Fr%thUi(g)ok)20MQQ8Z>)NGh4<~IUi8_Aslhz&|E-7O3f-rF?1Rx(n z>D5r5!l-XR=Ij^#)FjE)LE;qlx?}y8s}wq#;Gwd|s1y5Y`Pc9Yz&sGD^W7DhjuLoFNDMH)dGA+wcD&Lk$!a=06|ZL_rd zO04xZom14b%P0DG$9II;{G)vsR+zrgC=4&e!RUa|raFk6&?YRFh43>Oi1QJcI6nL( z_owySs7H^i?ZXth7HMxZp!<|F_MACCGZTJHQGB8nlMqaCiw$kruMl5t8h;m)wcK*x zrC@bZo!)1D{q;f3_iw^uygu=B|HFUVGP(d(ukL=JaX&`@02`WvdV75R(sf?wUwn zY)}T4jOSzSr3sj=D+)EJ`$dp~p0rxtyMm0%nS3kI)&yxeXIOr(St|kPaBmqL(xga=J{L-lIRy-8DEw)VQtWeU1vbOP=fICt8k)E4i=>vj}L#;Z3b;Qxk%`L;TpDmRQhSWejSvR z~vFC9YK1~{kfYgLYH>P;V6M65R*h|VvR22E^ zJLFup#(gFzQ}+6DK596M>EO68*)$F29fe{B#)Iy$?Zcv5KHr=`^+K9?-Xv5=ym-@%bKDLshS=J8Kh~0ncmjGW#~5Rl^hG z6gWUGk93eG zyn~Ex*K15`p0aSig;!=Tnr`~zGL>8RwD7leHocE{4&Mgu+1bb1P=7w4I_aNy>Fl{g zvcw=N-nX%^?>${V8E1AhoYD2tWps0{aM?h-hpf#dpTD(-Xl$j)N%FfBW%}}C&8AV| zwoe&CdPNt94pyc16{WGrEq&Rh!I~Jix}*4|x%AWa0b;%y3x*B)vBnX@Fqalj#!aKx z=Jp;%_nh72HG4zI(nZE`?ooR3yXdBv;JKbo-=Q5-aQe0bj@{!0YW|m%zDzyYp=axjDv&uB!XIt!t(y&_ z$0JwLhwI6w%iIoLb~g;a4LY>#2vvH&kZrHYfy;d5)VvsCHrVoXm-zHNhwA zx+T)kPYG{f^y-a@s_}MlG5Upfw;(_{J+B8$}}Gl^#`no>M0ckT}OFs z^ttJ1`OD7uW`zba()cMtg5OsMc})jB77uT!q3E^&WPF#mrcACmAMNKAH}(3*rcf?X~h| zHy)J_<`l6;uK8V)k0_ECp25{XLPFBR%$&p`iWD6RAQ6PC;T3oh`7@F|QxWe4#qUqs3zMfLUznu}AnC5H0M5ixsZ6 z)iH`2Nfn)m61i6F1sF!q!pMo<$dY7O(|+jTZC5H0+s&ysI!$@OqgT}7aj2ZQCR0PI zg7~RJgZD|&h?z(>C^rXK>+nN>XF^z&nSQlv0*R%m@lnE+a6;9y1XDacMPVNaT3vkHz91>X+SfFwl}B^EsLniNecf0p$5NDD|b$Zf=1TiIHk zB;DUgo}_d(Q%tTfN!H&;a6;I*G$v@H6KosdWyi^PG~sRoToCclpn=54#c;jilt?8V z4KzGYG%OUs6^=-N;U~wPz#pUGXjbhmcK054L`git2J8|gj)*}&@&JeS9jEYFrWOXI z=6p%=K_@$-Q?>F_Udv4};St!V$eEPEo<>qs8eeCrBtK#JMQ#BWDIbG|fIKtynwbJ10D=I0L4H zFcC-Y(sF$mfVU`R=x(G=qti*)ldhDwFlN(EHW1O!^rHbd$spn_I+GNYmHR$x1DsvP znT@-N?8#4h8vcwfL^*v$DW^X?%P~9yFl2*^v)Bo^7*f-(b7is2BENuh-)&?HHE{}i zWQcBNVR`7^w9Yy+W0fh;l&Y_ty$a1%-po~v;Jg=seAbkqpzJxfp&=EK z&61R#UY-9!C*R5=*O*Sv)FYqPy5KQgCN_cUgW2q1bb%7I&=_23##NY;8c^`)G!LgF z_s#9hH)46u35xtT^F$E2u1z^19*@K5kdpmH(aMEf5}8j26EaUylQx0zYtxie&H#d9 zB%PjZM4>fZVnkDBeo6kZaOvsG(zP$~aLdx#&G@F-(i7I4+SJl-JsAOwMQ?OC2{(+e z!SW47WkmIN&n+W+v8=8*$|p+h-UgfXNfd?`MA^Go*K$M@h(vx&ExImW!PF>NiYk{& zeP+|JTA?&$vpZ|^ZPR97$L5Db<)L*YTE}#IlS@0=iZN2daNE9!<))dy77#Bn9@ zSrsY$8$c(b(0xNC=J|#O`G&6f4a3|Urn5IJx=e>OLO;Z+#csv2c~&dIs*h-@-Nj)E ztM>(L426bDg>1y}5H%^Q)fEBpWzCiqYZ6A}C*|v@6^%4;vMRQz{WbpNcUk34ZsFFk ziPSwKtdoyaeuAufT3Yvl-b7OpTTi#nxU|ktw=4o($udyu8*j^lg~7*Ju|Fkf&z<9l ztg_pR=LNfLmDJlHgYi+p9AQ2N{XuppKc>6E;U2|aNOsF2p8(H*2ge?dXo^3Tpdvll zqlbchB~eydC^Z^ihtu?4k4D${Mt>E5*c^~e@e|j|DSFftkiXS%2KU7iYsSS7F7-r} z+e9Q2HoKTK#mxm5mwH|HHHO&~hio)EW45p!H`Ycrd*XX1B2fdSEsmWn)+x=m6k!|u zuF{^d6&iDzoy*x&MbK$`vy-l@)fT@MlP7pdrIl>B)8bR7d)V6s`i=x-3F&;BnABUh zb#J!$#!Hn>!Sk-5&8GXJ-DWvm^4GcyQQZ!xw%5=ub$=AoJ?z&qDrTE? zRoV2Ti5m5=*rH_hAZg?2ew&(ZHFt=y)4C1OlUcueEEj$C|USu z399xZ&rsBAd4`+F1JAMh$T42r3MZSmS&1=Y!6hVVE2wLlI z^^TKE*HJ`|hg=t>X?;g*XJTp7i#fuES87Hs{@%{=J$|G+-cS{fUGxp|i}wPI6N1tc zcl9SkyeGucCnU&QC!`i8WG*J;7$@&bPd?P2lqdI|R7{^#Zk<$Jm{hx%RA>C~j9mJ| z3;hoo-XFBmKj^f6&|COmz;^M$h;hn9dg_%lD_9-FjFFEjH+C5JofaJUSkuyH)gU^t{kO>%52Q{OHBJ*TnpU9WNzl0eWj; zI(lI?T9j&hVR|29$y@X~%i`4i#h&QJ4>^mTE{j6j(!4v23*UH`ydTW|xZq*#5XF|8 zIr++S)+)+qguW_Y^8C7l)5e3*hQ`iV9(q3$5WRed*oTMI0DZlU2f_G}x`Kz+{v&-1 zPkAev{PJTb`6p->`nC@bTZ||j48&>A!|fwlD|GRxi+lwnuyU7)MLB%g9+ka^d_%dAnMOO{9_4P}0 z=a=uit4)GSb9Sp128$rjlC=*vp~31M2YYU(7|{#qHOJAV?#ruyjY8w?MD+w_hr8yfRqZ`F0E4f6QWj8lp zs%_>~uN1~?=HzS&l`(C``K**)Zf1U(lULg+=)|xcT|lXEJ#Sm6S=>VIZ!I%!cV=*Q z+b;tBdied-j@E77VFOOf7*TtsubF~hSM9!L2Yj`w_?p%E^~26rB<9Yi9nL^v&@F?V zWWjapy$#+?AK>@(-3rccJ39wzoZ%TeN5q`bKA#SOf<4m~l6WuPWaCn8FUe#NBDBYj zf5nlpxR;Q#;Ys{0`sOBs+_$(-n>4$7Z$?+>D>>+{zD4b?3=@6jyLrXVkcH9UvmX(^ zZD+sFGse#6uyDthgKun0*nwlzU|T$vgP>|lVClfU^Xtdkyx&9a@5mc+C}w^Stp2Xb z%)ty=-M#q!*mvm#KgV{(u4XLzkB4nP{K$88c7M3WuV3*Uy1w5fWInv_hhgn-XdG}D zZ*a(}juF3eXqU4WpRt2t+;NL#BQeBw+WF)?#+JIXgvs&)BnOfR+-1vAn+3x}?b?rx zZ*B%$&E4Za(a+h$2Q5(hoj6plILUm=$Yc|#5UmzDwKv&{&&4PrVQ=CSr&e4#wSI3# z0|POTZn?VvuN7>R>YsCKFq#e7FgsQgT&CZWu;Ke1Yk8b&H@#l>-Bf38GhF54`|ah{@3+-pe|tWkG#P3yDwfR{v$6ibx0+u=??!(UO*d)Xhi=x zFYu>&{rowDw7z$`+#mV+OD0)E-^}(u^7Rl>Utk0icoQI`X-Nk9 z^}H{zjl0woJ68bxKRa6Gd+^?vpRzNTaa>VxR|PlbVg{U?7{oaUG3G|sOEw%G95 zVD8T5Bgl{Bd*`&Tf5{8{lCS?zs@)(@)KOyuq9$;uhj`PwKSu-o`d|A3Kl}A zLkQ7b=r4VNVoi3L4yfDLIbEHfeSzb>lt#FyZ`&W2`J0_KyPK_Iy#BvrO&Gx>t=3tU=rW^ znXhNQ?#GM$@@Kw2_JjmUg5ROi%iucylp4e9^Ur)e&tM;O=+At;1y?mtnfD@Bj!Ur> z9OM!Q^7T2Ki0)tV^$J5kEh&+~@DcsY3nb-CJO|?S zFBPBLQSwY_346Z;>h&*QPwKiZd>GS%UQlX@coQYhm^3p?&Adu_89!x~?(L@VDm$Gg zdG5+plAj7(EI*@buELXa*PF%-ayNr-XUy_XDPz!!0=u+&H zjFSks1CvKcXl#ZDG_&#+)0t1#GprCpgETNI z)d*L-sDRLoG^q8|2ww?ONR~_*Y&UB}z^2__cDIf+Bt^&gniWm?^(oTOC#lB7(6AiF zL((wDCS%g@qFiPwcCv6Voe4R@vYb_fEP~3zgrc}V>y`#tqZt8e_}$z zHV~F_Cz&kTTilcm%>n1ECyPnYHf7jYPZFIXiw!}TGM(TjNglG3!M>lGvVg6UrK!l{ zu;^diq~%O{AVMB5q5bMMCn8ZngFN95;uRa4I5<(ojXaS9{fa{gn(!o)sRFf)y}0Q&AS%20nj=n2pR7p)4^K0~SBHCW|#FOC2EA8jU)^*Vz zsO{@vbnr=PHoUw)-G0Su>5aoM0-~nL!j=~hB57In;$ob6+hgk(v;j6Iz;fxNmUjpd zZZRNs7i(k2brR!clrqtYnCYJ9af;9lH=)vW>EV~pk2JeBLNhIvc$cx)NzkFiRPAu4 zFA3Wj#wq^l=7VQ_3Oo66NooDJ8AWN~sv&|mGNry~DWeCy%bb(7!_D|s-VGVXn5HoH zTvHH)BcgTXg*e7qRs~DYBM4iUw72*c64dBX26uVqj33*tenyg9aPLxHNv+Ef%#J_i zd7X0-#%L*j%jsQgMQ$!DW4EH_>@d9db*gadhN%1O#H&2U*F`q?*3aCXCJRHdi(f|f zyl9XJE(>v`FqF0-A7q@JQY~|>kVM%Sn2t}kR~S_VGv3pqrVs0u7r}~UY~5zynnUB* zy470Y+gW2d&Q9BB*VU!V+Pfotmbh`<&kKUmo#2T|t1|A26XiRu&C?24jz`Ub^bXJo z`nfr85#Vzo72E4(vV?=cKWzi ze38s4)|cV)4FIz%88|0oGJN4A)jS#0UUY`H1KY*$lVK|ZG8g0$!tzNvBVz8W&wFeHpinsn2Ex z3`pH-G8w;GX`aoZ7u_1#8FxsW;Ait22JX#EjJwe3vqjRAMfdhA#yx-P^JTChq>Gt^ z>07ww`6t>XNS`dzejJ?S{4=K^bjXnD05N^OD!xPt9rI=So=ttRu4L#jk;(Lfs95u2 zQ+vr{x}E9p4V>g++se>$Zi(rrar$Bhx3+48l?e;lV(JB;z0wQjF{B7}PKf`f&_q5>m`t>X_^> zOzv?^eo`zU5_K$57?$KXmMkgu19fZ#Vi>l{IQA1#oagE|nlKy^-Eka4Qe0DYTniYk z%{VTJ11X-1I-WZW&vP8lmlU5UP#r%Mh95bOA4>{O0IP%HFtF)uAZh#SM8xfXNEv{U zG3!1Cs2MZ%!@F}SSL>PQ%Q+NxBxc&G0P_N{ zHUNDCusHzv0WbBVf^in;Ks}SKPe;d z|4l^vzln&yr(^yPqzucyN*QaE9epgnlw(5YzYSo4e@Yn*+xtU2+)ww1??$#f91$Dy zav7CEmUaK;WKi?o9aMadaqwQXLA3*LG8Vp1s_@_Y@qt%2`p1+C#qw|0IYw9Pb|Mtq z^jW`L=eW&1c*rY?p=vv_KM%}53NJp}dMv!8p1^y&Y@BHaS>R&M`sFqUvf_j#(><4{ zS>g6Mp3wMY)sYb6cp|}tPh!=}H@ah^sIcO6vzU4B$H%H$Z}wLJALDp1bIks1Cn_Q4 zh_nUI*KgMai~oG5d+6fFw`O1^>fpipsMqeh(zND5dDV8YooT`>>96xxd>2Pip@vP< zpCxyszK_LLN|UXQwK^XTYw~xjVGkQVyf|E`Z2#65nE6AdLFPtYrQfz>+F1d6$5}Xu z=h+G~_!GMkP|SIHYeuhp~PJTUm#sDIGPuq=6LM^mvbGC|gMEVLqKUSyw!k zaun8kUN}ZK4ism!I`h)U1Me%|n7qX6puznX0=Z>E6b*TZ*R~fR^7do)9m~h`#JLN4eWLyv*3CCX7H9qX^UW)W0w(^xXAnW8wDPF@Pn4dyT}KSmu1SwI_8QkarElMT_31s{m`6YY;zM4(U?EzC&&LJP_jzqz zOT1MT_qEB+YrT61#bfTucQXiC*J%4JM}yQRGXsZLAMqwX?f)p`yN`WKTHIsYJC!1j z@x!_bZ}>CSlGphZHk?0XlqX)_e35n|pY55;=Cd5!io$Yi4t0LE$sRk3*gIx<51a;H zl!5UX0e*E$;+4XFO5@QAnZS6$=7W>v*DUX$VBJNFl=exkx<_+7lt2r z**v|m`LxjFC4#m5w!w!+&62|(p1va`3~S1o>2%5E`ad?!sJW#gMcj<77#CGAwlrh{ z$g@R*xwW4cYZWEGtjzS*vb;5?QJyMNlhewhm1M1%5%VMS4bi6QlBmu=NPM*uudOv6 za%Rz41d>6t^VVEQW@X>L`iNGIUPwtqD~_<0C#}-sbtP9QX3J zNBk=}+y(*VJQ&9#l|Dy9OBaiGnx}<#q1ZzIpwmU!HFtGr7fY>_&yP1eHLd)9@z_74cq;7(@~ldW8@Ewa>hZ zc^T&QiVcYvke4}#|5mt)lh-(6&m@?J_F5I~LXTQd31QL5`AHu^$NY$Ev+zbNWCF#9 zP1Ih0OUzoAe-QmH#{N6`P2L+D4|Gon5;gAT2}QRnxin3{iR+3K9&bKFo=z62)s<+D zWNvCSpMI#4xj$yk2tvF|wUBHU9(69aN#!-}?!a{``xgAEfHQkWb%L(?OHYUA^zI9H?f%a`K-*~PuWU1W>#7wfUm5`)0$ zy3O5%4HxKCab9ZyLUd{E938Ox4-guv9QR8Lc-S-gimMqE9ROHG&*b?45njG znE2Wy50+k0xAcalx^(T%H&%>ruP_rIcF8VopYcd8zC`}WcUW+ZT=E*1^gU@GlHPmG z>}Ag#JM&KV zj651@hd%UQ@>n~Qd7HxT^P~I;-!}J)vuwE3XE3ujdMsmY%vWy?HbBCAAp80vXsvC( zX4mJKZPD#O!FSZe*M!>lq|j$c0jg!;>Hf$&J;?*pk{7#32TTKfW8tT00X<{aD5Uly zL-AY})1oZWKJC-l6!lsU@{L~cXHMa{v2O5QGhlqu2lv?L~e6 zHE)i7|D8TZwZgy;oq>K0-uV4~(qcjEtUe@=phQJ6UgcvCRfzvF+`o?6t2;ChXu-U5 z4wg&t(Gd#}DGF9kaW~=!nZF;he&qRhJ@{NR@N&&>JSot=pU06V&}GKEb1mp<%>_8M z8GIjlp$Rd=_p#;(gY>%vTZZ*J8!{&wKhq3P#y3r|G^VK!zYxZ}L1mlM9}s`c-NR;D z(P!IBW#*M9`ylbp7|hJ8P@`scbMppp2Ls1XO3aNYVM(7>0^%JD-YUC ziP@To37)ceT@$srZ-o(xeW>NA0ER1S#=Qg}+3YA1;_ z3NRr1Vvhwl21(l5NlzgTMresVRAR1)vyOPIWtch!+}R>L*>)od3U@YDO3pTMe%9yA zz2?MG2>-|qk5hyf2ZXs5g*j;%5h{j0)C!cxPieZDBBPk%iiZdk_kNZVYV6aWQuZmu z>Lf6@*pGcZED0PA4@Z>bAXaY&Y91lNXoKw(Q|nDq{l9oy_6NVWFbY(ZblFHv#YE=+ zfU2duW($MF7C~~0kp-Q|`Y*^*Z9inOTNl{R*DB2w;q|sStrhLrUhMzD%HvVL(L2uc z%z!iuHY6Gy>~|czFyPt)0cYgg%y^-YF{I>?g-+c*;o1>*--Bkj9;YNKW&S{O9ihFZ z8vTzNLY}%H_m9%A8j+LYfzPH9R4JL*gK4-Po|lbT`RJ@-L1>0!DT2AqVtF?S0u!u|V@hF4C3(r6dC9@4V(uwogQ@%_SsGlNTC*>7=z{!@^S=Ga ze@R!6AXq>qk;T)LIfBl#9xUi9&eo-Kk4(vDrYl^7hrKRIQM4}1|5U)VnWwLu&Bs-= z*`!pIXHs-uCzqZpDAFS@db8+jc%eC0p^Hwzf@>~|L{5}WPD*OAjD(|5Xz*QvLXp`p z%ZRK^tb+U!jznPa#5EiUzNDs<(rdW3%$717#dqN5)^Kq&DMsF>jq24h?beDK6NzeB zFB|iSQh_HbYnNp>myORpY85k^oGl|;EnlK5|HxIL_Pl&bM{H)6bL=<@Vs4HuGT+s) za4s-^Enj(mHR@n9+Ur*34-bnguIS7a3v3ljTuIA)epE$GC+ul>?6t_`v*Rkl0T_WN z4+-HLvY{#*1nm0I8%n^*ps&6uS$$i#8W1ubaUiR?nyY!{s`<{U%c|p9O>G2igzoCr zu;j)IpV@St#tR@h?jaI{pb7Kxwdg6~4UckJ8;*Olk&iZN6~J|)A&E-drI9LUY^%LV zHZIB8xpiTW$si(yZo1tNJCF+Bj&*)OS5vV}S;*Af5<;umC68Wfs zhU7@Llp#c%QeeDIo{MBrHX%nYH-bj7>1jk$hizSx@B5~L=HM#NCQ+@z+Aa3_t>UQu z#&D0U3}jZ=Tyst{>TP8Dn}L!9kAg1ll4RwQ{vr1E(#Em5oOG4OacIj2UAE~nl#fnU zQfd?Yv=vI;7N1&JLDx#u-})k=%_FC6bf~S9u+d7Q#LkMlUHg9f@K)Ost+tw>cDGOM zY07zi5{=Z#9nK~lzFeNQh>mJPgKK&ngElE1ba@EwHY8!|F<~h;eiLOGE3ubv4Wikl zq@zW$lgR6B1{&o@&_Xlc#;n)!X0w&2g_U(UjeWa`Q|j$wCwLdHR|y{cf3WvfQEe~k z-fjYI2@u?ZYjG%2+-WIR3KS`>Z7FWWEx3DecXxMpcehg9T?=12JJ(!u?zP7_=X_^h z?#tZdA{iqYnT}scjeCr{;U>4kBu3C}^R$?^ML>qTueTXGlE0 ztK$ypQcmv*4{Q+BNIPn1)k|!A{Z`{mVH@pYM@?Y&4PVn;L-$+JZUNdJ4Ydw!lScdf z8fLHy+HUP$L!OmX4_RYIx_csZ)olS;c1~ELb`p1A6u2*KwnEIhCpd^|p0{{0zi=}^ zaVb9_m8gG%w`fbsS!tjz$%HBe=6X@;&mJuN5#L`4)=w|&uVz#(a_%Z=q>_B|xkF>{ z<0Vgb;$WxqU~l7K|NWrz^x$aWptX}}b8Vkp+}-@Gn=YF#j4UYEbLE-K8- zT$eFO(ilP07!l^u7|G8uGU9OxfpIF$acY-wTFRtx`lfNlrEz9rLaV#+v-Su!P1E@| z#3*tTA6P$)kplo>6H2_AIKzy@=l}o~`lPaD=m0qoFgi&Fos=vJ?PCSXpaAdyQyXsL4-jl0Bn}fav3;covAMZ3rcHq zxoJR>uo=9qxHV4Y02Kiz^*}@<(?l3HcLA*3xG@mEGH0`nkxFMKtT3R%XDm+U!Mbv;xw6DWaGkVr%e0&_w({_EWh)84 z+huV_29AJb)d*uH7-Kdkcoo3%gwz7)IkftPb;;jy0dPE7EU<=THUlVK)q*6ip@u-* zVV@xLf-B0*^W{|P>@uE+*)&$`52bfMSeNmLmZwPgmp7~|HyqODlw2l1YT(ts0vDzo?9j5JX-VYex9}b}% zB{}e6JD7Y&JMDVl5pgh&O1nt5eBgSzyZW$yDR5Yzb+~!ByPbSk*?hR~x_9_+SWj|9 zdb+%Ku65MvdUP$gf7^U?!g%;VatI(jc0N6Lf=2U<+U?jZ;uvX!8oY9Bbb5>)N{uOW z^7;Jcc)6s4z(E2qqlr_8dET_LCPc$`b-oeNExoojqQ7eKt2VZQiT zz{Z7Ub`)}q-KCt~WwqbsPBBb1aQS`nav$|disEYg&6T9) z$ho%1)mZkG{>YUA+U2Vkm*_%EteU4@Dd#>yC&|RsD&McPo?Sl(&R9iWz?EL`#hr+y zPCZRNjU&BG7`aI*2~E+y417EfmXN)*+lvYyz4aElEi`|UYI|#(a;wyGTOs=-{PLo7 z)j7lQI^L&B}4ApckT z;S%qe<7Bq52%`S)h>U*%8R``iXaA!_#@~PpoBt#-(lP;m5*Zy{SaiyN5E=4t+_vY} z7#}4<34Re7zkm$A*VSKdEI<7QGV)3?KuLTsAY-m#UFLW1;iYO1+#f*3q@i4Y0mVSD2npaPy&zLWn$myr|LMNm_L9F-ZU|{=MqH(wkC5GhC;eb z^OmMdFd&24`OeC0XgXzMIEuI^R zi{nLW?~035(;q~JZq=1hva9RQyBn_Z7uQV>e*_=yu0t8_!7w7@8Sx}sTQ{6DjL2YW zl=BCryhraq6}Ug|^v%MP>HEk-47(L_^f1E!v;MfQ*4CL5tHKRPNSvi5aWGh(WfYM$i+a z+9#==J31g2ve=+-Q;vEdgn3NME{smND<5*=63E&rx0#@eOWr);wnHKX-A z_wcLsV1sk&Px(zuZ|-w=E{>Y0c~_OVe$nsRgYqnA#!uWumh3ELakC*9j`4NE<>jL7 z(vsfdE4xbQf75!H)kcAAjLRc0YsTs_e3z-z)f3*ESKw;ZrX} z`e>8YAJL)o_?xkxlH)exNsGrfROo8~YiQWR_}d94f0Q1I@wZY{4afVC!;}bi<3f_- zVK7ECmys&S!ph@A#RG3vK^IAaNqAfIh9@$w&eeW(mdorx?M5;oM(|EEjKw%BRX5k8 zxFv)=l)NQLj+E{7nV)oMqY{ttu)0U*cdCQd~I})2CSmZ(aUi12Uuv^&mdX#o^%a1rW zlG}^K(B#|QiUh{x+x;pb*h10t*j#i!Sy*^A0M0txYH9rm#D_n9)~6;~1gF zmFv={v*Is}imlxj{lU~fRU~NAN5lNV6cjjfy+mx|;o*wpRL@oW(BF?o#7&Y>bIkRD ztZ2p~(_WL&imCPkBF4Yvb&}F6&GlcEjYpNeCT0AjI&d^L9$i;S!c1*5H?Vaw9@F-c zgvDERa2aDFwy%44e36ma0s@Jk3P>92F5~)tpSDTpT; zMB)yZfOl$cII?Ua>Et7XpJ_vN#Nq(a=jI%X{9tax^3d}A12*k8fMC>}q|)+@A}vOg zx16sab~;AX_G>xYT9IQjBXyNY>g zN(EXt1*%iK%6N72N+qs1g}OL!E2bCI<&1@&^&U*Yh1%QvhP#Z zI>SoFYuZsda-3Vxx@yL2K8ZSV-oaGYzFWp?xtTh0y;;zHe86i3V&NV`5jAw+Ng-{B z%Eun~un|}Y6Vi?mee6Z2p^K~y>A;^p_Tj-?)J1cHbduwq_(^K$VWmL2=rEN}0#q0E z@LC|lcc zh5_FS{)of$S@O-IfzTuVs5=_&c`Bmj7g#iU%wPFD9Uo)qi?|TMczE=ACY`3Cv^K#6 zdffDRHqVlwtQ)~(8tz4|B$}p?LJGlDp7KS$>XOli7BqtCvgnIKLrr7#6@r<%X^e|v zhb3d}M}pZl+{;pbO%r`IQo^}D<;(K8B@+W!>vs$-`m!=l)6`g-aA9uxvbqj!$<*A9 zaB&Uysc7d1Utk96-^q-xIjH|6 zGr+2UBQsF{Kr{X#Gf@6SGk%j9D1V_DzsL;KzoQu>C*Z%M8Q_?|p&6*Z*o=Rp8Q|Y+ z#&0wO6I{!5M$D87P0k8Nb;K@W0@UUu*{W z-*Cpw3IYty(5~|M7a&3YPe1}jN5Ciu7##uAkiw7#7}x+~9$@eT410j_4>0lphCTdN zQ^05k7#9JfBVY&w41|DD6L9FbFo^{mCO*t#0rOJ81QjsvD9k|$laazK7cfo5ZwKjr zp&I@NU&5>vVgGBb_+LQce*uaAAAkhTKLH85{gYpS1lE7Fby#|^|7%<4S6T<_pKYB# z(mGiG#bNqOT!Hxyhv^@X1m-^+rhh0*G5+o_{X=1j`45Nb9|}{9KifM0PU~Ra{w5{< zmexU^F}pjc{aacG{l)!p=ik#h7{A&&|48eg|LHLOOJR!k+hO{b!W4bx4~OZ$6{Z-e zfL{*NzbQ=7|4UkD5aXX|9avch{hw(amLQCOq;-CkbI%q?! zkaS3;z_&j4)_$D*?O9_^>?uv2tjT;u%F;cJa82ikyH#H5v!Zc;-{`P6wPM;TOjAKh z@Z8+UI^j9?r}vx&{$oMR{_!Q}+2U!H(z+pvQvI*DB_qF$N4=AbV;0MkB-E!@2p|8J zS>L!L&6lqboL`y4h(jwE^%|N+fO?TDA(Q_;rhhV<_aaXYH8@`}M`|ix?IK@gC|?<+ zrhlr~@uENnr9h=e3U#_X{-W@UOo7@=|8#Xd%0-byaDm3w@2(EYA7Pz8yE-VSg*ph* zv+V?zrG7GndU$jUz2A7JV}oo)^l6MIdb54YW0P_}y-1krGqV*)y4W}1A)Jp6!4=4| zq1iW-Pf&|3p5`yIIWX?SQtus(uCBV+Fttor=s%q1YZ}@y@4(Vnp+L3c>)P0{EQwr9 zVUFP)!r8UX!O|Rh$Hh0*#B5hypm|_HT+=Xn!)#wfxqKqZRloU(*>OTW>9ib=_t2%p z`a3a$_U33!bAR0n*YJ#$p9DEA56@Ad@QjH-EC}~ARKZdYyusCHeAjL0uuup#&D%93 z_1Z6p&Bw-Ug=^CmwH*aF#4LUal;1H@U=mYi6qv*`9;br)C9~^^fga=f@F;O@cS0R& z2z$mlv37QkU@!wr7v4LR>edxXf|1EJv?ETa147h+8nQNhp|ODh6=sK- z-q0^Xf%p`8AytLa#sI)9FRSd>tudc~nv&`)Rsm~@aQL8y^KV;cP`T~k*q0cd zT@wKImC2eMF3i+lLEZ1Br{E8nfY6v*4bCs#hoFFKpB+nhp#fqnp;5ugcU54nehi%Yn$nO(le$)-Q0% zfMI|(4m819$D^tu9LZBnpew!gPQEaw95x0KQrTuK<$N^&yFAm?L6RO<2~g^p)B0c7kk$$%_@C9 zEkBn~nQqmuJWuiHZt!fVu%Th!-JXAhhNRn2XF`L^?cHafS9pwIPFnZ-9>xqjcidRJ zAvl|xYImRUSJ5fe^+Tb)33(aCytAFmzv8 zzj*EKEb0mvbQ&x zDS@-AwzYMl^(+4F3x)WZ)iQX206zA9!MO|p60e+9Xy_pRTOSCZHGT{U+xT?X?kOq+ z=Ue_6wfJB%hnMyNS*(u1Jpq}lo&^YjX{?Ss5`oF#jun)Cfyv~~HNAley56m`_yb%) zDJRZ?UxMP~gZg{%duM|Z+k$9DWAR~0owziw=`Z+3AfLG}_z0~oOZNDyF)nKm{Erad z4G8|ZfiKlg?MF4gkb`Xq2of5`3O$wx4P*_y-u{?xAA08pxxEezNb{wF%Y&eQ33z&g z_arXNH{2c67v{X?jvfc0RPn%)grwPr`>uJA5*R|tXT!bSJ;@{?0^AXv?!nYMcNn+@ zZ#HFN@9^=WAW@Qrwu+K?3Jy^gCw}p{Q6}M0R9bVmagb<>G5?P{xcCmyrt$$seYkns z#tpO425SLmbTJwy;iR^4xYe;ShG~J${j?G2}x#SmqA) zV|~~}s!7uBF|%>l6m?0G@)67a*m3yD!XmNcKW?zr9g+oflXv^DcIJ|K+mestuy}b= zMAssd3{!Z>zmeY}V!xS75ls6An8#Y^4S{#W>UKzd35-TE!qS*aM;!Gv$5K;Ednsid2`UiQ*DtjI^uG0 z-E*Ah(fcTGa;RCBQ`)6@NO(O5lR_UB@xC57#xFY)A|0+UIj=h36$^N`0f z5(&`X-{t|<^3r$F($w;Q3g_q6qg{~IuA^YN$Uc z2}-|PpinxMPqIKY^U7ySbEudbQ0fRO=EK|{#Ve+c3%K{dI1Uwj3@A&t6|<|kqJb#v zwZ-IOged3nmE$a)NCTCFrFnA5C?~vC9Xh4Td|;)7s#a*#OATaj-c_ZxI=G0py0aBX z5C)vXtNyGGCbBSo#Zq9(2fljtwVJo4U9N_lrNSN=j1RSHRG6=6Kdy<)sqj!oCRjB_ zTJ_zv17HA>d>4@X)m`&x%@B@l*6`};S?Z{2Z86ZT!gA{poXo`x0QtT4dJOf&$JLnw z_L&I)rh@vi*7_nRb96u*Z+Re6rFsJ*Aft{CxxvXCaTHK{*HEwCfF4%geTUSB+|bp~ zn2QdmHclNiMjjuqFH>)-Tm=HcfFq~{P5G_B)qe?0!7zjAUjkF4WsFopR`k^dzyh51Xte+o?h@RuU}5}5w>mm>cmF#YW>MfxQ${p~MBh6zmn zN z;ZavtH$FZd78dsE)vK>xzjk$Xv9hwRuC6L5C?qE*fBg87oSeL{un>z?$h-s|cFSX1 z2S8?KfnD^#s?P@5&tTzaOirf%Y(M`Gu2}!GZ-tNq1$cORdHeYK`3D3B1&4%&g-1kw zgBfOH5KC7V z&Q<)s>PBNRFl!CgmaX@MQA#Jjtt;Oc_#glbZM5!JzQ7HH&rc*QFuTuAzh%zcp)QR&+|NIy z)e*gM=9mu-9az<#_cJz5|F(-y{6&`N@n9)$Ld$Y+OKXUxiXd(}wvh+td!Y zZ7Rhs`!sB?ez!)};U4!T7fny+r@h5X^M4e6=8qQQ=L}iqN2`z3XO^tr21p9LE7#|L zjbykHb^lrwUQIV-? zps0gGqnhOWp!h{F`p091e3A|A;H4+AJ3L0rX1D@aPI;Op& zFC-rekviY~(4d?K5;_B3Fo>H)_bVl`4WSMuL7R z_n?Z+sorXKGHq8vGFOrGzZ3eU;IX{g|P*@WpVVUuV{G;&P?O$Kzvb1e4yBSUAcE z;5aW@JK%{7GMsxL@9`eP$jDSbycY-N`-;(irGAbg$4$PM)zf)R`vac{BY+({RdgVD zoN{i7X9hwDBIG9|<28*I_dmiV={=&prb~P2yawwiFXw=s$?#>;55Ot-YBm=LS zf|QL+GIB(~umNBuengS+_;TxEcgqVTv>pMSK)qahR&ByR@1;@X;x;#OeMRPebeKNR zWZa_VF-5BFc6enFpG>x_@c!=eOM(7_1LwXxrHxL6GP zdlg#ob!GM93q~8F-?WMWM$kPyiOIK>$)A2EbK90*9ZB}277GirBwi$QoW>F*rlwJ) zdrd&;tU|sIB=d76d=2o8c?D`>a7A7uJG82S(i+M70Jh143qff~hDQMS!Z09`1c-`$ z%cWzU4EH&#x?@D)P`5Xp#0COf+#m`?FOc6SzoP%V78KMUf$GD1%=S{y>JQYJs^f3B!wCl8Jgd1NV^S*Uk2!k+>|znc z1O&W=9Wg*to*ajG8ajsnp>>ihAtrG!>FPi`BB?a*sa&ITNbBoMIw_Jf{!NWf*}ZhH zxgKGqoo}_eMAEixJGvz?UacEEV_beRuA>HMuQIyptHp(I^!$^@lZP>F$4GY$%Tj~UrknLS;`NeF>+Z>J}=uS*Rrhy32mM_|yX35unP>xk|}EDO%l68KnV zB2DE0sF0v1it<@c#X4~tVg}Puk)n})i~z5@IAREJvQ(!sifjp5;nRlm=U?yfBHJgS znN|gzKFQVCst0ioON11#Za-J5qu*RXZ?uWSF}3~2J#pWKlLvfwMgn_WcpDL-rHI-h$l%xCU-bA2 zTV^hG(h&5Zs0fJPNY^uXmY2lI;mPydi=sq9gOoTZ32xhXK!ri|{B2+|dxa_aXGf6; zx~qb(C?)n!9cPBC^K1Zqcy7nm?VI^kl|B|nj!6Tmk@a7KAQRu8E;-FzJpE9_Ydt19 z)1$Q_m%yO5Cw0MtN30qGJfpvZPx3L(>w8xp={dwx#!GZ?3M)zdxjp?a zLi`%OPOm3Vwi6%X+2eo(Pv&ETE32ZOQWnlY2;T}E1-~E`0WAR8=zB0fo45r2(56JQ z^dzCCclgxU=7$fd@OCkazPMPgx`os)dLBC<$LK?H>}+;Xicnt+HsT{V4AedHorY>$ zO<@V)_2o@y#gyKCr?|6kc6fFgjBp?;Oh{3u9l{iYC9}@eF*C*zeUu2y^TYCa5zPw% zfMX}f*|pV10_3O^A`2C%>+SmtLUyBNV3IGlTM3DQ^K6A7{X9MVFHlZ)voi1!oOJk@ zdi;69LE|>phNGt+4yPv?nejXQ>Mrt2_M#5YEi|m#d2U;Fyp#Oh0NoebjGqdoDrH!t zO4NEsHb!#s1|-@^zQ4LZJuFTxG?I8ST7J%IWCpZN^hel<=0N`4s3x6xw zGJNr%n6N?rh{hj%S+vTGc;FJ*u}sZASMvEm8Q|hPs9Y^HB>B)+{ud8qg z6gHvBg7&$;sr_J#8{=O5S<4XbvpDmd0GL; ze9JODAod`EUR{8f_cI+z2!fVGET22b*QeD70uoSuqoU1CsnTgB#_jd(3(+-#W@@C+ za)m;?xWMZRzwI@^z1YWw9Cz_2{b@1PmGjP)-; zwmy8HY#fyQc1~#2wtO}}n>uxfy2A@FyyEYNd_{S05kUPee^$UMmR;kC*J|h6nMlTK znv|iZr|NSpSV%wTa*7QW|GOpFEi{qC4N*Yn`S*s)hzHEO~ zZ)fRAm(HCuOUj`+&3)njoxwLc>=YRS-YUS&-BZ0!qq84iQ(y3+9N~|7%)BrH9ZgOJ(bbT)j+ji&#NW(1X+2Y z(+eZg?)ZV8*S!EsGGWqvp?MvNrOFYJYkFWG#TgA$Qm}+%&`7=%v9e=d%O)g7k7n?W z`VtrY*h+l(Lk5M$32qf|mBC5ikEo61N~xgGWXWOg&*GRDZ41SZC299Tu?iu6<>q1f zo5WH4q;a04N!6rjhosrKr1`p}#kr*Ao1|6zX(YF4WCZCHyy;YG>C}$twDIZm_34cB>CCt3ECd;BycsVk z)iO98Gq~e3cfAKB1@-j>urr_L z+k#et!uH*Q+Wx%uI=GJb!v6Zg!TG}B+rqMZa4%BPq*~FmW6^AUQHD`r`%WH1D;!oU z+$wMJnp*J>$KuWS;_dq4-TC7E+u}oll4IVIQ?(MsbH|d)_>$}TlH2)```eNSf>J<3 z@p62@3PCYELn%^1DY&5&b)gjft`w8744bbEm#_HgKv5}A5qv~hZ)+Lc_c8*)_+GU_ zAO(;?r@Tk5oETQop}dPHBP3%?0J9blv)`4c(v{D-m(v@Ac>{_011qTZ^5Cs17^ExV z^eROeVx^EPnWZae7s>@1DxsVp8q!1}$W;o)M7j%A?s-++Csl3G z>aMiv$It5iW`snaoGSIziLCFcwE`8CH!mk-qA((gRPfXZ za3_o?BoJ(;4)%3IA+-pDDPq`C1qn+*NMS@oqaY+QI7$F1kuZQM3~34jJyN}PpSHH4 zRTB;%U8; zZ1zyt7lE)mFrmiF^FUBKCVT)Ch!YE9ek<^p&~#ne@kmc(|I``T9{vscQ(m0!wQz4h zOr^lAJNh+3!3hj_iquXRlg=g#nnfm`R5oWKgYRbmkdnd6E0Ap6>yYf$3W+|0>LJ921L?ji8k)ej((}YxP)$I9#lsDF6)bdqR~c*txlFJ zaZN@LD|+Y72{2Ec?L{m6;5|`Nf%1rZLxI;&ls><(3cNrr@E{tf`<4tYk`J~HW|vMofy5rMMZGp4mh;Q(ekMF-ptz?Uq(%u*rK9EWpu z?mZs^)<}$_yeB&fm%LRchK!mWrNJjWgI`I97a0WHB2T{10QC<5Bv{yV`TGFSe$-J( z-ThvJ&##B|ntBTQy5+j&j+^5GrvS&}zED7)e9Yk`Xr&!6vIvyX1QCuhvxETC$TCOb}<6q!)3G{($c&gMbsZl1u}($NPQc03+*0~2z6B2#f}>iNn6 zi(}6VG(a*$b@0`18Y|+~S(#A%;2zV!00}_-Zp|BZoO%nouT?kdAy)YcCk9P9_Ac1fk<((&Y#a<#1V?;o`bzW)L{K{B{poegU89HI z>yfU7_OOML)~Q(?+Q5mg@|ADcEGDB*=_A9&L_fbCL#7N|@10r;?_=#7H122BKz-u6 zl#TU0Gga{H8xdyGr;s9bwDXm3&LBi0!ET%t#m`NPX+tlYh2*nWX!ymJGFs%>I~uX3 zN}_0{MB>ixD{&S(mUL*-(^tJcR~he5nJOFPNTNYIaPT5p+AiOhsGm|kg(Lgn0^%qQ z{;sn`@$SPjW@MM~NFGSc_Vif23i%7cHMV9;{^@S_CUM3t*mo7d{7f&2t3w?Q{%(Qs z;(0?4oM4;i;Qqjm*3!-C(nZ52&@Lrod&jav_D-1GBoMkS5Ytf|N%=J0mm7jg^_aX# z8o8S`+9_Dn!cl6NKK|0nOo_ZbHZ8fIS!;{bjta%pcs6qVdBr;NXC+^`JqBxN<|0?V zB3rRuKX2iWuM2(iKcH1i{Kdh}DSgC9`=CB8C3>3;SnpzC_TvxeWGcb<|3d zjSfPWo!M67uuF4tliKDMm*l?3Z~#01(&U86o1uU_-xvL5Bj5NLsx=MA z&ijvkb5E6sgHX4WAGVwzQcY~z7aR;3xMx`Uw0uqC4yQ~7GsxS?$SB%i$YUe7WF#n1 zfv#%_1Z)BYW(>V1-I6j7yd5G2eBVAeK4(~YgEoRhx_I7bJ+HDeJkzv^IK8DCgGa{au2&__thQ9*JUsVKh zr`(#e*_pr$O|}MEOVJs%5QvMIKc#tV1^vSC1y%6~VjwBMjplWR>rLK038lr=T;vsd z@$%=A(LM5g$`{unM@JV*=77*ef@bBB4QRVD@)tMcvKL@4L5AoSQK7FVA*+lsEmHnP z_rnzZJn4e8D*FXb$PP_eT3j~AA8tLXFY`;61BA)_hxqJFHp|59h#!eAp#U;&Zb}w* z+Am}}P+AZr89FoFog2O zT9TDQ{fy|fcxKx#J9RX2_|RoV@YfG1{3l+gILo${jnL;8vR%^x`T=M90%}Ir%#6wi8Yi< zlZEOHt|)xfkNb0#u>Z|qzM67_a=s3n?hznGk1;S)(KpxA+{PF2it!z&{{3beH~CUk z;tg2$UVx#2qP7=zYufmHC z2=~5z4)a-D$(9F77x;!D+n&9aTO!dZq8Fcfb$v-(L3(}^ArgB2)Mndi;4lzZkom*M z)K@a5QB{i!=$(44ACp`0mVyACt5ICyCpHge%jN%pBchC{p_+p7sciD>F9 zbwv_Rc#pbkzW``iQ8ak~HGM@2&8ByDrsPCxs;53Ock8PNQ6-1x^yyzqp9}DE`#w)- zl*rw%@-|4KLy7Za?*EbzB8X&|8S#!D!}Xh10G-(9&CiC}$*%cpBB>!#MtRxE0Y>?8 zX8A@1S@cGv1;xX=bcL160mjAkhyBJSNta0bapm)u#--f>-1}wy%mXGB!-55QxufsY z6N(b0q)n?!RSQgOmR*s}YIATI%<6ETOB2>@787FB?>8g+m^YpdJB2r#FE@Z|YUTpX zTiUnx%v<5m-m)~@;}RXWmE;Aov?FnWEjuw~`FA_8TaYbGakU%Gx{2JtR=pI~y~n-B zW{Cv-)W!v;{md<3>p{dSM)nGl318D8;u5)-VZlf6*U^qq#+N0p-4w0HMoEgkPP`K$ zela0e<5*$xUN*^SO2yoM$Yy4V`k-ozM|;6~wqMKCcHXEYm~P%gA)mvRrGhVEJs);bxj&!QPANT~UGQzxm@3D2y}18kEASWwW1j9qh9UTz^5km9YmYxzMMbOP@N>) zi6T%uUNfe*(bC4i)608HzgH&31FuTS9PhGl-A;diZz8o`neY%0Pl2K@xYluUO7NPsyj;kvoE( zVnTt!j8F6t&p>am_A$Uup2%Z~B8f5u-b+(F%L_?p6k}mTiqLt0my(!-mreqASv~PY zwg5e23sv-`V+-pS1AXT{1588m$i;cOSz`@|s96WT4C;TjfP|6x0$`ZoXQ{KeiJ;HX z?|nMwKOJ)5@{1r!t;$%qvM2r^%Hiok?0HE)5!9!#^>krp*z}ox=y@Rxi7%wr`OU6K z5cX?ZbVPs!OU^g6`VCKILvkApOn`^(J7gLlh~5-kefP29us$8W-b1RyQ{|G#sb;ozGljo z4aK8LOzCr~(5?``T;YX$d)e$3Q7{iux zYhs*XwcC&I{8XgUS&oWhLxry_pIuQ+~_+%VRGr}JJ|x%anAP2T!A7ZjFX`0!}F@Tup$%v zvjLAEwqHa9^3C0h;pX91@mrag-!mAaBSP%z+Qf=!{00`kBjEvra`nFk7EBKpO%^RS zp;%b}m+{s^haRWptMtd3CJ!BR=XK+i+x`ztr(!*>>ut*y z(=*LyO(oDXKCR?Kqna@ZBT9Y-(C8aTdqEV5vFU&;LJM{W&r~Td&pjC$JbY&^F_!vS zhmMUFUM+sbYJeMqo_y_fmw2$Hp_BCbI=v0L?O>Vw-mLiIeObfn;XscMwjbL~KLiaY zb$`{D{(i>KV<8`H6{#z|LH|mxUTv7pEXE`Tf&LI8I95s9<|k$a$8s#9uk|KB(dDfP9Q&Vf6w4&7zjNXp9^kaKVURQ6o54sae>|)2 zt(tz=&#-5V!_#w2ky^5G67w3No^5TCoAv17 zux#zC9RN*j*In~zOO3{_?SJ~{;G)y*<36drbAfGhPzAi^csB`e_}JsJtv=z;ZCgp+ zH)JDwZD$R;PBHa~GTNrRdYVHt!nI*_#n-b6ohNu1F8RXRLFMr92Y7Z|o9vIqtH}Ci zGeN0KRdu!%k%INkct1;$+%`AZxi)S1MW$Lc)qwt zQ%H_W;3~Tg77}R{cR4?;sb9?rxpgTZS(qsiZm)YpgJJ<)G1#G5=}zkg#*GH9WZjlR zG0rvBP{^ev*#|ozF{{xUr|}w%4PSSB6}Kz{(*iq-Szm)AJSdF{_~|SQy$Y*67e>Sf zhbBBbR){<1wfPR0-hqK3wTdyn9}j7-^I*1ncbEIMom-B930sVQ(FdT2En%{+2Mw%7 zTcKrZ#ZW5_Mo=+osc{|q;90HY%I&9stzt5A1_gn z_mHRcPONcJ=~Wl>GvlK5X}q+43GoXSc0M$4x9jnFUF$;}s}TZmw4?<~#p?Uc@Rs`e z#z9b^w!{n_@F^0i2D!Y5Aiw5KZ&W{5BcC4EuOECqS^2WFdyP-|ML@ugcwXYMjvQ1T z>>qqNa{OCzITfzyoFOi*_QqZ+ettR*V3mO8nt+cg=Kg-pVSb+iX#;C>-Lxcv{e=Br zbbGz)HHrlVt#7|<-=^zS3ADm<7}^fXi48LGwT#d4^!glhgX^|%ZClo>XSQu;;pb&F z>$dnN9vg5n`{sxnxeyXutP)z25z4|D_?R19x$S;U z3tGC?i}v$-;vdnL8V0P0h)MS*0(xTXn4y8}EX5<)dV|pH{fhkbzy9@P2Y3{gt(nfA^Icw+1=;v|o#+p+Y z1{(E7|GW+tM+EC-eS0nu6XhRsWfd2m9`j5$NLL~p9TZDA?X6`PuEP`KXXh@a8cc!c zZc|tnYZ~iiuIk@ZYvZFDha3}k_8q|6N})#?K+zN9F8P(~#=Cq9nk^AOYHwG4$aP;L&Dp*gvF{jbx2}5PrM&)yx~n8pF_AnUt(!)G!b1= zeqH=2k7=P~#LPwT<4n>c-A$4iPIB;ZgvxXb`cBA3oW_o1Y{qQz#$^H_FlpvG_6vT* zd0glb{ZG3BxijIs>_vW*Eu;dv9=<037LJbB|%Tm8dN3>3_+ zz;G`!MOHIhZgYfhbGZ90o#}IZ`ZL_)bDgDft!i?(9dpC#b0c`ZY|;o|%A z5_a>Fkn)r3^WI5?KFafku|V??!*g&eQv*-4T-Ou9W%;U-c>MYWH30>6js=zZ1x>pJ zUoOpC=nET=GWzKF~z0?t)K+9-A-iDm(+9h=c8kV0U1lnm|NjUkjqIK%E_h66U9ne z-jp^*mjmkaPazqFtQlkD`88uD7zri!0Ts7u6$E@>4yO{{gbL1v3T1APAaW&FLkZtN zMLSt}>PF=|S>&qZ4LHDR*==5F{A`h*Di}L~9ygFycNc>Z07&4d{)nt=U|g+N06xvH zG~KJFVE~)lRa>i9bdc424XoMcH9720F2$;HBdoG>tQuD#EZ&=qn|GwwVqk;}a42op zh9cL66V}np)wrrxLkVk%Fe^L>B~+11x5nWqw0l?0S8fT|+*yNHSkCK^n+w9Mr}@Q0By{a~GIB5SL%j*oe$Wsg^#u8dgUL z_9koelLNJ+VlS{_l1ZsfCfGMDgk?Hef8T4=@@>@r*3_ZXgd7e4S~Lx&*4D!}Z%a2j zT#D3h!D5z>EN*l2O80;F8aS5#Cbt7WgCX8(yf_ zU){C-h^$J=6YvHAQMup?$l83?0B)RUdf#!+ECAxpfM>#u4S=rqZ@XmKkl*SdBeTG} zvSPWMKy+sCpYo${@>B3K3L(FZ<|jhA$9ke-l%>JjV-uA1?j@dOW0q}WkJWw;8ZFTA zZIA0>j|;fhW3b0lqnG5hxQHT9LB1L;J7Gi!h3SOZtM$IF1U6`+(eq#!rrx5snQM!ODA!i%o%pz9j)C3;|%YoCXfS zBF2`9*_ImFheW*xj3Y;kBhbWbl^fw#f*0D zzXLbDn+zD73g{ac9NbCtzFF|?Bx`*mg4WhbvwRE^q-Z5?!eKuddjEc4Onx{|1LaE) zBP8kd$nI$W%ZVPviMCGH>CB0#&WTB?dI8$t3|in?r;^xDnN=!{r5{SmUzLUk@x4^6%8O|!Kh#SRu|9(s$u$2pAIj_B74k9sVdCX7T;G?WFL zVX+*4#{;T{V$Z#Wk+*zN89+$Sax5j4taCylkSurz#IogQT!ZaJQQe$Zc%^nciT>!-E?e&0HH zaA_${p)D>2DmbN-;#O#JE2X$Yakm7L;10pvT>}IU?(Xj1;_~vGGw(g;&fNcC|Frka zXRq&C?f-IsLgL-%D@@0kh!=qkO@~^_URuiYS{Yc{NjYULgoLE~T5X5ryK7iC6nGf0arrVuq4AfWp|P)xrG{ z23<6KqIldo*t{4giCyP>0>><`CWVslPh*b;iy|mg{o`}ypIF|%P^^(7i>1o!^`SEk zYp#ntALA+LW2nStEkDL9Q0rw|Bqeh#^`D1aH5c>1!!Fc;b=LTQm>2=1c+*g#Y{X!# z$ecZ_y_|5~s(E@6VCGkA$R76TV0>9w*t#_sXk@uEfV3Y{NU@gA3nBD*rY~%roBfY! zmf9u`M&){I>be3iextwkFxyDIh!1SXZxzR-M1M`93ba}3^C#IZCK&o-4X!L7#akU? zjG8wkYWzL83ff=s77QkBNv+6SZh9PDwRMi0O^6#e&(I^y@xkTTxPqZyXXDZD$`8*V_K5kW*S+mO+hvRfv#bW1qLvqF(L3VeeAMv3E1 zkD}?1w(W{jZ>}Hz%LL59e$~<&+N0yobt9fn+HfCb}KZl-ZjVvLsV=pFhsF>7QZ=MZ3$J zdlu~&rv?ou9ZFN5^DW0kN)Me(@+TV7ovzg$P8u$8TGMc4oB{wOX`aXQ%l_PN33;h@ zHt((Ol~U2twlUHqu?!L>U?(LFo3JHaeKDU{B~ZK=4W#+}v1NT0eU*FLn#L&QwP6nc z&ifMG?|7nWXN==)Z8xYr?2_qi-dmx~QR_TW!I6LH=O5Ct2agW$)Ba6tE^;uhX&78o zPMsq$=Pn}A&`xCZR7o6{8C!{VIF1JHN-u}P(!YNqGbV<}eL4(QJ`XcLhYW9y&?n#J znJ}GP@mT(gTyB!_Y4jZCFf{$!h<-N8ch!*^mt_D$5r?Y|zsU{%R2Y8o=kaX#Vdfd$ zt*$K`JPiBu9bNDk(%5PmeQZ1y{vnn4U;5*5X4=EEhDW&8^LnmJsCj;nCVI?BMwC#%X8C zzZ>RD$R~30i%ASkzaV~vIU;slRCz`Rh~%|giVW{}3Hf_=_2+itzTM#fZtv%^Z0ka} zdpwDfk`Ai<5hcaFfx(rd1Ifc;nF_dPZj=3A@Y^_skvD)|*;%uR1+keE4i5*wQ(}Z%pzb~k!k>EqJ zFs;Pw#*D1>(0^z-5OjOXQM5(GEIwsgH`c z9|Go6Gk2E;oA_-vO%SrZ8 z=hCk+-zfM|h20-pe|ved-k;WLF2KKhb&j8V+$;8|H2+~Eo_7RhGQ;#1i;~izS94n7 z{YOF8aTzS<4=a)@2|da|i(h}Fy1k6jU<5pVxAS+~_sUuE;jG2$oR&@sb(eC@nZg81X+IDRg~Y1H{s7{?Z=;)BXW4xWDFE+C7(q(As_ z(rLZChx_*6mz%!nMpttxcs~9_sbTA6YVmsPIb+yK&WJmTb5fFpIJeq|qaFP-RuN9s zfF70d?Ztx2GA=ug>+;Kt)zGlJ5x)W*+@2kl+J+OjsXOP6zWxg@qk+7%PY;(YnIBAa zzdWzK@03QOj}19vf5$}M4iwg^CbXdnFl6>+Dqv?KGXdHJ-8d6q9a*`##-?W4MWMQ9qtVl=o-8Ajf5S8--437SgZ7%+@Q{ z_*En>2ExoH7;na~%_$ftuPw#ary54aMf)s}U8A>8z4FZeyf0mo_L@ShJJO&Qu}SZ^ zo2-1s_X)|i#sHF4C^obWot4Pui@*LLr9GPXmG9-acSxF^sw4jtf;n&}Us01x-uv~q z9v2_a8^&ETbO7a;;JCRGW2~bL)RjeORd(2IE*gFnWBU59*0ASNRoaa)rr>ge(x0iS zbO|FC5vs^M+D())2`pcdMM&PB?>zG<3ike4J_h!Vxb=&FQv5#Aiq>M5*=wKK?hcsy zB}`&o%_Qe5nD*s$FrDWJ4DJ?vEF7N?a-u20xF-z@-c!k>{vgdSXMs&kI`LBQgN(}W z&#DxSU_ZH<3{AR1^_P8_CpAMvE? zWu1U5&6U?RpB(PG*6}Z2Blug&PmgjmAqJGYb*+G48!uD-m8%7Qr1PKG{)0`6Si%Bj zSYdjB%Z@8JO}n||n`J?p=gUuq{mN%Hx;Rn`v&7LwZCeudtlAarjnxTVC3Y2vYphD9MY8C=m3Q)SWLnxi249w!$W>hFhO{?O8Ni z1o^c2OsCVSBkaN|8}b50Mt&Y1_mHL7s<5TRGJ+;#2f(oeZ;{6} zlIPjHD4MpPUp8Xa7n^}E?jJN&)G3l*tStnz*R{hMh7_C@Jp;pcjK<1t&w+7es% zx(xrT@%S+3Ld3zY#cP47Rcc4O0TJBa{2o)wrGM%ZMPaKoF>~Z0Qn*Sg{(tI{7mvdD z5!?4i*6b3Cf8#EjE=!L8%-=4R?6|bx3>`na^Ke0-vTL*!>56vC%b_`%`=KyXenv z?FOT7hw)9DCyH1vW}dY9>+bt5&os6LHoLfLE`en@9?p3l#IDXJmmbg+?{-r!6%5}Z z2A>OQpPM?*`vAv()BJ8dK2v>p!qwX|kjIHS4)B>jUbi4e83Y zV#NbWYkZs019KqGmd@?~DB0mXl?iZ4FSM2bWM7#f_c@^O=M2;T8v5Qqb*3<2on0G6l4 zHCIah8AcxxBWMM{)eAsFMxm2L14sZ*yzI7X6rFuQE=fcgZv?O*0GvGRjz0*@)e3trt5H>THgrAs}tH zP{Y~4gzG>iO|(fz{|D0`Q>`ST=r~vdE>uQ@DxJVi7RZSku7e5SV0&V2f!kL}0Av&| zZcL^L@Err7QVTD57W(e-UC88$$LkQgP`6-9R|hpJ&)q3cTHMHFbB4kls#&&3#nZ&C zZ|HNHvAKe&^m!23v6L9sz{r)b&Z>AX^QUKp09KN=U6cQ~0nj4SZrs8?ti)ft0{eetdOHjDnlQaPC`;r zdD2O5JQxW+E+F8%P7N$inoxn*v!%1hWp$dT7H7tU?fT*iL7@#PuVga8Jc(Uj;PvF8 zHGiRSD%kU0H$5#Fm_Z?5BaTr!X9<)Q;*vhm7RR!e`8G6Dr#c=-JmJVJCKM3!aWBnW z2oIAzqckf=$Ll6bX;)05FVSv~;Lz0OYl7hvZjN&!TwW+LX*UmK=Z=k`J@+S8y)oa| zFG)NUmsuOg9tu2Viw2a(i)sN&uEDQs!_3@)X%=`SNDPt_3mg9wdkg3vJZG2Cyw0oq zgPD-8h4}`OMWFm_^1kcOm4Z`6gX7zvV8 zOe+mUlEOz_czTt#1kUydC>$Q~T1KS{3qd~%-NSs->3U>{t7i4Qn(1*8_1%r@p? zXJ;ul6vBEcvVpNQmZii2rBxOwU7#Z0q#}Pg7=k)3Z!NT=u#As`ceOA$%{Q7mF{-*Z z3Ljv>DVXb|RgP{5*f%T3SplGtq|~}qq+3)3h2;4ACuFUcl(OfI%w~O@PhTq(f79>m zDMw0^M5TjUcFtZl7ZR$vmys=4W!2Af{eh-DB@PW6KexIey00AYPoNUyTSZ({EzOZ- zF;`7VgJ-R6ReMwR{g$B&uP6vmZdX+Jil&HAs}c*Z_OLJGc&^f?Fe9F$w)mH*9y8cl zxbe)MMyQ_P50|PEwbxidlMQY9TDe!VXOVbDUAr@q{ViW;ydG$dU-eS1 zv3sBXwRV$6V`J4=V|5Y2_O@QTKWjX>X4AQeY}F;owSIP_p*guhX|5;<-`_;1>;{lZ z6QZ!-fjAHjP`L$;_IoXaB97%*RFmI^$UjrxPhYoaE-y+aW{1Z$wd{Xr1!Yto;UWI? zRIYOP7oyOEK^x+S@l)|V`SP&ef(7lfczj!>pMKq(CJf(xgz70^^n0ur7TH)EIP?K{~$o*|wA0@~msmV*I<+pt8 z(=5&{LNp=K;R_@I-*^ily0jw^k@y@qBcz|8te>rzvd}9lY1$tx+jSw$q<$T0@<^@F z$`lAP0l%Y-#!H>^xu3v0_JJ;6L*Ij%iza9t zHN>Rf`20Gi_j7qKTXk>3ovUkLZyKjW%0X|ERbPf~NUm;Qc1mB_LT{nMuVUK%&rbc~ zlKr-rZJ4W)lPv^akq43=Q}X?E2N>Eqte`CNMxuLrOq@l}*$Y5Du@Py~!D1rHm z$e5NTvu4j#j9`QR{TKyhZ;FVE~SXI1te zsyhe6tlhV(eWRUL|yO$E*q0-sL*jp}THqmhoQT313K%<=TgFqGF z4<%7OAmq=4RBYFp>}ZPuq4$f!4gs?f!qM->KPG=+sjI^$je;h%7bm~x0KXqj{(yHb zAv&pQ?S1?woPLifE>6`UNh$SkjTR@rHme2Dvh)|LS=CH?!qt~Ovtx_3%}b`_4tx9# zr;c>%&D^G8*`v?k(}MvsHSDu7gb~ntg?Ku}u)`rcXe+3s1FkoldFap)JIm{U%DN{^ zwk9pI9*BU?brsH#y`Co{oPYMWN84(i(r>%+DAk8E(!WZC;37Tv!CpO?iR(VBGV~Vx9MkYjnc*h2;cOi_dizWxz{^MN8}V zOB#WTD+J4hgG(!1lWU@je+eOG&xT1%=6Euff9x%>nJgQc8e2>KdTa6vPsvPteMPWy zMT~h>A(sQFv?>|ADiZ8KxU|Z&vC25HLUW|0Q~O&h_&2NfZ;cnfGc0Fr@0ZS+d3Yb* zT{V;Bf0Xl#Ter$x7bxWrepnYE+VHPk7o*>h65EjWUiVku@K-ncda?0Ai57im(PG+u z0N3UfbQ7>`6&PTS`O`cRd&{8IyykulCAv-lALh5()Zn(!I@;19+74l`a$=;naz3(j zAzH~5UpcT?f%R_<5bXF3ZQ6#Z5)+P&b`P06wk((tP3k>gYJR;_T)1g%v$H;>XspN7 zK-l?%8x1qh4)4-+^kg@?#cq**k3eJZ#(58y!nTyiwrt3jX~c`k$iCKlpVXxc1F^Hb zxU;ynkGbelB0lphNU4|`t&{s(cbG#*>H$g30mmiEaj9muf4=R3#2yqnYg0Q<)U|lj zSybP1!Csu%zYi_l#>6~`quH!q!yD39OO50X+C0F1b$rbIuAjWMo;7izrJ0$=$C??Y7S6pN{`uP$@966?^4A>LNXhv8 zJsJ}WUFe8V7$Z9V2UhM;O*wZ?`*(ghqNcqS@c4K={^N?%XdzTH_|;4PJf92J;nv8v zNH*H55S-gTt&A%=|JFGEZQys|era)o%Ov(`ewE?Z z)n(Uou9wNmk8-H%{INCP<@^5-w=$Y$uHc0HdeqF$6_AQeP*%&_ErgA=JaKmS8kZWc zfsMh!eXijZqhDxYUs6G1XAhkTRV4L&U=0xy}>6Y9BO0ck>T?t5NaNa$I6 z!MvL1JO$wQgf^9n9Hcg-gULF*c#%$;fNV)J}t;5U5kHkFbPslAk^&6SJ z^!A`QGCt4zFq%%AV{RB^X4mMISHk;O?y;O3Z^J>7NzXspk8p*s@#g-OBBt$?;Q3{e zEb9m#v=zXeTre-$X7CWNfR|76jrL_LBnx@reJGqLz@k%arXFNh_oGeQYoCZU;x#Al z;(P7Fm#v~`KF>@j{EwqE?dxL2--0Teni{~)i_Lus*5#L{f3?4`lOIiWL1~1&2?QP9 zltgg|C#@O=|Mm=Bjw`{udY3ce!g9(q^@3`7!aSOLaqil11#W3NPoOu%^I@UU1;Q*@ z?SgozW?(B8CerYDwmHE2&4x(yrnEw7Q#+Ik<<4~Ar2+rJ;`fi|IJ7hQuP!yai+}B} zsUgb*7u)N`>$&7S8vBWVW4E`D#D>0)A}oe}XlFen{?LfHQQH94dZ5h~TGd=&Z+)$V zyS|{AipO6>yp9*GGlh}oYxkKjub^FZe{U5kQ$r}BdiwJZK9du|TF19}=13McUaEz@m&jbJb6M{m;)8&G~)C+C`J%-hjk?p9q=?f8sWuiQ2T_1QzSMVWvU#P}iC9_CNL zt>-Q;wk`*>7Ql+(OVA#OkF0MS(GK8ALL;%@$w8+XK=Vg1KnZ%VUxU%xuYkzXwyQaR z9()20Xa$Y5tS`?HI>4G|tXsUQk%Z$t(<@i)@5br)Q~lfE>yL0U==ZOK%57V~JEnLL zSrEaA0j%_DM^x2K(SE^UcyNoCC67R%;0wk&AD|ffoE<<3{8CiXwo?3#=fmCJyOA~& zfp2juCkK>h81Gp}b-OMBptjA7Q(<`-8{-TZh+%k?rldpo4-tcsr)xIrc@UaXylD8- zcv~q(ilEEoq68U%9PN;h)Mjv2@a+1DxAHil*WbsnxAC8`xs8$j__O@Pz86bXD$NWt zK4td|8>Y%dSM!!>4`thRtq&Kw;V)aSkMSM6?hZA|TCYzH%398^k3CjzCm-vjKV2Lu z`<&nY_&3MTR!%7CAC!>9#0jJp<=u7G?x*v6fnEnX9eUXj9dhC;oxDysV8x+;i68Tn zWs|fB952csv;}w2zCgJ&JHneta^vr##+gI% zl}beQf|TA$hdz6WnN3s9F=?WBi_lGmY3|<6IqE&;M>;fNc?(Ap`w2Z_^J`%zv0n)< z*|+$@uX|LwpON}u;+=mtAN1UFOuJJUr+Pe92F#4TuOCojato1EsLhV}P5qotEuT@u zD9bf#<8}M<({_{Ya`aBWT%lXs$FYb|r)=FjvrjF!Zx~dgXcq^1xdnmJ21I??kJQ`8 z>OV1`_Vc`_{and~IxzJi=1F)jp%m^qkcs{yUx;Z>B|v*3%jS$+?vI69#?AL!xyjr& zh3x8OeG@rSKY%hhd7|G{(5eeRlvwE%>C6kEyhmfQol1)IjvI#>_|CF0VkHe;+b0oH zpGMK&Gzv_ zY*~eoc-Rt^)uJ z`eKZF6SYuzYE6kP%yI)IG~LMjPtk2LR}aHurjz?)__380YC?OWw|2BLnefm|-UBY1 z`mxAsP}W%Imc4zcwyu@%&?eya>*MCf+MdA#(=3n0gWB5qc`su{b(*JKDSz`Z%$86YjUsb;?Gl_ngy#dWp6B6+@rK#Y(GCOeV1nWb^*WjB>APcl8wQ?<#$iiux(3k;>&+4IG8so@h#_NFFiI{ zQg0%VjVBG^_a_vzchYrY;x%tw&mlhdUoO3mdXM!l-r`=LJ(fLotbTI(x1RW5KHYl% z_~^Mq;d<4h3bT)$3$uUcq z&=^_$-vD)(1^kQb{Mi%ynNa?$z<{@`0o*bH9GU@KW&wiW0Ab1i{+R%ws{j#!0C9mp ziJgG2l!3C~KzY|dMVUY)vp{9mATCzj##mjF1o?(o|Mp2Wu^=^rouDUk=PzKeF>CN| zaIk59un8*Id?wg(C)geca=Z%u!wPbi0lDOZ+)yCb9gxQr$SWcEM}835j2{SS`<=-c zf&#;Kz~R7M_+6*=ehRx^07E!R}9oQ-`Y@PL2uMA{c)3GN%Y#;Sx?hHEyhM%&AzYV7K zneyZd4!gc`{EIS;nF@d02}i??z+j8Ol8wMIkH8Iyz%Pg(?2RDajUdI1e8v_@E*nW< z9!V7vd7mFHixxqL8^y>L^;$NH$vlcBB#NydilaA*Yd7jGZZt1jH1|8%=nv-6{2|eT z1<}I2(W1N2Pa)LLY%yPCW4@ZlNQcD87R1Q+#whN_DC5R{<7A6fla2jh9;@+GMJ#IBJl-WF-mM_sqc`4b zH{J&~!H+E=KsF)BJOLDv5W-xL0O?JD?IwidCPuO)M$0C~nkPP`P!kIhlY0~4yNPMI zNf~TOf3jqga?F$RLXrv!l8SnhN_LaVaFZ+8lB;BsYZ4PXM}poYBpc=@7mXxy)hGYF zO77@Q?qW;nrb_AEO-41O^z5ee;ie4Y!aJ{1#?0ZPA@B)7_&_3jp#VO84PVrPuQkBe zsZu-5lXrKYPV7?;cTKs9*I!Obtr$0MML%T`C z6iUakfZ!#i6S}7p+@upVrlYfG0I4&ug)%6TGA6S!UNmNq7pBwWWs(VHQfp_voXcRu z%b?lIc$Jj-<|dPwI*YY0lU*+3-CpMVB(%}4G|{B+G7{6z?Ac%BvcFnnONVAZrBLPj zvK9BTmGN@Eg=QaZXZ>LJnU641EzHsB%hB7*F~H0H$(}1|0amxjO^IhDGlZ;k>0%k$XF^TNybVbAxI%MY-~4+_l(73POf=i$WVn&K6J zf!XO&1+f+d@u3BYg$1t3eFgBnf;7Cs4EDk-xx$<|bE=PM{p1Bja(?1_=zzcCD^@QKGnDv)g~0x?&UVl6(QJ*kfBAbjYVB@#XZ!;-ATnLq2m6%;z7KUmYd=+>XLT3 zl4*;Q*}~$<#*&%2lDWR(CF;@@p^^>ll1e#AE=A(8_NKeOZ z9u;`~73ee-t%d*bD3CXmGc;Aany+BgDd$M8yx9{!-9z8&^F6ev%E`;UxiMeCDB~6{ zJJl8~>oYGatPcYQYcU*^OvM*-i zE_$+OE()m+)~OLmt_K%YL+9&5J&GddtLnpb%3?UG0wE2F_zf|Z4axlt@%s%Bj>gE_ zCu3)0LULo4a6`U)V|r6#?rmc_$CJ^ah~}!UfyTZ}yxwrWK1seQm&F2sY#D2>qr1iR}^|rtDe!q3I-+WHK6+^+3?4%V3io`8O;twFNIm%95 z<&mVGZWr@Na;r89Xd4xxwIZ~M?4WJ=qLiAm{k1|nlU2J!G8)}o8waQLbIP~Y#GY`H-kcO$8`{2t;=Z<9@d&G@`PM2G(O0F=AC}yEhTq>%te>yj z->lH@3F&X-91yC-Yq1)rxf7Qc8R#dFM|chl>-N^}^bIHs<}>$?KnMLr`o?t!7dg#_ zQU+IPd!C3ejU}t0*MA1riib=ViWWSFjy->^AcoEdy8i7Aoj`}1bO-(p4BLA4UlWW> zDfHeejQsW-MuUww#|~i)j)>h2;}VXlG7l3fjtY5>kitgqsz=BNN8jF!e5WED+y5|1 zr#SY;bCeM_Hi#Hz8XV)f8)YLLclj{Jr8rLLImQbc2Um}N7#t7t92X><`1yWZRB__| zcKkDJLa=)L>)-^D=Y%ZbZgut~?Ni64WLJGWz6gj10BlX{9%>mHLo zVN*(9x_=K&9o$Zu6HXhxpR!h*PVktrfBFDcPW~C3ZuOXUBb>?oIO3%^({(%T2b;mJ zoDNc)v6Y9FV0mBQsxuR>ow0+m&^;lpQ&q}t(0z_Z&Q>?Je-e)&$q)CMo#Ct1}Bj4 zg<-hjK=W)5;S+I7)z7skdblt(xY#1SFgrLsvADPnGxj4`GHzbnC|N4}vlPU&B(qPr zOp}>=s5rArw|q&5SF5=E{(kA|aM_V>`BrcG;(i%R%mZD2`rdj4Utettw(?bRg@Ah% zn!ZfDG>Rv-N_8aJY`v;WxB8WuXbtCTk#1>%l75XP)e}u=jecpCwRDxdWsSgkjaz9w zG;fV}Y2u~+IswtTz|!!$mi5mMz?sAKyW(|;hhE3&zDSL$vv$;!q@ppUW-m}GG`w)@6+SHAt z)crl(eZ<2|{loqo?Lm9kOr`z-#Pi@2ifE>@<-irOKSZ?LM}O$Qa4q@%ka9Se zIyJR)=z4d!LO-=2c66Y7w3Rx!mU?81I69!8+<7?C6FK~JX0v#te{4c~d=)nFx8+#P z^Z4Om;fDT1Z{Zj-Z5-fpq9SsF_jUn$`9v+{goJpU;M1wp-N}oxd9t)q72VTx+LvRL zkEf!D(^t!L^af{Av}dg0qiBk_C;iXIXWmYoUtJeS`5`Z%?T2du0EAp=s-KBCXuEWcJ zZI8tlvexG(a-5pPf0L2_M4NDvM$3qvSG6X;#~wjDyWx;Qq9vCDWDKuj3@$`HE^t#W zV8p{##~0CW0incY$%cS9xdsNqTU@*PxoAE#yPK-;tM=BbQiGN7$HK_R3s}=lnfrCp za#`{`TbeTBr0>1#ay?qad#AS;GAfND4!pP|=&u|B$j3t76#%>vzz|Wt@`-ERhkNT7 zeW&cF|(7z4<|%>!gbM+eeTadVSkgV2R|p)oPMSoje{B(ZQ@#zb^{ zG+Z7K8YC7sAt?o(i=Gev4}M!iM#cd^G`F-O+uA!iySjT&z5j*Z4v&nEJ&sRIPEF6u z{=dYxGaLOWhd3=P5?zL&SHOWx0iBPLOJ;;^3xy>@ zbovXk{EK`zHdu8|x~%XA1(n00L(l)=Z~rgtt?sI<%%2)=F6=9twy3YY2G{@q+M}ZD zB?)Gm#yevnwMj!^bZ_~3X!;C^N{mHUH= z3>FxKPg2D({rH^85F1FwLwiMY`t6J9YNS4$1T_W#noJ!zqB!zR+LSUwdGtg6Hw*wp z+TD*i`RP(OIqlU#gW;Cu|Hf~XTU+jLucrG%hhOeD%P?ZHuK12{kZ8(_Pp|hf-`ex3mw2f>wv~Ytvay^k_eH>42dN%q*UWBFOS!c zcz!~sPWf449GsYL7b)}NdmITS$Pk6{VFYZwiIV&AiWXan2=ljCfaL75o7QgBa=c+M z+g5^cocUIwS#H5r(je-`Qk?o~(0Ga`+WT!u&7m_ez;~7@ib%8VjX!|$dnHQ{grJ)C zC)FESJ79VYEp>ioCeNOkg95poOxpWY|8;5bPx5vskF}i)dI&RxkkMvsfTk$yf}$o=&8vRT>fe_yJJ;_=9bVvqbumpVB6dn9{iBL9|&&gsPgwM<;-vOtE`d#~!D*?fTfeo## z@Bx7@W>m^4s%PkMsIS>!2ls)w4ompJNEN)9vDP#n`li_ln;%CQNI(*J;yAAU7+U0j zPMOk{|1+?WwV)?7eHfu1;Pz^g$psNBT4%2RMRv{iubE66rN6BLbRQNhMjHUI5EqqefCuc zY>bA{5JPmP$%~B%`1V!J;2W7hP2o=_FHfWAuaA3b40?Sd!UER-J@39!Z~Z1keV*h* zVPgtlc4Z0`8X03aYyz*+^<6V#r@)eq!wyY%+!w|vpA&W5id^Cj9CNbTP@ z1{DSyLGa%cNC0M@6+#os)n=6UHVZx{G@Hhdg+RX2f zSkycUbFF3KmgkPA%xnqag?lwt%Ik$xZdSm!*&e9)&iun>0eIuLAk?9lX+t>82 z0*7#7i-ombDJQ8~tO;fYm9e%}G6ph?YB_K8l^@yc*J4=Ad{LX`FG=XMs?-IUqVYE% z&*!On*!tuoDTKeK@ujn&t&qG@L63`$=m_XI#NfqX%Eu>|8~Ok9Ol|Ka{GBV@r_>nV z-=lYHV=|+8kbQkJOrT>|V`6jNDB1L+1eDNe`}t4CBv}zt!Td~3`Hyz+WZx~j^j+jHx!1k}4@}QRe9;2T%n>C| z_-#3?#L5&B2EtYgV;SI@XWxhyG#yqB&Ylp+W@Ht4_Mu?1Pkkkf?_Vo{Q`Q@9r%a*HH zYmI>GA3}JN#pgG=Hm3D0M1lKk*yT#1Pp^5LYU8KV!M;v`=`Wa8R>jSu35N31)1Hix zE_8M$pY3Ec|J;Q6Mr{_bQ$|cFeg}eaM?SNfh#MVVJ!4oU!((&0if?jufR+rFMSv7G zUeQp&YJVSsEjE{NwWTsaO`(QiQ+cTw*1fE>d7pmry;^y-6-cSq8RmOMQQdpc1ppdy z73+p9hiaJuDhS_VXGMHh_}e{oi~qsp8I~ftj$Mm1QT2a%OcD^Q*1O+)l%&$A9xp2+ z@-fEGwbCY{NO93mi+GGty8-Jg z#bz_ceM!ZRnfw=dSvTt0RiC^@a==#H`s^ZQu0 zoaYMP@2Y5M^|lh8&}`cebG~hy+Qt4W;+V3c5Z*Y`qjXW3RrK zaf*UDNW55FH>n?PTDly%yi!=)(5tKKYvsCZ?+`iJV0NFm9lGikemLC~zb!0jvGJ6>b!3F{)iubX`|sW zCa-sYrlEWDOyqL<7Up@}^mu<_P6=c z?706*($4rtYk0n>cdF6wf3F$vaYpLg)S)ZSpL-_2ciOCJ%Yo4~KvFYMYKEUh)Aw(; zGv`i#5|9OcX=i`wE3Fx%?i!@3;kOO)>;LGXd==C@_2;vu{TFbMiEFS~0<)}uz&BKo zHBPXdCPn9^^Eidg_k`el*94F|>W4PU8*@2U}DGTgkXk zQ3eED1@80&sDZ)Igpj2Akb{VTzuSMd)Irgdpye}ACZ$aRDx|PJw78xy>{_q`>8pnhT!m8Xc=*ovc4`i83ULQembZ7P*rU=Dq{fjrXT!2|ICxIBU2*sYowe!$W3Z zV`@Gv6yf8*(A6EQZE!fIRs<6&tee>zupQnB_P*Tq#W0T`6O07phlc~h&QTFHnxSkU z=sTH62CAqJ;D|$%^Ygq&!fRNtM)<4QDDGXa7Y*TX3NUL!WD!M#DtYAFhG>yq*Efj~ z%-6vm1VhkVqI7zqL_-o|H0$}&Wm2} zBcHstxDqwJiN0pw?pc2qEvLTOlqc=&;Ce#sZ0xvTm{V}FX94_4dyDv+qI+pKb`8hX zgloINx1OxGi9zjdF&jOp(NvyBf@y!Aw6`)dAeWxkks(j(E!W9z+LQIRyB+>VFdgeA zz1cY(@5y>A$Gl^n+yTs>?t4k5{U7P=zqWLp0zaCk0e3uu;Xl$_dWxioRP#(8ZT{wl z%=g+^rl2HAenZ~ItREF=AD^7Jd-Pm;nHIfSQrdR!sIyI;q_@Y6nXmU!r7beW`m(=k z=NR#3gEX?gxo5LltH|QznkD5P`sQ@krv7fs&@0S!5X$Rh$mPDt z)e*{ZYRvP*BecSEL}yQR-HWxK%L`7*b?!?H=*xPN->Qb)hi2zd7uC<@6=@gM z%;h)S6p0HJA>@k6UDaq(VJ&s_0K_ToP6!mhpI3HIb(>JoV0e)8T zg7LnRwZ??$n-WN2$$C;LioIZKu2h4xbdS2sSgZ6%J9kHsz3j@OHKQdRI=9=p0n=JfnpkuE`e2U;YvFp= zy*gRYe0>f}eJCU6n_|Ntu$wXR0 zZjlEC$Y&{S)oiUV?$Sy3irW~hjBys)l)-I`p6zuT$TthA%+Pl36e$NrG${tm&m$Oo zPt$G0|5TR#C$jXv8%uYcQdS*O3}~c09SsanRm7n^i$HPmpg{)EpUO&M1E{E{7@cl!jAw6LN^b(9H))|a z<*qlCwl7_zFY^h#_3X<{>B~p-6)yA@-}PlG^oFFMf_3{(uzMp?K%9(*{fJ__`h|WZ z?LfQ8K&S3Nx90#VWuWg#?YJ=TgyD|R(+-Y_3{L0{PI(T_qzuj>1{W3vm+l5vXouEB zhBoMQhqgS2c2b7+5JLwGLq~T*C$z(7BEx@mhc7&bX|Ga-ZxF+G3&Rh0!%y5Ty66a| z-UznW2oOGkN7Fn)usA|=KSDw`N+vq`TyOM+*C-`?l)8D8mU?lN{(h8!ZtRul*c-hu zX0I_;_!xWh80X>`nEQT=hi;rtbo{;E_(!jC0r>bK1KQ^Ze6@jbF*v@cSBJRb#FzVV zX|9RSdJ{6u6S9L7a@G?{qLZq6lZw`p-@GQZ;ghK0I zbjs3t>NkAK>VC=^jxPio_4k~1g-^RTPkSy-+vrYH98UX-&IIbs1bfZ=93q%`;nf8| zq9P7v!U<+0q5V;dsBp#E1h3g7y4l3R*<{h#H0#+^(YbWRxlFyee6P7ez1iZ$*@DHn z48r+buKBXVxgzWNYOnbU*nIuqOsL|h^Es-SVF4k&V7`UwfGu>DEc7(@_a80{!WTxY z7bXT5CPf#g;EOZOi(`w6!~b6ZUl^e0?cLG+-QpeI`=gr>f?cVFn z-R~{m@$KC${M*L70`TCAzK!1z4G)B@iTCZ_!z>s4ZK(uq&;Wkm0iNImZQ#8);6M)E z%L^Xi2tMHg4dJ|4;TXQq9KMSZ-r&&Wa&^;&n{ot(f90KF~0(ED0b3 z$Rz+HAOa*XXe<97<1g&u%VH{b@&U~u4FvE4FHjCgp5%Em<39e+PF^e-KpY1V03^@> zJm6N4KmgJJN#)?>dDG)he!@VGERYlxz)(#BU;#z`3}8MEZ+AHn6$V(5f!&#Z1Hrz8r-peiE3=;5^In*sneFzLOJKrz5N zhcE&y;0wDh08IW1eZ>pKJ`$9kT^>>E%`)h%F2QA9EL(E{#y~Y|ZtF?|4F7@bxeo3I zfB_b8NEsVI0?5(rq8(=Qrc+ z;m!;2-s^|J=HM;>i_Qz_)$UF6?gh^aP~sR8KP${4?fPE8{5~c;(dNX!6AjNo>h25Q zF7bQ<3_LOKz7Xq!#0z@?@g?6b1YcUeaPrI2@f+XHIv*wnKMW4i3j=@i6aVux9}Jl= z0=@9?Dj)F(pYpx{3kCoKC8O~>-@nz)CLezcD=+j;Z}LSy@dhvUV-NK)-}EzY^vK0O zO!O>0fA#77_Foe80DlMq|Mk6q^JNe5P`~g?|LNeK_GE7l;}r=kzbsY{_wXzC`SBn3 zP7K&$QBx4i71I;7C_=-=?q<K`|Pay_OVQzPV^c9?h6nM$Wi=H5B(0W6t@2IzOY@)4-A>$0KmW<$DS;SZ~ftm z{q~_zhu;fw;$FT09e~gQFbL4XA%I9Y$bgukI5>l7;7FLUfG9|KX+b&IF@iYr$SBY` z=+OAcg9!8J`U)E>J4;(@dke54v4Y#{`wJW_JWO0{e2ko|yv*F}{0to}JxyJ0eT|*1 zz0KY2{S6+Tt;;KJUTJ_R;1B=d*s5e>OX5 zHm%yVY}>kh3pcLZxpeE=eTnuiumlihjO52rR)BsBbQopoDua%ZUb_NFus9UR#5ykV z3*gA{BgKprC1CJ$24=m!BATfdGSuj%Z`m~%h&{JZ$^M)_KbR9=Z?mRfF^)|6b5=H-`Sj!9;jX8IE5nQ*14W}9xl z31^Z>#>pC+bXMMpXP$auXy=|bS?On>f(}Zkf_)aM-F%2H%4nmGVt44HC`Br1rIucb z=}&WJirkW%ehO-+qT)p9sMr8%YO1QP%IY%e2&9i*2?RTB~h<&2|fJxZ<7|?6}NuYi_#g zt~(vN;iAfJyzZ%oKv|3+XC>x3NOs?Zr?T>3%n3d zOmW3Gy(&Qv67;Zf#~gp`amXN#JWIwRpFA?k9+#|f$}X$yvMeUQOf$?I(=xNoHOCC* z!xsMxbkHXGJd9~ZAB}X25(kef#b|YcMfAeSf?2ugp(>?e@EZfBvxVpCQwr4}AdimIMZvKHojB zZtrWL0|!<;|NW1F+LK`K6!nSx{!W63ORsGu__88d0 z4T>&?7TjDB?0C;EIa0A}Dz%%1+7hl)UVvq{1i+P38}cmn7yB zIax?UelnSH%w@sq=*wtI(@&BtCLpc3OJULvnT9;%F2}~qT7J_l&YY$>&xuNC_ClNA zGo~st*-A35@s>DTW;f-i&L8Fzo&4Mfp|yyif&Nzq}>GohhGBu8<%P_vK{q9iRTf~t8@c*0Vi7+q;aWeG=dR*Rk^Eay#Z zYSNtQbVMk9r!3gHJ$D8)7BN-mH;dZQhO%X(J8h~{(F0IQ3e=|&%jh=iMoXjOjHVyN zX-$~=)Ub*bQVIl*RoiJ%s=@-O*W>0vJ$g^CE-0j9t*c!L)YG97^{hcnt6uNNRRr4e ztx8>sTr@UhnG;799e z%hT3Wv(GGSXI~53>_`+CY%OhMLF-tTY80lFB_Lc0OE=fXcDTfC4r6h(kbsUY0uDl1P&%e~vOx4Z70Y<(*$UYYhxy8LahgWrjN%ikIK?bpaRE~&pBBq_#WY6oiwDt@64!XgGq$mR z2`K>~2qDNr7BZ2KY~&;td6`HqvXh$(=tWC<(uQvIqh%3kN^2U?mbS_xCIISCi+a?gF14s>Vd_+?deo@-aG+le z>ts~9ShJ3`tpi=_U*-DNyvFiuDYjbcUJDz3Z>P3< z&hP%R+uXgPd!Off;3mcPI{Egvgx?sPpli71?ri6UZ%E(-ued|kTT!1Rj{?Etcm+HT z@{WuA;~l?o=N#_vblXSe7H>ImBdk%4JNKPV$lGeCIgFXU$1OahD4{ni!9{ z)SfF#`GS<$2MYSopKeNn|3i7^0`_Mk<+0Lc03k+xhh4vL8HqemWO@q zPlfqayIyCr&m8Rm*L*VD?=DuiqqU`SH*nv{Wp}&_ey38u1=YcKXK->ow|@tE;0F(R zlIp#$ddgo7{x6j))^yB3%<~h81{=^>jwC82v z5e8ty!yU&P2H~BNx^#QbzV^g_r0N6AdIkTk_cj%ij8&LcrGJW^Yf3Vxv;_%?#z2a%kc+vxY`{=L#^zF`m+o6&CtbD!i=}#cg zXMY12J>RE);U|Ar$A2JVegx=%>|%iWgMgy9+4ETdV$b<^RffJ@iAjo_q^m|j-Lra(*DAgy;BAqym<2Y@sh%MDWNRH_UC+KJ??5K|KC@qOs zG42SD^_Wh(I4|O8kNcP@^caig$d3W}O8`kN{wRYy^pr4JjxF*(mzg=#Udh zO$-Sw5J{04IV0XSZykAVv5+zmAORvtk|Sx7C5e(I`H`?dZ{Q}9E9nX?S#B`d4KfLC zG}#RpsgXM=eC}sN5$AqD8I(ZzZ~PW-9Z7Hf#$`i;k38v=gCdD%rj&9sm2Xp&a8{LU zla*;>4ifp4UpXd{lyC}{cs((ek7I}kcb15AV`jOQY?*jMsZuD1auVk%773Pli6vwA zfuZ$BLrH{?wR=bOmr_HQDfdc?=P*sFmyPKr!B}AfC53@$Ta#%`iAb4McXaRFDncp>>(J7h0 zS(pSCg~SPm708*$xt-nVo<-7}OvGH&d3DlRXZG2D)(M+#)Sk@JpYI8v%;|;nX+ZeN zd%o#{;t5}Qn0=kupzIl-5gI4)iAf5IngvRr_nDzcsE0h*p@Np2wGt|#9`c>_Ii40; zp5_^xDC$ba8HVrYnM^sNGfJ8Rx}X?(qtltA2Wp^VIGa@1hN$(QxNw^^YNQoHp+Y5D zk-4Mb)T7slU_n}fGJ2#{x|cS3p*otS8rr1!=bGqweQy}1;GmOMYNo_dqBy#uEGkcy zd7@0ZreX?%atczPX$@t1rg{1sN&1|-MV`n7o*TMlKYE2y8eT&hA0euzi&`IiI-gDI zq=M?DIXHn2I(i?v4S2cAsGABKj{2tq2B%AEqL2!uzGbM`Nu`{ssyw2QXwsUL>Za$( zrWrS@M(3dE$)L4lr`4dTtLm!@whWWXhMx+ZDk`dKdaP1JsAO2FxY`Xx`m53U9iCdN zWLK?wcde#Lr$P#*b4sR)Dy`w#E?AnZT3W7Lx}}jys{83p-Rg!ADz5RWgu6O*uR1}o z8mFP!qS-2#?#ivVs!;Sg4ZSL_1uIhI3aEj)t_ZrC{JMIZiKz#=iPr$F&jxF;Xo|3) zs<7#*uIVbQb_lTEs-6)$4&ZvRCtD&JTdex3v7_p-_6oAwI;A1|uHTBND2uZ{(y9l? zuo8%`Dax|6YOMQ8v;Qiyh?uH5tF-z-twYqK(JiPwu`w8Ml*d8TV~sGGXb_>7a7wKp5Ov8xxal8m!Ri<(x9vrxOYyJ@*A3%kp^l;*pz0KCB~ zX~Sz^s_P1?#=NPPw6Y7mN&>qD(!ABnYH1d|**mTRyD*Haz2A!r-m51S3%=z$SJQir zT5G=RyAtA?C+<|szVW+$kjpUeE5G+US?GH$48~zB#$jx7rZL86jK*F(#1M>3DJ;$F zOwHZgQP*rJ_?*uZ%+0_|&m5YzJ}AxujmnYSfln&R8)eUdywD82$?;r$mAuSKXS65s z%oZ)h`&`Xg$Iq>d!XS;(vgWncI?|cf(y&+2WEInFDecC+%%g$|qX~`B%S@sIP1E>` z(Fr=wBCD(uO(7rs(*(WJeOc1h+|(qU$w@8M+G);mJgiUc(*9htjXc#|9l2Ev(aXAs zu4mK}lFeSds6pMZL%r2SJ)NBEpg4) z*=y4sd#=J9v89^T?OfQ84P%~c)iLY07Rb@;Od?1P+3YJxL0|-)4cebA+M!L_qiqDe zDP%_=1gg#2tL@sY4co5WmZp)~uWj40joXTQ(wkk_szVOD0NlSV+`&!Uoy)q(d)$uA z+kAc4J)yh?^xSn8-AXvq%PqgJf&@s=0^7~q-R<4q{oUD(1hXLC+1&t0aNg;S-s`R2 z?Y-XOEeqw%-t+C=_5I%2t$EcoZQYzr*uc`?`^~m{eI^19-~oEr@j~DQ{+7DNBpAPDwF6x~w>50zhLFn8@qcIvIl&$XSuMX?6F6*=I>NE_%H#lzYgrAi|d4(-QYa|+noU|knGE@?9I;X&+hEe4(-z}?bS~0*KY0Ej_up7 z?cL7p-|p?<4({VF?&VJI=Wg!lPVLPO-YqcR!@kQb0Ppio?-=!N@Ar=H`L6H#&hP#1 z@Ba?)0Wa_aPw*H8Z}10?@CmQ*3(xQk@9+-~@ewca6HoCN6>squkMS9=@f*+a9q;iU z5Aq=|@*_|37A0@;Cy(+euktI;@-6T3FAwuEFY_}OPxCcz^EZ$4Ij{3O&+|R+^FI&t zK`-6>kssaWD6CPxp0i_jix?d9U|-&-WF5@ArQX_<=9@ agHQN{Z}^9g_=&Ih2#e47jsNz60029}9HWithout CUDA GraphsWith CUDA GraphsLaunch 1Kernel 1Launch 2Kernel 2Launch 3Kernel 3Launch Graph 1Kernel 1Kernel 2Kernel 3 \ No newline at end of file diff --git a/docs/examples/te_gemma/media/graphs_1.png b/docs/examples/te_gemma/media/graphs_1.png new file mode 100755 index 0000000000000000000000000000000000000000..f42b50fe0d7804e638f5e719f90cd381cc565fcb GIT binary patch literal 16100 zcmeHubyQUE*Y6Mtf)XMjEsZotmnaRAO2d$XG)UJVARQ7Sjg)}I(A^yhQbP|V-NMi_ za}VF|@4d0!d)IyMT6evF-1|L?#mqT#&hzZOpU<=7v-XM5P*Zq-M~MdlfgUI+%4&f? zn1;ajIo!Lz|M*i_ZY6Y@(baJrbdT-%s zW##zZ#>wpvt6d5Nq6aC-%Dnc<*kAN^d-Mh_eB`9-X}$Ea>){J}`>Ln7jPz^_ytcnI zU3|fEZVT>23jJ1D{rivm*P`Qs#hyOpjr=OGR#LQ#nJ2;>$k5Hgpn_|tIP>rY?7*$5 zWCxj7tX7x+|K8kGl76_Ek&#$_@cnABsr~@z{>J_DwyAdFeFoqd|5$82nx$G74jgXg zKFy7(gJi_S#PMpJ2n3=s_5*O4%}n3b)zzE^3%IV?Rb`XulCW^Rw?>)BzGvEGfKs7a zJ>lHU0K2tHo^r=Q5qAxxK51HqS|+Rjrfr zhLXmuSXoQ>$YWX}c@%CvKjrX|=YRk%1o%f@DkN%IxtuQckjzWAE4B?C}(R zv;tGfo#Y_aY{4&(ep~g~)I9TdZ@R3?MtJyuSC?9yh#sI7Gi~f)d^e+^=ED+3wFP}P zYI|u}S=1eYHi21&@#mQ#6q4-6q&)G&Ov=8hNv;~d?^O~6-Wn$Ucz}!&TJ(t%?q_cp25&pb!vsmybm4Ex5Cdm#b)eanEu>a z^PSVKwJa7NhKTq4KEU$Azu2KMBhD{k`gFx}F7iZYSHG%05|^i&%$<_PHi#RmBe{NM zG2Wa>QAVS6nhJMx+<>dg%F5=GbZ5%nZ0nJLOl8l1P$)dqk1*hljHW^*5`esX?W4Y3 zKFS$Uk1r@E{0Q@TvoSFz6&;IQ-DV%#kxcnR!9Cm5%!>u8VCi#_HONhlGT&WJc?)_z z*cVNq@B*|O%tjF*M0;18wwu{dR`!^CAzvjG^5Ee$WwtDpTZErkm7FDLg`m7}c$wOX z;SEnIIQODYe1HV7>UQ|wP>M~l2hRSZrx*0Lx45Ly-`U-@_&3b`{{6d0>@ILQH^i#J zno%ub(OPgiMb*Z^z8lU%(4{ktcdg_b&3M8iC`|h9a*%?zj3+@Lv0|g~Rn|E_PL25p zlm+w)YP5sknAfcAdvqf(W!|{x|Cur6;$?(kYnDdw8lI4vd$Y*ypZ71*z#ND_t!qcI zg~bd7-FAl_yH4JneRMm1ZBkHNci-YoU7mhji4Us;9#Y_A?5ElAeMT(?O}NdO^vW5Q zt?|At-XT)bWNYf+p;ZQ90g{q?R;xy+)@*m2#(7pQMHTbMTf5Sie@e>6KEkeq>_z_w z$p54zsBYYSRifKCxvS7r_dN#TF>Mg5v8DrM6=f7#Kb_*v{i#bW9}%afK6U_MSR`~t z-+NWspRRF-GW+Ipe+8GtA^B$_n!wpK>RpW>77!5)_a1T;giRMCKkDiYh3u?jJ>%-e zoi|nyt?Ais&C1ceKS^dE?pGUSb3z)Xxtcej&3Torc!sqAd0`Q9%<>x5#|l~Jv#+v( z+I>cVw&SsFb{(X8{uRc7bPnH;7s=Ct)|-lL3OrZYq_e|29T4yP>Grcb z_KKr>WV^i2-WA(Dm)DX1awwN)#O3}VrQI1P^g+d1mG<-k_ORl7Ye6~2A&+!m4$@)Z zfpghQ5tsT&L7Oyn{5-cl)w+Rj^I5cPF5NPUgpx5vILEfwu0wZ;w)fXKnR?YaR07z9 zpyw5`_2gp!iF!T=+`5c{T7 z=U8GSUA?!hWGX_{Pyn>F&gV89K3R`#olXNh7ha`5`b4sXs_p9N@$19I@Py9I)y)TV zxxR;H6rw0wYECpQRl7;__Vmq@J@>R$p$1&jFWT}h4w=`jY<|}+tn8GVQxxFY(#=mv z7)!3Y6{7gvahClcKMc|cdP7P@pTY%*LV2Ge;E4_jNo;B9 zsjxpLNo1@Ro#`jw;c$8XfqU2|!(Zg&CYKZjly>p)#({t_Kph?N6B?~fB+6lG@;_G= zQ}jIOe!S(l%H$*7)jN@pB>z{l|PK|p>Ds6fWdz0cbMZ$(Y248H4S@EV~Kl+>%12#IcOTiY9 z@>vsZScU1D)qXSIx8_;an;IS1hT)V8iDOsGSyYkwiLO4G)%x2kW1><@Nl#K@{F^b9 zIpNpGV?)_T-8*-P|E)PC7p4p4X?3MF{3$?P130 z(lWL_+<;E5&7Wm!!AG7S{3KWm$yE4jvYy=2k%&7Py0EORDb#99w%f91#0#1|(e-sB zwEgb6n$`@LoqaHB257>YstWuK9Kz`YoP25WSWsZU>um|uG#BaeR_lARx>N*oGm(2O zCN}R%<+PE+DIjWi`(pxMM(ob}3U(w~TKtkrf!H#EOif~?;*-)KBGsTx!Y#eWZ5#6~ z8x9%%D0?IKeuh&v+7Cm24h5m$%g|bj{(IEr-ye84H&d;97cH#C=D3dfoYD#1VYMSL zoxeJFp4}toJvo~m8`le7G}M(L`oY_YEyJ64)BgroA0yvU!xXP$qlOFuT$ZwRT&Y>S z!nRzwMQ-z6ae_myNhtbeRi{)BWz6}qU?RE-Jd_aPB!vR2qNchVc49vc3_R_LjTtuM zPc;i#zRd8YCUUOY01efbcahe&MLw?Hz;ITNdn^5GD!?iPY0RPWD1ez_bghM7N4CrHu%TK2ACpp-XZEPV;W!^#^NN0on&5JfNiYcH{So=;^F01z{X3w*y&Llx2iEwc zjM%|1k|7Yt#r*{B@%Z>fEHdf#K;CWVwdaSH!(S3)NGh!M=WV`A2aV?>#($pIKDkPg zKRO1s)Hw8UWAB|OJbCDf!HbpQj})$F1tNZ?9DT2v@x_Jxqk%_1Wu)B$QSk3PylHU} z4UKL5A+R*|a{d+~Ks?~D!jDg0L4OJ9ObRydHh=X2zGNTKq@`O;x1GB9$xs6mL?Lkb zw!x^ZNK$KTeKU?`E>;R_RK_*3*=gu-Oxm`Re{iw4ANDe>z39~5$Yq$L;rEp_pl@hPP#Y*PX_B)>)Bpsk;*?3YGQR7j zLiCx>((Um#qf0jIYSNM=$iuLwLN+GZu51(LQ>f}WM@gKVDOeyBki;1uu9F z#h+>EC(}o8Pi3&+zOek`^*$t-YU(}f*=&i0v4ock z4OC(KMqOjTxncxm%BIZ;{8D^+R~wycYt09?`0rEjk%kf4yF>E^{V9LJwFvc-m)&NT z?ePz6f(6-II;)mGom8NrrX}V}ik8@@7mzg6XDkC%A&b%hv2bL+TS}qAF)uZzXSR>) zOuA{kd%#TNmTT_R2CCTI_fqXeazVu^;;o$&#sGhYgNF!>&_KdgalP{!oo44fz3VEE zq-@)}5y{*Aob;xW$cu?g#L&me$)Cas=;_Vc58zeSGC72+4 z{cF#gi$&@9w5+JUE5=Vu}*aOvw@t#?Jq-5KkU*Xju3Lq3;O3#hlXhbKP8p6d1do)mWUnzlJiGfK8({ffI zn^WbJdaJdQXauR#h@8Bdr;*osp5o7eV5SHPK9p_gG@G-Cb}fTIYj_WcSIr}VY} z{Kk*|@tNRgx@jbdcKMAHC*;J$$bULLGvFh`r4pqd7i8~F{qnqe*30zn(8(J64A1Rg z8(Wdk&xaDxB?mWDZI>7N(G;G7v9t?!I}<_#`oSd5R0hweQ;%YyZUdJyZ7VNNcf|1$ zm>2+)FtQp&sV7T;G~IHF4)sMW+#WM2h1@@{qjTGk)4QI<*Q@m620}C7-k&G*5)4af zv=Mw>-{m__xURVQ>FY8@gMWM&m(^JmF`cJyeDujer~d&6?Q`w|2!&O|2?nsBq4x}X zUYPU5NRKKuKmQqsRT5e@_zr~W2}jMz@mebQirZD+fq?1V{n}PF)ThS-!e6XAW*?lD z?)9_CbmUiEA5STacC5S6w_k%Ye2-fz^b*JNyhKn-H|fCoVcBWnS?a%VlmRCZNYZhT zjmo(ikmYM*ePfDfI=ch!T5OMDp_Gn^*mRN7@NRjhRwu-5Fa0rLu0PbQoqBC^VlF(l z8u{^_i1l1%COkV}Upou^zKPVLZ_hjHawOF!Y_l~#>AsQs^pY6Yd|FZ@yROunS);fx zBq%dodYjjwM^u3CtjkamT4;jA1}*P2FYQrtfRqq(){QN0yMnl17LM98zkyxm?SLr9 zopC7w@~UzL4Z2C$u4)e>7Mc5sf?4=4-?n$)| z?+Z9nU)`t10X1Be3xhgNw2cl&-H|iP9WGcM(Nt)_uLX>J&U&{P6?Ug~5z~c`U_aqC z)c<&zI|%Sj2Z*N^bW%lYK!WPy(_%8Ca@?CX372X;9}UVC&=A_?p^@TNm&6d@FS>DGAc2K^K=hD4ikt;;zXA|0zZ__-{Lx!o# zpjWiTUJ)|Wjuj}kwPX>{#<(_i=_!=f`RdB*8XjfGm@n)w!5;oz@o;a}uy-QXV4MG- zidINyN&L24uey2R*SliZG{zZFNR9NVf05$%R&V&7t?m1z8osHS(YM!4GS9)q_XZ|B zI4JVGti}lL?^q~A?hH4c*M5-SfxIB)xmK_E{i*H1mSkzIrJ0=It>1KpeL?!!FFuK*TsSUTi$c7MuSYlHAbrFGl^8*w5*<2OC?+{&TH=@nIBJ3D%Xw#{KidKP zZnFT%$4dq&lf|G;f3;A*i9X}zollv}SaK*jLDzG*rO~QH;NFwg7X|^r_=?GL^n;6X zqqEtehV~!k!^Rr(@6YTbj?O*t~-u|mx`ai)5H(K zAHf7si;hRvyaqL<8V8^(-#@KVoAgTfzId%_V<#G#bZQ3rjw~C4h(CQ8H>ksA zO;*D@4%9w}E->$>vww7He>OW;1gjK%pO?)S+f)3hB9Hu#+~5mKViJp=7srqJyuXB7 z3d&IPwAwP}wnF}VQ%EWEsomf;g;jIOFmd>8g%$Gs=h(4#jDtg`WcygNz**ZQzvw_# zLpRTR{tvx4^XibyZLttdBz|+2Iw@E;IH)%2!pOmx)i66?HauXlL_#5C66*iXef?|B zN&~C6kEa|LV2v};Y}ZkykNkQZd?{%GlowesTp*h>fNZ=QF)Bwb#E9Fe-};&lDR==ZvVx*XK1gptCos$m{kUGV8;b zP;uk!NC0{h2)_i5lSZcUtPYgx`k* z{UCEw0pRA!vgQ?D&zp$Lv-j9KimfmK31X-^c8v9HbXdn4`+`x#-Dt5a^ZEp`*26cJOB73$B#u~o>1zy zlO;x(Feo+LEwt`N&&d|OJFa=ml%=@0uZ7tLrL4R#?$Y={A+R*Ob-2^p7kvCCRd zQe|vzGPF#^v%`wqjQ<|Z!LF68LkxEftln%mKAX?>%sIvXCg}~-wRk$t&EXBz?tXbK zbxt23maDwm&~}NjY&ujV`XOCrR_-ENw&|8PNz;oH&cS;ob;%KDX4nVkdHO^QR6R8l zH+J-Iyi(w3_NQHa?n8unt6$(ZD+;q_JNB^ThSq6;)5xub)Op?J@H$f`m*M`glP4+DXCX5~nzGn|D#HxBk1E#SlcV$1v^8 zF+6RtJLd@k$szkqPra*6`R_T;q*pkugMqRbOGOKh<>n8qsH+4Ve)F`doq&tP<(?m!{a_`)%JrpMWx*f#Sd{yS6nQgkz}HaFWO&LH&b^m@;;-S z7c`Rh=KT2gdbKgs(m7qY5?7r>@_J4766>4rK6F68vvKaQz4PhdaBO7-NE_jPPtT0Y z)GX)HFfO>`at!73+n7b>_hydPUnKqXA=uVmJemp{{pxclpLAb}T-xb^XwjFjd*E1i z&Ghno<^o}^+P*<(2A$Ze?UVyGXYfbb?JgY3t<1;G<|4lc8mw`oBd8O!#OTHZ0A?b+ zEG^lNz)T@J%<1!vhgpcoBZg=ay3AWa@q<7zyb1vAF%9lrLp?(~iSw4WOD?B=bnZcj zIC;cXFU`rG7MDs1a={b1^!u)O$iIF!$HK0@!`KfF#^~nh8ETgfirzaYl5E1s9ge6> zbm}YT%Ff9-AacdOTJ7SXd#Pf<1ys?WGIg;*6e6ya-_IRhZ2uZ85^bR(89oS}wU=ub zU@-^CJ^B1UtvpGOF!W0Iq5fR$XNsb*GQWGdGojIYFESvr-HHbc-uFV8e^{y{Gq;@Y z{7m7qqaEizaVc8G*@HLXX+wQ5*QHVT(#VSYEQx`+WtiUF?pO_ntb+pXxjAw^0$HvuP@3r?m*@hLe1#?bUMI- zh~8?#l6VjJyIj1s*l>VaO!lCooTM|ulmrg?YtK19Y}t0NU>U||PUCaM)B=R+LW zyFz4x7a~cA+oP;uY zWrLrmxHY(Z^{iS~8T@*0rs~Y$*%X}&08!x4VsC$C#*2#++xy$o(F?0Vp!aSt$c*I; zuuJcYUTaZHRgkWITl|=rE+r2=mMGk_?^DuTAO>ODEB2>+QUZt>b?Zd zAp1cZd%FU+H}dILO4hBP|K9(|-S#qLqHoi1iHC;L}F7T@gc$2c+2;szWv|7oZHLmybF6kG4V^??O(4p$Nc-L~Or9;f{$ z3-vGU4x|rF^ad8BZ>AwOYy&&FuNwg zflIPt=2ddHKQXcZZFm3vbhV5QP(9qOhtW`WGNNV&;(WREs*)f$e3j@rZsb3NngO@V zCS~~5ZHuQOYjQEOM8jVblB;NHYL;5i1FNj8mt3YU^>~+JRas{=OYs2q;NwJLcUtP> zUFV=xdwbcRJ5x4g#WHL0jFkkePouTeFYRD%X=l}Jx&6NfHB+joF?RyO#-Q1Q3p$IZ`Q{DjcRM>xzqOc9^@fa?7?Bz`cAxdlyoLr5*oy3D;+^8_ z^Ci!;v%GF%x2i`6H?ldrt`Oo86m^uVTs4}{wY9fdaVoFZwpO0EbmC=u0FoXT(^eNM zKgv-a?k*BA6>!NMiI1qB8KS%V%R1z@KD>!i7jebnB%WKX8Lp(Y_^&%5<2e*(PVMtM(^++AG#i>t} zXMB+Yi=kN&v^>@w7h_6Sw_7a0ue3Y7SPw>$$Q8+Y{0nR|n{Aw`p0YZ@EJpf9u09|nI1T%F%so{ZHyfsI@TnTd_3=W6ZvoZu2IM_u?^6NPcsKE}p0 zj_pW~1-FS;DSQ@pO(013N2H`?ebJ9;HnIh*3G9WaN_99?Sx7<&Xd>qwn;)>VvlGV% z=p>0{0lyx!qXSNNr`4wsWyZ~9_~hI@Dm<+rrKOyp4&f!g;0IJf%bhrMlkSW17L|JD zV$zyLE!X&A-D5cqMzW>BU)GpU54P4ysg4e|F zSnlR>6asTV`kc@X`+wFZq}~nAQ>kJe=Es)l7|9erMofb(u1>c>pz7KCQN0f*>)rK} zB}vJ{XYJ*cmGMx>L&#d)yyJ-~dL!NGQ%1&p;I@(}JQ*Shv@gIcdi@z>)if}`Q%c~4 zp!`|@Q5r5c(qFRGy0EVcrmD&~LCP%kLuc(nf4=Qr?TtF_z#;@Wvpxu-$8J5)*jXCz zvW!}d@q`@DSj6Sj)Vw}-f9fBcwQsE7HfXt1la-M0@MeV&626Q4DeeJYkA4=5=&aT& z(|IN+xU&8QEN^AS3|dY$bnMi(&iqK_*nq*)dioF#5ASH!I@|2}Z2w~MFd$4Wip=a} z<0sIal_F?I#-f@M=n416nXhwr`Rdh!gT_tTl%H>n+{Wa*HZwhW&APCEZ)|wXfW$oy zdTy5<1`9Q5^)WkGXdD|~6S2>Siol3K!x=wx%WAFy5AU?QspFx9}wNZ*fCf#W#H z4VJsR`^7pBBtpOXZKrvq>5mjXz4ib}$L4(@K=y&EXlUCN-dca$dFh0`kf%OI5WSlX zMIy_qE*v^PqG|Cg;R5O%*QagBGsBkagXT|6_x!I;JA(mL8O5zM}LNS7}?qH0rMzmY7)!G(Z;m;o|^+|i(Kzj zi`!o(vZ}9MZf5t73f zZ{BRp*YnW&?(ucgT>_28$9-VPlSqcQBN-8KQdhX3qcH`#m6=M5hP7C64HJ{tb`$~% z48X?8+0#uZ9UUEgztp6aayXeQm~%LV*9_CK7K|GsnNgZ~@-hke-2GB{N8E`}SJ%?G zMLiy1EnmNWZQ2!bPp`rh19jbwrivotdg1EI&!V0c?Y;}^v=_4I4tZ}ckkY8b*0T1Lo@{+=92$8&G+x$D{QBpw)`35 z@Y=7PuJFJ1*qf`x=g=*e1xADW>~j6c6ekBHmDwL&*&F?EC1cezPS^_E-he!tKSo_2 zcH0YGQ(s4qqc5nYlcZx2bA4BelX*bvg2ag`r+yVSU}6zVeR+R>Ns#HzWby4R8ag_m zSU3cxr(0uQJH>_0%N-cE2G9C9IXMXe-F-kg$zHTu-b?^^RpGYtCa{K_+#T2V@A@ctL^JtSRr_z02zEY;0__vA4$t-7dM$cERno{HB^3 z^rl2-Wi^VYZ{ufr@Ep<56H6H)UcU?H1?TPuGLBsrZ~0SsT~=j(qze-!Fs*caz@p$Y z$L4OkFu#RU#r*ySpr*^H7DtxZL6{9Hy z@tj%_WOoUuw^G~QO|VFu{(1=t>lI)J2J@xzv2Muk|@KfedJW~*6~xeP;ZPXVlhKf&$Suo6^Ry{>P_}$YK$dRJ&ZDqfrQo z1W=L?a)FhWo-Vn`gF{Fgot#YOhq~OP5O&P4UPE0?xA%E(W|A-9agr?nYv8q!YAr#f z!(Q|v*$Oc1fa}(|?wbu_IpUXL1adE5VtMS(easa1T+Ioi?dqe4lwV$~#pOZJs3%Zq zIJ-jrZSZs3g*||_o>784X=12^39=z5Q6M4%I0mw)4+o3Qo=u7#fKQbA0@*FAUmX!Y z3}I5rdN+i?#?wz;acyKU;%C77(gZ$*$4w0l4e`GHL1DkpK&&otjP2Np&;cDV*^gw1 z(nvK2Abq;Qn7=-2^hq(U>d`ePYoWG16N|uw*wG zdMhd2OG`_Wi17j9AV&KuOH0e;Sn-37emV%y4d!!d<9D@P2wGV3i=~r>uMFrGsbz>@ z=qH=i&zn`x(nm7H`4DeEC&JGE_HA`*EDs1sKyW;5Da;_?nfECxivWNf5Okw~^#j~P&sUjXKql_S-w$8w z>aLy5JB2YRHSARl(+|6bm6AAH=Oi$Bn~Y{jQteJdfu8_>7bo!t*CgQ53Yhj08QE%m zJCaLQbMIL;=WrI9S!w5wq*8(eT0NEjj#Jwu%Qy9G_&e_^&b31bS{%^hU={>~zZz>3 zt47cf%+M((KOg%xAWNf;2_>$!@=|~_j*Q2I+LN5i*9pp+<<)f(!!IsQ1(>?K;ETy$ zIiI(;Hyby52eNCJ3(9yxO3kSOjZ+)-(?#6j!1iKVqQBQx_I z;Bg>)du`=}9raN;Ed!ZX@02b$#$mp$)3$aMh?)<@Y(!!hf&NX95UQNZL`uI{U{NYOLkW2TkE+&opoSx%J~4 z8MtA#-$P-zSKc>o&qH_&vr6*vc42m`^XoCvVIQ%R$yysY%Kgan*Qn5aPoRWpT1;qp(gj>R0g@KT3gA%8 z7sH+`IP^QHk0lp5aBB`1;1>E(?;P_>+tUkssL@&T zPNc^2o~)*}$%bexrzg@G-@8vrUwe`fgaV_5C3DL`Z~Pt+(h=OJgKrA}dFrkHkxngq%Xe`F zvrJ)7W`aNJcFhrPw}UDrq0Zd4!3g)~IJc6!2UNaF}TbPj3hn&WAg zbur6X0>1((Gr}PHuO0~7)0Wl9@II0sSG7GdsCN#+@s2#Qk0c|;m$JB#8eifHBb*^+ zza&P&h`z+{aJ5*fVEwB?J$ipzwYn3j9}@tQOw-v?|IUVakZhm5kR3~8L+)>n6Uy27 z>23**gi)$L+u_<7)bV#0EM4ozLCz}nlMG$Gj0nD&m~&)UGzn+(d05acX?&Gq6MG0h z!A`(kQ}CL$fA?sR^_^V=Ynr+_YrXT_gWV{$0Olc7I@|hDzYSnH;=Z^dpvioaze_(^ zQwX)ql;JRP#SjUOs6w+Si{9|fUC>5YQ5L`ffW2nw8hIg`qmqOa)*M=i8|yn)2P&bhsy`&GYR{9cX7)V*cO0G@es$$z472)8!6s75 zk$aJhd+p4SB%}HMh@N17zHt6eOih_NjY@|lhood{w;kZ9AqEk-xSEe{D@HRQEdqQM zVTJuHzx`qp>8R9o=_bfd8YR|P-TSCZ#fb2dSJd#t5@7|9pS9j@hf zJvG{dMXyCNM=Er`b6UV#yg`AtIT4&n$X-uz_t(R?bBXxy519DYzVcPAMDYQC>gU%{e`FTmG#yx~mv_`22U>8-{2)j6^O(e3_!IJZCa#oVZ*|R1aX=zT{o^ z6D8f3Sb;r%*$o~o)uv#cDqmdx$qBhEhHzk}j^}Ji_KL1LCea-FfR%-|!>~Nv$ER=c z=mr*zP+wnvpGPeN^hxB}DL1J*{c2=So@$9Sf;RwLwa#*ihq(KJL}3xzmeub^q<3yq zi;hF8P#UZB^{dHbqi7skTZzHsf1&S6bLlz1=GFW}0bS(A#et+c{ojKawA@vvzZp{e zum0yI;9$8kSdpC#x1mL2?VblRr%%Qa$aeR6fZ@=#e{Y3+I>g65U^lSXN?TUXiUBWo zT$KN^%ODiYvY9mO05f{Lc|4R&c4y+-%CF9;3~3;-Gb&y&qpe{WX?t^wi8J$-Oi3_pGsMZDz?97s|-?_6T;41{UGmJ0fwh|v;P(cFMl4G&7_5V6Jn^H0Z zU<29xuw2PDj#Jvr;2734#M>(kf6d#stcm5K3!0k5V0=WtcV6 l`)04~-2Z)^bqtNu*;cmmgMFF|cugIoB&Q}@{=ziqzX0Ag*t-A# literal 0 HcmV?d00001 diff --git a/docs/examples/te_gemma/media/graphs_2.png b/docs/examples/te_gemma/media/graphs_2.png new file mode 100755 index 0000000000000000000000000000000000000000..35c34ede5559bd0c26ce807789ee6d3fdb2bb062 GIT binary patch literal 15177 zcmdVB2UL?=*YE4rt?aEJd#gwjuz*MtDG})^A|kz)P(%nV6d?qpTTw(nT7VEhDWQY} zArLy*%BDjQ0s#^NBApl_B}78I5AXYpao_j5_uO&6Gsbt$IE;~zXC+zBTx+hi=9=sG z{E0U=HQ+lga{SPtLwtt!?pPi=bQpT*&@cMG9RohWA1>Dg-hPGMHnjc?_>20@<26tg z4$-#{u?qAI33CgE9P$bX^oJ)Sl{Mul?bbIgK>0f0s`S0BNv)xGlW5I7{Z}uFyQ*ij1Mfn+} zUwF@7`2BaOUx)C9q4@D!CM0wowV#OBq{cg_dZ%D`=POaz^}|n}ME}720`%@r zUTlrO;qh}8Ig3>fgw`gzGwz7dpZ%wwYY@HqO{S!ueS*z!KJO6U{`=SSs$b$s+K+gE4hs$oIQ)tmonrC;3u31YR{%}wp1Jh-RS#d%X>Ik4j^k)XBM zNBRfq!!^XaSdb3;Pk%*!8ru zqORtlGp4m#QhQL(P+so)PL2mVP3Hs1_*=xiA*2|S>KeE-+#nM#-t$E=e9sVXCh0>m z)Fh^!;U5$xgZ=DLY?+$ zj0&3n*x3|nV9seBD&4-?&04R+v$Wn*Lcuy_qt$bEXRc@}kaT(7LMB`*yFrei<}jN= z$)$3s!F4RODx_Txq>W(S_+!Z3Rc3hhbHD)qH)}mZWGw6@Wzw4JwN*NN?XEWUHX&x+ z^0~{@&}AB8-t-4Uv&2L0Qv2)b)KBrX(YePOU2Q0@;GCYb^xgKTZq=FqFY~bGvG?4w zMrAsBt`5jDDHW?%+LQL%Ht@pYfyq`^cUaSL<*F~3DVKL@KQcPIT))**%7`=6C+wc= zEjq+{UTvNG{;s=ASZ3G31jy_9Y8yRPK*T~vb~7`sD7Fpvm?&^N+~M49P_c3&GmL;0 z&*bSyHxBH>!3RU~&zi7gsi&bLHsXts6xLQ^KT_$ke%c$d2W?1A0yC&?09jUBR~bH6 z!ycKl5y_dtSgP#7?CDqP#b5lQrU#!U8=N8 zj(3{09l=TLX}|Ao_tC}))`@65PjcmRpM@f`OB+2SLrFKuY&D0OBEUz!ceQr(st!oX~d;IhrA-x80PfcNwT9}6Du@u9*% z5QCIpgyLJ8Z%D*8h6E*fa5sYV%@c1eeJ>1_GF&PfN-Pm9Lw9GZ+UeMT0PJA5Y3#)j zclH-3Ji;G_|BMKs{+WGHk+R}FqL`1hFibKSNpv$&SYnRQ1BNTnp6*HC?Jjl~EdI zMqb63>Z?ULw&%=k^z7A_&;c44^V27I)CUW+P795QAjP9=6iemA%PF`cOpX09zv!i( zW5gG$3u=szy@Cmy(@HL&jR@ZEYa!bSd=a}dXSZ77R_fo|OQ}!Z8u?K)F06-pWtDbT zJCto7-b`|8jq-VF>+g#&&R=nMz|_^%l>^2_%@ob`FvUEsWoZx^(rI=yIa8o#L208* zf;pV(utc>JP4yAsc2{2%@QUP)x3)fH+PN7;Kwvqo(T*t|A`z>*XBG0YKV;9Kjx4^o zogedK>*7%b>B_>`dc6d6v5cRgu1?84-RMR~8s=GQQDy99*G>~7a*G_aHpni=Z@j#m zpYi>u1JJqeS=$-LRy>gp#3S~qIPr5B#;a_iW-|17|J1p@=f;(&?5RR6U^e}pJsQy6 zV27U%N#9=njI>V|X^=PclE|e6wY)<kEzMl)E(^y6t3+{oN2Iu;ktPv~x@^ZA3f!duJ2lYK zyVNro^X%xrPur2tkUG~*luejeylT~#Dfeb_M<~s_?Mrj_5|=5}Kt1hk_hz8bpi;(a ztQtPgYzpakcCP!{h)6X%VZF191qz|(%We#WfmN>$V2&iZ?aqHwlI>etESO>qcC3Yp zDJj_~^E&T)XGDp6T}B1pL0@%EE!o$^*b^V)9KHH_Ee&yI(qag#^kjLal%82si!mtC z&3R)xSOJ+=mal#Ki`U74p#?z?Muv$=BfH_r%HB}cWDbqkHM>mR8^nGnf49x*4xd#kGakAhC@|k<&8wEHz5o%&&Ecar*o@h zpgHKAskR7M;b@zsVO;|%cRHwp9+aa>EF-$wT_#@(p}}}U_626rGklk6$+o_8udo_h zAB}5kXpL*+qq1^8mZtZImvNa;__}&~Rh{A+`|Xj<^7n75Th_L7+dfZf7(rgjXcK1Y zg`Ky3$5O9qUT}r=TC%yF{f@!j=m2dkE7~Rx8*3n&%Md{at2A@F8`^!nYd2m_F)6Ef z)ZE}E=J%sIARhnRU0O@ZcEfbdZ7~%P_sHKj*E3HN%y?B|1tmt0rS1QDnrHRYgF0X; z1!Yb3G3p1PYIEnEls7CSk{#AA82F?1m*S z+c}g6@M3Cc_}uElB`UevI{FBy5*VV!8)=o*t*>LF7WdF|vt$I`I6vE3sSPF!Qv+n= zM&3FRG>_5DNFkV+%)2sW&2l;cp8ov(;*Aak3hA`Kn*CKd91jllwkX;bL zPDW?p);1ZhvJnuD>6nN3-mlVKlMlGvYBrKM-DMxp^!|=Y#|8>zU}8w#7&^1N=lY9w zX#Zowq2**`Ej-qo5t?p*vvZIbLi#H-vYS)yAz+tz_jgYXEsj3=l&-p^6?a@&`~Ir1 zulMhA(k%(pE2KE$Of`2;d4%8Y`=}vI8Q!U1-1z;)M3$&X zbz|t;r;hv!utkm37P+=!(&&9`jU-_1`i=Pd86jlR?r8j1)}gKVA)iCf2~6bSnQ0kEUH4 zo_U~rZWZTN(+*h6bD4S%OInGvm34sLupak8TSlXMpeUYf9-I(nTTuejIN!>Xig;hl z0-}`~%C?lIC_nN|cu-ec0E@_d=v zzED=vnr^eBTrVPKz1?zmb|N)v=J?CPfsDNlEz4t#W^eE8RY~sDU%$VlLe%U)J=yAS zPBL2o{O5}&d|&k2oz8f&`726H@yi)os~mwPktR$zKQOD)Ax4<%Aog$H3y*uXQT$+& zUe~hj&YEGM@J|I}tmh(WjKq*WE2ACR?#<;-c860wr;pb3zP&yf_tA0}9VM0o;!xIEcN=DTd$9$Gj zR35iTf%%F*j1-Wo3-NFoe#Dpg_}Z8={i;fjriCb2p|+ORb5CjHi?2^Vfg;X>A|bV! zS}&~ehP>y}Dm&&PZAQMZ->%HIDYNGhp1Wbt1j!N~Y$>09bVF0QPElWy7im9+Sl0Qt z&iWXHX6hN}52c`1^|pInlyl$t(R>PWs3?(9-fs@;H#zZl<`ff2I!a?ZuQWQdr{c4L zKs1;;ntLi2R+rptpTZJ!xG`QSmpQwLe6=YfM400UHF{eh&}YMILJgY8<)Ka->cH&M zcCwu;kS$IAZ9Vc*gW^t$h=k(v9Kn@@P_SCA&H1)aguzNLA)*3frSE)CG?cJt;3c@R z1ul7MWyz)>F6YJ+tv9atF9*88gDq+s_ z`4)epSO;f}Yo06WhsfO(q5&WF*65?@3uQc&`fq{5&;Hf^LAoMO$flSk1R1wBWU~9% z!ELnlOHfXMv7W1aRxyM)UR)qwiAFpg89JL(!7{5ny76i9I;13e= zkYW*E{}YCU;zsnls|_ViAQLy|h^VBF1!Q?e?ZXwjet!h1Ioiv^Now4M9_CQi+8UJ~ z6JnYIBx$J-qgR>5%fPm&9_YPt_3qv-^RyzrT5O&8=&|fr$b0_>Fs=KfK9)ruFnwP1 zLF;Dyem(pPIMq_O*{m#wl_0WV?zHtv*11*{ye^$c666mk^BeUa7+uaYdUMw!8X8u! zG?~F}WDTV_xN6ms>Mc|6*2J6wohu(Ke}EM{yo;|!t8zT*}k2m#BjRQ_&mI1dUk@@=lJ+Vtixn##XAqc?%$>g zsV7Aj2H$|-8j*XyMO~^ENL86n={s-wo#gfOl6I)?4aq`Ve#2@5_%a zt1F5#QR@`@EB?6 zWNy}I8yN&PG5a=fk2m##&)%<6^m+rQ1x~A^+sXVck;_5ak}_@5{qnX$NxI|fX2|G^ zhA>m=R&G&v*h3e7vm%ERbA73wS47SoyPU4lyT#9j9bHbbpOui9J?ADT z+kg7Kq>H%TeJ6?M`5Dw>^lkUEn)@X&zZ?7~JTQr$|2Ri)vYbRF;xv!Gb}`Psy#*5x zX|g8$?itUO`r}_nQ?g_ikOd#m7`0VfTieSKyujt_rz-#8JM-+v5qkmdH?)+A=E*;o zD=x8NJpb0LzVMJ%q&QIH(zAcXRB!(~3VmUd;RYC6?EQ@!2YBUWc3N86q7ra<{usWx z+A=kSkItA;S&ECCDVa3fLQ7MeJ z(xuOTW$C_N}9^dXhzjeY?%c{SVE@e18 zaP;4-s4v{Z&(k^OJ5Bj%#;zATpWj+z#h8AIuDrWvwtFh!U!htKY{i$deBDr-!^?YQ z0ZW_RdBtrw{4ayUUpGI&RxT9?wGmYRBZG~wt$>c$A&R6UuEaxL=EkN~#0*K-#2GsA z8#>|(owpeZxf*IxgL=Y9$pKxT&3MUvb)SW>)OmV*xV9FMjk#?KP0T=%>@jfvaRs}?$4i1=5j{sYL-S7L3gg!z48YUs1ZxtZ{nkPzon+w z%0Z6V0K&LZcKI`R9M7f(otJETXR4`XlBq~KxA}N<<+tTYI}!Ml)K>If=w>btkiGUU zEIAT(-W-OMv~Rt-@HT0i0cyC5iR0jW(=Qxr-p52)D~r*WWu0d}`8 zpN(MCPOKekT@%_eIyb7p4s#Oy=%TJdYfqSM z{pwS5kSxcBGX1H?6{fL7wDsM(T@qnrX6>)6)TPAn+N;PFa>0_be&j=w!k#1116*xl zm4ix0k+)?K(htm?cL!-Klp61P#>4@~&DO@|`|j&Bc+UjxYRtVE5H?B4K)BLvG3mq! zP@(E4vHHQPnK03rQp&ZSU3g~Ng?S=N+3&(9R1byM#Mkd@nVQv0lJz9qlZPe}k_DFQ)_2q(@;B>^Y9o*f1DyG1c z*i?E?rSoS^)8(RGiHe~bB_EN@QTU?J;BF7EoOXMmbfuVPmq6(0OR-5%`YYN%kpKIL zxxSqhPt}8T`B0nKRybV-AWqu&%?-_8oOjBc<{;`N@tLGenJ*7ZrM?dWgkyK3%(wU_ zuR?Y>zM4~_X<^ag3>?l%A#zxs#n-hSE zVc%|9Mk?%ntvIZpC8@A{NC6zNf%T+^2gTzr(6#ESY%;HsQjGI;aKDu+UCE8M#!1R( z+cd6jTpUqVPn+-IkxO4zse#8fiMK>lh(};eEBym($5Kr-FIoW=2dQFM?i?N@+r3)a zs8mrV6%);Rd$^ky(`v(|xE`U_sFh~B?{M40+m#cOo+Re|Quop?dYYu&IET8}FPvYX zIElAAxJ~>k&&8w#=tc+PYu!>?5+nLw)c06-QuIV(diyfFamt+8e2E`GE~+h(-+j%H7=}E71TMt6jV~MtF!&-1j|dAQUGv+Y2pMeVP7am?2@bk8V!=W!5cq5N(&c89kD1<^geLu?>xP z_9H3{`FBfPPL$~D?rtrkFN;~G77R7FvJ@W2nwgV3(T3=<7C7SaS#IRpFzQ%Ew4)v&CTLFx$SiT&RxkIq0unmr!{P6h zDyE#Y?!I&o2l$FM)qWCp=`#9$+y{mPHg#a8?}b%T*YKRt+D5FG07G!VhVeGsLAIN{ zRMb|@{u|7$_E?Os%q&AIRV2i?;`GTXY`oWDZr`*6cjuX3OLduHosQrax~m8|61FXv z?c_|;Z3L;>O!+caj5^C(+lPdS)Zzre1q1p8Uy(LPe2a-%o<+r^-D^8Ov;0gcLyb_! zxYnTiDFM;kCwY@!Dta?#`O4$k$_(5w#SiRdm#$~xRkTR;ngMC^YHSXp$3rA8Aado& z3h^PT+R`L?d+wNM{FQ19VDmY;NlR2_;fp1gtFx zb}>Yl{TSG*tm0a6XXrV#LZ~YrcnBgr3k`56ciFFZFgn>kjD6+hOWS+j%Dz_&b8fQ^ zh_lmGue0aFB3gem(KTWacr9uS=)+$n^Y5|v*e11L-Gei%JNjaV-HqZKY zaDdl+`0c#Q;qM5QbPY`HKXGO;?~+d^n?;KRDnVP&x8(A!VV z8kR^;1PA)QwCv+7c4dtKIGYg~NwBj08iK}mmULCmhLBRvn!t{$G<20LcbNC}?bS4z zj-nQ4o0YJa@|g&epkIZP*UKRjB{=^ zb4mWtVQW+?HX}>glx<(o@{WxNjhj>m92sjiHj&vh$ZI$0;*>QiofGc$7fko=$jVDw z2e1Xw1z9k)FZaF^C#-1bxNCtYtnmBNlh(S}vJ5>(JXXIh&+b%rFzLlK(e<{-uE^D_ z)XVvu=C{xWGCF#E3T|^n9MnYW##jzq1LlH&g<^6ZQ$AuGV0pL9WY>Hap5heM&w2B` zY8!9GNu1bP+l>_PJ}&1m9d1X0v-!mabVAkZ!0L<}Tl0<6}uYrUu8M5OswtB5xr1V&94)doJ?7}HUj*r7)_ zi5l~|6+bLHxclyn)hL0(F zrM}cUGxwb+Oi#~tG)d`?11nfMWf~HlBS?O9T>QAEoV?r*AFz!%f_Wwk;2j}s!!2K5 zJa;0Lxp`K*1$V2?*}VLKmelVomK1J_vThOz)xEn_QEE=10Hcf24rV|8$BToF2N-4d z0_b98plDpgIperXlc2YRdcYc)AsRv&j2^fz1?77stJ!Z|K42Z5xPE^9F9CP>{+zqb z=mh8H(5adYIuNbm$j9N^FN+K%(r*(IGj0HXlD zdZq)q1zQ0UhUQf{O3TO1$Q(c)xzx68Zbe{h3mG!18V4D-w8|--e=rqK&lUCkxxqaqrm3JE ze5|t01M@+(#xcY(G=vZkNpQr9@gHl#&>(lirF{YW0MihbL5nylV80CLvZiRx`+3T2S#tos`m@;=yt0 zV>$cg3pds^m4Kk6-MHRpRRCMrOc9h@+r4-~dK*5Mdd_+Mi_XzD!j8C}phC~k@Wo`{ ziBXv2ND{+kJ6kJyfAv0)IrZmcl{cDqM)-jP1N|vhP{s%)fPuyz_ZTCsbw!WcCd1+7 zr6+vaNN0B!zdzH7Wz;;nUp`44Dk3L`xQwx4KIpxP=FTsyU2L**tW)qYDqs;z-7RAYg!pWfa@Bu0Om`zMq>zh22(!{?V|(4D=d z-2+UY+;`zj1S;|7`3FXp%da>x@bzem=i+bV_RWhi2v6u<(csp%PYM~#M&_0y9cDQA zx+0L7be-N*?ZR=^P-Y~QP=xL7551K4merNgQN*`;JLf}3p zza0_Stq+5=NG6VPk6p}OmspB^sJbqZ2t4+vQmSl{q&2UoqEJn_4kwa-DE5L=1yU;p zx1Noa=?}ThXgOA)O}z#ro(A3_+z#&0N)`S)p1vjjsHoGF@0G{!@&&`q(ts)Tb;}Uu zrC7oZ6K7!6!$45?vvN14&uoS1WOuDTc=zn|fVFqu z!-f4}#p(Zq3c4aG=LpS_c0FMHAiU(zV8W8sl~~+u_skykPx2PTUkk+J_4ih9kyiC5 zS5tmH$cO=U@%8d#PfyQ(2vZ*Y6Sdr3pqTf%fo5v(zpo%n@TdI; z6H{rW?&$wxfHJ>9%z*|8{*T9L1_Ow>06|(;)2`%yvC7WZt(;<_P|DF3#ZE^Z%R0@m z;JO{U!pyzlDY^!3e{`SJEKVn;1wPz2-Mh5TdG4$@I0M_BdexMdA|#>s{P6#`TU_aY z-@ZPa?9Wk?{x_J}5%~`Z-T$f){y)G@bB4p$6j(ynMn*qKlD&t z2}o7@I(kHK-$6L8hUCspk_1B0xY2=w1U`Ay-6iwXBESrgygmP>@KZhfoJV zZ`RkR$dMny(rpkyUlq)Xe)~0MNgNRd8GiwQot)=PdXL28X{#_fvn!xLkld@GW0tbUC*g#TgZ>d$3#jC&1 za}I=vfE-Z>i`rI&inG2TYd2t5hdQkG4PZpBfMM#uf>mT>HJeMGiW$6X61}PABkDDL*KqG z2lH#ZoPXVIjH4tpAirH_8u{&SH>|tl@*U?0k^9bf3r>ED3&nD&IT~C1ot(%PB)>2F zlSExi+!%%W>5pzxo%ck$XU=R% zOx&9@q3m7)Iu$^=6z18_NTFke*SB}Ik$`6^A6ye^*y)k0yhCbD95$+?3%lsWjqf+g z?SYG(_o;<5F&buj=ZLbC6+=|lR?KOZj_q=c3Rqi_e|FDA4_(LF6-BEM2FK=OI^B77 z7gB^&;7g#q3&Rk}4(|xQrSlxRt>|zgZHoLkC&`?YE)QZt2bWnmo0Vbt;>;qrbI=v| zkC3;8VPeOw>7If}cHY_JQL}VE&)7?WMe%a>_^B0GSE5S{(G~ACuoB+VFM&~4G$?Ct zCp~jfjO0eOif2ZuK?(kt+FH~O%icZP z?)0-!jBCYbi@L7yF~hNztbMnHXM9YJHO8y*#n(~7;+K?hD-RP_YXh-^C`B{7$-Z+V z{?IR6&B@W!DY3#4HJr_6k?}~BIflT+u?>3VVNQ)6Lfo&(;F?(fTI2k(m%2Bce8gzl z=DF_ZtE-rVeZesA(Px)zBEF^Sz(3wbT;GeqpY*Fp=?wIxv|HIUe!Mb2hLm%x+00~> zVRcHIm(-Pcy&|9UF{zJ6FN1)kEGZVTq?#drWw=#s6SI+Zajw5BMG}$$S?q%HXZRa| zBq)5PCozFem~$)SZSjRpy#e?$H)x?Ig<~A+!OD?H~WmY{QdDp9-Y>EOu92!q3#Q z`ej=p2Rs$HgIXn#tolSB5dZ;K7qP>l-n+*>E($JzFj}_NFyXNnwUfgIQtITL; z`CD{ki^v)*@;X$^ePVMqz~PpWwCK&zCWMDdfkHw<4jSWuornd!-WzBbFoRidAjKHO~+0v)}(}diu-Ar)x#u z!jB?1##fI=S(kuxr%KZ#Zn&V`N?T-#%y#{rQblh^tX7Yz?(XNv7t$U#B>a{=Uw2Bt0jcS!si%C(BlbRwI%KDR z0zs8VX(l6aB4TWMo0OsZ*|RdlLeXS&kQv@Gc&8^9pRx8q*FC^FIEt<&gJmtT;3$fLj%D&-nT6x%B=OgP=LR{lU;2p+Lq%H%!oa<|% z-eRm=RP)(7_wi_zvQ|}2$`RI4RhB3z4Y%Xq%o>-|&~nW|1P+Q~WF={~;9M0?S~Vo> z%)zmr7m5M8MqNQ03iP_!>3zdY2 zH;Br2ozD;(Hzo%nD#+i1&{w2JSxze@XYZ!-{}H`07CMdy`QC5f`-u;BsaK@~jrmY` za^`I2a?y6ORky{<>SxnB4552slHDgAsuo;yA#Xq(zeHusJfTb(=r)7}TkTa*2En)2 zC7xuuL>S81;lATh6$Y1~`?(ySeJI9#FBfItZQ8k`@C}k%OF;yz3kWg}-Ja>budwPwBqK7fx>WE%#3GA1@!bmNx7;mxCkRG&d1YNGEChVgVKKdzN&jA^{G%zv22*{r57?(@#$1ggdW;pA7HQltGir_2~++~WFR;J4 zqHC?=2g;+qDghFS4A?SC1r}_mzq<)|c2_9Uwesz`uN{h4NA}nw2isigx=sS3+*YQ? zDE>&994-oAA7T{;9$`+0gT8mERt{C#ew(WlQ&q|{wIFT z#W&FTW`EA2cib!V?i=qNAvI*$C+m4=D?3E)ug5K4k3F;A+8zF4KE!dqp9)N%W?t%e zI=gj-l&NS{xre^4sBL;WGnXFp{L?G4Ge;%R_6FB3qOZZXk+RP{620+v=$3Lbl9Q-j z{Tbou9Q@j4n*v@#N2I$TFUR7{l#AOcvy0UNT@+&xW|OMP#`!XGem{Rs_wdOErmjDSe99N=@5UbTk- z<#hCJl`^AMfdJ_xe9vkCz*AV@;QW9*Gf1RvojWX*1YhN79~s@^#OI5ZBy#*HM4XdC zTe@d#c%yOtpD;f>QOkr85bH|Ir!%A0%4b^n)iK6_Q(RnVYmI@v!P&0NX`4t`Qor1lcIV%X|^t42*4vhl&_JzRmk|{RJA^N63pzRW{v5%Bj zwv-(#(!kxQ=bkRt^pC>U8gHA%=N`n0smPVT9SsWBD(#&?A5oEpW@dKI&d&MG-4QRU zO<6SFwDl{q=ZYOf14t?PIn-)MLMUqzOYD!jXAr1)VGU-L+iMp6<4^w^)Ar144In94 z#si{FK0I1qypK%Wa`w*xnY@Z+`Wgn-K)=DdYvNMyYKSmy}@2%F0+%ZQ8Cei+dok)QG4B$Eode+{ibl z`Uz=~J4z4@wo6l}|LpFe?wZ{EvBoP6gkFN5!-rWBn0$9VUpFF_!?`Zqo24nGPZ3jO z1&)2Z48VYiy>3;{MNiJv?_Vo2xpa0{%>F?KqCM12p;C#riTS!V| zl*u_YLm7Upis*dD(l0@scMvI@F651X)S7|tqwogSmxkYr)}K zd4G1R-^Wwdt(|9i?J?_tv&P`gwYtd&qv=QX`QqQQYYktfu_qDi9%Pm}iuu=XWmDs& zZ3**63W?eXtCF8@hClenCeQuTy#CtNN8)yljwrJyX`HS%vlHK=x}z6(cDlkn;My@7 zQ$^-BHpNsb6)*{Obwx6lp5FkgbHuAOkt#qM{F8iqBm+;)fr`0>1t@2Q9_?~WY%#l* zvA>|nU2798Bl((BeopV3tHt5m!qS?5A>xR$nnU@OX+>Sq_)@E#jnXy=6#w^T;9uu_ zeSAKgEWF~lVq;^YY?3x%a4Wevzr`?h7-*7=YIrhATwQS=iLaOm0z~Jg$*BFGHinkA zwg_J+)IidUWhQab`~D_Rk6B>Gz(U%{YF;e>gbMprO3JZ*sCD!}62HLk=xDEMe^)l4 zyj<4H#6NnVXlr+`tK`n}(;b`j6O^EaD;RrzryZ}|<@S6Z+BU5QIz_tR|65#%U*8=u z3oX}C*WkkA$|o+daTrXRo6xX~DR=k-Em9K<2A8#6iO-1i_s911_QIFTN>aCfTTbJ| zEEwN0GD>K6QCyFDvC-Wyt}a8~Qc0cFQs&N1d_(n>mp*DW@|O(0{(U+sLmMC|cQeA) z=llPIe}3)z2fbtO9<>F&sz3JM4)gwBIN{hnKUs{Ia{7`{4bX-|hWe&=aJL^k`@aD7 CQjcT+ literal 0 HcmV?d00001 diff --git a/docs/examples/te_gemma/media/plot.svg b/docs/examples/te_gemma/media/plot.svg new file mode 100755 index 0000000000..481f156df6 --- /dev/null +++ b/docs/examples/te_gemma/media/plot.svg @@ -0,0 +1 @@ +87.68 s54.11 s28.22 s16.75 s12.13 s0 s10 s20 s30 s40 s50 s60 s70 s80 s90 s100 sHF (baseline)TE (subsitution ofGemmaDecoderLayer withte.TransformerLayer)TE + THD attentionTE + THD attention + CUDA GraphsTE + THD attention + FP8 \ No newline at end of file diff --git a/docs/examples/te_gemma/media/thd_bshd.svg b/docs/examples/te_gemma/media/thd_bshd.svg new file mode 100755 index 0000000000..47eed69565 --- /dev/null +++ b/docs/examples/te_gemma/media/thd_bshd.svg @@ -0,0 +1 @@ +BSHD LayoutQKVQKVCumulative sequence lengths:3, 3 + 1, 3 + 1 + 3, 3 + 1 + 3 + 1Sequence offsets:0, 4, 8, 12[batch_size,seq_len,head_nr,dim][total_nr_tokens,head_nr,dim]Seq. 1Seq. 2Seq. 4Seq. 3sbtTHD LayoutPad. 1Pad. 2Pad. 4Pad. 3Attention masktokenpadding \ No newline at end of file diff --git a/docs/examples/te_gemma/requirements.txt b/docs/examples/te_gemma/requirements.txt new file mode 100755 index 0000000000..c90fb6dad0 --- /dev/null +++ b/docs/examples/te_gemma/requirements.txt @@ -0,0 +1,4 @@ +transformers==4.41.1 +accelerate==0.30.1 +datasets==2.19.1 +sentencepiece==0.2.0 \ No newline at end of file diff --git a/docs/examples/te_gemma/run_gemma_2b.py b/docs/examples/te_gemma/run_gemma_2b.py new file mode 100644 index 0000000000..db2fb087c9 --- /dev/null +++ b/docs/examples/te_gemma/run_gemma_2b.py @@ -0,0 +1,15 @@ +from transformers import AutoTokenizer, AutoModelForCausalLM +from huggingface_hub import login + +access_token = "" +login(access_token) + +model_name = "google/gemma-3-4b-it" +tokenizer = AutoTokenizer.from_pretrained(model_name) +model = AutoModelForCausalLM.from_pretrained(model_name) +print(model.config) +input_text = "Write me a poem about Machine Learning." +input_ids = tokenizer(input_text, return_tensors="pt") + +outputs = model.generate(**input_ids) +print(tokenizer.decode(outputs[0])) diff --git a/docs/examples/te_gemma/run_generation.py b/docs/examples/te_gemma/run_generation.py new file mode 100755 index 0000000000..eb781f11cf --- /dev/null +++ b/docs/examples/te_gemma/run_generation.py @@ -0,0 +1,22 @@ +from utils import * + +hyperparams.model_name = "/perfhome/repos/ckpt/models/gemma-7b-hf/" # "/tmp/gemma-7b-hf/" # <== Add model weight location here e.g. "/path/to/downloaded/gemma/weights" +hyperparams.qkv_format = "thd" + +# hyperparams.generation_cuda_graphs = True # 709.8s +hyperparams.generation_cuda_graphs = True + +if hyperparams.generation_cuda_graphs: + # It is necessary to preallocate a static buffer. + # CUDA graphs require static input tensors for every kernel. + # This approach may result in a slight increase in memory consumption; + # however, the substantial speedup achieved makes it worthwhile. + hyperparams.cuda_graphs_static_batch_size = 64 + hyperparams.cuda_graphs_static_max_seq_len = 1024 + hyperparams.cuda_graphs_static_max_context_len = 128 + +hyperparams.is_paged = False +model = init_te_gemma_model(hyperparams) + +print_sample_of_generated_texts(model) +benchmark_generation(model) diff --git a/docs/examples/te_gemma/run_generation_llama.py b/docs/examples/te_gemma/run_generation_llama.py new file mode 100755 index 0000000000..2f90995bd1 --- /dev/null +++ b/docs/examples/te_gemma/run_generation_llama.py @@ -0,0 +1,10 @@ +from utils import * + +hyperparams.model_name = "/perfhome/repos/ckpt/models/llama2-7b-hf/" # "/tmp/gemma-7b-hf/" # <== Add model weight location here e.g. "/path/to/downloaded/gemma/weights" +hyperparams.qkv_format = "thd" + +# model = init_te_llama_model(hyperparams) +model = init_baseline_model(hyperparams) + +print_sample_of_generated_texts(model) +# benchmark_generation(model) diff --git a/docs/examples/te_gemma/te_gemma.py b/docs/examples/te_gemma/te_gemma.py new file mode 100755 index 0000000000..f24b700979 --- /dev/null +++ b/docs/examples/te_gemma/te_gemma.py @@ -0,0 +1,808 @@ +# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. + +from contextlib import contextmanager + +from typing import Optional +from functools import partial +from collections import OrderedDict + +import torch +import transformer_engine as te +from transformer_engine.pytorch.attention import InferenceParams, RotaryPositionEmbedding +from transformer_engine.common.recipe import Format, DelayedScaling +from torch.cuda.amp import autocast + +import transformers +from transformers.models.gemma.modeling_gemma import GemmaForCausalLM, GemmaConfig, GemmaModel + +import torch.nn.functional as F + +def setup_cache_params_from_infer_params(inference_params, lengths_tensor, max_input_length): + """ + Converts the `input_ids` to variables like `cu_seqlens_q/kv`, etc. which + will be used later. + + (Currently a hack, this should be reformatted to a better method) + """ + + assert lengths_tensor is not None and max_input_length is not None, \ + "lengths_tensor and max_input_length should not be none for qkv_format = \"thd\"" + torch.add( + inference_params.cached_sequence_lengths, + inference_params.input_sequence_lengths, + out=inference_params.cached_sequence_lengths) + # inference_params.input_sequence_lengths[:len(lengths_tensor)].copy_(lengths_tensor, non_blocking=True) + inference_params.input_sequence_lengths.copy_(lengths_tensor) + + inference_params.max_incoming_seq_len = max_input_length + + max_seqlen_q, max_seqlen_kv = inference_params.max_incoming_seq_len, inference_params.max_sequence_length + + # # Allocation of buffers, it works correctly with CUDA Graphs. + _allocator = StaticBufferAllocator() + NR_BUFFERS = 4 + + cu_seqlens_q, cu_seqlens_kv, cu_seqlens_q_padded, cu_seqlens_kv_padded = [ + _allocator(inference_params.max_batch_size + 1, dtype=torch.int32, device="cuda") + for _ in range(NR_BUFFERS) + ] + + torch.cumsum(inference_params.input_sequence_lengths, dim=0, out=cu_seqlens_q[1:]) + torch.cumsum( + inference_params.cached_sequence_lengths + inference_params.input_sequence_lengths, + dim=0, out=cu_seqlens_kv[1:]) + # If layer has shape [b * s_layer, h, d] + # offsets are of the form [k * s_layer * h * d for k = 0, ..., batch_size] + cu_seqlens_q_padded.copy_( + torch.arange(0, inference_params.max_batch_size + 1, device="cuda") * max_seqlen_q) + cu_seqlens_kv_padded.copy_( + torch.arange(0, inference_params.max_batch_size + 1, device="cuda") * max_seqlen_kv) + + # inference_params.step_dict = OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist())) + inference_params.pre_step(OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist()))) + + # print(inference_params.step_dict) + + def get_cache_params_in_infer_params(): + return max_seqlen_q, max_seqlen_kv, cu_seqlens_q, cu_seqlens_kv, cu_seqlens_q_padded, cu_seqlens_kv_padded + + # For the time being, create an ad-hoc field in `inference_params` to get the variables. + # @sudhakars: to create a better way later. + inference_params.get_cache_params_from_infer_params = get_cache_params_in_infer_params + +# This class has been modified from +# https://github.com/huggingface/transformers/blob/98adf24883b007c2a7fb17bab1c01b1614673433/src/transformers/models/gemma/modeling_gemma.py +class GemmaRotaryEmbedding(torch.nn.Module): + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): + super().__init__() + + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float() / self.dim)) + self.register_buffer("inv_freq", tensor=inv_freq, persistent=False) + + @torch.no_grad() + def forward(self, x, position_ids, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + self.inv_freq.to(x.device) + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) + position_ids_expanded = position_ids[:, None, :].float() + # Force float32 since bfloat16 loses precision on long contexts + # See https://github.com/huggingface/transformers/pull/29285 + device_type = x.device.type + device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + return emb.unsqueeze(2) # should return in [b, s, 1, d] format + + +class StaticBufferAllocator(torch.nn.Module): + """ + This class is used when we use te.make_graphed_callable(). + CUDA Graphs require all tensors to be static. Neverthless, + torch API make_graphed_callable() takes care of output of torch modules, + and makes them static. Thus by wrapping allocation of memory into + torch.nn.Module, we can greatly simplify our code. + """ + + # pylint: disable=no-self-use + def forward(self, size, dtype, device): + """ + Return buffer of given size, dtype and device. + """ + return torch.zeros(size, dtype=dtype, device=device) + +class TEGemmaDecoderLayer(te.pytorch.TransformerLayer): + """ + Wrapper class over TE's `TransformerLayer`. This makes the wrapper very + similar to HF's `GemmaDecoderLayer` and easier to replace it in the code. + + Args: + config: GemmaConfig + args: positional args (for compatibility with `GemmaDecoderLayer`) + kwargs: keyword args (for compatibility with `GemmaDecoderLayer`) + """ + + def __init__(self, config: GemmaConfig, layer_idx: int, *args, **kwargs): + + self.gemma_config = config + + super().__init__( + hidden_size=config.hidden_size, + ffn_hidden_size=config.intermediate_size, + num_attention_heads=config.num_attention_heads, + bias=False, + layernorm_epsilon=config.rms_norm_eps, + hidden_dropout=0, + attention_dropout=0, + fuse_qkv_params=config.fuse_qkv_params, + normalization="RMSNorm", + activation="geglu", + # attn_input_format=config.qkv_format, + attn_input_format="bshd", + num_gqa_groups=config.num_key_value_heads, + kv_channels=self.gemma_config.head_dim, + layer_number=( + layer_idx + 1 + ), # Layer numbers in TE starts from 1, not 0 like in the HF. + zero_centered_gamma=True, + ) + + def alloc(self, size, dtype, device): + """ + Allocated the buffer and works correctly with CUDA Graphs. + """ + return self._allocator(size, dtype, device) + + def forward(self, *args, **kwargs): # We need to additionally pass positional encoding. + + # if "self_attn_mask_type" in kwargs: + # attn_mask_type = kwargs['self_attn_mask_type'] + # else: + # attn_mask_type = "whatever_default_is" + + # if attn_mask_type == "arbitrary": + # # @sudhakars: following logic doesn't work for `thd` + # attn_mask = kwargs['attention_mask'] + # attention_mask_inv = ~attn_mask + # generation_case = torch.tensor(torch.tensor(attn_mask.shape).shape).item() > 2 + + # if generation_case: + # # @sudhakars: for some reason, `attention_mask` for generation is of the + # # form [b, 1, 1, s]. + # attention_mask_inv = attention_mask_inv.squeeze(1).squeeze(1) + # assert torch.tensor(torch.tensor(attention_mask_inv.shape).shape).item() == 2 + + # # Create `position_ids` on the fly using `attention_mask` since HF + # # does the same in generation logic. + # position_ids = attention_mask_inv.long().cumsum(-1) - 1 + # position_ids.masked_fill_(attention_mask_inv == 0, 1) + + # if "position_ids" in kwargs and kwargs['position_ids'] is not None: + # assert torch.all(torch.eq(position_ids, kwargs["position_ids"])), "position ids don't match match exactly!" + + # # convert [b, s] to [b, 1, s, s] since `arbitrary` is only set for + # # context phase and context phase gets [b, s] sized attn mask + # seq_len = 1 if torch.tensor(torch.tensor(attn_mask.shape).shape).item() > 2 else attention_mask_inv.shape[1] + # arbitrary_attn_mask = torch.zeros(attention_mask_inv.shape[0], 1, seq_len, attention_mask_inv.shape[1]).bool() + # for sample_idx in range(attn_mask.shape[0]): + # pad_len = attn_mask[sample_idx].sum().int().item() + # # set the columns to padded + # arbitrary_attn_mask[sample_idx, :, :, :pad_len] = True + # # set the rows to padded + # if not generation_case: + # arbitrary_attn_mask[sample_idx, :, :pad_len, :] = True + # arbitrary_attn_mask[sample_idx] = torch.tril(arbitrary_attn_mask[sample_idx].logical_not()).logical_not() + + # # Update the attention mask to arbitrary + # kwargs['attention_mask'] = arbitrary_attn_mask.cuda() + + # # @sudhakars: `max_position_embeddings` is not even used inside GemmaRotaryEmbedding + # # @sudhakars: change the hardcoded `dim` to something like config.head_dim + # te_rope_emb = GemmaRotaryEmbedding(dim=256, max_position_embeddings=self.gemma_config.max_position_embeddings).cuda() + # te_rope_emb = te_rope_emb(args[0], position_ids.cuda()) + # else: + # When the `attention_mask` is not `arbitrary`, then for the purpose + # of this tutorial, we're using `padding_causal` (for context) and + # `padding` (for generation) + # @sudhakars: find a better way to provide the `tensor_format` + te_rope_emb = RotaryPositionEmbedding(self.gemma_config.head_dim)( + max_seq_len=self.gemma_config.max_position_embeddings + ).cuda() + + inference_params = kwargs["inference_params"] + # @sudhakars: big assumption that the input is "sbhd" + # batch_size = args[0].shape[0] + if inference_params.qkv_format_legacy == "thd": + ( + max_seqlen_q, max_seqlen_kv, cu_seqlens_q, cu_seqlens_kv, cu_seqlens_q_padded, cu_seqlens_kv_padded + ) = inference_params.get_cache_params_from_infer_params() + + # this args cannot be passed to TransformerLayer + keys_to_remove = [ + "position_ids", + "past_key_value", + "output_attentions", + "use_cache", + "cache_position", + ] + for key in keys_to_remove: + kwargs.pop(key, None) + + # import pdb; pdb.set_trace() + # We need to return tuple to be compatible with HF. + return ( + super().forward( + *args, + rotary_pos_emb=te_rope_emb, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_kv=cu_seqlens_kv, + max_seqlen_q=max_seqlen_q, + max_seqlen_kv=max_seqlen_kv, + **kwargs + ), + ) + +class StaticGemmaModel(torch.nn.Module): + """ + StaticGemma is based of HF GemmaModel class. + It is adjusted to work properly with CUDA Graphs. + """ + + def __init__( + self, + model: GemmaModel, + dtype: torch.dtype, + mask: torch.Tensor, + lm_head: torch.nn.Module, + ): + super().__init__() + self.model = model + self.normalizer = torch.tensor(self.model.config.hidden_size**0.5, dtype=dtype) + self.mask = mask + self.lm_head = lm_head + + def set_inference_params(self, inference_params): + self.inference_params = inference_params + + # @sudhakars: is `arbitrary` fine being the default here? + def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor = None, attn_mask_type: str = "arbitrary"): + with torch.no_grad(): + # static operation - for CUDA graphs + hidden_states.data[:] = hidden_states.data[:] * self.normalizer + + for i, decoder_layer in enumerate(self.model.layers): + hidden_states.data[:] = decoder_layer( + hidden_states, + attention_mask=attention_mask, + self_attn_mask_type=self.mask if attn_mask_type is None else attn_mask_type, + inference_params=self.inference_params, + )[ + 0 + ] # static copy - for CUDA graphs + + hidden_states.copy_(self.model.norm(hidden_states)) # static copy - for CUDA graphs + logits = self.lm_head(hidden_states) + logits = logits.float() + return logits + + +class GemmaGenerator(torch.nn.Module): + """ + GemmaGenerator gets one layer of embeddins, + makes forward pass and returns next tokens. + """ + + def __init__( + self, model: GemmaModel, lm_head: torch.nn.Module, dtype: torch.dtype, qkv_format: str + ): + super().__init__() + self.model = model + self.gemma_layers = StaticGemmaModel(model, dtype, "arbitrary", lm_head) + self.qkv_format = qkv_format + + def set_inference_params(self, inference_params): + self.inference_params = inference_params + self.gemma_layers.set_inference_params(inference_params) + + # @sudhakars: is `arbitrary` a good default value here? + def forward(self, hidden_states: torch.Tensor, mask: torch.Tensor = None, attn_mask_type: str = "arbitrary"): + logits = self.gemma_layers(hidden_states, attention_mask=mask, attn_mask_type = attn_mask_type) + + assert logits.shape[0] == hidden_states.shape[0] # b + assert logits.shape[1] == hidden_states.shape[1] # seq_len + # logits.shape[2] = number of tokens + logits = logits[:, -1, :] + next_tokens = torch.argmax(logits, dim=1) + + # static copy for CUDA graphs + hidden_states.copy_(self.model.embed_tokens(next_tokens).unsqueeze(1)) + + return next_tokens + + +class PartialForwardWrapper(torch.nn.Module): + """ + This class wraps a `torch.nn.Module` while partially modifying its `forward` + + CUDAGraphs' `make_graphed_callables` method takes in a module but if only + `functools.partial` is used to wrap the module, it changes the modules' + type and that interferes with the `make_graphed_callables` intrinsics. + """ + def __init__(self, module, **kwargs): + super().__init__() + self.module = module + self.partial_forward = partial(self.module.forward, **kwargs) + + def __call__(self, *args, **kwargs): + return self.partial_forward(*args, **kwargs) + + # @sudhakars: should we use better abstraction? + def set_inference_params(self, *args, **kwargs): + return self.module.set_inference_params(*args, **kwargs) + + +@contextmanager +def replace_decoder(te_decoder_cls): + """ + Replace `GemmaDecoderLayer` with custom `TEGemmaDecoderLayer`. + """ + original_gemma_decoder_cls = transformers.models.gemma.modeling_gemma.GemmaDecoderLayer + transformers.models.gemma.modeling_gemma.GemmaDecoderLayer = te_decoder_cls + try: + yield + finally: + transformers.models.gemma.modeling_gemma.GemmaDecoderLayer = original_gemma_decoder_cls + + +class TEGemmaForCausalLM(GemmaForCausalLM): + """ + Causal LM created with `GemmaModel`. The underlying `GemmaDecoderLayer` + class is monkey-patched with `TEGemmaDecoderLayer` class before + initializing the causal LM with `GemmaForCausalLM`. + + Args: + config: GemmaConfig + """ + + def __init__(self, config: GemmaConfig): + with replace_decoder(te_decoder_cls=TEGemmaDecoderLayer): + super().__init__(config) + self.config = config + self.to(torch.bfloat16).cuda() + self.hidden_size = config.hidden_size + self._model_generation_phase = GemmaGenerator( + lm_head=self.lm_head, + model=self.model, + dtype=torch.bfloat16, + qkv_format=config.qkv_format, + ) + self._model_context_phase = StaticGemmaModel( + self.model, torch.bfloat16, "arbitrary", self.lm_head + ) + + if self.config.fp8: + self.fp8_recipe = DelayedScaling( + fp8_format=Format.HYBRID, amax_history_len=16, amax_compute_algo="max" + ) + + @staticmethod + def _padding_to_end(inputs, lengths, max_seq_len=None): + """ + Gets the tensor with sequence padded from the beginning and + return tensor padded from its end. + + Parameters + ---------- + inputs : Tensor, tensor with shape [b, s] containing token numbers. + It's padded from the beggining. + lengths: Tensor, tensor with shape [s] with lengths of the sequences. + + """ + max_seq_len = torch.max(lengths) if max_seq_len is None else max_seq_len + batch_size, max_seq_len = inputs.shape + new_input_ids = inputs.clone() + for i in range(batch_size): + new_input_ids[i, : lengths[i]] = inputs[i, (max_seq_len - lengths[i]) : max_seq_len] + new_input_ids[i, lengths[i] :] = inputs[i, 0 : (max_seq_len - lengths[i])] + + # Disable the input preparation that involves extra padding + # inputs.copy_(new_input_ids) + + # Trim the inputs to no extra padding i.e. fix the max seq len to + # the longest sequence in the batch + actual_max_seq_len = max_seq_len + inputs.data = new_input_ids[:, :actual_max_seq_len] + print(f"actual_max_seq_len: {actual_max_seq_len}") + + # For Paged Attention, make the valid sequences, multiple of 64 + # inputs.data = new_input_ids[:, :4].repeat(1, 16) + + + def _next_64_multiply(self, x): + return ((x + 63) // 64) * 64 + + # This function is overriden in TeGEmmaForCausalLMCudaGraphs. + def _create_hidden_states_buffer(self, input_ids: torch.Tensor): + tensor = torch.empty( + (input_ids.shape[0], input_ids.shape[1], self.hidden_size), + device="cuda", + dtype=torch.float32, + ) + # import pdb; pdb.set_trace() + return tensor + + # This function is overriden in TeGEmmaForCausalLMCudaGraphs. + def _create_inference_params(self, *args, **kwargs): + infer_params = InferenceParams( + *args, **kwargs + ) + + max_batch_size = kwargs["max_batch_size"] + + # Initialize some legacy params + infer_params.cached_sequence_lengths = torch.zeros( + (max_batch_size,), device="cuda", dtype=torch.int32) + infer_params.input_sequence_lengths = torch.zeros( + (max_batch_size,), device="cuda", dtype=torch.int32) + + return infer_params + + # This function is overriden in TeGEmmaForCausalLMCudaGraphs. + def _get_max_input_seq_len(self, input_ids): + return input_ids.shape[1] \ + if not hasattr(self.config, "cuda_graphs_static_max_context_len") \ + else self.config.cuda_graphs_static_max_context_len + + # The buffer for generation is some part (beginning) of hidden states buffer. + # This function returns pointer to it and also copies there data if provided. + def _get_generation_buffer(self, hidden_states_buffer, data_to_copy=None): + # hidden_states_buffer has shape [b, s, hd] + # generation_buffer will have shape [b, 1, hd] + # Notice that "generation_buffer = hidden_states_buffer[:, 0, :].unsqueeze(1)" + # will return uncontiguous buffer, which we want to avoid. + output = hidden_states_buffer.view(-1)[ + : hidden_states_buffer.shape[0] * hidden_states_buffer.shape[2] + ] + if data_to_copy is not None: + output.copy_(data_to_copy.reshape(-1)) + generation_buffer = output.view( + (hidden_states_buffer.shape[0], 1, hidden_states_buffer.shape[2]) + ) + return generation_buffer + + def _generate_context_phase(self, input_ids: torch.Tensor, inference_params: InferenceParams): + # import pdb; pdb.set_trace() + hidden_states = self._create_hidden_states_buffer(input_ids) + hidden_states.data[:] = self.model.embed_tokens(input_ids) + + # We need to update offsets before every forward pass to make cache work properly. + lengths = input_ids.ne(0).sum(dim=1) + # import pdb; pdb.set_trace() + if self.config.qkv_format == "thd": + # inference_params.setup_before_new_input( + # lengths_tensor=lengths, max_input_length=input_ids.shape[1] + # ) + lengths = input_ids.ne(0).sum(dim=1) + max_input_length = input_ids.shape[1] + setup_cache_params_from_infer_params(inference_params, lengths, max_input_length) + else: + inference_params.setup_before_new_input(length=input_ids.shape[1]) + + + logits = self._model_context_phase( + hidden_states, + attention_mask=((input_ids == 0) if self.config.qkv_format != "thd" else None), + attn_mask_type="padding_causal" if self.config.qkv_format == "thd" else "arbitrary" + ) + + # We choose logits coresponding with last token in each sequence, + # which have various lengths - they are stored in (inference_params.incoming_seq_len - 1) + # Tensor when qkv_format == "thd" and + # they are the last token in the sequence when qkv_format != "thd". + if self.config.qkv_format == "thd": + logits = logits[ + + torch.arange(logits.size(0)), inference_params.input_sequence_lengths - 1, : + ] + else: + logits = logits[:, -1, :] + + next_tokens = torch.argmax(logits, dim=1) + + # self.hidden_states have shape [b, s, hd]. + # We return hidden state for the last token - output has shape [b, 1, hd] + hidden_states = self._get_generation_buffer( + hidden_states, self.model.embed_tokens(next_tokens) + ) + return hidden_states, next_tokens + + def _make_mask_one_token_longer(self, mask): + return torch.cat( + [mask, torch.zeros(mask.size(0), 1, 1, 1, dtype=torch.bool, device=mask.device)], dim=-1 + ) + + @torch.no_grad() + def generate( + self, + input_ids: Optional[torch.Tensor] = None, + pad_token_id: int = 0, + max_new_tokens: int = 0, + *args, + **kwargs + ): + self.eval() + + # We need both autocasts: FP8 for operations that can run in lower precision + # and BF16 for those that cannot. + with autocast(dtype=torch.bfloat16, cache_enabled=False), te.pytorch.fp8_autocast( + enabled=self.config.fp8, fp8_recipe=self.fp8_recipe if self.config.fp8 else None + ): + + lengths = torch.sum(input_ids.ne(pad_token_id), dim=-1).squeeze() # [s] + + batch_size, max_input_sequence_len = input_ids.shape[0], self._get_max_input_seq_len( + input_ids + ) + + # This is not needed since the padding to the left is already done in utils.py + # # Pad input_ids with zeros on the left to match max_input_sequence_len + # # This adds padding tokens (0) to the left side of each sequence in the batch + # # Shape goes from [batch_size, seq_len] to [batch_size, max_input_sequence_len] + # input_ids = F.pad( + # input_ids, (max_input_sequence_len - input_ids.shape[1], 0), "constant", 0 + # ) + + if self.config.qkv_format == "thd": + # For thd layout padding is at the end, otherwise at the beginning. + TEGemmaForCausalLM._padding_to_end(input_ids, + lengths, + max_seq_len=self.config.cuda_graphs_static_max_context_len \ + if self.config.generation_cuda_graphs else None + ) + + # import pdb; pdb.set_trace() + + # InferenceParams is a cache, where keys and values of previous tokens are stored. + # Moreover it stores length of both already generated and input sequences. + inference_params = self._create_inference_params( + max_batch_size=batch_size, + # num_layers=self.config.num_hidden_layers, + max_sequence_length=self._next_64_multiply(max_input_sequence_len + max_new_tokens), + num_heads_kv=self.config.num_key_value_heads, + # num_heads_q=self.config.num_attention_heads, + head_dim_v=self.config.head_dim, + head_dim_k=self.config.head_dim, + dtype=torch.bfloat16, + is_paged=self.config.is_paged, + page_size=64, + total_num_pages=64, # 64 * 64 (max_sequence_length) / 64 (page_size) + # is_cuda_graph=False + ) + + def init_cache_params_in_infer_params(inference_params): + inference_params.cached_sequence_lengths = torch.zeros( + (batch_size,), device="cuda", dtype=torch.int32) + inference_params.input_sequence_lengths = torch.zeros( + (batch_size,), device="cuda", dtype=torch.int32) + + init_cache_params_in_infer_params(inference_params) + inference_params.qkv_format_legacy = self.config.qkv_format + + self._model_context_phase.set_inference_params(inference_params) + self._model_generation_phase.set_inference_params(inference_params) + + hidden_states, next_tokens = self._generate_context_phase(input_ids, inference_params) + + # Generation phase. + if self.config.qkv_format == "thd": + # inference_params.setup_before_new_input( + # lengths_tensor=torch.ones((next_tokens.shape[0],), device="cuda"), + # max_input_length=1, + # ) + setup_cache_params_from_infer_params(inference_params, + lengths_tensor=torch.ones((next_tokens.shape[0],), dtype=int), + max_input_length=1) + else: + inference_params.setup_before_new_input(length=1) + + output_tokens = [next_tokens] + + mask = None + if self.config.qkv_format != "thd": + mask = (input_ids == 0).unsqueeze(1).unsqueeze(1) + + for _ in range(max_new_tokens): + if self.config.qkv_format != "thd": + # It will not work with cuda graphs, but it is not used for thd qkv_format. + # Attention mask in bshd needs attn_mask increased by 1 to + # include the next token to be generated + mask = self._make_mask_one_token_longer(mask) + + # setup_cache_params_from_infer_params(inference_params, input_ids) + # @sudhakars: could create position_ids from mask here + next_tokens = self._model_generation_phase(hidden_states, mask, attn_mask_type="padding" if self.config.qkv_format=="thd" else "arbitrary") + + # self.inference_params contains for example kv_cache. + # This needs to be called before every pass, + # to update the information of sequence lengths. + # Here we increase sequence offsets by one, + # because we generated one token for every sequence. + if self.config.qkv_format == "thd": + # self.inference_params.setup_before_new_input( + # lengths_tensor=torch.ones((next_tokens.shape[0],), device="cuda"), + # max_input_length=1, + # ) + setup_cache_params_from_infer_params(inference_params, + lengths_tensor=torch.ones((next_tokens.shape[0],), dtype=int), + max_input_length=1) + else: + inference_params.setup_before_new_input(length=1) + # next_tokens is static output tensor, so we need to clone it + # - it gets changed every iteration. + output_tokens.append(next_tokens.clone()) + + result = torch.cat((input_ids, torch.stack(output_tokens).permute([1, 0])), dim=1) + return result + + def forward(self, *args, **kwargs): + self._model_context_phase.set_inference_params(None) + hidden_states = self.model.embed_tokens(kwargs["input_ids"]) + logits = self._model_context_phase( + hidden_states, + attention_mask=((kwargs["input_ids"] == 0) if self.config.qkv_format != "thd" else None), + attn_mask_type="arbitrary" + ) + return logits + +class TEGemmaForCausalLMCudaGraphs(TEGemmaForCausalLM): + """ + TEGemmaForCausalLMCudaGraphs is the version of the class TEGemmaForCausalLM + using CUDA Graphs to speed it up. We need to make one trade-off. + Namely, batch_size, max_seq_len and max_context_seq_len need to be static. + It is necessary to run generation with the same value of + these variables that we recorded graph on. + """ + + def __init__(self, config: GemmaConfig): + super().__init__(config) + assert ( + config.qkv_format == "thd" + ), "Generation with CUDA Graphs are implemented only for thd format." + + # Preparation of the static buffers. + self.config = config + self.hidden_states_buffer = torch.empty( + ( + self.config.cuda_graphs_static_batch_size, + self.config.cuda_graphs_static_max_context_len, + self.config.hidden_size, + ) + ).cuda() + # This is in fact part of the buffer for hidden_states. + self.generation_buffer = self._get_generation_buffer(self.hidden_states_buffer) + # self.inference_params = InferenceParams( + # max_batch_size=config.cuda_graphs_static_batch_size, + # max_sequence_length=config.cuda_graphs_static_max_seq_len, + # qkv_format="thd", + # ) + self.inference_params = InferenceParams( + max_batch_size=self.config.cuda_graphs_static_batch_size, + # num_layers=self.config.num_hidden_layers, + max_sequence_length=self.config.cuda_graphs_static_max_seq_len, + num_heads_kv=self.config.num_key_value_heads, + # num_heads_q=self.config.num_attention_heads, + head_dim_v=self.config.head_dim, + head_dim_k=self.config.head_dim, + dtype=torch.bfloat16, + is_paged=self.config.is_paged, + page_size=64, + total_num_pages=64, # 64 * 64 (max_sequence_length) / 64 (page_size) + # is_cuda_graph=False + ) + + ## Taken from TEGemmaForCausalLM above + max_batch_size = self.config.cuda_graphs_static_batch_size + # Initialize some legacy params + self.inference_params.cached_sequence_lengths = torch.zeros( + (max_batch_size,), device="cuda", dtype=torch.int32) + self.inference_params.input_sequence_lengths = torch.zeros( + (max_batch_size,), device="cuda", dtype=torch.int32) + + # def init_cache_params_in_infer_params(inference_params): + # inference_params.cached_sequence_lengths = torch.zeros( + # (batch_size,), device="cuda", dtype=torch.int32) + # inference_params.input_sequence_lengths = torch.zeros( + # (batch_size,), device="cuda", dtype=torch.int32) + # init_cache_params_in_infer_params(inference_params) + + self.inference_params.qkv_format_legacy = self.config.qkv_format + + self._model_generation_phase.set_inference_params(self.inference_params) + self._model_context_phase.set_inference_params(self.inference_params) + + def record(self): + # We want to record model in training=False, because it will be used in generation. + self.eval() + + # Here "the trick" happens. We override methods from TEGemmaForCausalLM + # with their recorded version. After invocation of each of them, + # captured graph will be replayed with minimal usage of CPU, + # what will lead to huge speedup. + input_shape = ( + self.config.cuda_graphs_static_batch_size, + self.config.cuda_graphs_static_max_context_len, + ) + self.inference_params.reset() + # self.inference_params.setup_before_new_input( + # lengths_tensor=torch.tensor(input_shape[0] * [input_shape[1]], device="cuda"), + # max_input_length=input_shape[1], + # ) + lengths = torch.tensor(input_shape[0] * [input_shape[1]], device="cuda") + max_input_length = input_shape[1] + setup_cache_params_from_infer_params(self.inference_params, lengths, max_input_length) + + self._model_context_phase = self.record_graph( + self._model_context_phase, + self.hidden_states_buffer, + attn_mask_type="padding_causal" + ) # CUDA Graphs recording + + input_shape = (self.config.cuda_graphs_static_batch_size, 1) + self.inference_params.reset() + # self.inference_params.setup_before_new_input( + # lengths_tensor=torch.tensor(input_shape[0] * [input_shape[1]], device="cuda"), + # max_input_length=input_shape[1], + # ) + lengths = torch.tensor(input_shape[0] * [input_shape[1]], device="cuda") + max_input_length = input_shape[1] + setup_cache_params_from_infer_params(self.inference_params, lengths, max_input_length) + + self._model_generation_phase = self.record_graph( + self._model_generation_phase, + self.generation_buffer, + attn_mask_type="padding" + ) # CUDA Graphs recording + + """ + Functions _create_hidden_states_buffer and _create_inference_params + from base class are overriden to make hidden_states and inference_params static + - not changing their position in memory between every invocation. + """ + + def _create_hidden_states_buffer(self, *args, **kwargs): + return self.hidden_states_buffer + + def _create_inference_params(self, *args, **kwargs): + self.inference_params.reset() + return self.inference_params + + def _get_max_input_seq_len(self, _): + return self.config.cuda_graphs_static_max_context_len + + @torch.no_grad() + def record_graph(self, function, input_tensor, **sample_kwargs): + # function is invoked on argument (self.hidden_states,) and all kernels are recorded. + # record_graph() returns captured function, which can be run later with lower of th CPU. + fp8_format = Format.HYBRID + fp8_recipe = DelayedScaling( + fp8_format=fp8_format, amax_history_len=1024, amax_compute_algo="max" + ) + + # We need both autocasts: FP8 for operations that can run in lower precision + # and BF16 for those that cannot. + with autocast(dtype=torch.bfloat16, cache_enabled=False): + graphed_function = te.pytorch.make_graphed_callables( + function, + (input_tensor,), + fp8_enabled=self.config.fp8, + fp8_recipe=fp8_recipe, + allow_unused_input=True, + num_warmup_iters=3, + sample_kwargs=sample_kwargs, + ) + return graphed_function diff --git a/docs/examples/te_gemma/te_gemma_loading_weights.py b/docs/examples/te_gemma/te_gemma_loading_weights.py new file mode 100755 index 0000000000..41f62ad7f3 --- /dev/null +++ b/docs/examples/te_gemma/te_gemma_loading_weights.py @@ -0,0 +1,160 @@ +# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. + +import os +import re +import gc +import torch + +from typing import List + +from transformer_engine.pytorch.fp8 import fp8_model_init + +from transformers.modeling_utils import load_state_dict, _load_state_dict_into_model +from transformers.utils.hub import get_checkpoint_shard_files + +""" + This file contains logic of mapping the HuggingFace GemmaModel parameters + with TransformerEngine TransformerLayer. When we have initialized Transformer models + both with HF and with TE, we can copy parameters from the first to the second. +""" + + +def _load_weights_for_fp8_model(vanilla_model, hyperparams): + # The weights are loaded from the file with state_dict + # of model with weights which contains also fp8 parameters. + # The weights are in BF16 precision, but they contain fp8 metadata + # computed by the calibration procedure. + vanilla_model.load_state_dict( + torch.load(hyperparams.fp8_model_weights_filename), + strict=False, + # strict = false, because some parameters have + # multiple pointers to the same weight + # vanilla_model._model_context_phase.model + # and vanilla_model._model_generation_phase.model + ) + + +def _load_weights_for_standard_model(vanilla_model, config): + # The weights are loaded from the file with original weights. + archive_file = os.path.join(config.model_name, "model.safetensors.index.json") + resolved_archive_file, _ = get_checkpoint_shard_files(config.model_name, archive_file) + total_dict = {} + for shard_file in resolved_archive_file: + state_dict = load_state_dict(shard_file) + total_dict.update(state_dict) + + replace_params( + total_dict, + vanilla_model.state_dict(), + config, + qkv_fused_and_interleaved=config.fuse_qkv_params, + ) + # Copy parameters like embedding: + _load_state_dict_into_model(vanilla_model, total_dict, start_prefix="") + + # Force mem release. Taken from huggingface code. + del total_dict + gc.collect() + + +def load_te_model(cls, config): + """ + Custom method adapted from `from_pretrained` method in HuggingFace + Transformers repo: + https://github.com/huggingface/transformers/blob/f497f564bb76697edab09184a252fc1b1a326d1e/src/transformers/modeling_utils.py#L2579 + """ + config.use_cache = False # To make TransformerLayer compatible with GemmaModel + with fp8_model_init(config.fp8_model_init): + # there we need only to create model + vanilla_model = cls(config).to(torch.bfloat16).cuda() + + # return vanilla_model + # and now we copy the weights into it + if config.fp8_model_weights_filename is not None: + _load_weights_for_fp8_model(vanilla_model, config) + else: + _load_weights_for_standard_model(vanilla_model, config) + + return vanilla_model + + +def _get_all_layer_prefixes_to_update(hf_state_dict): + """ + There are many parameters in hf_state_dict, whose name start with "model.layers.[number]." + This function extracts all strings like "model.layers.[number]." + that are starting strings of keys in hf_state_dict. + """ + all_layer_prefixes = set() + for param_key in hf_state_dict.keys(): + layer_prefix_pat = "model.layers.\d+." + m = re.match(layer_prefix_pat, param_key) + if m is not None: + all_layer_prefixes.add(m.group()) + return all_layer_prefixes + + +def replace_params(hf_state_dict, te_state_dict, config, qkv_fused_and_interleaved=False): + """ + Replaces params from TE TransformerLayer state_dict with corresponding parameters + from HuggingFace GemmaModel state_dict. + """ + all_layer_prefixes: List[str] = _get_all_layer_prefixes_to_update(hf_state_dict) + + for layer_prefix in all_layer_prefixes: + + def copy_from_ht_to_te(te_name, hf_name, start=None, end=None): + te_state_dict[layer_prefix + te_name].data[start:end].copy_( + hf_state_dict[layer_prefix + hf_name] + ) + + copy_from_ht_to_te( + "self_attention.layernorm_qkv.layer_norm_weight", "input_layernorm.weight" + ) + copy_from_ht_to_te("self_attention.proj.weight", "self_attn.o_proj.weight") + copy_from_ht_to_te("layernorm_mlp.layer_norm_weight", "post_attention_layernorm.weight") + copy_from_ht_to_te("layernorm_mlp.fc2_weight", "mlp.down_proj.weight") + copy_from_ht_to_te( + "layernorm_mlp.fc1_weight", "mlp.gate_proj.weight", end=config.intermediate_size + ) + copy_from_ht_to_te( + "layernorm_mlp.fc1_weight", "mlp.up_proj.weight", start=config.intermediate_size + ) + + if qkv_fused_and_interleaved: + """ + When qkv_fused_and_interleaved=True, key, query and value layers are on one tensor + in TE TransformerLayer. Moreover they are interleaved within each head. + Let q_i, k_i and v_i be query, key and value layers for i-th head respectively. + Then TE stores weight tensor in the form: + [q1 k1 v1 q2 k2 v2 ...] + This is done to maximally optimize performance time. + """ + te_qkv_layer = te_state_dict[layer_prefix + "self_attention.layernorm_qkv.weight"] + + def copy_interleave(hf_name, idx): + src = hf_state_dict[layer_prefix + hf_name] + for head_nr in range(config.num_attention_heads): + dst_offset = head_nr * config.head_dim * 3 + dst_slice = slice( + dst_offset + idx * config.head_dim, dst_offset + (idx + 1) * config.head_dim + ) + src_slice = slice( + head_nr * config.head_dim, head_nr * config.head_dim + config.head_dim + ) + te_qkv_layer[dst_slice, :] = src[src_slice, :] + + copy_interleave("self_attn.q_proj.weight", 0) + copy_interleave("self_attn.k_proj.weight", 1) + copy_interleave("self_attn.v_proj.weight", 2) + else: + copy_from_ht_to_te( + "self_attention.layernorm_qkv.query_weight", "self_attn.q_proj.weight" + ) + copy_from_ht_to_te("self_attention.layernorm_qkv.key_weight", "self_attn.k_proj.weight") + copy_from_ht_to_te( + "self_attention.layernorm_qkv.value_weight", "self_attn.v_proj.weight" + ) + + return all_layer_prefixes diff --git a/docs/examples/te_gemma/te_llama.py b/docs/examples/te_gemma/te_llama.py new file mode 100755 index 0000000000..426b79cbf1 --- /dev/null +++ b/docs/examples/te_gemma/te_llama.py @@ -0,0 +1,759 @@ +# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. + +from contextlib import contextmanager + +from typing import Optional +from functools import partial +from collections import OrderedDict + +import torch +import transformer_engine as te +from transformer_engine.pytorch.attention import InferenceParams, RotaryPositionEmbedding +from transformer_engine.common.recipe import Format, DelayedScaling +from torch.cuda.amp import autocast + +import transformers +from transformers.models.llama.modeling_llama import LlamaForCausalLM, LlamaConfig, LlamaModel + +import torch.nn.functional as F + +def setup_cache_params_from_infer_params(inference_params, lengths_tensor, max_input_length): + """ + Converts the `input_ids` to variables like `cu_seqlens_q/kv`, etc. which + will be used later. + + (Currently a hack, this should be reformatted to a better method) + """ + + assert lengths_tensor is not None and max_input_length is not None, \ + "lengths_tensor and max_input_length should not be none for qkv_format = \"thd\"" + torch.add( + inference_params.cached_sequence_lengths, + inference_params.input_sequence_lengths, + out=inference_params.cached_sequence_lengths) + inference_params.input_sequence_lengths.copy_(lengths_tensor) + inference_params.max_incoming_seq_len = max_input_length + + max_seqlen_q, max_seqlen_kv = inference_params.max_incoming_seq_len, inference_params.max_sequence_length + + # # Allocation of buffers, it works correctly with CUDA Graphs. + _allocator = StaticBufferAllocator() + NR_BUFFERS = 4 + + cu_seqlens_q, cu_seqlens_kv, cu_seqlens_q_padded, cu_seqlens_kv_padded = [ + _allocator(inference_params.max_batch_size + 1, dtype=torch.int32, device="cuda") + for _ in range(NR_BUFFERS) + ] + + torch.cumsum(inference_params.input_sequence_lengths, dim=0, out=cu_seqlens_q[1:]) + torch.cumsum( + inference_params.cached_sequence_lengths + inference_params.input_sequence_lengths, + dim=0, out=cu_seqlens_kv[1:]) + # If layer has shape [b * s_layer, h, d] + # offsets are of the form [k * s_layer * h * d for k = 0, ..., batch_size] + cu_seqlens_q_padded.copy_( + torch.arange(0, inference_params.max_batch_size + 1, device="cuda") * max_seqlen_q) + cu_seqlens_kv_padded.copy_( + torch.arange(0, inference_params.max_batch_size + 1, device="cuda") * max_seqlen_kv) + + # inference_params.step_dict = OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist())) + inference_params.pre_step(OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist()))) + + # print(inference_params.step_dict) + + def get_cache_params_in_infer_params(): + return max_seqlen_q, max_seqlen_kv, cu_seqlens_q, cu_seqlens_kv, cu_seqlens_q_padded, cu_seqlens_kv_padded + + # For the time being, create an ad-hoc field in `inference_params` to get the variables. + # @sudhakars: to create a better way later. + inference_params.get_cache_params_from_infer_params = get_cache_params_in_infer_params + +# This class has been modified from +# https://github.com/huggingface/transformers/blob/98adf24883b007c2a7fb17bab1c01b1614673433/src/transformers/models/gemma/modeling_gemma.py +class LlamaRotaryEmbedding(torch.nn.Module): + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): + super().__init__() + + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float() / self.dim)) + self.register_buffer("inv_freq", tensor=inv_freq, persistent=False) + + @torch.no_grad() + def forward(self, x, position_ids, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + self.inv_freq.to(x.device) + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) + position_ids_expanded = position_ids[:, None, :].float() + # Force float32 since bfloat16 loses precision on long contexts + # See https://github.com/huggingface/transformers/pull/29285 + device_type = x.device.type + device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + return emb.unsqueeze(2) # should return in [b, s, 1, d] format + + +class StaticBufferAllocator(torch.nn.Module): + """ + This class is used when we use te.make_graphed_callable(). + CUDA Graphs require all tensors to be static. Neverthlessly, + torch API make_graphed_callable() takes care of output of torch modules, + and makes them static. Thus by wrapping allocation of memory into + torch.nn.Module, we can greatly simplify our code. + """ + + # pylint: disable=no-self-use + def forward(self, size, dtype, device): + """ + Return buffer of given size, dtype and device. + """ + return torch.zeros(size, dtype=dtype, device=device) + +class TELlamaDecoderLayer(te.pytorch.TransformerLayer): + """ + Wrapper class over TE's `TransformerLayer`. This makes the wrapper very + similar to HF's `LlamaDecoderLayer` and easier to replace it in the code. + + Args: + config: LlamaConfig + args: positional args (for compatibility with `LlamaDecoderLayer`) + kwargs: keyword args (for compatibility with `LlamaDecoderLayer`) + """ + + def __init__(self, config: LlamaConfig, layer_idx: int, *args, **kwargs): + + self.llama_config = config + self.head_dim = self.llama_config.hidden_size // self.llama_config.num_attention_heads + + super().__init__( + hidden_size=config.hidden_size, + ffn_hidden_size=config.intermediate_size, + num_attention_heads=config.num_attention_heads, + bias=False, # LLaMA specific + layernorm_epsilon=config.rms_norm_eps, + hidden_dropout=0, + attention_dropout=0, + fuse_qkv_params=config.fuse_qkv_params, + normalization="RMSNorm", + activation="swiglu", # LLaMA specific + # attn_input_format=config.qkv_format, + attn_input_format="bshd", + num_gqa_groups=config.num_key_value_heads, + kv_channels=self.head_dim, # LLaMA specific + layer_number=( + layer_idx + 1 + ), # Layer numbers in TE starts from 1, not 0 like in the HF. + zero_centered_gamma=True, # LLaMA specific + ) + + def alloc(self, size, dtype, device): + """ + Allocated the buffer and works correctly with CUDA Graphs. + """ + return self._allocator(size, dtype, device) + + def forward(self, *args, **kwargs): # We need to additionally pass positional encoding. + + if "self_attn_mask_type" in kwargs: + attn_mask_type = kwargs['self_attn_mask_type'] + else: + attn_mask_type = "whatever_default_is" + + if attn_mask_type == "arbitrary": + # @sudhakars: following logic doesn't work for `thd` + attn_mask = kwargs['attention_mask'] + attention_mask_inv = ~attn_mask + generation_case = torch.tensor(torch.tensor(attn_mask.shape).shape).item() > 2 + + if generation_case: + # @sudhakars: for some reason, `attention_mask` for generation is of the + # form [b, 1, 1, s]. + attention_mask_inv = attention_mask_inv.squeeze(1).squeeze(1) + assert torch.tensor(torch.tensor(attention_mask_inv.shape).shape).item() == 2 + + # Create `position_ids` on the fly using `attention_mask` since HF + # does the same in generation logic. + position_ids = attention_mask_inv.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask_inv == 0, 1) + + if "position_ids" in kwargs and kwargs['position_ids'] is not None: + assert torch.all(torch.eq(position_ids, kwargs["position_ids"])), "position ids don't match match exactly!" + + # convert [b, s] to [b, 1, s, s] since `arbitrary` is only set for + # context phase and context phase gets [b, s] sized attn mask + seq_len = 1 if torch.tensor(torch.tensor(attn_mask.shape).shape).item() > 2 else attention_mask_inv.shape[1] + arbitrary_attn_mask = torch.zeros(attention_mask_inv.shape[0], 1, seq_len, attention_mask_inv.shape[1]).bool() + for sample_idx in range(attn_mask.shape[0]): + pad_len = attn_mask[sample_idx].sum().int().item() + # set the columns to padded + arbitrary_attn_mask[sample_idx, :, :, :pad_len] = True + # set the rows to padded + if not generation_case: + arbitrary_attn_mask[sample_idx, :, :pad_len, :] = True + arbitrary_attn_mask[sample_idx] = torch.tril(arbitrary_attn_mask[sample_idx].logical_not()).logical_not() + + # Update the attention mask to arbitrary + kwargs['attention_mask'] = arbitrary_attn_mask.cuda() + + # @sudhakars: `max_position_embeddings` is not even used inside GemmaRotaryEmbedding + # @sudhakars: change the hardcoded `dim` to something like config.head_dim + te_rope_emb = LlamaRotaryEmbedding(dim=self.head_dim, max_position_embeddings=self.llama_config.max_position_embeddings).cuda() + te_rope_emb = te_rope_emb(args[0], position_ids.cuda()) + else: + # When the `attention_mask` is not `arbitrary`, then for the purpose + # of this tutorial, we're using `padding_causal` (for context) and + # `padding` (for generation) + # @sudhakars: find a better way to provide the `tensor_format` + te_rope_emb = RotaryPositionEmbedding(self.head_dim)( # Use self.head_dim + max_seq_len=self.llama_config.max_position_embeddings + ).cuda() + + inference_params = kwargs["inference_params"] + # @sudhakars: big assumption that the input is "sbhd" + # batch_size = args[0].shape[0] + if inference_params.qkv_format_legacy == "thd": + ( + max_seqlen_q, max_seqlen_kv, cu_seqlens_q, cu_seqlens_kv, cu_seqlens_q_padded, cu_seqlens_kv_padded + ) = inference_params.get_cache_params_from_infer_params() + + # this args cannot be passed to TransformerLayer + keys_to_remove = [ + "position_ids", + "past_key_value", + "output_attentions", + "use_cache", + "cache_position", + ] + for key in keys_to_remove: + kwargs.pop(key, None) + + # import pdb; pdb.set_trace() + # We need to return tuple to be compatible with HF. + return ( + super().forward( + *args, + rotary_pos_emb=te_rope_emb, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_kv=cu_seqlens_kv, + max_seqlen_q=max_seqlen_q, + max_seqlen_kv=max_seqlen_kv, + **kwargs + ), + ) + +class StaticLlamaModel(torch.nn.Module): + """ + StaticLlama is based of HF LlamaModel class. + It is adjusted to work properly with CUDA Graphs. + """ + + def __init__( + self, + model: LlamaModel, + dtype: torch.dtype, + mask: torch.Tensor, + lm_head: torch.nn.Module, + ): + super().__init__() + self.model = model + self.llama_config = model.config # Store LlamaConfig + self.normalizer = torch.tensor(self.llama_config.hidden_size**0.5, dtype=dtype) + self.mask = mask + self.lm_head = lm_head + + def set_inference_params(self, inference_params): + self.inference_params = inference_params + + # @sudhakars: is `arbitrary` fine being the default here? + def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor = None, attn_mask_type: str = "arbitrary"): + # import pdb; pdb.set_trace() + if hidden_states.shape[1] > 1: + torch.save(hidden_states, "input_ctxt.pth") + + with torch.no_grad(): + # static operation - for CUDA graphs + hidden_states.data[:] = hidden_states.data[:] * self.normalizer + + for i, decoder_layer in enumerate(self.model.layers): + hidden_states.data[:] = decoder_layer( + hidden_states, + attention_mask=attention_mask, + self_attn_mask_type=self.mask if attn_mask_type is None else attn_mask_type, + inference_params=self.inference_params, + )[ + 0 + ] # static copy - for CUDA graphs + + hidden_states.copy_(self.model.norm(hidden_states)) # static copy - for CUDA graphs + logits = self.lm_head(hidden_states) + logits = logits.float() + return logits + + +class LlamaGenerator(torch.nn.Module): + """ + LlamaGenerator gets one layer of embeddins, + makes forward pass and returns next tokens. + """ + + def __init__( + self, model: LlamaModel, lm_head: torch.nn.Module, dtype: torch.dtype, qkv_format: str + ): + super().__init__() + self.model = model + self.llama_layers = StaticLlamaModel(model, dtype, "arbitrary", lm_head) + self.qkv_format = qkv_format + + def set_inference_params(self, inference_params): + self.inference_params = inference_params + self.llama_layers.set_inference_params(inference_params) + + # @sudhakars: is `arbitrary` a good default value here? + def forward(self, hidden_states: torch.Tensor, mask: torch.Tensor = None, mask_type: str = "arbitrary"): + logits = self.llama_layers(hidden_states, attention_mask=mask, attn_mask_type = mask_type) + + assert logits.shape[0] == hidden_states.shape[0] # b + assert logits.shape[1] == hidden_states.shape[1] # seq_len + # logits.shape[2] = number of tokens + logits = logits[:, -1, :] + next_tokens = torch.argmax(logits, dim=1) + + # static copy for CUDA graphs + hidden_states.copy_(self.model.embed_tokens(next_tokens).unsqueeze(1)) + + # self.inference_params contains for example kv_cache. + # This needs to be called before every pass, + # to update the information of sequence lengths. + # Here we increase sequence offsets by one, + # because we generated one token for every sequence. + if self.qkv_format == "thd": + # self.inference_params.setup_before_new_input( + # lengths_tensor=torch.ones((next_tokens.shape[0],), device="cuda"), + # max_input_length=1, + # ) + setup_cache_params_from_infer_params(self.inference_params, + lengths_tensor=torch.ones((next_tokens.shape[0],), dtype=int), + max_input_length=1) + else: + self.inference_params.setup_before_new_input(length=1) + + return next_tokens + + +class PartialForwardWrapper(torch.nn.Module): + """ + This class wraps a `torch.nn.Module` while partially modifying its `forward` + + CUDAGraphs' `make_graphed_callables` method takes in a module but if only + `functools.partial` is used to wrap the module, it changes the modules' + type and that interferes with the `make_graphed_callables` intrinsics. + """ + def __init__(self, module, **kwargs): + super().__init__() + self.module = module + self.partial_forward = partial(self.module.forward, **kwargs) + + def __call__(self, *args, **kwargs): + return self.partial_forward(*args, **kwargs) + + # @sudhakars: should we use better abstraction? + def set_inference_params(self, *args, **kwargs): + return self.module.set_inference_params(*args, **kwargs) + + +@contextmanager +def replace_decoder(te_decoder_cls): + """ + Replace `LlamaDecoderLayer` with custom `TELlamaDecoderLayer`. + """ + original_llama_decoder_cls = transformers.models.llama.modeling_llama.LlamaDecoderLayer + transformers.models.llama.modeling_llama.LlamaDecoderLayer = te_decoder_cls + try: + yield + finally: + transformers.models.llama.modeling_llama.LlamaDecoderLayer = original_llama_decoder_cls + + +class TELlamaForCausalLM(LlamaForCausalLM): + """ + Causal LM created with `LlamaModel`. The underlying `LlamaDecoderLayer` + class is monkey-patched with `TELlamaDecoderLayer` class before + initializing the causal LM with `LlamaForCausalLM`. + + Args: + config: LlamaConfig + """ + + def __init__(self, config: LlamaConfig): + with replace_decoder(te_decoder_cls=TELlamaDecoderLayer): + super().__init__(config) + self.config = config + self.to(torch.bfloat16).cuda() + self.hidden_size = config.hidden_size + self._model_generation_phase = LlamaGenerator( + lm_head=self.lm_head, + model=self.model, + dtype=torch.bfloat16, + qkv_format=config.qkv_format, + ) + self._model_context_phase = StaticLlamaModel( + self.model, torch.bfloat16, "arbitrary", self.lm_head + ) + + if self.config.fp8: + self.fp8_recipe = DelayedScaling( + fp8_format=Format.HYBRID, amax_history_len=16, amax_compute_algo="max" + ) + + @staticmethod + def _padding_to_end(inputs, lengths): + """ + Gets the tensor with sequence padded from the beginning and + return tensor padded from its end. + + Parameters + ---------- + inputs : Tensor, tensor with shape [b, s] containing token numbers. + It's padded from the beggining. + lengths: Tensor, tensor with shape [s] with lengths of the sequences. + + """ + max_seq_len = torch.max(lengths) + batch_size, max_seq_len = inputs.shape + new_input_ids = inputs.clone() + for i in range(batch_size): + new_input_ids[i, : lengths[i]] = inputs[i, (max_seq_len - lengths[i]) : max_seq_len] + new_input_ids[i, lengths[i] :] = inputs[i, 0 : (max_seq_len - lengths[i])] + + # Disable the input preparation that involves extra padding + # inputs.copy_(new_input_ids) + + # Trim the inputs to no extra padding i.e. fix the max seq len to + # the longest sequence in the batch + actual_max_seq_len = inputs.ne(0).sum(dim=1).max() + inputs.data = new_input_ids[:, :actual_max_seq_len] + + # For Paged Attention, make the valid sequences, multiple of 64 + # inputs.data = new_input_ids[:, :4].repeat(1, 16) + + + def _next_64_multiply(self, x): + return ((x + 63) // 64) * 64 + + # This function is overriden in TeGEmmaForCausalLMCudaGraphs. + def _create_hidden_states_buffer(self, input_ids: torch.Tensor): + return torch.empty( + (input_ids.shape[0], input_ids.shape[1], self.hidden_size), + device="cuda", + dtype=torch.float32, + ) + + # This function is overriden in TeGEmmaForCausalLMCudaGraphs. + def _create_inference_params(self, *args, **kwargs): + infer_params = InferenceParams( + *args, **kwargs + ) + + max_batch_size = kwargs["max_batch_size"] + + # Initialize some legacy params + infer_params.cached_sequence_lengths = torch.zeros( + (max_batch_size,), device="cuda", dtype=torch.int32) + infer_params.input_sequence_lengths = torch.zeros( + (max_batch_size,), device="cuda", dtype=torch.int32) + + return infer_params + + # This function is overriden in TeGEmmaForCausalLMCudaGraphs. + def _get_max_input_seq_len(self, input_ids): + return input_ids.shape[1] + + # The buffer for generation is some part (beginning) of hidden states buffer. + # This function returns pointer to it and also copies there data if provided. + def _get_generation_buffer(self, hidden_states_buffer, data_to_copy=None): + # hidden_states_buffer has shape [b, s, hd] + # generation_buffer will have shape [b, 1, hd] + # Notice that "generation_buffer = hidden_states_buffer[:, 0, :].unsqueeze(1)" + # will return uncontiguous buffer, which we want to avoid. + output = hidden_states_buffer.view(-1)[ + : hidden_states_buffer.shape[0] * hidden_states_buffer.shape[2] + ] + if data_to_copy is not None: + output.copy_(data_to_copy.reshape(-1)) + generation_buffer = output.view( + (hidden_states_buffer.shape[0], 1, hidden_states_buffer.shape[2]) + ) + return generation_buffer + + def _generate_context_phase(self, input_ids: torch.Tensor, inference_params: InferenceParams): + hidden_states = self._create_hidden_states_buffer(input_ids) + hidden_states.data[:] = self.model.embed_tokens(input_ids) + + # We need to update offsets before every forward pass to make cache work properly. + lengths = input_ids.ne(0).sum(dim=1) + # import pdb; pdb.set_trace() + if self.config.qkv_format == "thd": + # inference_params.setup_before_new_input( + # lengths_tensor=lengths, max_input_length=input_ids.shape[1] + # ) + lengths = input_ids.ne(0).sum(dim=1) + max_input_length = input_ids.shape[1] + setup_cache_params_from_infer_params(inference_params, lengths, max_input_length) + else: + inference_params.setup_before_new_input(length=input_ids.shape[1]) + + + logits = self._model_context_phase( + hidden_states, + attention_mask=((input_ids == 0) if self.config.qkv_format != "thd" else None), + attn_mask_type="padding_causal" if self.config.qkv_format == "thd" else "arbitrary" + ) + + # We choose logits coresponding with last token in each sequence, + # which have various lengths - they are stored in (inference_params.incoming_seq_len - 1) + # Tensor when qkv_format == "thd" and + # they are the last token in the sequence when qkv_format != "thd". + if self.config.qkv_format == "thd": + logits = logits[ + + torch.arange(logits.size(0)), inference_params.input_sequence_lengths - 1, : + ] + else: + logits = logits[:, -1, :] + torch.save(logits, "logits_ctxt.pth") + next_tokens = torch.argmax(logits, dim=1) + + # self.hidden_states have shape [b, s, hd]. + # We return hidden state for the last token - output has shape [b, 1, hd] + hidden_states = self._get_generation_buffer( + hidden_states, self.model.embed_tokens(next_tokens) + ) + return hidden_states, next_tokens + + def _make_mask_one_token_longer(self, mask): + return torch.cat( + [mask, torch.zeros(mask.size(0), 1, 1, 1, dtype=torch.bool, device=mask.device)], dim=-1 + ) + + @torch.no_grad() + def generate( + self, + input_ids: Optional[torch.Tensor] = None, + pad_token_id: int = 0, + max_new_tokens: int = 0, + *args, + **kwargs + ): + self.eval() + + # We need both autocasts: FP8 for operations that can run in lower precision + # and BF16 for those that cannot. + with autocast(dtype=torch.bfloat16, cache_enabled=False), te.pytorch.fp8_autocast( + enabled=self.config.fp8, fp8_recipe=self.fp8_recipe if self.config.fp8 else None + ): + + lengths = torch.sum(input_ids.ne(pad_token_id), dim=-1).squeeze() # [s] + # input_ids = F.pad( + # input_ids, (max_input_sequence_len - input_ids.shape[1], 0), "constant", 0 + # ) + + if self.config.qkv_format == "thd": + # For thd layout padding is at the end, otherwise at the beginning. + TELlamaForCausalLM._padding_to_end(input_ids, lengths) + + batch_size, max_input_sequence_len = input_ids.shape[0], self._get_max_input_seq_len( + input_ids + ) + # import pdb; pdb.set_trace() + + # InferenceParams is a cache, where keys and values of previous tokens are stored. + # Moreover it stores length of both already generated and input sequences. + head_dim = self.config.hidden_size // self.config.num_attention_heads + inference_params = self._create_inference_params( + max_batch_size=batch_size, + # num_layers=self.config.num_hidden_layers, + max_sequence_length=self._next_64_multiply(max_input_sequence_len + max_new_tokens), + num_heads_kv=self.config.num_key_value_heads, + # num_heads_q=self.config.num_attention_heads, + head_dim_v=head_dim, + head_dim_k=head_dim, + dtype=torch.bfloat16, + is_paged=True, + page_size=64, + total_num_pages=64 *3, # 64 * 64 (max_sequence_length) / 64 (page_size) + # is_cuda_graph=False + ) + + def init_cache_params_in_infer_params(inference_params): + inference_params.cached_sequence_lengths = torch.zeros( + (batch_size,), device="cuda", dtype=torch.int32) + inference_params.input_sequence_lengths = torch.zeros( + (batch_size,), device="cuda", dtype=torch.int32) + + init_cache_params_in_infer_params(inference_params) + inference_params.qkv_format_legacy = self.config.qkv_format + + self._model_context_phase.set_inference_params(inference_params) + self._model_generation_phase.set_inference_params(inference_params) + + hidden_states, next_tokens = self._generate_context_phase(input_ids, inference_params) + + # Generation phase. + if self.config.qkv_format == "thd": + # inference_params.setup_before_new_input( + # lengths_tensor=torch.ones((next_tokens.shape[0],), device="cuda"), + # max_input_length=1, + # ) + setup_cache_params_from_infer_params(inference_params, + lengths_tensor=torch.ones((next_tokens.shape[0],), dtype=int), + max_input_length=1) + else: + inference_params.setup_before_new_input(length=1) + + output_tokens = [next_tokens] + + mask = None + if self.config.qkv_format != "thd": + mask = (input_ids == 0).unsqueeze(1).unsqueeze(1) + + for _ in range(max_new_tokens): + if self.config.qkv_format != "thd": + # It will not work with cuda graphs, but it is not used for thd qkv_format. + # Attention mask in bshd needs attn_mask increased by 1 to + # include the next token to be generated + mask = self._make_mask_one_token_longer(mask) + + # setup_cache_params_from_infer_params(inference_params, input_ids) + # @sudhakars: could create position_ids from mask here + next_tokens = self._model_generation_phase(hidden_states, mask, mask_type="padding" if self.config.qkv_format=="thd" else "arbitrary") + # next_tokens is static output tensor, so we need to clone it + # - it gets changed every iteration. + output_tokens.append(next_tokens.clone()) + + result = torch.cat((input_ids, torch.stack(output_tokens).permute([1, 0])), dim=1) + return result + + def forward(self, *args, **kwargs): + self._model_context_phase.set_inference_params(None) + hidden_states = self.model.embed_tokens(kwargs["input_ids"]) + logits = self._model_context_phase( + hidden_states, + attention_mask=((kwargs["input_ids"] == 0) if self.config.qkv_format != "thd" else None), + attn_mask_type="arbitrary" + ) + return logits + +class TELlamaForCausalLMCudaGraphs(TELlamaForCausalLM): + """ + TELlamaForCausalLMCudaGraphs is the version of the class TELlamaForCausalLM + using CUDA Graphs to speed it up. We need to make one trade-off. + Namely, batch_size, max_seq_len and max_context_seq_len need to be static. + It is necessary to run generation with the same value of + these variables that we recorded graph on. + """ + + def __init__(self, config: LlamaConfig): + super().__init__(config) + assert ( + config.qkv_format == "thd" + ), "Generation with CUDA Graphs are implemented only for thd format." + + # Preparation of the static buffers. + self.config = config + self.hidden_states_buffer = torch.empty( + ( + config.cuda_graphs_static_batch_size, + config.cuda_graphs_static_max_context_len, + config.hidden_size, + ) + ).cuda() + # This is in fact part of the buffer for hidden_states. + self.generation_buffer = self._get_generation_buffer(self.hidden_states_buffer) + self.inference_params = InferenceParams( + max_batch_size=config.cuda_graphs_static_batch_size, + max_sequence_length=config.cuda_graphs_static_max_seq_len, + qkv_format="thd", + ) + + self._model_generation_phase.set_inference_params(self.inference_params) + self._model_context_phase.set_inference_params(self.inference_params) + + def record(self): + # We want to record model in training=False, because it will be used in generation. + self.eval() + + # Here "the trick" happens. We override methods from TELlamaForCausalLM + # with their recorded version. After invocation of each of them, + # captured graph will be replayed with minimal usage of CPU, + # what will lead to huge speedup. + input_shape = ( + self.config.cuda_graphs_static_batch_size, + self.config.cuda_graphs_static_max_context_len, + ) + self.inference_params.reset() + self.inference_params.setup_before_new_input( + lengths_tensor=torch.tensor(input_shape[0] * [input_shape[1]], device="cuda"), + max_input_length=input_shape[1], + ) + self._model_context_phase = self.record_graph( + PartialForwardWrapper(self._model_context_phase, attn_mask_type="padding_causal" + if self.inference_params.qkv_format == "thd" + else "arbitrary"), + self.hidden_states_buffer + ) # CUDA Graphs recording + + input_shape = (self.config.cuda_graphs_static_batch_size, 1) + self.inference_params.reset() + self.inference_params.setup_before_new_input( + lengths_tensor=torch.tensor(input_shape[0] * [input_shape[1]], device="cuda"), + max_input_length=input_shape[1], + ) + self._model_generation_phase = self.record_graph( + PartialForwardWrapper(self._model_generation_phase, mask_type="padding" + if self.inference_params.qkv_format=="thd" + else "arbitrary"), + self.generation_buffer + ) # CUDA Graphs recording + + """ + Functions _create_hidden_states_buffer and _create_inference_params + from base class are overriden to make hidden_states and inference_params static + - not changing their position in memory between every invocation. + """ + + def _create_hidden_states_buffer(self, *args, **kwargs): + return self.hidden_states_buffer + + def _create_inference_params(self, *args, **kwargs): + self.inference_params.reset() + return self.inference_params + + def _get_max_input_seq_len(self, _): + return self.config.cuda_graphs_static_max_context_len + + @torch.no_grad() + def record_graph(self, function, input_tensor): + # function is invoked on argument (self.hidden_states,) and all kernels are recorded. + # record_graph() returns captured function, which can be run later with lower of th CPU. + fp8_format = Format.HYBRID + fp8_recipe = DelayedScaling( + fp8_format=fp8_format, amax_history_len=1024, amax_compute_algo="max" + ) + + # We need both autocasts: FP8 for operations that can run in lower precision + # and BF16 for those that cannot. + with autocast(dtype=torch.bfloat16, cache_enabled=False): + graphed_function = te.pytorch.make_graphed_callables( + function, + (input_tensor,), + fp8_enabled=self.config.fp8, + fp8_recipe=fp8_recipe, + allow_unused_input=True, + num_warmup_iters=3, + ) + return graphed_function diff --git a/docs/examples/te_gemma/te_llama_loading_weights.py b/docs/examples/te_gemma/te_llama_loading_weights.py new file mode 100755 index 0000000000..a5ab151f67 --- /dev/null +++ b/docs/examples/te_gemma/te_llama_loading_weights.py @@ -0,0 +1,224 @@ +# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. + +import os +import re +import gc +import torch + +from typing import List + +from transformer_engine.pytorch.fp8 import fp8_model_init + +from transformers.modeling_utils import load_state_dict, _load_state_dict_into_model +from transformers.utils.hub import get_checkpoint_shard_files + +""" + This file contains logic of mapping the HuggingFace LlamaModel parameters + with TransformerEngine TransformerLayer. When we have initialized Transformer models + both with HF and with TE, we can copy parameters from the first to the second. +""" + + +def _load_weights_for_fp8_model(vanilla_model, hyperparams): + # The weights are loaded from the file with state_dict + # of model with weights which contains also fp8 parameters. + # The weights are in BF16 precision, but they contain fp8 metadata + # computed by the calibration procedure. + vanilla_model.load_state_dict( + torch.load(hyperparams.fp8_model_weights_filename), + strict=False, + # strict = false, because some parameters have + # multiple pointers to the same weight + # vanilla_model._model_context_phase.model + # and vanilla_model._model_generation_phase.model + ) + + +def _load_weights_for_standard_model(vanilla_model, config): + # The weights are loaded from the file with original weights. + archive_file = os.path.join(config.model_name, "model.safetensors.index.json") + resolved_archive_file, _ = get_checkpoint_shard_files(config.model_name, archive_file) + total_dict = {} + for shard_file in resolved_archive_file: + state_dict = load_state_dict(shard_file) + total_dict.update(state_dict) + + replace_params( + total_dict, + vanilla_model.state_dict(), + config, + qkv_fused_and_interleaved=config.fuse_qkv_params, + ) + # Copy parameters like embedding: + _load_state_dict_into_model(vanilla_model, total_dict, start_prefix="") + + # Force mem release. Taken from huggingface code. + del total_dict + gc.collect() + + +def load_te_model(cls, config): + """ + Custom method adapted from `from_pretrained` method in HuggingFace + Transformers repo: + https://github.com/huggingface/transformers/blob/f497f564bb76697edab09184a252fc1b1a326d1e/src/transformers/modeling_utils.py#L2579 + """ + + config.use_cache = False # To make TransformerLayer compatible with LlamaModel + with fp8_model_init(config.fp8_model_init): + # there we need only to create model + vanilla_model = cls(config).to(torch.bfloat16).cuda() + + # return vanilla_model + # and now we copy the weights into it + if config.fp8_model_weights_filename is not None: + _load_weights_for_fp8_model(vanilla_model, config) + else: + _load_weights_for_standard_model(vanilla_model, config) + + return vanilla_model + + +def _get_all_layer_prefixes_to_update(hf_state_dict): + """ + There are many parameters in hf_state_dict, whose name start with "model.layers.[number]." + This function extracts all strings like "model.layers.[number]." + that are starting strings of keys in hf_state_dict. + """ + all_layer_prefixes = set() + for param_key in hf_state_dict.keys(): + layer_prefix_pat = "model.layers.\d+." + m = re.match(layer_prefix_pat, param_key) + if m is not None: + all_layer_prefixes.add(m.group()) + return all_layer_prefixes + + +def replace_params(hf_state_dict, te_state_dict, config, qkv_fused_and_interleaved=False): + # collect all layer prefixes to update + all_layer_prefixes = set() + for param_key in hf_state_dict.keys(): + layer_prefix_pat = "model.layers.\d+." + m = re.match(layer_prefix_pat, param_key) + if m is not None: + all_layer_prefixes.add(m.group()) + + for layer_prefix in all_layer_prefixes: + # When loading weights into models with less number of layers, skip the + # copy if the corresponding layer doesn't exist in HF model + if layer_prefix + "input_layernorm.weight" in hf_state_dict: + te_state_dict[layer_prefix + "self_attention.layernorm_qkv.layer_norm_weight"].data[ + : + ] = hf_state_dict[layer_prefix + "input_layernorm.weight"].data[:] + + if layer_prefix + "self_attn.q_proj.weight" in hf_state_dict: + te_state_dict[layer_prefix + "self_attention.layernorm_qkv.query_weight"].data[:] = ( + hf_state_dict[layer_prefix + "self_attn.q_proj.weight"].data[:] + ) + + if layer_prefix + "self_attn.k_proj.weight" in hf_state_dict: + te_state_dict[layer_prefix + "self_attention.layernorm_qkv.key_weight"].data[:] = ( + hf_state_dict[layer_prefix + "self_attn.k_proj.weight"].data[:] + ) + + if layer_prefix + "self_attn.v_proj.weight" in hf_state_dict: + te_state_dict[layer_prefix + "self_attention.layernorm_qkv.value_weight"].data[:] = ( + hf_state_dict[layer_prefix + "self_attn.v_proj.weight"].data[:] + ) + + if layer_prefix + "self_attn.o_proj.weight" in hf_state_dict: + te_state_dict[layer_prefix + "self_attention.proj.weight"].data[:] = hf_state_dict[ + layer_prefix + "self_attn.o_proj.weight" + ].data[:] + + if layer_prefix + "post_attention_layernorm.weight" in hf_state_dict: + te_state_dict[layer_prefix + "layernorm_mlp.layer_norm_weight"].data[:] = hf_state_dict[ + layer_prefix + "post_attention_layernorm.weight" + ].data[:] + + # It may happen that gate_proj.weight and up_proj.weight will be in the different files, so we need to + # load them separately. + if layer_prefix + "mlp.gate_proj.weight" in hf_state_dict: + te_state_dict[layer_prefix + "layernorm_mlp.fc1_weight"].data[ + : config.intermediate_size + ] = hf_state_dict[layer_prefix + "mlp.gate_proj.weight"].data + + if layer_prefix + "mlp.up_proj.weight" in hf_state_dict: + te_state_dict[layer_prefix + "layernorm_mlp.fc1_weight"].data[ + config.intermediate_size : + ] = hf_state_dict[layer_prefix + "mlp.up_proj.weight"].data + + if layer_prefix + "mlp.down_proj.weight" in hf_state_dict: + te_state_dict[layer_prefix + "layernorm_mlp.fc2_weight"].data[:] = hf_state_dict[ + layer_prefix + "mlp.down_proj.weight" + ].data[:] + return all_layer_prefixes + + +# def replace_params(hf_state_dict, te_state_dict, config, qkv_fused_and_interleaved=False): +# """ +# Replaces params from TE TransformerLayer state_dict with corresponding parameters +# from HuggingFace LlamaModel state_dict. +# """ +# all_layer_prefixes: List[str] = _get_all_layer_prefixes_to_update(hf_state_dict) + +# head_dim = config.hidden_size // config.num_attention_heads + +# for layer_prefix in all_layer_prefixes: + +# def copy_from_ht_to_te(te_name, hf_name, start=None, end=None): +# te_state_dict[layer_prefix + te_name].data[start:end].copy_( +# hf_state_dict[layer_prefix + hf_name] +# ) + +# copy_from_ht_to_te( +# "self_attention.layernorm_qkv.layer_norm_weight", "input_layernorm.weight" +# ) +# copy_from_ht_to_te("self_attention.proj.weight", "self_attn.o_proj.weight") +# copy_from_ht_to_te("layernorm_mlp.layer_norm_weight", "post_attention_layernorm.weight") +# copy_from_ht_to_te("layernorm_mlp.fc2_weight", "mlp.down_proj.weight") +# copy_from_ht_to_te( +# "layernorm_mlp.fc1_weight", "mlp.gate_proj.weight", end=config.intermediate_size +# ) +# copy_from_ht_to_te( +# "layernorm_mlp.fc1_weight", "mlp.up_proj.weight", start=config.intermediate_size +# ) + +# if qkv_fused_and_interleaved: +# """ +# When qkv_fused_and_interleaved=True, key, query and value layers are on one tensor +# in TE TransformerLayer. Moreover they are interleaved within each head. +# Let q_i, k_i and v_i be query, key and value layers for i-th head respectively. +# Then TE stores weight tensor in the form: +# [q1 k1 v1 q2 k2 v2 ...] +# This is done to maximally optimize performance time. +# """ +# te_qkv_layer = te_state_dict[layer_prefix + "self_attention.layernorm_qkv.weight"] + +# def copy_interleave(hf_name, idx): +# src = hf_state_dict[layer_prefix + hf_name] +# for head_nr in range(config.num_attention_heads): +# dst_offset = head_nr * config.head_dim * 3 +# dst_slice = slice( +# dst_offset + idx * config.head_dim, dst_offset + (idx + 1) * config.head_dim +# ) +# src_slice = slice( +# head_nr * config.head_dim, head_nr * config.head_dim + config.head_dim +# ) +# te_qkv_layer[dst_slice, :] = src[src_slice, :] + +# copy_interleave("self_attn.q_proj.weight", 0) +# copy_interleave("self_attn.k_proj.weight", 1) +# copy_interleave("self_attn.v_proj.weight", 2) +# else: +# copy_from_ht_to_te( +# "self_attention.layernorm_qkv.query_weight", "self_attn.q_proj.weight" +# ) +# copy_from_ht_to_te("self_attention.layernorm_qkv.key_weight", "self_attn.k_proj.weight") +# copy_from_ht_to_te( +# "self_attention.layernorm_qkv.value_weight", "self_attn.v_proj.weight" +# ) + +# return all_layer_prefixes diff --git a/docs/examples/te_gemma/test_paged_attn.ipynb b/docs/examples/te_gemma/test_paged_attn.ipynb new file mode 100755 index 0000000000..543ebe9262 --- /dev/null +++ b/docs/examples/te_gemma/test_paged_attn.ipynb @@ -0,0 +1,33 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "ace403ac-c276-4378-a4e8-0155165f9934", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/examples/te_gemma/tutorial_accelerate_hf_gemma_finetuning_with_te.ipynb b/docs/examples/te_gemma/tutorial_accelerate_hf_gemma_finetuning_with_te.ipynb new file mode 100755 index 0000000000..7875ffc9f3 --- /dev/null +++ b/docs/examples/te_gemma/tutorial_accelerate_hf_gemma_finetuning_with_te.ipynb @@ -0,0 +1,314 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Accelerating a Hugging Face Gemma model finetuning with Transformer Engine" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the previous [tutorial](../te_llama/tutorial_accelerate_hf_llama_finetuning_with_te.ipynb), we demonstrated how to accelerate HF Llama models using the Transformer Engine library. We replaced `LlamaDecoderLayer` with `TransformerLayer` from the Transformer Engine, achieving a speedup. Furthermore, we conducted the finetuning in FP8 precision, which yielded an additional speedup.\n", + "\n", + "Now, we will undertake a similar enhancement for the Google's [Gemma](https://blog.google/technology/developers/gemma-open-models/) model." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Dependencies for this tutorial\n", + "\n", + "Following files and media are necessary to effectively run this tutorial:\n", + "\n", + "1. `te_gemma.py`\n", + " - This file contains the code to load a Hugging Face Gemma checkpoint in Transformer Engine's `TransformerLayer` instead of Hugging Face's `GemmaDecoderLayer`. This is used in the following two sections of the tutorial - \"Improvement 1\" and \"Improvement 2\".\n", + "2. `utils.py`\n", + " - This file contains the code related to dataloading, hyperparameters, setting up model/optimizers/accelerator, model training and other miscellaneous tasks like restarting the jupyter notebook from within the cell. \n", + "3. `requirements.txt`\n", + " - This file contains necessary Python packages for this tutorial.\n", + "4. `media/`\n", + " - This directory contains the images used in the following tutorial." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -r requirements.txt\n", + "\n", + "import torch\n", + "cudnn_version = torch.backends.cudnn.version()\n", + "assert cudnn_version >= 90100, \"cuDNN version >= 9.1.0 is needed to run this tutorial.\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Differences between Llama and Gemma" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Thr Llama and the Gemma are very similar models - both are based on Transformer Decoder architecture. The most important architectural differences between them are the following:\n", + "\n", + "\n", + "| Feature | Llama | Gemma |\n", + "|----------------------------------------------|------------------------------------|--------------------------------------------|\n", + "| **Norm Layer** | Standard RMSNorm
$y = \\frac{x - \\mathrm{E}[x]}{ \\sqrt{\\mathrm{Var}[x] + \\varepsilon}} * \\gamma + \\beta$ | RMSNorm with zero centered gamma parameter
$y = \\frac{x - \\mathrm{E}[x]}{ \\sqrt{\\mathrm{Var}[x] + \\varepsilon}} * (\\textcolor{red}{1 +} \\gamma) + \\beta$ |\n", + "| **Embedding Dimension/Head Dimension** | 4096/4096 | 3072/4096 |\n", + "| **Activation Function** | SwiGlu | GeGlu |\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## [Baseline] Running HF `GemmaModel` (Precision: `BF16`)\n", + "\n", + "Similarly to the Llama tutorial, we begin the experiments by running baseline Hugging Face Gemma model finetuning in BF16 precision.\n", + "\n", + "

\n", + "\n", + "Note\n", + " \n", + "This tutorial loads and trains a Gemma 7B model which takes up most of the GPU memory and therefore, we need to restart the jupyter notebook each time before running the following sections. A small utility method `restart_jupyter_notebook` is defined in the accompanying `utils.py` file. This function restarts the jupyter notebook so that the GPU memory is flushed before the model is loaded again from the checkpoint in order to avoid running into OOM (Out Of Memory) errors.\n", + "\n", + "If the utility doesn't work, comment this line `restart_jupyter_notebook()` in the following cell and manually restart the jupyter notebook before running the cell. Repeat the same for other sections in this tutorial.\n", + "\n", + "
\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "10 finetuning steps complete!\n", + "\n", + "Average time taken per step: \n", + "298 \n", + "milliseconds\n" + ] + } + ], + "source": [ + "# Restart the notebook (to flush the GPU memory)\n", + "from utils import restart_jupyter_notebook\n", + "restart_jupyter_notebook()\n", + "\n", + "\n", + "# Import necessary packages and methods\n", + "from utils import *\n", + "\n", + "\n", + "# Default hyperparams, also defined in `utils.py` in class `Hyperparameters`\n", + "## !!! `model_name` attr must point to the location of the model weights !!!\n", + "## Weights can be downloaded from: https://huggingface.co/google/gemma-7b\n", + "hyperparams.model_name = \"../../../../gemma-7b\" # <== Add model weight location here e.g. \"/path/to/downloaded/gemma/weights\"\n", + "hyperparams.mixed_precision = \"bf16\"\n", + "\n", + "\n", + "# Init the model and accelerator wrapper\n", + "model = init_baseline_model(hyperparams).cuda()\n", + "accelerator, model, optimizer, train_dataloader, lr_scheduler = wrap_with_accelerator(model, hyperparams)\n", + "\n", + "\n", + "# Finetune the model\n", + "finetune_model(model, hyperparams, accelerator, train_dataloader, optimizer, lr_scheduler)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's add this information in a table and keep comparing it with a few possible improvements in future sections:\n", + "\n", + "| Models | Precision | Step Time (or ms per batch) | Speedup (over baseline) |\n", + "|-------------------------------------------------------------|-----------|-----------------------------|-------------------------|\n", + "| HF (baseline) | BF16 | 298 | 1 |" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## [Improvement 1] Replace HF's `GemmaDecoderLayer` with TE's `TransformerLayer` (Precision: `BF16`)\n", + "\n", + "We replace *GemmaDecoderLayer* with the highly tuned *TransformerLayer*, similarly to our approach in the [Llama tutorial](../te_llama/tutorial_accelerate_hf_llama_finetuning_with_te.ipynb). Let's observe the impact this change has on the model's speed." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "10 finetuning steps complete!\n", + "\n", + "Average time taken per step: \n", + "257 \n", + "milliseconds\n" + ] + } + ], + "source": [ + "# Restart the notebook (to flush the GPU memory)\n", + "from utils import restart_jupyter_notebook\n", + "restart_jupyter_notebook()\n", + "\n", + "\n", + "# Import necessary packages and methods\n", + "from utils import *\n", + "\n", + "\n", + "# Default hyperparams, also defined in `utils.py` in class `Hyperparameters`\n", + "## !!! `model_name` attr must point to the location of the model weights !!!\n", + "## Weights can be downloaded from: https://huggingface.co/google/gemma-7b\n", + "hyperparams.model_name = \"../../../../gemma-7b\" # <== Add model weight location here e.g. \"/path/to/downloaded/gemma/weights\"\n", + "hyperparams.mixed_precision = \"bf16\"\n", + "\n", + "\n", + "# Init the model and accelerator wrapper\n", + "model = init_te_gemma_model(hyperparams).cuda()\n", + "accelerator, model, optimizer, train_dataloader, lr_scheduler = wrap_with_accelerator(model, hyperparams)\n", + "\n", + "\n", + "# Finetune the model\n", + "finetune_model(model, hyperparams, accelerator, train_dataloader, optimizer, lr_scheduler)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Compared to the \"baseline\" implementation, we see that using Transformer Engine's `TransformerLayer` in place of Huggging Face's `GemmaDecoderLayer` gives a speedup of **16%** even when using only BF16 precision!\n", + "\n", + "| Models | Precision | Step Time (or ms per batch) | Speedup (over baseline) |\n", + "|-------------------------------------------------------------|-----------|-----------------------------|-------------------------|\n", + "| HF (baseline) | BF16 | 298 | 1 |\n", + "| TE (replace `GemmaDecoderLayer` with `TE.TransformerLayer`) | BF16 | 257 | 1.16 |" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## [Improvement 2] Replace HF's `GemmaDecoderLayer` with TE's `TransformerLayer` (Precision: `FP8`)\n", + "\n", + "The last improvement is about enabling FP8 precision. Let's see how it works." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "10 finetuning steps complete!\n", + "\n", + "Average time taken per step: \n", + "214 \n", + "milliseconds\n" + ] + } + ], + "source": [ + "# Restart the notebook (to flush the GPU memory)\n", + "from utils import restart_jupyter_notebook\n", + "#restart_jupyter_notebook()\n", + "\n", + "\n", + "# Import necessary packages and methods\n", + "from utils import *\n", + "\n", + "\n", + "# Default hyperparams, also defined in `utils.py` in class `Hyperparameters`\n", + "## !!! `model_name` attr must point to the location of the model weights !!!\n", + "## Weights can be downloaded from: https://huggingface.co/google/gemma-7b\n", + "hyperparams.model_name = \"../../../../gemma-7b\" # <== Add model weight location here e.g. \"/path/to/downloaded/gemma/weights\"\n", + "hyperparams.mixed_precision = \"fp8\"\n", + "\n", + "\n", + "# Init the model and accelerator wrapper\n", + "model = init_te_gemma_model(hyperparams).cuda()\n", + "accelerator, model, optimizer, train_dataloader, lr_scheduler = wrap_with_accelerator(model, hyperparams)\n", + "\n", + "\n", + "# Finetune the model\n", + "finetune_model(model, hyperparams, accelerator, train_dataloader, optimizer, lr_scheduler)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "| Models | Precision | Step Time (or ms per batch) | Speedup (over baseline) |\n", + "|-------------------------------------------------------------|-----------|-----------------------------|-------------------------|\n", + "| HF (baseline) | BF16 | 298 | 1 |\n", + "| TE (replace `GemmaDecoderLayer` with `TE.TransformerLayer`) | BF16 | 257 | 1.16 |\n", + "| TE (replace `GemmaDecoderLayer` with `TE.TransformerLayer`) | FP8 | 214 | 1.39 |\n", + "\n", + "\n", + "After turning on FP8 precision, we get even more speedup of almost **39%**!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Conclusion\n", + "\n", + "As shown in the [Llama tutorial](../te_llama/tutorial_accelerate_hf_llama_finetuning_with_te.ipynb), using the `TransformerLayer` module from Transformer Engine to replace Hugging Face's `GemmaDecoderLayer` results in a speedup compared to Hugging Face's native Gemma implementation." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## See more\n", + "\n", + "We also prepared [tutorial](./tutorial_generation_gemma_with_te.ipynb) in which we will show how to speedup the Gemma model generation using Transformer Engine." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/examples/te_gemma/tutorial_generation_gemma_with_te.ipynb b/docs/examples/te_gemma/tutorial_generation_gemma_with_te.ipynb new file mode 100755 index 0000000000..acb93b795e --- /dev/null +++ b/docs/examples/te_gemma/tutorial_generation_gemma_with_te.ipynb @@ -0,0 +1,1277 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "40364db7", + "metadata": {}, + "source": [ + "# Accelerating token generation of the Hugging Face Gemma Model with Transformer Engine\n", + "\n", + "Generative AI has made remarkable strides in recent years, with Large Language Models (LLMs) like ChatGPT at the forefront. These models have revolutionized how we interact with machine-generated content, providing capabilities that range from writing assistance to complex decision support. The core functionality of these models is the generation process, which involves predicting the next token in a sequence based on the preceding text. This task is critical for applications such as automated content creation, translation, and more, emphasizing the importance of efficient implementation.\n", + "\n", + "\n", + "\n", + "
\n", + "\"\"\n", + "
\n", + "Animation 1: Hugging Face Gemma model token generation.\n", + "
\n", + "
\n", + "\n", + "For those seeking a deeper understanding of text generation mechanisms in Transformers, it is recommended to check out the [HuggingFace generation tutorial](https://huggingface.co/docs/transformers/llm_tutorial).\n", + "\n", + "In the previous tutorials on [Llama](../te_llama/tutorial_accelerate_hf_llama_finetuning_with_te.ipynb) and [Gemma](./tutorial_accelerate_hf_gemma_finetuning_with_te.ipynb), it was demonstrated how finetuning can be accelerated using the Transformer Engine's `TransformerLayer`. Building on this foundation, the current objective is to enhance the generation speed of the Gemma model.\n", + "\n", + "This tutorial will introduce and explain several advanced features of the Transformer Engine that contribute to this goal:\n", + "\n", + "###### **1. THD Attention Layout.**\n", + "\n", + "Addressing the challenge of computing attention for sequences with varying lengths, a common method is to pad these sequences and apply an attention mask. The Transformer Engine, however, offers a more optimized approach—by specifying the lengths and offsets of the sequences, attention can be computed directly. Instead of passing the tensor with shape `[b, s, h, d]` and the attention mask, one can pass a tensor of the shape `[t, h, d]` along with tensors detailing cumulative sequence lengths and offsets to run the attention optimized for this case. This specific attention layout is referred to as the **THD layout**. \n", + "\n", + "\n", + "The letter `t` in the standard `[t, h, d]` layout is equal to the total length of the sequences, namely `t = s_1 + s_2 + ... + s_b`, where `s_i` denotes the length of sequence `i`. TransformerEngine supports a THD layout that incorporates gaps between these sequences - the lengths of the offsets need to be passed in the additional parameter.\n", + "\n", + "
\n", + "\"\"\n", + "
\n", + "Figure 1: The difference between BSHD (default) and THD attention layouts is as follows: with BSHD, one needs to provide the attention mask, while with THD, one needs to provide cumulative sequence lengths and sequence offsets.\n", + "
\n", + "
\n", + "\n", + "###### **2. CUDA Graphs API.**\n", + "\n", + "The speed of GPUs is increasing at a rapid pace. It turns out that sometimes the runtime of kernels is shorter than the time it takes for the CPU to submit them, which can lead to significant overhead. CUDA Graphs can address this issue. When certain kernels are executed repeatedly, it allows us to record and replay them with less CPU involvement. This becomes particularly useful in applications like token generation, where a `TransformerLayer` is run for every token that needs to be generated.\n", + "\n", + "One can read more about CUDA Graphs [here](https://developer.nvidia.com/blog/cuda-graphs/).\n", + "\n", + "PyTorch exposes graphs via a raw `torch.cuda.CUDAGraph` class and two convenience wrappers: `torch.cuda.graph` and `torch.cuda.make_graphed_callables`. More information about the cuda graphs in Pytorch can be found [here](https://pytorch.org/blog/accelerating-pytorch-with-cuda-graphs/).\n", + "\n", + "
\n", + "\"\"\n", + "
\n", + "Figure 2: CUDA Graphs reduce the overhead generated by the long time it takes to launch a single kernel. It enables the recording and replaying of subsequent launches, thus reducing the total time used by the CPU.\n", + "
\n", + "
\n", + "\n", + "\n", + "###### **3. FP8 Weights Calibration.**\n", + "\n", + "Assuming that the model is trained in FP32/BF16 precision and the goal is to execute it in FP8 precision, the process isn't straightforward due to the absence of appropriate FP8 scaling factors. In this scenario, FP8 calibration becomes essential. By conducting several forward passes on sample data, the FP8 scaling parameters can be computed. This calibration allows the model to operate correctly in FP8 precision.\n", + "\n", + "It is highly recommended to familiarize oneself with the [tutorial](../../examples/fp8_primer.ipynb) on FP8 precision to understand the importance of proper scaling factors.\n", + "\n", + "\n", + "
\n", + "\"\"\n", + "
\n", + "Figure 3:\n", + "If the model is trained in BF16/FP32, it does not include the computed FP8 scaling factors. When it is run under fp8_autocast(), the value of these scaling factors will default to their initial values, which can cause numerical errors. Weight calibration involves calculating FP8 scaling factors from higher precision forward passes. Once these factors are computed, the model becomes numerically stable. \n", + "
\n", + "
\n", + "\n", + "###### **4. FP8 Model Weights.**\n", + "\n", + "The typical approach is to store weights in higher precision and then cast them to fp8 before operations. This may prevent accuraccy drops in training. However, for inference, this level of precision is not necessary.\n", + "\n", + "The TransformerEngine includes a wrapper `fp8_model_​init`, which allows for the creation of models that store only the FP8 copy of the weights. This eliminates the need to cast from higher precision to BF16, saving time in this casting process. \n", + "\n", + "
\n", + "\"\"\n", + "
\n", + "Figure 4: Model under fp8_autocast() stores weights in high precision by default, and casts them if needed. It can leads to slowdown and increased memory usage. Using fp8_model_init() results in storing weight in FP8.\n", + "
\n", + "
\n", + "\n", + "###### Benchmarking\n", + "\n", + "We'll evaluate the generation time across one benchmark: generation with context phase max sequence length = 128, batch size = 64 and number of generated tokens = 896 on random texts with random lengths.\n", + "\n", + "
\n", + "Note\n", + " \n", + "This tutorial focuses on showcasing the mentioned features of Transformer Engine in the context of token generation. It's important to note, however, that NVIDIA provides [TensorRT](https://developer.nvidia.com/tensorrt), which is optimized for inference tasks and should be considered for such use cases.\n", + "
" + ] + }, + { + "cell_type": "markdown", + "id": "b18f91a9", + "metadata": {}, + "source": [ + "## Dependencies for this tutorial" + ] + }, + { + "cell_type": "markdown", + "id": "e5201d77", + "metadata": {}, + "source": [ + "Following files and media are necessary to effectively run this tutorial:\n", + "\n", + "1. `te_gemma.py`\n", + " - This file contains the code to load a Hugging Face Gemma checkpoint in Transformer Engine's `TransformerLayer` instead of Hugging Face's `GemmaDecoderLayer`. It does also contain code for generation with THD attention, CUDA Graphs and weight calibration.\n", + "2. `te_gemma_loading_weights.py`\n", + " - This file contains logic of mapping the parameters from `GemmaDecoderLayer` into the `TransformerLayer`.\n", + "3. `utils.py`\n", + " - This file contains the code related to dataloading, hyperparameters, setting up model/optimizers/accelerator, model training and other miscellaneous tasks like restarting the jupyter notebook from within the cell. \n", + "4. `requirements.txt`\n", + " - This file contains necessary Python packages for this tutorial.\n", + "5. `media/`\n", + " - This directory contains the images used in the following tutorial." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "31390c76", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com\n", + "Collecting transformers==4.41.1 (from -r requirements.txt (line 1))\n", + " Downloading transformers-4.41.1-py3-none-any.whl.metadata (43 kB)\n", + "Collecting accelerate==0.30.1 (from -r requirements.txt (line 2))\n", + " Downloading accelerate-0.30.1-py3-none-any.whl.metadata (18 kB)\n", + "Collecting datasets==2.19.1 (from -r requirements.txt (line 3))\n", + " Downloading datasets-2.19.1-py3-none-any.whl.metadata (19 kB)\n", + "Collecting sentencepiece==0.2.0 (from -r requirements.txt (line 4))\n", + " Downloading sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers==4.41.1->-r requirements.txt (line 1)) (3.16.1)\n", + "Collecting huggingface-hub<1.0,>=0.23.0 (from transformers==4.41.1->-r requirements.txt (line 1))\n", + " Downloading huggingface_hub-0.26.2-py3-none-any.whl.metadata (13 kB)\n", + "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers==4.41.1->-r requirements.txt (line 1)) (1.24.4)\n", + "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers==4.41.1->-r requirements.txt (line 1)) (23.2)\n", + "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers==4.41.1->-r requirements.txt (line 1)) (6.0.2)\n", + "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers==4.41.1->-r requirements.txt (line 1)) (2024.9.11)\n", + "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers==4.41.1->-r requirements.txt (line 1)) (2.32.3)\n", + "Collecting tokenizers<0.20,>=0.19 (from transformers==4.41.1->-r requirements.txt (line 1))\n", + " Downloading tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)\n", + "Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.10/dist-packages (from transformers==4.41.1->-r requirements.txt (line 1)) (0.4.5)\n", + "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers==4.41.1->-r requirements.txt (line 1)) (4.66.5)\n", + "Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from accelerate==0.30.1->-r requirements.txt (line 2)) (6.0.0)\n", + "Requirement already satisfied: torch>=1.10.0 in /usr/local/lib/python3.10/dist-packages (from accelerate==0.30.1->-r requirements.txt (line 2)) (2.5.0a0+e000cf0ad9.nv24.10)\n", + "Requirement already satisfied: pyarrow>=12.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets==2.19.1->-r requirements.txt (line 3)) (16.1.0)\n", + "Collecting pyarrow-hotfix (from datasets==2.19.1->-r requirements.txt (line 3))\n", + " Downloading pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)\n", + "Collecting dill<0.3.9,>=0.3.0 (from datasets==2.19.1->-r requirements.txt (line 3))\n", + " Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)\n", + "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets==2.19.1->-r requirements.txt (line 3)) (2.2.2)\n", + "Collecting xxhash (from datasets==2.19.1->-r requirements.txt (line 3))\n", + " Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)\n", + "Collecting multiprocess (from datasets==2.19.1->-r requirements.txt (line 3))\n", + " Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)\n", + "Collecting fsspec<=2024.3.1,>=2023.1.0 (from fsspec[http]<=2024.3.1,>=2023.1.0->datasets==2.19.1->-r requirements.txt (line 3))\n", + " Downloading fsspec-2024.3.1-py3-none-any.whl.metadata (6.8 kB)\n", + "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets==2.19.1->-r requirements.txt (line 3)) (3.10.5)\n", + "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets==2.19.1->-r requirements.txt (line 3)) (2.4.0)\n", + "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets==2.19.1->-r requirements.txt (line 3)) (1.3.1)\n", + "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets==2.19.1->-r requirements.txt (line 3)) (24.2.0)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets==2.19.1->-r requirements.txt (line 3)) (1.4.1)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets==2.19.1->-r requirements.txt (line 3)) (6.0.5)\n", + "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets==2.19.1->-r requirements.txt (line 3)) (1.9.4)\n", + "Requirement already satisfied: async-timeout<5.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets==2.19.1->-r requirements.txt (line 3)) (4.0.3)\n", + "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.23.0->transformers==4.41.1->-r requirements.txt (line 1)) (4.12.2)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.41.1->-r requirements.txt (line 1)) (3.3.2)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.41.1->-r requirements.txt (line 1)) (3.7)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.41.1->-r requirements.txt (line 1)) (2.0.7)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.41.1->-r requirements.txt (line 1)) (2024.8.30)\n", + "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate==0.30.1->-r requirements.txt (line 2)) (3.3)\n", + "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate==0.30.1->-r requirements.txt (line 2)) (3.1.4)\n", + "Requirement already satisfied: sympy==1.13.1 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate==0.30.1->-r requirements.txt (line 2)) (1.13.1)\n", + "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from sympy==1.13.1->torch>=1.10.0->accelerate==0.30.1->-r requirements.txt (line 2)) (1.3.0)\n", + "INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.\n", + "Collecting multiprocess (from datasets==2.19.1->-r requirements.txt (line 3))\n", + " Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)\n", + "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets==2.19.1->-r requirements.txt (line 3)) (2.9.0.post0)\n", + "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets==2.19.1->-r requirements.txt (line 3)) (2023.4)\n", + "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets==2.19.1->-r requirements.txt (line 3)) (2024.1)\n", + "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas->datasets==2.19.1->-r requirements.txt (line 3)) (1.16.0)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=1.10.0->accelerate==0.30.1->-r requirements.txt (line 2)) (2.1.5)\n", + "Downloading transformers-4.41.1-py3-none-any.whl (9.1 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m9.1/9.1 MB\u001b[0m \u001b[31m175.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading accelerate-0.30.1-py3-none-any.whl (302 kB)\n", + "Downloading datasets-2.19.1-py3-none-any.whl (542 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m542.0/542.0 kB\u001b[0m \u001b[31m334.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m628.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)\n", + "Downloading fsspec-2024.3.1-py3-none-any.whl (171 kB)\n", + "Downloading huggingface_hub-0.26.2-py3-none-any.whl (447 kB)\n", + "Downloading tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.6/3.6 MB\u001b[0m \u001b[31m296.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading multiprocess-0.70.16-py310-none-any.whl (134 kB)\n", + "Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)\n", + "Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)\n", + "Installing collected packages: sentencepiece, xxhash, pyarrow-hotfix, fsspec, dill, multiprocess, huggingface-hub, tokenizers, accelerate, transformers, datasets\n", + " Attempting uninstall: fsspec\n", + " Found existing installation: fsspec 2024.6.1\n", + " Uninstalling fsspec-2024.6.1:\n", + " Successfully uninstalled fsspec-2024.6.1\n", + " Attempting uninstall: dill\n", + " Found existing installation: dill 0.3.9\n", + " Uninstalling dill-0.3.9:\n", + " Successfully uninstalled dill-0.3.9\n", + "Successfully installed accelerate-0.30.1 datasets-2.19.1 dill-0.3.8 fsspec-2024.3.1 huggingface-hub-0.26.2 multiprocess-0.70.16 pyarrow-hotfix-0.6 sentencepiece-0.2.0 tokenizers-0.19.1 transformers-4.41.1 xxhash-3.5.0\n", + "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable.It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.\u001b[0m\u001b[33m\n", + "\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.3.1\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython -m pip install --upgrade pip\u001b[0m\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "%pip install -r requirements.txt\n", + "\n", + "import torch\n", + "cudnn_version = torch.backends.cudnn.version()\n", + "assert cudnn_version >= 90100, \"cuDNN version >= 9.1.0 is needed to run this tutorial.\"" + ] + }, + { + "cell_type": "markdown", + "id": "e8dfabbf", + "metadata": {}, + "source": [ + "\n", + "|\n", + "## [Baseline] Running Hugging Face generation with Gemma model" + ] + }, + { + "cell_type": "markdown", + "id": "59560bff", + "metadata": {}, + "source": [ + "HuggingFace Transformers library offers generation API. \n", + "HuggingFace generation for the Gemma model will be used as a baseline." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "2803e0ec", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.10/dist-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.\n", + "`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.\n", + "Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use\n", + "`config.hidden_activation` if you want to override this behaviour.\n", + "See https://github.com/huggingface/transformers/pull/29402 for more details.\n", + "Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:04<00:00, 1.02s/it]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "============================== Generation example 1 ==============================\n", + "Prompt:\n", + "Here are the two facts about GPUs:\n", + "Generated text:\n", + "\n", + "\n", + "1. GPUs are very good at doing the same thing over and over again.\n", + "2. GPUs are very bad at doing different things at the same time.\n", + "\n", + "The first fact is why GPUs are so good at graphics. The second fact is\n", + "============================== Generation example 2 ==============================\n", + "Prompt:\n", + "Some facts about NVIDIA:\n", + "Generated text:\n", + "\n", + "\n", + "* NVIDIA is a global technology company that designs and develops high-performance computer graphics and computer processing units (CPUs) for the gaming and professional markets.\n", + "* The company was founded in 1993 and is headquartered in Santa Clara\n", + "============================== Generation example 3 ==============================\n", + "Prompt:\n", + "Here are the two facts about GPUs:\n", + "Generated text:\n", + "\n", + "\n", + "1. GPUs are very good at doing the same thing over and over again.\n", + "2. GPUs are very bad at doing different things at the same time.\n", + "\n", + "The first fact is why GPUs are so good at graphics. The second fact is\n", + "============================== Generation example 4 ==============================\n", + "Prompt:\n", + "Some facts about NVIDIA:\n", + "Generated text:\n", + "\n", + "\n", + "* NVIDIA is a global technology company that designs and develops high-performance computer graphics and computer processing units (CPUs) for the gaming and professional markets.\n", + "* The company was founded in 1993 and is headquartered in Santa Clara\n", + "============================== Generation example 5 ==============================\n", + "Prompt:\n", + "Here are the two facts about GPUs:\n", + "Generated text:\n", + "\n", + "\n", + "1. GPUs are very good at doing the same thing over and over again.\n", + "2. GPUs are very bad at doing different things at the same time.\n", + "\n", + "The first fact is why GPUs are so good at graphics. The second fact is\n" + ] + } + ], + "source": [ + "# Restart the notebook (to flush the GPU memory)\n", + "from utils import restart_jupyter_notebook\n", + "restart_jupyter_notebook()\n", + "\n", + "from utils import *\n", + "\n", + "# Default hyperparams, also defined in `utils.py` in class `Hyperparameters`\n", + "# !!! `model_name` attr must point to the location of the model weights !!!\n", + "# Weights can be downloaded from: https://huggingface.co/google/gemma-7b.\n", + "# Weights should be in the *.safetensors HF format, not in the original format.\n", + "hyperparams.model_name = \"/tmp/gemma-7b-hf\" # <== Add model weight location here e.g. \"/path/to/downloaded/gemma/weights\"\n", + "\n", + "model = init_baseline_model(hyperparams)\n", + "\n", + "print_sample_of_generated_texts(model)\n", + "# benchmark_generation(model)" + ] + }, + { + "cell_type": "markdown", + "id": "b3698dc6", + "metadata": {}, + "source": [ + "Let's put this time into the table for later comparison.\n", + "\n", + "| Models | Time (s) | Speedup | \n", + "|-------------------------------------------------------------|---------------------------------------|--------------------------------------|\n", + "| HF (baseline) | 87.68 | 1 |" + ] + }, + { + "cell_type": "markdown", + "id": "8bb40f45", + "metadata": {}, + "source": [ + "## [Improvement 1] Using TransformerLayer from Transformer Engine instead of GemmaDecoderLayer." + ] + }, + { + "cell_type": "markdown", + "id": "263b40f2", + "metadata": {}, + "source": [ + "As in the [Gemma](./tutorial_accelerate_hf_gemma_finetuning_with_te.ipynb) finetuning tutorial, a GemmaDecoderLayer is substituted by a tuned TransformerLayer from the Transformer Engine. Let's run it and compare the time with the baseline." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "9dceef93", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.10/dist-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in TEGemmaForCausalLM is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained(\"openai/whisper-tiny\", attn_implementation=\"flash_attention_2\", torch_dtype=torch.float16)`\n", + "Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in GemmaModel is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained(\"openai/whisper-tiny\", attn_implementation=\"flash_attention_2\", torch_dtype=torch.float16)`\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "> \u001b[0;32m/perfhome/mnt/wkstn/work/repos/TransformerEngine/transformer_engine/pytorch/attention.py\u001b[0m(8223)\u001b[0;36mforward\u001b[0;34m()\u001b[0m\n", + "\u001b[0;32m 8221 \u001b[0;31m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m 8222 \u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mpdb\u001b[0m\u001b[0;34m;\u001b[0m \u001b[0mpdb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mset_trace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m-> 8223 \u001b[0;31m key_layer, value_layer = inference_params.save_to_kv_cache(\n", + "\u001b[0m\u001b[0;32m 8224 \u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlayer_number\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey_layer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue_layer\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0m\u001b[0;32m 8225 \u001b[0;31m )\n", + "\u001b[0m\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "ipdb> key_layer.shape\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "torch.Size([128, 64, 16, 256])\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "ipdb> value_layer.shape\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "torch.Size([128, 64, 16, 256])\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "ipdb> query_layer.shape\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "torch.Size([8192, 16, 256])\n" + ] + }, + { + "name": "stdin", + "output_type": "stream", + "text": [ + "ipdb> c\n" + ] + }, + { + "ename": "AssertionError", + "evalue": "Queries, keys and values must be 4D tensors when qkv_format = bshd!", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[2], line 11\u001b[0m\n\u001b[1;32m 7\u001b[0m hyperparams\u001b[38;5;241m.\u001b[39mmodel_name \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m/tmp/gemma-7b-hf\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;66;03m# <== Add model weight location here e.g. \"/path/to/downloaded/gemma/weights\"\u001b[39;00m\n\u001b[1;32m 9\u001b[0m model \u001b[38;5;241m=\u001b[39m init_te_gemma_model(hyperparams)\n\u001b[0;32m---> 11\u001b[0m \u001b[43mprint_sample_of_generated_texts\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 12\u001b[0m \u001b[38;5;66;03m# benchmark_generation(model)\u001b[39;00m\n", + "File \u001b[0;32m/perfhome/mnt/wkstn/work/repos/TransformerEngine/docs/examples/te_gemma/utils.py:280\u001b[0m, in \u001b[0;36mprint_sample_of_generated_texts\u001b[0;34m(model)\u001b[0m\n\u001b[1;32m 277\u001b[0m inputs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minput_ids\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m inputs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minput_ids\u001b[39m\u001b[38;5;124m\"\u001b[39m]\u001b[38;5;241m.\u001b[39mcuda()\n\u001b[1;32m 278\u001b[0m inputs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mattention_mask\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m inputs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mattention_mask\u001b[39m\u001b[38;5;124m\"\u001b[39m]\u001b[38;5;241m.\u001b[39mcuda()\n\u001b[0;32m--> 280\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgenerate\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmax_new_tokens\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m50\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 281\u001b[0m generated_texts \u001b[38;5;241m=\u001b[39m tokenizer\u001b[38;5;241m.\u001b[39mbatch_decode(outputs, skip_special_tokens\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m 283\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mprint_output\u001b[39m(prompts, generated_texts, idx):\n", + "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py:116\u001b[0m, in \u001b[0;36mcontext_decorator..decorate_context\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 113\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(func)\n\u001b[1;32m 114\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdecorate_context\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m 115\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m ctx_factory():\n\u001b[0;32m--> 116\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/perfhome/mnt/wkstn/work/repos/TransformerEngine/docs/examples/te_gemma/te_gemma.py:450\u001b[0m, in \u001b[0;36mTEGemmaForCausalLM.generate\u001b[0;34m(self, input_ids, pad_token_id, max_new_tokens, *args, **kwargs)\u001b[0m\n\u001b[1;32m 446\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39mqkv_format \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mthd\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 447\u001b[0m \u001b[38;5;66;03m# For thd layout padding is at the end, otherwise at the beginning.\u001b[39;00m\n\u001b[1;32m 448\u001b[0m TEGemmaForCausalLM\u001b[38;5;241m.\u001b[39m_padding_to_end(input_ids, lengths)\n\u001b[0;32m--> 450\u001b[0m hidden_states, next_tokens \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_generate_context_phase\u001b[49m\u001b[43m(\u001b[49m\u001b[43minput_ids\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minference_params\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 452\u001b[0m \u001b[38;5;66;03m# Generation phase.\u001b[39;00m\n\u001b[1;32m 453\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39mqkv_format \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mthd\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n", + "File \u001b[0;32m/perfhome/mnt/wkstn/work/repos/TransformerEngine/docs/examples/te_gemma/te_gemma.py:381\u001b[0m, in \u001b[0;36mTEGemmaForCausalLM._generate_context_phase\u001b[0;34m(self, input_ids, inference_params)\u001b[0m\n\u001b[1;32m 378\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 379\u001b[0m inference_params\u001b[38;5;241m.\u001b[39msetup_before_new_input(length\u001b[38;5;241m=\u001b[39minput_ids\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m1\u001b[39m])\n\u001b[0;32m--> 381\u001b[0m logits \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_model_context_phase\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 382\u001b[0m \u001b[43m \u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 383\u001b[0m \u001b[43m \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m(\u001b[49m\u001b[43minput_ids\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m==\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconfig\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mqkv_format\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m!=\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mthd\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 384\u001b[0m \u001b[43m \u001b[49m\u001b[43mattn_mask_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mpadding_causal\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43minference_params\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mqkv_format\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m==\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mthd\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43marbitrary\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\n\u001b[1;32m 385\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 387\u001b[0m \u001b[38;5;66;03m# We choose logits coresponding with last token in each sequence,\u001b[39;00m\n\u001b[1;32m 388\u001b[0m \u001b[38;5;66;03m# which have various lengths - they are stored in (inference_params.incoming_seq_len - 1)\u001b[39;00m\n\u001b[1;32m 389\u001b[0m \u001b[38;5;66;03m# Tensor when qkv_format == \"thd\" and\u001b[39;00m\n\u001b[1;32m 390\u001b[0m \u001b[38;5;66;03m# they are the last token in the sequence when qkv_format != \"thd\".\u001b[39;00m\n\u001b[1;32m 391\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39mqkv_format \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mthd\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n", + "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1736\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1734\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1735\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1736\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1747\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1742\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1743\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1744\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1745\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1746\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1747\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1749\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1750\u001b[0m called_always_called_hooks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n", + "File \u001b[0;32m/perfhome/mnt/wkstn/work/repos/TransformerEngine/docs/examples/te_gemma/te_gemma.py:183\u001b[0m, in \u001b[0;36mStaticGemmaModel.forward\u001b[0;34m(self, hidden_states, attention_mask, attn_mask_type)\u001b[0m\n\u001b[1;32m 180\u001b[0m hidden_states\u001b[38;5;241m.\u001b[39mdata[:] \u001b[38;5;241m=\u001b[39m hidden_states\u001b[38;5;241m.\u001b[39mdata[:] \u001b[38;5;241m*\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnormalizer\n\u001b[1;32m 182\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i, decoder_layer \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel\u001b[38;5;241m.\u001b[39mlayers):\n\u001b[0;32m--> 183\u001b[0m hidden_states\u001b[38;5;241m.\u001b[39mdata[:] \u001b[38;5;241m=\u001b[39m \u001b[43mdecoder_layer\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 184\u001b[0m \u001b[43m \u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 185\u001b[0m \u001b[43m \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 186\u001b[0m \u001b[43m \u001b[49m\u001b[43mself_attn_mask_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmask\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mattn_mask_type\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mis\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mattn_mask_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 187\u001b[0m \u001b[43m \u001b[49m\u001b[43minference_params\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43minference_params\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 188\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m[\n\u001b[1;32m 189\u001b[0m \u001b[38;5;241m0\u001b[39m\n\u001b[1;32m 190\u001b[0m ] \u001b[38;5;66;03m# static copy - for CUDA graphs\u001b[39;00m\n\u001b[1;32m 192\u001b[0m hidden_states\u001b[38;5;241m.\u001b[39mcopy_(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel\u001b[38;5;241m.\u001b[39mnorm(hidden_states)) \u001b[38;5;66;03m# static copy - for CUDA graphs\u001b[39;00m\n\u001b[1;32m 193\u001b[0m logits \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlm_head(hidden_states)\n", + "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1736\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1734\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1735\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1736\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1747\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1742\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1743\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1744\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1745\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1746\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1747\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1749\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1750\u001b[0m called_always_called_hooks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n", + "File \u001b[0;32m/perfhome/mnt/wkstn/work/repos/TransformerEngine/docs/examples/te_gemma/te_gemma.py:151\u001b[0m, in \u001b[0;36mTEGemmaDecoderLayer.forward\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 148\u001b[0m kwargs\u001b[38;5;241m.\u001b[39mpop(key, \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[1;32m 150\u001b[0m \u001b[38;5;66;03m# We need to return tuple to be compatible with HF.\u001b[39;00m\n\u001b[0;32m--> 151\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m (\u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mforward\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrotary_pos_emb\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mte_rope_emb\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m,)\n", + "File \u001b[0;32m/perfhome/mnt/wkstn/work/repos/TransformerEngine/transformer_engine/pytorch/transformer.py:690\u001b[0m, in \u001b[0;36mTransformerLayer.forward\u001b[0;34m(self, hidden_states, attention_mask, self_attn_mask_type, window_size, encoder_output, enc_dec_attn_mask, enc_dec_attn_mask_type, enc_dec_window_size, is_first_microbatch, checkpoint_core_attention, inference_params, rotary_pos_emb, core_attention_bias_type, core_attention_bias, alibi_slopes, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv, fast_zero_fill)\u001b[0m\n\u001b[1;32m 687\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m cast_if_needed(hidden_states, torch\u001b[38;5;241m.\u001b[39mget_autocast_gpu_dtype())\n\u001b[1;32m 689\u001b[0m \u001b[38;5;66;03m# Self attention.\u001b[39;00m\n\u001b[0;32m--> 690\u001b[0m self_attention_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mself_attention\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 691\u001b[0m \u001b[43m \u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 692\u001b[0m \u001b[43m \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 693\u001b[0m \u001b[43m \u001b[49m\u001b[43mattn_mask_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mself_attn_mask_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 694\u001b[0m \u001b[43m \u001b[49m\u001b[43mwindow_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mwindow_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 695\u001b[0m \u001b[43m \u001b[49m\u001b[43minference_params\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minference_params\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 696\u001b[0m \u001b[43m \u001b[49m\u001b[43mis_first_microbatch\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_first_microbatch\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 697\u001b[0m \u001b[43m \u001b[49m\u001b[43mcheckpoint_core_attention\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcheckpoint_core_attention\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 698\u001b[0m \u001b[43m \u001b[49m\u001b[43mrotary_pos_emb\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrotary_pos_emb\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 699\u001b[0m \u001b[43m \u001b[49m\u001b[43mcore_attention_bias_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcore_attention_bias_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 700\u001b[0m \u001b[43m \u001b[49m\u001b[43mcore_attention_bias\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcore_attention_bias\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 701\u001b[0m \u001b[43m \u001b[49m\u001b[43malibi_slopes\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43malibi_slopes\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 702\u001b[0m \u001b[43m \u001b[49m\u001b[43mcu_seqlens_q\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcu_seqlens_q\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 703\u001b[0m \u001b[43m \u001b[49m\u001b[43mcu_seqlens_kv\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcu_seqlens_kv\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 704\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_seqlen_q\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_seqlen_q\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 705\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_seqlen_kv\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_seqlen_kv\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 706\u001b[0m \u001b[43m \u001b[49m\u001b[43mfast_zero_fill\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfast_zero_fill\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 707\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 709\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mapply_residual_connection_post_layernorm \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moutput_layernorm:\n\u001b[1;32m 710\u001b[0m attention_output, attention_bias, residual \u001b[38;5;241m=\u001b[39m self_attention_outputs\n", + "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1736\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1734\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1735\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1736\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1747\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1742\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1743\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1744\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1745\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1746\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1747\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1749\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1750\u001b[0m called_always_called_hooks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n", + "File \u001b[0;32m/perfhome/mnt/wkstn/work/repos/TransformerEngine/transformer_engine/pytorch/attention.py:9453\u001b[0m, in \u001b[0;36mMultiheadAttention.forward\u001b[0;34m(self, hidden_states, attention_mask, encoder_output, attn_mask_type, window_size, is_first_microbatch, checkpoint_core_attention, inference_params, rotary_pos_emb, core_attention_bias_type, core_attention_bias, alibi_slopes, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv, fast_zero_fill)\u001b[0m\n\u001b[1;32m 9447\u001b[0m query_layer \u001b[38;5;241m=\u001b[39m query_layer\u001b[38;5;241m.\u001b[39mview(\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m*\u001b[39mquery_layer\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m2\u001b[39m:])\u001b[38;5;241m.\u001b[39mcontiguous()\n\u001b[1;32m 9449\u001b[0m \u001b[38;5;66;03m# ===========================\u001b[39;00m\n\u001b[1;32m 9450\u001b[0m \u001b[38;5;66;03m# Core attention computation\u001b[39;00m\n\u001b[1;32m 9451\u001b[0m \u001b[38;5;66;03m# ===========================\u001b[39;00m\n\u001b[0;32m-> 9453\u001b[0m context_layer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcore_attention\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 9454\u001b[0m \u001b[43m \u001b[49m\u001b[43mquery_layer\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9455\u001b[0m \u001b[43m \u001b[49m\u001b[43mkey_layer\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9456\u001b[0m \u001b[43m \u001b[49m\u001b[43mvalue_layer\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9457\u001b[0m \u001b[43m \u001b[49m\u001b[43mqkv_format\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mqkv_format\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9458\u001b[0m \u001b[43m \u001b[49m\u001b[43mcu_seqlens_q\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcu_seqlens_q\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9459\u001b[0m \u001b[43m \u001b[49m\u001b[43mcu_seqlens_kv\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcu_seqlens_kv\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9460\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_seqlen_q\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_seqlen_q\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9461\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_seqlen_kv\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_seqlen_kv\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9462\u001b[0m \u001b[43m \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9463\u001b[0m \u001b[43m \u001b[49m\u001b[43mattn_mask_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mattn_mask_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9464\u001b[0m \u001b[43m \u001b[49m\u001b[43mwindow_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mwindow_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9465\u001b[0m \u001b[43m \u001b[49m\u001b[43mcheckpoint_core_attention\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcheckpoint_core_attention\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9466\u001b[0m \u001b[43m \u001b[49m\u001b[43mcore_attention_bias_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcore_attention_bias_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9467\u001b[0m \u001b[43m \u001b[49m\u001b[43mcore_attention_bias\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcore_attention_bias\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9468\u001b[0m \u001b[43m \u001b[49m\u001b[43malibi_slopes\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43malibi_slopes\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9469\u001b[0m \u001b[43m \u001b[49m\u001b[43mfast_zero_fill\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfast_zero_fill\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9470\u001b[0m \u001b[43m \u001b[49m\u001b[43minference_params\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minference_params\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9471\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 9473\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mqkv_format \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mthd\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 9474\u001b[0m \u001b[38;5;66;03m# [b * sq, h] -> [qs, b, h]\u001b[39;00m\n\u001b[1;32m 9475\u001b[0m context_layer \u001b[38;5;241m=\u001b[39m context_layer\u001b[38;5;241m.\u001b[39mview(\n\u001b[1;32m 9476\u001b[0m (inference_params\u001b[38;5;241m.\u001b[39mmax_batch_size, \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m, context_layer\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m1\u001b[39m])\n\u001b[1;32m 9477\u001b[0m )\u001b[38;5;241m.\u001b[39mcontiguous()\n", + "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1736\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1734\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1735\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1736\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1747\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1742\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1743\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1744\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1745\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1746\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1747\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1749\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1750\u001b[0m called_always_called_hooks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n", + "File \u001b[0;32m/perfhome/mnt/wkstn/work/repos/TransformerEngine/transformer_engine/pytorch/attention.py:8301\u001b[0m, in \u001b[0;36mDotProductAttention.forward\u001b[0;34m(self, query_layer, key_layer, value_layer, attention_mask, qkv_format, cu_seqlens_q, cu_seqlens_kv, cu_seqlens_q_padded, cu_seqlens_kv_padded, max_seqlen_q, max_seqlen_kv, attn_mask_type, window_size, checkpoint_core_attention, core_attention_bias_type, core_attention_bias, alibi_slopes, fast_zero_fill, inference_params, is_first_microbatch)\u001b[0m\n\u001b[1;32m 8298\u001b[0m context_parallel \u001b[38;5;241m=\u001b[39m cp_size \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m 8300\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m qkv_format \u001b[38;5;129;01min\u001b[39;00m [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msbhd\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbshd\u001b[39m\u001b[38;5;124m\"\u001b[39m]:\n\u001b[0;32m-> 8301\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mall\u001b[39m(\n\u001b[1;32m 8302\u001b[0m \u001b[38;5;28mlen\u001b[39m(x\u001b[38;5;241m.\u001b[39mshape) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m4\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m (query_layer, key_layer, value_layer)\n\u001b[1;32m 8303\u001b[0m ), \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mQueries, keys and values must be 4D tensors when qkv_format = \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mqkv_format\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m!\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 8304\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m qkv_format \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msbhd\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 8305\u001b[0m max_seqlen_q \u001b[38;5;241m=\u001b[39m query_layer\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m0\u001b[39m] \u001b[38;5;28;01mif\u001b[39;00m max_seqlen_q \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m max_seqlen_q\n", + "\u001b[0;31mAssertionError\u001b[0m: Queries, keys and values must be 4D tensors when qkv_format = bshd!" + ] + } + ], + "source": [ + "# Restart the notebook (to flush the GPU memory)\n", + "from utils import restart_jupyter_notebook\n", + "restart_jupyter_notebook()\n", + "\n", + "from utils import *\n", + "\n", + "hyperparams.model_name = \"/tmp/gemma-7b-hf\" # <== Add model weight location here e.g. \"/path/to/downloaded/gemma/weights\"\n", + "\n", + "model = init_te_gemma_model(hyperparams)\n", + "\n", + "print_sample_of_generated_texts(model)\n", + "# benchmark_generation(model)" + ] + }, + { + "cell_type": "markdown", + "id": "b5d40836", + "metadata": { + "jupyter": { + "source_hidden": true + } + }, + "source": [ + "The speedup of **62%** was obtained." + ] + }, + { + "cell_type": "markdown", + "id": "006d18e8", + "metadata": {}, + "source": [ + "| Models | Time (s) | Speedup | \n", + "|-------------------------------------------------------------|---------------------------------------|--------------------------------------|\n", + "| HF (baseline) | 87.68 | 1 |\n", + "| TE (subsitution of GemmaDecoderLayer with te.TransformerLayer) | 54.11 | 1.62 | " + ] + }, + { + "cell_type": "markdown", + "id": "2bbf3d47", + "metadata": {}, + "source": [ + "## [Improvement 2] Use of THD attention layout.\n", + "\n", + "Input sequences can have various lengths. Hugging Face generation – as can be seen in Animation 1 – pads the sequences and then uses attention mask. In the THD attention layout cumulative sequence lengths and offsets need to be provided, instead of attention mask. The THD attention layout is much more optimized than BSHD layout.\n", + "\n", + "The class `transformer_engine.pytorch.DotProductAttention` supports this format. One need to pass the following things as the arguments to the forward:\n", + "- `seq_offsets_q`, `seq_offsets_k`, `seq_offsets_v` – offsets of the beginnings of the next sequences,\n", + "- `cu_seqlens_q`, `cu_seqlens_kv` – cumulative sum of the lengths of the sequences of query and values,\n", + "- `max_seqlen_q` – maximum sequence length in query layer,\n", + "- `max_seqlen_kv` – maximum sequence length in key-value layer.\n", + "\n", + "
\n", + "Note\n", + "\n", + "Currently, the THD attention for `TransformerLayer` is supported only for token generation.\n", + "
\n", + "\n", + "Let's look how using TransformerEngine with THD attention impacts the speed of token generation:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "4fc5e1cd", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.10/dist-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in TEGemmaForCausalLM is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained(\"openai/whisper-tiny\", attn_implementation=\"flash_attention_2\", torch_dtype=torch.float16)`\n", + "Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in GemmaModel is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained(\"openai/whisper-tiny\", attn_implementation=\"flash_attention_2\", torch_dtype=torch.float16)`\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "============================== Generation example 1 ==============================\n", + "Prompt:\n", + "Here are the two facts about GPUs:\n", + "Generated text:\n", + "\n", + "\n", + "1. They are very good at doing the same thing over and over again.\n", + "2. They are very bad at doing different things at the same time.\n", + "\n", + "This is why they are so good at rendering 3D graphics.\n", + "\n", + "The GPU\n", + "============================== Generation example 2 ==============================\n", + "Prompt:\n", + "Some facts about NVIDIA:\n", + "Generated text:\n", + "\n", + "\n", + "* NVIDIA is a global technology company that designs and develops graphics processing units (GPUs) for the gaming and professional markets.\n", + "* NVIDIA was founded in 1993 by Jensen Huang, Chris Malachowsky, and Curtis Priem.\n", + "============================== Generation example 3 ==============================\n", + "Prompt:\n", + "Here are the two facts about GPUs:\n", + "Generated text:\n", + "\n", + "\n", + "1. They are very good at doing the same thing over and over again.\n", + "2. They are very bad at doing different things at the same time.\n", + "\n", + "This is why they are so good at rendering 3D graphics.\n", + "\n", + "The GPU\n", + "============================== Generation example 4 ==============================\n", + "Prompt:\n", + "Some facts about NVIDIA:\n", + "Generated text:\n", + "\n", + "\n", + "* NVIDIA is a global technology company that designs and develops graphics processing units (GPUs) for the gaming and professional markets.\n", + "* NVIDIA was founded in 1993 by Jensen Huang, Chris Malachowsky, and Curtis Priem.\n", + "============================== Generation example 5 ==============================\n", + "Prompt:\n", + "Here are the two facts about GPUs:\n", + "Generated text:\n", + "\n", + "\n", + "1. They are very good at doing the same thing over and over again.\n", + "2. They are very bad at doing different things at the same time.\n", + "\n", + "This is why they are so good at rendering 3D graphics.\n", + "\n", + "The GPU\n" + ] + } + ], + "source": [ + "# Restart the notebook (to flush the GPU memory)\n", + "from utils import restart_jupyter_notebook\n", + "restart_jupyter_notebook()\n", + "\n", + "from utils import *\n", + "\n", + "hyperparams.model_name = \"/tmp/gemma-7b-hf/\" # <== Add model weight location here e.g. \"/path/to/downloaded/gemma/weights\"\n", + "hyperparams.qkv_format = \"thd\"\n", + "\n", + "model = init_te_gemma_model(hyperparams)\n", + "\n", + "print_sample_of_generated_texts(model)\n", + "# benchmark_generation(model)" + ] + }, + { + "cell_type": "markdown", + "id": "8e397a65", + "metadata": {}, + "source": [ + "By using THD attention, the following speedup was obtained:\n", + "\n", + "| Models | Time (s) | Speedup | \n", + "|-------------------------------------------------------------|---------------------------------------|--------------------------------------|\n", + "| HF (baseline) | 87.68 | 1 |\n", + "| TE (subsitution of GemmaDecoderLayer with te.TransformerLayer) | 54.11 | 1.62 | \n", + "| TE + THD attention | 28.22 | 3.11 | " + ] + }, + { + "cell_type": "markdown", + "id": "21a89d9c", + "metadata": {}, + "source": [ + "## [Improvement 3] Speeding up generation with CUDA Graphs" + ] + }, + { + "cell_type": "markdown", + "id": "e2d53e7b", + "metadata": {}, + "source": [ + "TransformerEngine includes a function `transformer_engine.pytorch.make_graphed_callables`, which functions similarly to the corresponding feature in PyTorch. It is capable of recording any modules from the Transformer Engine. Below is a code excerpt from `te_gemma.py` from class `TEGemmaForCausalLMCudaGraphs`:\n", + "```\n", + " def __init__(self, config : GemmaConfig):\n", + " (...)\n", + " \n", + " # Here \"the trick\" happens. We override methods from TEGemmaForCausalLM\n", + " # with their recorded version. After invocation of each of them,\n", + " # captured graph will be replayed with minimal usage of CPU,\n", + " # what will lead to huge speedup.\n", + " (...)\n", + " self._model_context_phase = \n", + " self.record_graph(self._model_context_phase, self.hidden_states_buffer) # CUDA Graphs recording\n", + "\n", + " (...) \n", + " self._model_generation_phase = \n", + " self.record_graph(self._model_generation_phase, self.generation_buffer) # CUDA Graphs recording\n", + "\n", + " @torch.no_grad()\n", + " def record_graph(self, function, input_tensor):\n", + " (...)\n", + " # function is invoked on argument (self.hidden_states,) and all kernels are recorded.\n", + " # record_graph() returns captured function, which can be run later with minimal use of th CPU.\n", + " fp8_format = Format.HYBRID\n", + " fp8_recipe = DelayedScaling(fp8_format=fp8_format, amax_history_len=32, amax_compute_algo=\"max\")\n", + " with autocast(dtype=torch.bfloat16, cache_enabled=False):\n", + " graphed_function = te.pytorch.make_graphed_callables(\n", + " function, \n", + " (input_tensor,), \n", + " fp8_enabled=True, \n", + " fp8_recipe=fp8_recipe, \n", + " allow_unused_input=True,\n", + " num_warmup_iters=3\n", + " )\n", + " return graphed_function\n", + "```\n", + "\n", + "It is strongly reccomended to review the entire code of the class `TEGemmaForCausalLMCudaGraphs`. Let's now proceed to evaluate the performance improvement offered by CUDA Graphs." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "31a3a8a3", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.10/dist-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in TEGemmaForCausalLMCudaGraphs is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained(\"openai/whisper-tiny\", attn_implementation=\"flash_attention_2\", torch_dtype=torch.float16)`\n", + "Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in GemmaModel is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained(\"openai/whisper-tiny\", attn_implementation=\"flash_attention_2\", torch_dtype=torch.float16)`\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "============================== Generation example 1 ==============================\n", + "Prompt:\n", + "Here are the two facts about GPUs:\n", + "Generated text:\n", + "\n", + "\n", + "1. They are very good at doing the same thing over and over again.\n", + "2. They are very bad at doing different things at the same time.\n", + "\n", + "This is why they are so good at rendering 3D graphics.\n", + "\n", + "The GPU\n", + "============================== Generation example 2 ==============================\n", + "Prompt:\n", + "Some facts about NVIDIA:\n", + "Generated text:\n", + "\n", + "\n", + "* NVIDIA is a global technology company that designs and develops graphics processing units (GPUs) for the gaming and professional markets.\n", + "* NVIDIA was founded in 1993 by Jensen Huang, Chris Malachowsky, and Curtis Priem.\n", + "============================== Generation example 3 ==============================\n", + "Prompt:\n", + "Here are the two facts about GPUs:\n", + "Generated text:\n", + "\n", + "\n", + "1. They are very good at doing the same thing over and over again.\n", + "2. They are very bad at doing different things at the same time.\n", + "\n", + "This is why they are so good at rendering 3D graphics.\n", + "\n", + "The GPU\n", + "============================== Generation example 4 ==============================\n", + "Prompt:\n", + "Some facts about NVIDIA:\n", + "Generated text:\n", + "\n", + "\n", + "* NVIDIA is a global technology company that designs and develops graphics processing units (GPUs) for the gaming and professional markets.\n", + "* NVIDIA was founded in 1993 by Jensen Huang, Chris Malachowsky, and Curtis Priem.\n", + "============================== Generation example 5 ==============================\n", + "Prompt:\n", + "Here are the two facts about GPUs:\n", + "Generated text:\n", + "\n", + "\n", + "1. They are very good at doing the same thing over and over again.\n", + "2. They are very bad at doing different things at the same time.\n", + "\n", + "This is why they are so good at rendering 3D graphics.\n", + "\n", + "The GPU\n" + ] + } + ], + "source": [ + "#Restart the notebook (to flush the GPU memory)\n", + "from utils import restart_jupyter_notebook\n", + "restart_jupyter_notebook()\n", + "\n", + "\n", + "from utils import *\n", + "\n", + "hyperparams.model_name = \"/tmp/gemma-7b-hf/\" # <== Add model weight location here e.g. \"/path/to/downloaded/gemma/weights\"\n", + "hyperparams.qkv_format = \"thd\"\n", + "\n", + "hyperparams.generation_cuda_graphs = True\n", + "\n", + "# It is necessary to preallocate a static buffer.\n", + "# CUDA graphs require static input tensors for every kernel.\n", + "# This approach may result in a slight increase in memory consumption;\n", + "# however, the substantial speedup achieved makes it worthwhile.\n", + "hyperparams.cuda_graphs_static_batch_size = 64\n", + "hyperparams.cuda_graphs_static_max_seq_len = 1024\n", + "hyperparams.cuda_graphs_static_max_context_len = 128\n", + "model = init_te_gemma_model(hyperparams)\n", + "\n", + "print_sample_of_generated_texts(model)\n", + "# benchmark_generation(model)" + ] + }, + { + "cell_type": "markdown", + "id": "53bb430f", + "metadata": {}, + "source": [ + "The **5.23x** speedup was obtained.\n", + "\n", + "| Models | Time (s) | Speedup | \n", + "|-------------------------------------------------------------|---------------------------------------|--------------------------------------|\n", + "| HF (baseline) | 87.68 | 1 |\n", + "| TE (subsitution of GemmaDecoderLayer with te.TransformerLayer) | 54.11 | 1.62 | \n", + "| TE + THD attention | 28.22 | 3.11 | \n", + "| TE + THD attention + CUDA Graphs | 16.75 | 5.23 | \n" + ] + }, + { + "cell_type": "markdown", + "id": "0a11b75c", + "metadata": {}, + "source": [ + "Let's look at the screenshots from *NVIDIA Nsight System* profiler to see where this speedup comes from:\n", + "\n", + "
\n", + "\n", + "
\n", + "Figure 5: Without CUDA Graphs. One can see that GPU (blue) is idle for big portion of the time.\n", + "
\n", + "
\n", + "\n", + "
\n", + "\n", + "
\n", + "Figure 6: With CUDA Graphs. One can see that GPU (orange) is fully utilized.\n", + "
\n", + "
" + ] + }, + { + "cell_type": "markdown", + "id": "e6b171a0", + "metadata": {}, + "source": [ + "## [Improvement 4] Running generation in FP8 of the model trained in higher precision " + ] + }, + { + "cell_type": "markdown", + "id": "1a80288b", + "metadata": {}, + "source": [ + "Implementing FP8 generation with the Gemma model is not straightforward, because this model was initially trained using BF16 precision, and the necessary FP8 scaling factors are missing. Running the model at this lower precision without proper scaling could lead to significant errors and incorrect results.\n", + "\n", + "It is highly recommended to familiarize oneself with the [tutorial](../../examples/fp8_primer.ipynb) on FP8 precision to understand the necessity of scaling.\n", + "\n", + "\n", + "
\n", + "\n", + "
\n", + " Figure 8: The FP8 scaling factors are incorrect and that leads to numerical errors. The weight calibration allows us to compute FP8 metadata during the forwards in higher precision.\n", + "
\n", + "
\n", + "\n", + "### Weight Calibration\n", + "\n", + "To address the issue outlined above, weight calibration will be used. This involves running several forward iterations at BF16 precision within the context `te.fp8_autocast(enabled=False, calibration=True)`. This setup allows the forward pass to operate at higher precision, while simultaneously collecting `amax_history` and other parameters related to the FP8 precision, which are essential for calculating the FP8 scaling well.\n", + "\n", + "The code below outlines the steps to initialize the BF16 model and conduct several forward iterations within the specified context. After these iterations, the model is saved, and these weights will be utilized in subsequent chapters." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "aecee0e1", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.10/dist-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in TEGemmaForCausalLM is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained(\"openai/whisper-tiny\", attn_implementation=\"flash_attention_2\", torch_dtype=torch.float16)`\n", + "Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in GemmaModel is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained(\"openai/whisper-tiny\", attn_implementation=\"flash_attention_2\", torch_dtype=torch.float16)`\n", + "Repo card metadata block was not found. Setting CardData to empty.\n", + "[WARNING | huggingface_hub.repocard]: Repo card metadata block was not found. Setting CardData to empty.\n", + "Repo card metadata block was not found. Setting CardData to empty.\n", + "[WARNING | huggingface_hub.repocard]: Repo card metadata block was not found. Setting CardData to empty.\n" + ] + } + ], + "source": [ + "#Restart the notebook (to flush the GPU memory)\n", + "from utils import restart_jupyter_notebook\n", + "restart_jupyter_notebook()\n", + "\n", + "from utils import *\n", + "import transformer_engine.pytorch as te\n", + "\n", + "hyperparams.model_name = \"/tmp/gemma-7b-hf/\" # <== Add model weight location here e.g. \"/path/to/downloaded/gemma/weights\"\n", + "hyperparams.fuse_qkv_params = True # This is needed by the last improvement.\n", + "\n", + "model = init_te_gemma_model(hyperparams)\n", + "\n", + "# Calibration\n", + "with te.fp8_autocast(enabled=False, calibrating=True), \\\n", + " torch.autocast(device_type='cuda', dtype=torch.bfloat16):\n", + " model.train()\n", + " run_forward_pass(model, hyperparams, num_iters=512)\n", + "\n", + "# Compute scale_fwd with enabled fp8 autocast\n", + "with te.fp8_autocast(enabled=True), \\\n", + " torch.autocast(device_type='cuda', dtype=torch.bfloat16):\n", + " run_forward_pass(model, hyperparams, 1)\n", + "\n", + "# Some parameters are in pointing to the same tensors, double save is avoided here.\n", + "dict_to_save = {k: v for k, v in model.state_dict().items() \\\n", + " if (\"_context_phase\" not in k and \"_generation_phase\" not in k)}\n", + "torch.save(dict_to_save, 'calibrated_weights.pth') # <== Add path to save calibrated weights." + ] + }, + { + "cell_type": "markdown", + "id": "b6dcd135", + "metadata": {}, + "source": [ + "|\n", + "### Generation in FP8\n", + "\n", + "
\n", + "\n", + "
\n", + " Figure 8: After the weight calibration FP8 scaling factors are correct and prevent numerical errors.\n", + "
\n", + "
\n", + "\n", + "Now FP8 inference is ready to be run." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "38e005f5", + "metadata": {}, + "outputs": [], + "source": [ + "!ls -alh /perfhome/repos/data/gemma-7b-hf/" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "a913f54d", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.10/dist-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in TEGemmaForCausalLMCudaGraphs is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained(\"openai/whisper-tiny\", attn_implementation=\"flash_attention_2\", torch_dtype=torch.float16)`\n", + "Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in GemmaModel is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained(\"openai/whisper-tiny\", attn_implementation=\"flash_attention_2\", torch_dtype=torch.float16)`\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "============================== Generation example 1 ==============================\n", + "Prompt:\n", + "Here are the two facts about GPUs:\n", + "Generated text:\n", + "\n", + "\n", + "1. GPUs are very good at doing the same thing over and over again.\n", + "2. GPUs are very bad at doing different things at the same time.\n", + "\n", + "This is a very important distinction to make.\n", + "\n", + "The first fact is a good thing\n", + "============================== Generation example 2 ==============================\n", + "Prompt:\n", + "Some facts about NVIDIA:\n", + "Generated text:\n", + "\n", + "\n", + "* NVIDIA is a global technology company that designs and develops graphics processing units (GPUs) for the gaming and professional markets.\n", + "* NVIDIA was founded in 1993 and is headquartered in Santa Clara, California.\n", + "* NVIDIA's\n", + "============================== Generation example 3 ==============================\n", + "Prompt:\n", + "Here are the two facts about GPUs:\n", + "Generated text:\n", + "\n", + "\n", + "1. GPUs are very good at doing the same thing over and over again.\n", + "2. GPUs are very bad at doing different things at the same time.\n", + "\n", + "This is a very important distinction to make.\n", + "\n", + "The first fact is a good thing\n", + "============================== Generation example 4 ==============================\n", + "Prompt:\n", + "Some facts about NVIDIA:\n", + "Generated text:\n", + "\n", + "\n", + "* NVIDIA is a global technology company that designs and develops graphics processing units (GPUs) for the gaming and professional markets.\n", + "* NVIDIA was founded in 1993 and is headquartered in Santa Clara, California.\n", + "* NVIDIA's\n", + "============================== Generation example 5 ==============================\n", + "Prompt:\n", + "Here are the two facts about GPUs:\n", + "Generated text:\n", + "\n", + "\n", + "1. GPUs are very good at doing the same thing over and over again.\n", + "2. GPUs are very bad at doing different things at the same time.\n", + "\n", + "This is a very important distinction to make.\n", + "\n", + "The first fact is a good thing\n" + ] + } + ], + "source": [ + "#Restart the notebook (to flush the GPU memory)\n", + "from utils import restart_jupyter_notebook\n", + "restart_jupyter_notebook()\n", + "\n", + "from utils import *\n", + "\n", + "hyperparams.model_name = \"/tmp/gemma-7b-hf/\" # <== Add model weight location here e.g. \"/path/to/downloaded/gemma/weights\"\n", + "hyperparams.qkv_format = \"thd\"\n", + "hyperparams.fuse_qkv_params = True # This is needed by the last improvement.\n", + "\n", + "hyperparams.fp8 = True\n", + "# Calibrated fp8 weights are loaded directly from the file.\n", + "\n", + "hyperparams.fp8_model_weights_filename = \"calibrated_weights.pth\" # <== Add calibrated weights location here.\n", + "\n", + "hyperparams.generation_cuda_graphs = True\n", + "hyperparams.cuda_graphs_static_batch_size = 64\n", + "hyperparams.cuda_graphs_static_max_seq_len = 1024\n", + "hyperparams.cuda_graphs_static_max_context_len = 128\n", + "model = init_te_gemma_model(hyperparams)\n", + "\n", + "print_sample_of_generated_texts(model)\n", + "# benchmark_generation(model)" + ] + }, + { + "cell_type": "markdown", + "id": "8cdbb56c", + "metadata": {}, + "source": [ + "One can observe that the outputs are coherent; however, the generation time has increased. Why is this the case?\n", + "\n", + "\n", + "
\n", + "\n", + "
\n", + " Figure 9: Running the model at higher precision involves only one GEMM operation. However, when the model operates in FP8, it requires not just the low-precision GEMM but also weight casting.\n", + "
\n", + "
\n", + "\n", + "Running the model in FP8 does not imply that all weights are stored in FP8. By default, they are stored in higher precision and are cast to FP8, using saved scaling factors, before operations such as GEMMs.\n", + "\n", + "This approach is beneficial during training: one can perform one cast for both backward and forward passes, leading to speedups. However, performing a single cast for each forward pass introduces too much overhead to achieve a speedup. This issue will be addressed in the next section of the tutorial." + ] + }, + { + "cell_type": "markdown", + "id": "8d3945e3", + "metadata": {}, + "source": [ + "### Use of only FP8 model weights" + ] + }, + { + "cell_type": "markdown", + "id": "2dd0cba9", + "metadata": {}, + "source": [ + "TransformerEngine stores parameters in higher precision and only casts them to FP8. It may be necessary to maintain accucacy during training. However, high precision is not needed when doing inference. \n", + "\n", + "Transformer Engine supports maintaining only FP8 weights with `fp8_model_init` decorator. Let's see an example\n", + "```\n", + "linear = te.Linear(1024, 1024) # this module is initialized with full precision weights\n", + "with te.fp8_model_init(enabled=True):\n", + " linear_fp8 = te.Linear(1024, 1024) # this module is initialized only with fp8 weights\n", + "\n", + "assert type(linear.weight.data) is torch.Tensor\n", + "assert type(linear_fp8.weight.data) is te.float8_tensor.Float8Tensor\n", + "```\n", + "\n", + "
\n", + "\n", + "
\n", + " Figure 10: Using fp8_model_init stores the weights directly in FP8 format, which reduces both time and memory usage.\n", + "
\n", + "
\n", + "\n", + "Let's run the code with `fp8_model_init`:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "96264b9c", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.10/dist-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in TEGemmaForCausalLMCudaGraphs is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained(\"openai/whisper-tiny\", attn_implementation=\"flash_attention_2\", torch_dtype=torch.float16)`\n", + "Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in GemmaModel is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained(\"openai/whisper-tiny\", attn_implementation=\"flash_attention_2\", torch_dtype=torch.float16)`\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "============================== Generation example 1 ==============================\n", + "Prompt:\n", + "Here are the two facts about GPUs:\n", + "Generated text:\n", + "\n", + "\n", + "1. GPUs are very good at doing the same thing over and over again.\n", + "2. GPUs are very bad at doing different things at the same time.\n", + "\n", + "This is a very important distinction to make.\n", + "\n", + "The first fact is a good thing\n", + "============================== Generation example 2 ==============================\n", + "Prompt:\n", + "Some facts about NVIDIA:\n", + "Generated text:\n", + "\n", + "\n", + "* NVIDIA is a global technology company that designs and develops graphics processing units (GPUs) for the gaming and professional markets.\n", + "* NVIDIA was founded in 1993 and is headquartered in Santa Clara, California.\n", + "* NVIDIA's\n", + "============================== Generation example 3 ==============================\n", + "Prompt:\n", + "Here are the two facts about GPUs:\n", + "Generated text:\n", + "\n", + "\n", + "1. GPUs are very good at doing the same thing over and over again.\n", + "2. GPUs are very bad at doing different things at the same time.\n", + "\n", + "This is a very important distinction to make.\n", + "\n", + "The first fact is a good thing\n", + "============================== Generation example 4 ==============================\n", + "Prompt:\n", + "Some facts about NVIDIA:\n", + "Generated text:\n", + "\n", + "\n", + "* NVIDIA is a global technology company that designs and develops graphics processing units (GPUs) for the gaming and professional markets.\n", + "* NVIDIA was founded in 1993 and is headquartered in Santa Clara, California.\n", + "* NVIDIA's\n", + "============================== Generation example 5 ==============================\n", + "Prompt:\n", + "Here are the two facts about GPUs:\n", + "Generated text:\n", + "\n", + "\n", + "1. GPUs are very good at doing the same thing over and over again.\n", + "2. GPUs are very bad at doing different things at the same time.\n", + "\n", + "This is a very important distinction to make.\n", + "\n", + "The first fact is a good thing\n" + ] + } + ], + "source": [ + "#Restart the notebook (to flush the GPU memory)\n", + "from utils import restart_jupyter_notebook\n", + "restart_jupyter_notebook()\n", + "\n", + "# Import necessary packages and methods\n", + "from utils import *\n", + "\n", + "hyperparams.model_name = \"/tmp/gemma-7b-hf/\" # <== Add model weight location here e.g. \"/path/to/downloaded/gemma/weights\"\n", + "hyperparams.fuse_qkv_params = True # Needed for fp8_model_init().\n", + "hyperparams.qkv_format = \"thd\"\n", + "\n", + "hyperparams.fp8 = True\n", + "hyperparams.fp8_model_init = True # This will result in storing only fp8 weights.\n", + "hyperparams.fp8_model_weights_filename = \"calibrated_weights.pth\" # <== Add calibrated weights location here.\n", + "\n", + "hyperparams.generation_cuda_graphs = True\n", + "hyperparams.cuda_graphs_static_batch_size = 64\n", + "hyperparams.cuda_graphs_static_max_seq_len = 1024\n", + "hyperparams.cuda_graphs_static_max_context_len = 128\n", + "model = init_te_gemma_model(hyperparams)\n", + "\n", + "print_sample_of_generated_texts(model)\n", + "# benchmark_generation(model)" + ] + }, + { + "cell_type": "markdown", + "id": "3e30ca5a", + "metadata": {}, + "source": [ + "| Models | Time (s) | Speedup | \n", + "|-------------------------------------------------------------|---------------------------------------|--------------------------------------|\n", + "| HF (baseline) | 87.68 | 1 |\n", + "| TE (subsitution of GemmaDecoderLayer with te.TransformerLayer) | 54.11 | 1.62 | \n", + "| TE + THD attention | 28.22 | 3.11 | \n", + "| TE + THD attention + CUDA Graphs | 16.75 | 5.23 | \n", + "| TE + THD attention + FP8 | 12.13 | 7.23 | \n", + "\n", + "The final speedup is **7.23x**." + ] + }, + { + "cell_type": "markdown", + "id": "c6e87275", + "metadata": {}, + "source": [ + "## Conclusions" + ] + }, + { + "cell_type": "markdown", + "id": "7bb2452d", + "metadata": {}, + "source": [ + "\n", + "
\n", + "\n", + "
\n", + " Figure 11: Times obtained with optimizations using TransformerEngine (seconds).\n", + "
\n", + "
\n", + "\n", + "In this tutorial, we've explored three features of the Transformer Engine:\n", + "1. Support for the THD attention layout,\n", + "2. Integration with CUDA Graphs,\n", + "3. FP8 weights calibration,\n", + "4. Models containing only FP8 version of their parameters.\n", + "\n", + "Each of these features can be applied in various contexts, such as fast token generation. It's important to note that the fastest possible inference speeds can be achieved using NVIDIA's inference-optimized [TensorRT](https://developer.nvidia.com/tensorrt) library." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/examples/te_gemma/utils.py b/docs/examples/te_gemma/utils.py new file mode 100755 index 0000000000..46577071c8 --- /dev/null +++ b/docs/examples/te_gemma/utils.py @@ -0,0 +1,366 @@ +# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. + +import time +import sys +import IPython +import random +import string + +from te_gemma_loading_weights import load_te_model +from te_llama_loading_weights import load_te_model as load_te_model_llama +import torch +from torch.optim import AdamW +from torch.utils.data import DataLoader + +from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, + get_linear_schedule_with_warmup, + AutoConfig, +) +from transformers import DataCollatorForLanguageModeling +from datasets import load_dataset +from accelerate import Accelerator +from accelerate.utils.dataclasses import FP8RecipeKwargs + + +from te_gemma import TEGemmaForCausalLM, TEGemmaForCausalLMCudaGraphs +from te_llama import TELlamaForCausalLM, TELlamaForCausalLMCudaGraphs + +class HyperParameters: + def __init__(self): + self.mixed_precision = "bf16" + self.model_name = None + + self.fp8 = False + + # Weights in fp8 + self.fp8_model_weights_filename = None + self.fp8_model_init = False + + # Cuda graphs + self.generation_cuda_graphs = False + self.cuda_graphs_static_batch_size = 16 + self.cuda_graphs_static_max_seq_len = 256 + self.cuda_graphs_static_max_context_len = 16 + + # Finetuning settings. + self.dataset_name = "timdettmers/openassistant-guanaco" + self.dataset_text_field = "text" + self.learning_rate = 1.41e-5 + self.batch_size = 8 + self.max_seq_length = 256 + self.gradient_accumulation_steps = 1 + self.num_warmup_steps = 5 + self.num_training_steps = 10 + + # QKV format. + self.fuse_qkv_params = False + self.qkv_format = "bshd" + + +hyperparams = HyperParameters() + +assert ( + torch.backends.cudnn.version() >= 90100 +), "cuDNN version >= 9.1.0 is needed to run this tutorial." + + +def get_dataloaders(accelerator: Accelerator, hyperparams): + dataset = load_dataset(hyperparams.dataset_name, split="train") + tokenizer = AutoTokenizer.from_pretrained(hyperparams.model_name) + if getattr(tokenizer, "pad_token", None) is None: + tokenizer.pad_token = tokenizer.eos_token + + def tokenize(element): + outputs = tokenizer( + element["text"], + truncation=True, + padding=False, + max_length=hyperparams.max_seq_length, + return_overflowing_tokens=False, + return_length=False, + ) + return {"input_ids": outputs["input_ids"], "attention_mask": outputs["attention_mask"]} + + with accelerator.main_process_first(): + dataset = dataset.map(tokenize, batched=True, remove_columns=dataset.column_names) + + # Simply pad to the multiple of 16 for both FP8 and BF16 precision + pad_to_multiple_of = 16 + data_collator = DataCollatorForLanguageModeling( + tokenizer=tokenizer, + mlm=False, + pad_to_multiple_of=pad_to_multiple_of, + ) + + dataloader_params = { + "batch_size": hyperparams.batch_size, + "collate_fn": data_collator, + "drop_last": True, + } + train_dataloader = DataLoader(dataset, **dataloader_params) + return train_dataloader + + +def init_baseline_model(hyperparams): + # Init the model + config = AutoConfig.from_pretrained(hyperparams.model_name) + # make sure to use flash_attention to do iso comparison with TEGemmaModel + config._attn_implementation = "flash_attention_2" + model = AutoModelForCausalLM.from_pretrained( + hyperparams.model_name, + config=config, + torch_dtype=torch.bfloat16, + ) + return model.cuda() + + +def init_te_llama_model(hyperparams): + cls = TELlamaForCausalLMCudaGraphs if hyperparams.generation_cuda_graphs else TELlamaForCausalLM + config = AutoConfig.from_pretrained(hyperparams.model_name) + config._attn_implementation = "flash_attention_2" + # config.hidden_size = 1024 + # config.head_dim = 128 + print(config) + # Adding all params from the hyperparams to the config to make the code simpler. + for key, value in hyperparams.__dict__.items(): + setattr(config, key, value) + model = load_te_model_llama(cls, config) + if hyperparams.generation_cuda_graphs: + model.record() + return model.cuda() + +def init_te_gemma_model(hyperparams): + cls = TEGemmaForCausalLMCudaGraphs if hyperparams.generation_cuda_graphs else TEGemmaForCausalLM + config = AutoConfig.from_pretrained(hyperparams.model_name) + config._attn_implementation = "flash_attention_2" + # config.hidden_size = 1024 + # config.head_dim = 128 + print(config) + # Adding all params from the hyperparams to the config to make the code simpler. + for key, value in hyperparams.__dict__.items(): + setattr(config, key, value) + model = load_te_model(cls, config) + if hyperparams.generation_cuda_graphs: + model.record() + return model.cuda() + + +def wrap_with_accelerator(model, hyperparams): + # Create FP8 kwarg handler if required + fp8_kwarg_handler = ( + [FP8RecipeKwargs(backend="te")] if hyperparams.mixed_precision == "fp8" else None + ) + + # Init HF accelerator that's used for training + accelerator = Accelerator( + log_with="wandb", + gradient_accumulation_steps=hyperparams.gradient_accumulation_steps, + mixed_precision=hyperparams.mixed_precision, + kwargs_handlers=fp8_kwarg_handler, + ) + # accelerator.print(f'State: {accelerator.state}') + train_dataloader = get_dataloaders(accelerator, hyperparams) + + # Wrap model, optimizer/scheduler, dataloaders in accelerate + optimizer = AdamW(params=model.parameters(), lr=hyperparams.learning_rate, fused=True) + lr_scheduler = get_linear_schedule_with_warmup( + optimizer=optimizer, + num_warmup_steps=100, + num_training_steps=hyperparams.num_training_steps, + ) + model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare( + model, optimizer, train_dataloader, lr_scheduler + ) + + return accelerator, model, optimizer, train_dataloader, lr_scheduler + + +def finetune_model(model, hyperparams, accelerator, train_dataloader, optimizer, lr_scheduler): + model.train() + optimizer.zero_grad() + train_dataloader = enumerate(train_dataloader) + + def run_iters(num_iters): + for _ in range(num_iters): + _, batch = next(train_dataloader) + with accelerator.accumulate(model): + outputs = model(**batch) + loss = outputs.loss + accelerator.backward(loss) + optimizer.step() + lr_scheduler.step() + optimizer.zero_grad() + + run_iters(hyperparams.num_warmup_steps) # Warmup iters + + # Get the timers ready + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + torch.cuda.synchronize() + + start.record() + run_iters(hyperparams.num_training_steps) # Training iters + torch.cuda.synchronize() + end.record() + accelerator.end_training() + + print( + f"""{hyperparams.num_training_steps} finetuning steps complete!\n + Average time taken per step: + {(start.elapsed_time(end)/hyperparams.num_training_steps):.0f} + milliseconds""" + ) + + +def restart_jupyter_notebook(): + # Try restarting the Jupyter kernel + IPython.Application.instance().kernel.do_shutdown(True) + + # Check whether the device memory has been flushed + if torch.cuda.memory_allocated() != 0: + import warnings + + warnings.warn("The device memory hasn't been flushed, trying with a second method!") + + # Try restarting the Jupyter kernel another way + # Restart the kernel + from IPython.core.display import HTML + + HTML("") + + if torch.cuda.memory_allocated() != 0: + print( + "The device memory hasn't been flushed, try manually restarting the Jupyter kernel!" + ) + + # Suppress the warnings + if not sys.warnoptions: + import warnings + + warnings.simplefilter("ignore") + torch.set_warn_always(False) + + +@torch.no_grad() +def run_forward_pass(model, hyperparams, num_iters): + """ + It runs num_iters forward passes with sample data. + """ + accelerator = Accelerator( + log_with="wandb", + gradient_accumulation_steps=hyperparams.gradient_accumulation_steps, + mixed_precision="no", + ) + train_dataloader = get_dataloaders(accelerator, hyperparams) + + # @sudhakars: what's the point of calling `model.train` inside `no_grad` + # context? + model.train() + train_dataloader = enumerate(train_dataloader) + + for _ in range(num_iters): + _, batch = next(train_dataloader) + batch["input_ids"] = batch["input_ids"].cuda() + batch['attention_mask'] = batch["attention_mask"].cuda() + model(input_ids = batch["input_ids"], attention_mask = batch['attention_mask']) + + +""" + Benchmarking and example generation functions. +""" + + +def print_sample_of_generated_texts(model): + tokenizer = AutoTokenizer.from_pretrained(hyperparams.model_name) + if getattr(tokenizer, "pad_token", None) is None: + tokenizer.pad_token = tokenizer.eos_token + prompts = ["Here are the two facts about GPUs:", "Some facts about NVIDIA:"] + prompts *= 32 + inputs = tokenizer(prompts, return_tensors="pt", padding=True) + + + max_length = inputs["input_ids"].size(1) + new_length = ((max_length + 63) // 64) * 128 + + # Add padding to the left + inputs["input_ids"] = torch.nn.functional.pad( + inputs["input_ids"], (new_length - max_length, 0), value=tokenizer.pad_token_id + ) + + # Add padding to the left (only intended for baseline generation with HF + # which expects padding to the left) + inputs["attention_mask"] = torch.nn.functional.pad( + inputs["attention_mask"], (new_length - max_length, 0), value=0 + ) + + inputs["input_ids"] = inputs["input_ids"].cuda() + inputs["attention_mask"] = inputs["attention_mask"].cuda() + + outputs = model.generate(**inputs, max_new_tokens=50) + generated_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True) + + def print_output(prompts, generated_texts, idx): + print("=" * 30 + f" Generation example {idx+1} " + "=" * 30) + print("Prompt:") + print(generated_texts[idx][: len(prompts[idx])]) + print("Generated text:") + print(generated_texts[idx][len(prompts[idx]) :]) + + for i in range(5): + print_output(prompts, generated_texts, i) + + +def _generate_random_words(num_words, max_word_length): + words = [] + for _ in range(num_words): + word_length = random.randint(1, max_word_length) + word = "".join(random.choices(string.ascii_lowercase, k=word_length)) + words.append(word) + return words + + +def benchmark_generation(model): + batch_size = 64 + context_length = 128 + max_new_tokens = 156 - 128 + print("=" * 30 + " Benchmarking " + "=" * 30) + print( + f"Benchmarking for batch_size = {batch_size} and max total tokens =" + f" {context_length + max_new_tokens}" + ) + + input_str = _generate_random_words(batch_size, context_length) + + tokenizer = AutoTokenizer.from_pretrained(hyperparams.model_name) + inputs = tokenizer(input_str, return_tensors="pt", padding=True) + + max_length = inputs["input_ids"].size(1) + + # Add padding to the left + inputs["input_ids"] = torch.nn.functional.pad( + inputs["input_ids"], (context_length - max_length, 0), value=tokenizer.pad_token_id + ) + + # Add padding to the left (only intended for baseline generation with HF + # which expects padding to the left) + inputs["attention_mask"] = torch.nn.functional.pad( + inputs["attention_mask"], (context_length - max_length, 0), value=0 + ) + + inputs["input_ids"] = inputs["input_ids"].cuda() + inputs["attention_mask"] = inputs["attention_mask"].cuda() + + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + torch.cuda.synchronize() + start.record() + + model.generate(inputs["input_ids"].cuda(), max_new_tokens=max_new_tokens) + torch.cuda.synchronize() + end.record() + + print(f"Time: {start.elapsed_time(end)/1000:.2f} s.") diff --git a/transformer_engine/pytorch/attention/inference.py b/transformer_engine/pytorch/attention/inference.py index 8267bf63c7..8a33a7f047 100644 --- a/transformer_engine/pytorch/attention/inference.py +++ b/transformer_engine/pytorch/attention/inference.py @@ -214,6 +214,12 @@ def __init__( dtype=torch.int32, device=torch.cuda.current_device(), ) + self.cu_pre_step_seqlens = torch.zeros( + self.max_batch_size, + dtype=torch.int32, + device=torch.cuda.current_device(), + ) + def reset(self): """Reset InferenceParams state""" @@ -280,9 +286,12 @@ def pre_step( def get_seqlens_pre_step(self): """Get cached sequence lengths before the stepping""" - return torch.Tensor(list(self.sequences_pre_step.values())).to( + seqlens = torch.Tensor(list(self.sequences_pre_step.values())).to( dtype=torch.int32, device="cpu" ) + # return seqlens.cuda() + self.cu_pre_step_seqlens[:len(seqlens)].copy_(seqlens, non_blocking=True) + return self.cu_pre_step_seqlens def convert_paged_to_nonpaged(self, layer_number: int): """ @@ -455,14 +464,14 @@ def pre_step( finished_seqs = self.sequences.keys() - unfinished_seqs unfinished_indices = [i for i, j in enumerate(self.sequences) if j in unfinished_seqs] finished_indices = [i for i, j in enumerate(self.sequences) if j in finished_seqs] - self.batch_indices.copy_( + self.batch_indices.data[:].copy_( torch.Tensor( ( unfinished_indices + finished_indices + list(range(prev_batch_size, self.max_batch_size)) ) - ).to(dtype=torch.int32, device="cpu") + ) ) # Advance unfinished sequences diff --git a/transformer_engine/pytorch/attention/multi_head_attention.py b/transformer_engine/pytorch/attention/multi_head_attention.py index a9a687ef15..ec51d68cdd 100644 --- a/transformer_engine/pytorch/attention/multi_head_attention.py +++ b/transformer_engine/pytorch/attention/multi_head_attention.py @@ -764,8 +764,8 @@ def forward( # sequence_start = inference_params.seqlens[0] sequence_end = sequence_start + sequence_length - q_pos_emb = q_pos_emb[sequence_start:sequence_end, ...] - k_pos_emb = k_pos_emb[sequence_start:sequence_end, ...] + # q_pos_emb = q_pos_emb[sequence_start:sequence_end, ...] + # k_pos_emb = k_pos_emb[sequence_start:sequence_end, ...] query_layer = apply_rotary_pos_emb( query_layer, @@ -775,6 +775,7 @@ def forward( cu_seqlens=cu_seqlens_q, cp_size=self.cp_size, cp_rank=self.cp_rank, + start_positions=sequence_start, ) key_layer = apply_rotary_pos_emb( key_layer, @@ -784,6 +785,7 @@ def forward( cu_seqlens=cu_seqlens_kv, cp_size=self.cp_size, cp_rank=self.cp_rank, + start_positions=sequence_start, ) # =========================== diff --git a/transformer_engine/pytorch/csrc/extensions/apply_rope.cpp b/transformer_engine/pytorch/csrc/extensions/apply_rope.cpp index b13a90f876..200f6817fc 100644 --- a/transformer_engine/pytorch/csrc/extensions/apply_rope.cpp +++ b/transformer_engine/pytorch/csrc/extensions/apply_rope.cpp @@ -27,9 +27,10 @@ at::Tensor fused_rope_forward(const at::Tensor &input, const at::Tensor &freqs, auto freqs_cu = makeTransformerEngineTensor(freqs); auto output_cu = makeTransformerEngineTensor(output); - auto start_positions_cu = transformer_engine::TensorWrapper(); // empty cu_seqlens tensor + auto start_positions_cu = transformer_engine::TensorWrapper(); // empty start_positions tensor if (start_positions) { start_positions_cu = makeTransformerEngineTensor(start_positions.value()); + TORCH_CHECK(start_positions_cu.ndim() == 1, "expected 1D tensor"); } if (qkv_format == NVTE_QKV_Format::NVTE_THD) { From d56f4393f5f12995dbd38f1b611a955cb7e94c36 Mon Sep 17 00:00:00 2001 From: Sudhakar Singh Date: Mon, 16 Jun 2025 11:47:52 -0700 Subject: [PATCH 2/7] remove extraneous code for easy debu Signed-off-by: Sudhakar Singh --- docs/examples/te_gemma/run_generation.py | 2 +- docs/examples/te_gemma/te_gemma.py | 310 +++------ docs/examples/te_gemma/te_gemma_save.py | 829 +++++++++++++++++++++++ 3 files changed, 932 insertions(+), 209 deletions(-) create mode 100755 docs/examples/te_gemma/te_gemma_save.py diff --git a/docs/examples/te_gemma/run_generation.py b/docs/examples/te_gemma/run_generation.py index eb781f11cf..6c45b9d670 100755 --- a/docs/examples/te_gemma/run_generation.py +++ b/docs/examples/te_gemma/run_generation.py @@ -19,4 +19,4 @@ model = init_te_gemma_model(hyperparams) print_sample_of_generated_texts(model) -benchmark_generation(model) +# benchmark_generation(model) diff --git a/docs/examples/te_gemma/te_gemma.py b/docs/examples/te_gemma/te_gemma.py index f24b700979..bab980cc28 100755 --- a/docs/examples/te_gemma/te_gemma.py +++ b/docs/examples/te_gemma/te_gemma.py @@ -19,87 +19,6 @@ import torch.nn.functional as F -def setup_cache_params_from_infer_params(inference_params, lengths_tensor, max_input_length): - """ - Converts the `input_ids` to variables like `cu_seqlens_q/kv`, etc. which - will be used later. - - (Currently a hack, this should be reformatted to a better method) - """ - - assert lengths_tensor is not None and max_input_length is not None, \ - "lengths_tensor and max_input_length should not be none for qkv_format = \"thd\"" - torch.add( - inference_params.cached_sequence_lengths, - inference_params.input_sequence_lengths, - out=inference_params.cached_sequence_lengths) - # inference_params.input_sequence_lengths[:len(lengths_tensor)].copy_(lengths_tensor, non_blocking=True) - inference_params.input_sequence_lengths.copy_(lengths_tensor) - - inference_params.max_incoming_seq_len = max_input_length - - max_seqlen_q, max_seqlen_kv = inference_params.max_incoming_seq_len, inference_params.max_sequence_length - - # # Allocation of buffers, it works correctly with CUDA Graphs. - _allocator = StaticBufferAllocator() - NR_BUFFERS = 4 - - cu_seqlens_q, cu_seqlens_kv, cu_seqlens_q_padded, cu_seqlens_kv_padded = [ - _allocator(inference_params.max_batch_size + 1, dtype=torch.int32, device="cuda") - for _ in range(NR_BUFFERS) - ] - - torch.cumsum(inference_params.input_sequence_lengths, dim=0, out=cu_seqlens_q[1:]) - torch.cumsum( - inference_params.cached_sequence_lengths + inference_params.input_sequence_lengths, - dim=0, out=cu_seqlens_kv[1:]) - # If layer has shape [b * s_layer, h, d] - # offsets are of the form [k * s_layer * h * d for k = 0, ..., batch_size] - cu_seqlens_q_padded.copy_( - torch.arange(0, inference_params.max_batch_size + 1, device="cuda") * max_seqlen_q) - cu_seqlens_kv_padded.copy_( - torch.arange(0, inference_params.max_batch_size + 1, device="cuda") * max_seqlen_kv) - - # inference_params.step_dict = OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist())) - inference_params.pre_step(OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist()))) - - # print(inference_params.step_dict) - - def get_cache_params_in_infer_params(): - return max_seqlen_q, max_seqlen_kv, cu_seqlens_q, cu_seqlens_kv, cu_seqlens_q_padded, cu_seqlens_kv_padded - - # For the time being, create an ad-hoc field in `inference_params` to get the variables. - # @sudhakars: to create a better way later. - inference_params.get_cache_params_from_infer_params = get_cache_params_in_infer_params - -# This class has been modified from -# https://github.com/huggingface/transformers/blob/98adf24883b007c2a7fb17bab1c01b1614673433/src/transformers/models/gemma/modeling_gemma.py -class GemmaRotaryEmbedding(torch.nn.Module): - def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): - super().__init__() - - self.dim = dim - self.max_position_embeddings = max_position_embeddings - self.base = base - inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float() / self.dim)) - self.register_buffer("inv_freq", tensor=inv_freq, persistent=False) - - @torch.no_grad() - def forward(self, x, position_ids, seq_len=None): - # x: [bs, num_attention_heads, seq_len, head_size] - self.inv_freq.to(x.device) - inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) - position_ids_expanded = position_ids[:, None, :].float() - # Force float32 since bfloat16 loses precision on long contexts - # See https://github.com/huggingface/transformers/pull/29285 - device_type = x.device.type - device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu" - with torch.autocast(device_type=device_type, enabled=False): - freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) - emb = torch.cat((freqs, freqs), dim=-1) - return emb.unsqueeze(2) # should return in [b, s, 1, d] format - - class StaticBufferAllocator(torch.nn.Module): """ This class is used when we use te.make_graphed_callable(). @@ -152,6 +71,10 @@ def __init__(self, config: GemmaConfig, layer_idx: int, *args, **kwargs): zero_centered_gamma=True, ) + self.te_rope_emb = RotaryPositionEmbedding(self.gemma_config.head_dim)( + max_seq_len=self.gemma_config.max_position_embeddings + ).cuda() + def alloc(self, size, dtype, device): """ Allocated the buffer and works correctly with CUDA Graphs. @@ -160,67 +83,26 @@ def alloc(self, size, dtype, device): def forward(self, *args, **kwargs): # We need to additionally pass positional encoding. - # if "self_attn_mask_type" in kwargs: - # attn_mask_type = kwargs['self_attn_mask_type'] - # else: - # attn_mask_type = "whatever_default_is" - - # if attn_mask_type == "arbitrary": - # # @sudhakars: following logic doesn't work for `thd` - # attn_mask = kwargs['attention_mask'] - # attention_mask_inv = ~attn_mask - # generation_case = torch.tensor(torch.tensor(attn_mask.shape).shape).item() > 2 - - # if generation_case: - # # @sudhakars: for some reason, `attention_mask` for generation is of the - # # form [b, 1, 1, s]. - # attention_mask_inv = attention_mask_inv.squeeze(1).squeeze(1) - # assert torch.tensor(torch.tensor(attention_mask_inv.shape).shape).item() == 2 - - # # Create `position_ids` on the fly using `attention_mask` since HF - # # does the same in generation logic. - # position_ids = attention_mask_inv.long().cumsum(-1) - 1 - # position_ids.masked_fill_(attention_mask_inv == 0, 1) - - # if "position_ids" in kwargs and kwargs['position_ids'] is not None: - # assert torch.all(torch.eq(position_ids, kwargs["position_ids"])), "position ids don't match match exactly!" - - # # convert [b, s] to [b, 1, s, s] since `arbitrary` is only set for - # # context phase and context phase gets [b, s] sized attn mask - # seq_len = 1 if torch.tensor(torch.tensor(attn_mask.shape).shape).item() > 2 else attention_mask_inv.shape[1] - # arbitrary_attn_mask = torch.zeros(attention_mask_inv.shape[0], 1, seq_len, attention_mask_inv.shape[1]).bool() - # for sample_idx in range(attn_mask.shape[0]): - # pad_len = attn_mask[sample_idx].sum().int().item() - # # set the columns to padded - # arbitrary_attn_mask[sample_idx, :, :, :pad_len] = True - # # set the rows to padded - # if not generation_case: - # arbitrary_attn_mask[sample_idx, :, :pad_len, :] = True - # arbitrary_attn_mask[sample_idx] = torch.tril(arbitrary_attn_mask[sample_idx].logical_not()).logical_not() - - # # Update the attention mask to arbitrary - # kwargs['attention_mask'] = arbitrary_attn_mask.cuda() - - # # @sudhakars: `max_position_embeddings` is not even used inside GemmaRotaryEmbedding - # # @sudhakars: change the hardcoded `dim` to something like config.head_dim - # te_rope_emb = GemmaRotaryEmbedding(dim=256, max_position_embeddings=self.gemma_config.max_position_embeddings).cuda() - # te_rope_emb = te_rope_emb(args[0], position_ids.cuda()) - # else: + # When the `attention_mask` is not `arbitrary`, then for the purpose # of this tutorial, we're using `padding_causal` (for context) and # `padding` (for generation) # @sudhakars: find a better way to provide the `tensor_format` - te_rope_emb = RotaryPositionEmbedding(self.gemma_config.head_dim)( - max_seq_len=self.gemma_config.max_position_embeddings - ).cuda() + inference_params = kwargs["inference_params"] # @sudhakars: big assumption that the input is "sbhd" # batch_size = args[0].shape[0] - if inference_params.qkv_format_legacy == "thd": - ( - max_seqlen_q, max_seqlen_kv, cu_seqlens_q, cu_seqlens_kv, cu_seqlens_q_padded, cu_seqlens_kv_padded - ) = inference_params.get_cache_params_from_infer_params() + + # if inference_params.qkv_format_legacy == "thd": + # cache_params = kwargs["cache_params"] + # max_seqlen_q = cache_params.max_seqlen_q + # max_seqlen_kv = cache_params.max_seqlen_kv + # cu_seqlens_q = cache_params.cu_seqlens_q + # cu_seqlens_kv = cache_params.cu_seqlens_kv + # cu_seqlens_q_padded = cache_params.cu_seqlens_q_padded + # cu_seqlens_kv_padded = cache_params.cu_seqlens_kv_padded + # print(f"input_sequence_lengths (in forward): \n{inference_params.input_sequence_lengths}") # this args cannot be passed to TransformerLayer keys_to_remove = [ @@ -233,16 +115,15 @@ def forward(self, *args, **kwargs): # We need to additionally pass positional e for key in keys_to_remove: kwargs.pop(key, None) - # import pdb; pdb.set_trace() # We need to return tuple to be compatible with HF. return ( super().forward( *args, - rotary_pos_emb=te_rope_emb, - cu_seqlens_q=cu_seqlens_q, - cu_seqlens_kv=cu_seqlens_kv, - max_seqlen_q=max_seqlen_q, - max_seqlen_kv=max_seqlen_kv, + rotary_pos_emb=self.te_rope_emb, + # cu_seqlens_q=cu_seqlens_q, + # cu_seqlens_kv=cu_seqlens_kv, + # max_seqlen_q=max_seqlen_q, + # max_seqlen_kv=max_seqlen_kv, **kwargs ), ) @@ -271,11 +152,13 @@ def set_inference_params(self, inference_params): # @sudhakars: is `arbitrary` fine being the default here? def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor = None, attn_mask_type: str = "arbitrary"): + print(f"StaticGemmaModel forward start") with torch.no_grad(): # static operation - for CUDA graphs hidden_states.data[:] = hidden_states.data[:] * self.normalizer for i, decoder_layer in enumerate(self.model.layers): + # print(f"layer {i}") hidden_states.data[:] = decoder_layer( hidden_states, attention_mask=attention_mask, @@ -288,7 +171,7 @@ def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor = No hidden_states.copy_(self.model.norm(hidden_states)) # static copy - for CUDA graphs logits = self.lm_head(hidden_states) logits = logits.float() - return logits + return logits, hidden_states class GemmaGenerator(torch.nn.Module): @@ -311,7 +194,7 @@ def set_inference_params(self, inference_params): # @sudhakars: is `arbitrary` a good default value here? def forward(self, hidden_states: torch.Tensor, mask: torch.Tensor = None, attn_mask_type: str = "arbitrary"): - logits = self.gemma_layers(hidden_states, attention_mask=mask, attn_mask_type = attn_mask_type) + logits, _ = self.gemma_layers(hidden_states, attention_mask=mask, attn_mask_type = attn_mask_type) assert logits.shape[0] == hidden_states.shape[0] # b assert logits.shape[1] == hidden_states.shape[1] # seq_len @@ -325,27 +208,6 @@ def forward(self, hidden_states: torch.Tensor, mask: torch.Tensor = None, attn_m return next_tokens -class PartialForwardWrapper(torch.nn.Module): - """ - This class wraps a `torch.nn.Module` while partially modifying its `forward` - - CUDAGraphs' `make_graphed_callables` method takes in a module but if only - `functools.partial` is used to wrap the module, it changes the modules' - type and that interferes with the `make_graphed_callables` intrinsics. - """ - def __init__(self, module, **kwargs): - super().__init__() - self.module = module - self.partial_forward = partial(self.module.forward, **kwargs) - - def __call__(self, *args, **kwargs): - return self.partial_forward(*args, **kwargs) - - # @sudhakars: should we use better abstraction? - def set_inference_params(self, *args, **kwargs): - return self.module.set_inference_params(*args, **kwargs) - - @contextmanager def replace_decoder(te_decoder_cls): """ @@ -442,13 +304,19 @@ def _create_inference_params(self, *args, **kwargs): *args, **kwargs ) - max_batch_size = kwargs["max_batch_size"] + # max_batch_size = kwargs["max_batch_size"] # Initialize some legacy params - infer_params.cached_sequence_lengths = torch.zeros( - (max_batch_size,), device="cuda", dtype=torch.int32) - infer_params.input_sequence_lengths = torch.zeros( - (max_batch_size,), device="cuda", dtype=torch.int32) + # _allocator = StaticBufferAllocator() + # infer_params.cached_sequence_lengths = _allocator((max_batch_size,), dtype=torch.int32, device="cuda") + # infer_params.input_sequence_lengths = _allocator((max_batch_size,), dtype=torch.int32, device="cuda") + + # These are updated in setup_cache_params_from_infer_params and they should be static for + # the duration of the context as well as the generation phase. + # infer_params.cu_seqlens_q, infer_params.cu_seqlens_kv, infer_params.cu_seqlens_q_padded, infer_params.cu_seqlens_kv_padded = [ + # _allocator(max_batch_size + 1, dtype=torch.int32, device="cuda") + # for _ in range(4) + # ] return infer_params @@ -478,39 +346,42 @@ def _get_generation_buffer(self, hidden_states_buffer, data_to_copy=None): def _generate_context_phase(self, input_ids: torch.Tensor, inference_params: InferenceParams): # import pdb; pdb.set_trace() hidden_states = self._create_hidden_states_buffer(input_ids) - hidden_states.data[:] = self.model.embed_tokens(input_ids) + hidden_states.copy_(self.model.embed_tokens(input_ids)) # We need to update offsets before every forward pass to make cache work properly. lengths = input_ids.ne(0).sum(dim=1) + # import pdb; pdb.set_trace() if self.config.qkv_format == "thd": # inference_params.setup_before_new_input( # lengths_tensor=lengths, max_input_length=input_ids.shape[1] # ) lengths = input_ids.ne(0).sum(dim=1) - max_input_length = input_ids.shape[1] - setup_cache_params_from_infer_params(inference_params, lengths, max_input_length) + inference_params.pre_step(OrderedDict(zip(list(range(len(lengths))), lengths.tolist()))) else: inference_params.setup_before_new_input(length=input_ids.shape[1]) - logits = self._model_context_phase( + logits, hs_buffer = self._model_context_phase( hidden_states, attention_mask=((input_ids == 0) if self.config.qkv_format != "thd" else None), - attn_mask_type="padding_causal" if self.config.qkv_format == "thd" else "arbitrary" + attn_mask_type="padding_causal" if self.config.qkv_format == "thd" else "arbitrary", ) # We choose logits coresponding with last token in each sequence, # which have various lengths - they are stored in (inference_params.incoming_seq_len - 1) # Tensor when qkv_format == "thd" and - # they are the last token in the sequence when qkv_format != "thd". - if self.config.qkv_format == "thd": - logits = logits[ + # they are the last token in the sequence when qkv_format != "thd". + # import pdb; pdb.set_trace() + import pdb; pdb.set_trace() - torch.arange(logits.size(0)), inference_params.input_sequence_lengths - 1, : - ] - else: - logits = logits[:, -1, :] + # if self.config.qkv_format == "thd": + # logits = logits[ + + # torch.arange(logits.size(0)), lengths - 1, : + # ] + # else: + logits = logits[:, -1, :] next_tokens = torch.argmax(logits, dim=1) @@ -572,7 +443,7 @@ def generate( inference_params = self._create_inference_params( max_batch_size=batch_size, # num_layers=self.config.num_hidden_layers, - max_sequence_length=self._next_64_multiply(max_input_sequence_len + max_new_tokens), + max_sequence_length=128, num_heads_kv=self.config.num_key_value_heads, # num_heads_q=self.config.num_attention_heads, head_dim_v=self.config.head_dim, @@ -584,29 +455,34 @@ def generate( # is_cuda_graph=False ) - def init_cache_params_in_infer_params(inference_params): - inference_params.cached_sequence_lengths = torch.zeros( - (batch_size,), device="cuda", dtype=torch.int32) - inference_params.input_sequence_lengths = torch.zeros( - (batch_size,), device="cuda", dtype=torch.int32) + # def init_cache_params_in_infer_params(inference_params): + # _allocator = StaticBufferAllocator() + # inference_params.cached_sequence_lengths = _allocator( + # (batch_size,), dtype=torch.int32, device="cuda") + # inference_params.input_sequence_lengths = _allocator( + # (batch_size,), dtype=torch.int32, device="cuda") - init_cache_params_in_infer_params(inference_params) - inference_params.qkv_format_legacy = self.config.qkv_format + # init_cache_params_in_infer_params(inference_params) + + + # inference_params.qkv_format_legacy = self.config.qkv_format self._model_context_phase.set_inference_params(inference_params) self._model_generation_phase.set_inference_params(inference_params) + print(f"context phase start") + # import pdb; pdb.set_trace() hidden_states, next_tokens = self._generate_context_phase(input_ids, inference_params) + print(f"context phase done") # Generation phase. if self.config.qkv_format == "thd": # inference_params.setup_before_new_input( # lengths_tensor=torch.ones((next_tokens.shape[0],), device="cuda"), # max_input_length=1, # ) - setup_cache_params_from_infer_params(inference_params, - lengths_tensor=torch.ones((next_tokens.shape[0],), dtype=int), - max_input_length=1) + lengths_tensor=torch.ones((next_tokens.shape[0],), dtype=int) + inference_params.pre_step(OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist()))) else: inference_params.setup_before_new_input(length=1) @@ -637,9 +513,8 @@ def init_cache_params_in_infer_params(inference_params): # lengths_tensor=torch.ones((next_tokens.shape[0],), device="cuda"), # max_input_length=1, # ) - setup_cache_params_from_infer_params(inference_params, - lengths_tensor=torch.ones((next_tokens.shape[0],), dtype=int), - max_input_length=1) + lengths_tensor=torch.ones((next_tokens.shape[0],), dtype=int) + inference_params.pre_step(OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist()))) else: inference_params.setup_before_new_input(length=1) # next_tokens is static output tensor, so we need to clone it @@ -706,12 +581,16 @@ def __init__(self, config: GemmaConfig): ) ## Taken from TEGemmaForCausalLM above - max_batch_size = self.config.cuda_graphs_static_batch_size - # Initialize some legacy params - self.inference_params.cached_sequence_lengths = torch.zeros( - (max_batch_size,), device="cuda", dtype=torch.int32) - self.inference_params.input_sequence_lengths = torch.zeros( - (max_batch_size,), device="cuda", dtype=torch.int32) + # max_batch_size = self.config.cuda_graphs_static_batch_size + # # Initialize some legacy params + # _allocator = StaticBufferAllocator() + # self.inference_params.cached_sequence_lengths = _allocator((max_batch_size,), dtype=torch.int32, device="cuda") + # self.inference_params.input_sequence_lengths = _allocator((max_batch_size,), dtype=torch.int32, device="cuda") + + # self.inference_params.cu_seqlens_q, self.inference_params.cu_seqlens_kv, self.inference_params.cu_seqlens_q_padded, self.inference_params.cu_seqlens_kv_padded = [ + # _allocator(max_batch_size + 1, dtype=torch.int32, device="cuda") + # for _ in range(4) + # ] # def init_cache_params_in_infer_params(inference_params): # inference_params.cached_sequence_lengths = torch.zeros( @@ -720,7 +599,7 @@ def __init__(self, config: GemmaConfig): # (batch_size,), device="cuda", dtype=torch.int32) # init_cache_params_in_infer_params(inference_params) - self.inference_params.qkv_format_legacy = self.config.qkv_format + # self.inference_params.qkv_format_legacy = self.config.qkv_format self._model_generation_phase.set_inference_params(self.inference_params) self._model_context_phase.set_inference_params(self.inference_params) @@ -737,30 +616,45 @@ def record(self): self.config.cuda_graphs_static_batch_size, self.config.cuda_graphs_static_max_context_len, ) - self.inference_params.reset() + # self.inference_params.reset() # self.inference_params.setup_before_new_input( # lengths_tensor=torch.tensor(input_shape[0] * [input_shape[1]], device="cuda"), # max_input_length=input_shape[1], # ) - lengths = torch.tensor(input_shape[0] * [input_shape[1]], device="cuda") + + # [1] Should be same as lengths_tensor from TEGemmaForCausalLM + lengths = torch.tensor(input_shape[0] * [input_shape[1]], device="cuda", dtype=torch.int32) max_input_length = input_shape[1] - setup_cache_params_from_infer_params(self.inference_params, lengths, max_input_length) + self.inference_params.pre_step(OrderedDict(zip(list(range(len(lengths))), lengths.tolist()))) + + print(f"context phase recording start") + # self._model_context_phase.model.layers = torch.nn.ModuleList([ + # self.record_graph( + # layer, + # self.hidden_states_buffer, + # self_attn_mask_type="padding_causal", + # inference_params=self.inference_params + # ) + # for layer in self._model_context_phase.model.layers + # ]) self._model_context_phase = self.record_graph( self._model_context_phase, self.hidden_states_buffer, attn_mask_type="padding_causal" ) # CUDA Graphs recording + print(f"context phase recording done") input_shape = (self.config.cuda_graphs_static_batch_size, 1) - self.inference_params.reset() + # self.inference_params.reset() # self.inference_params.setup_before_new_input( # lengths_tensor=torch.tensor(input_shape[0] * [input_shape[1]], device="cuda"), # max_input_length=input_shape[1], # ) - lengths = torch.tensor(input_shape[0] * [input_shape[1]], device="cuda") + lengths = torch.tensor(input_shape[0] * [1], device="cuda", dtype=torch.int32) max_input_length = input_shape[1] - setup_cache_params_from_infer_params(self.inference_params, lengths, max_input_length) + + self.inference_params.pre_step(OrderedDict(zip(list(range(len(lengths))), lengths.tolist()))) self._model_generation_phase = self.record_graph( self._model_generation_phase, @@ -802,7 +696,7 @@ def record_graph(self, function, input_tensor, **sample_kwargs): fp8_enabled=self.config.fp8, fp8_recipe=fp8_recipe, allow_unused_input=True, - num_warmup_iters=3, + num_warmup_iters=5, sample_kwargs=sample_kwargs, ) return graphed_function diff --git a/docs/examples/te_gemma/te_gemma_save.py b/docs/examples/te_gemma/te_gemma_save.py new file mode 100755 index 0000000000..a46f6a9b94 --- /dev/null +++ b/docs/examples/te_gemma/te_gemma_save.py @@ -0,0 +1,829 @@ +# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. + +from contextlib import contextmanager + +from typing import Optional +from functools import partial +from collections import OrderedDict + +import torch +import transformer_engine as te +from transformer_engine.pytorch.attention import InferenceParams, RotaryPositionEmbedding +from transformer_engine.common.recipe import Format, DelayedScaling +from torch.cuda.amp import autocast + +import transformers +from transformers.models.gemma.modeling_gemma import GemmaForCausalLM, GemmaConfig, GemmaModel + +import torch.nn.functional as F + +class CacheParams: + def __init__(self, max_seqlen_q, max_seqlen_kv, cu_seqlens_q, cu_seqlens_kv, cu_seqlens_q_padded, cu_seqlens_kv_padded): + self.max_seqlen_q = max_seqlen_q + self.max_seqlen_kv = max_seqlen_kv + self.cu_seqlens_q = cu_seqlens_q + self.cu_seqlens_kv = cu_seqlens_kv + self.cu_seqlens_q_padded = cu_seqlens_q_padded + self.cu_seqlens_kv_padded = cu_seqlens_kv_padded + + +def setup_cache_params_from_infer_params(inference_params, lengths_tensor, max_input_length): + """ + Converts the `input_ids` to variables like `cu_seqlens_q/kv`, etc. which + will be used later. + + (Currently a hack, this should be reformatted to a better method) + """ + + assert lengths_tensor is not None and max_input_length is not None, \ + "lengths_tensor and max_input_length should not be none for qkv_format = \"thd\"" + + inference_params.max_incoming_seq_len = max_input_length + + lengths_tensor = lengths_tensor.to(inference_params.cu_seqlens_q.device) + + # inference_params.step_dict = OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist())) + inference_params.pre_step(OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist()))) + + # print(inference_params.step_dict) + + # def get_cache_params_in_infer_params(): + # return CacheParams(max_seqlen_q, max_seqlen_kv, inference_params.cu_seqlens_q, inference_params.cu_seqlens_kv, inference_params.cu_seqlens_q_padded, inference_params.cu_seqlens_kv_padded) + + # For the time being, create an ad-hoc field in `inference_params` to get the variables. + # @sudhakars: to create a better way later. + # inference_params.get_cache_params_from_infer_params = get_cache_params_in_infer_params + +# This class has been modified from +# https://github.com/huggingface/transformers/blob/98adf24883b007c2a7fb17bab1c01b1614673433/src/transformers/models/gemma/modeling_gemma.py +class GemmaRotaryEmbedding(torch.nn.Module): + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): + super().__init__() + + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float() / self.dim)) + self.register_buffer("inv_freq", tensor=inv_freq, persistent=False) + + @torch.no_grad() + def forward(self, x, position_ids, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + self.inv_freq.to(x.device) + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) + position_ids_expanded = position_ids[:, None, :].float() + # Force float32 since bfloat16 loses precision on long contexts + # See https://github.com/huggingface/transformers/pull/29285 + device_type = x.device.type + device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + return emb.unsqueeze(2) # should return in [b, s, 1, d] format + + +class StaticBufferAllocator(torch.nn.Module): + """ + This class is used when we use te.make_graphed_callable(). + CUDA Graphs require all tensors to be static. Neverthless, + torch API make_graphed_callable() takes care of output of torch modules, + and makes them static. Thus by wrapping allocation of memory into + torch.nn.Module, we can greatly simplify our code. + """ + + # pylint: disable=no-self-use + def forward(self, size, dtype, device): + """ + Return buffer of given size, dtype and device. + """ + return torch.zeros(size, dtype=dtype, device=device) + +class TEGemmaDecoderLayer(te.pytorch.TransformerLayer): + """ + Wrapper class over TE's `TransformerLayer`. This makes the wrapper very + similar to HF's `GemmaDecoderLayer` and easier to replace it in the code. + + Args: + config: GemmaConfig + args: positional args (for compatibility with `GemmaDecoderLayer`) + kwargs: keyword args (for compatibility with `GemmaDecoderLayer`) + """ + + def __init__(self, config: GemmaConfig, layer_idx: int, *args, **kwargs): + + self.gemma_config = config + + super().__init__( + hidden_size=config.hidden_size, + ffn_hidden_size=config.intermediate_size, + num_attention_heads=config.num_attention_heads, + bias=False, + layernorm_epsilon=config.rms_norm_eps, + hidden_dropout=0, + attention_dropout=0, + fuse_qkv_params=config.fuse_qkv_params, + normalization="RMSNorm", + activation="geglu", + # attn_input_format=config.qkv_format, + attn_input_format="bshd", + num_gqa_groups=config.num_key_value_heads, + kv_channels=self.gemma_config.head_dim, + layer_number=( + layer_idx + 1 + ), # Layer numbers in TE starts from 1, not 0 like in the HF. + zero_centered_gamma=True, + ) + + def alloc(self, size, dtype, device): + """ + Allocated the buffer and works correctly with CUDA Graphs. + """ + return self._allocator(size, dtype, device) + + def forward(self, *args, **kwargs): # We need to additionally pass positional encoding. + + # if "self_attn_mask_type" in kwargs: + # attn_mask_type = kwargs['self_attn_mask_type'] + # else: + # attn_mask_type = "whatever_default_is" + + # if attn_mask_type == "arbitrary": + # # @sudhakars: following logic doesn't work for `thd` + # attn_mask = kwargs['attention_mask'] + # attention_mask_inv = ~attn_mask + # generation_case = torch.tensor(torch.tensor(attn_mask.shape).shape).item() > 2 + + # if generation_case: + # # @sudhakars: for some reason, `attention_mask` for generation is of the + # # form [b, 1, 1, s]. + # attention_mask_inv = attention_mask_inv.squeeze(1).squeeze(1) + # assert torch.tensor(torch.tensor(attention_mask_inv.shape).shape).item() == 2 + + # # Create `position_ids` on the fly using `attention_mask` since HF + # # does the same in generation logic. + # position_ids = attention_mask_inv.long().cumsum(-1) - 1 + # position_ids.masked_fill_(attention_mask_inv == 0, 1) + + # if "position_ids" in kwargs and kwargs['position_ids'] is not None: + # assert torch.all(torch.eq(position_ids, kwargs["position_ids"])), "position ids don't match match exactly!" + + # # convert [b, s] to [b, 1, s, s] since `arbitrary` is only set for + # # context phase and context phase gets [b, s] sized attn mask + # seq_len = 1 if torch.tensor(torch.tensor(attn_mask.shape).shape).item() > 2 else attention_mask_inv.shape[1] + # arbitrary_attn_mask = torch.zeros(attention_mask_inv.shape[0], 1, seq_len, attention_mask_inv.shape[1]).bool() + # for sample_idx in range(attn_mask.shape[0]): + # pad_len = attn_mask[sample_idx].sum().int().item() + # # set the columns to padded + # arbitrary_attn_mask[sample_idx, :, :, :pad_len] = True + # # set the rows to padded + # if not generation_case: + # arbitrary_attn_mask[sample_idx, :, :pad_len, :] = True + # arbitrary_attn_mask[sample_idx] = torch.tril(arbitrary_attn_mask[sample_idx].logical_not()).logical_not() + + # # Update the attention mask to arbitrary + # kwargs['attention_mask'] = arbitrary_attn_mask.cuda() + + # # @sudhakars: `max_position_embeddings` is not even used inside GemmaRotaryEmbedding + # # @sudhakars: change the hardcoded `dim` to something like config.head_dim + # te_rope_emb = GemmaRotaryEmbedding(dim=256, max_position_embeddings=self.gemma_config.max_position_embeddings).cuda() + # te_rope_emb = te_rope_emb(args[0], position_ids.cuda()) + # else: + # When the `attention_mask` is not `arbitrary`, then for the purpose + # of this tutorial, we're using `padding_causal` (for context) and + # `padding` (for generation) + # @sudhakars: find a better way to provide the `tensor_format` + te_rope_emb = RotaryPositionEmbedding(self.gemma_config.head_dim)( + max_seq_len=self.gemma_config.max_position_embeddings + ).cuda() + + inference_params = kwargs["inference_params"] + # @sudhakars: big assumption that the input is "sbhd" + # batch_size = args[0].shape[0] + + # if inference_params.qkv_format_legacy == "thd": + # cache_params = kwargs["cache_params"] + # max_seqlen_q = cache_params.max_seqlen_q + # max_seqlen_kv = cache_params.max_seqlen_kv + # cu_seqlens_q = cache_params.cu_seqlens_q + # cu_seqlens_kv = cache_params.cu_seqlens_kv + # cu_seqlens_q_padded = cache_params.cu_seqlens_q_padded + # cu_seqlens_kv_padded = cache_params.cu_seqlens_kv_padded + # print(f"input_sequence_lengths (in forward): \n{inference_params.input_sequence_lengths}") + + # this args cannot be passed to TransformerLayer + keys_to_remove = [ + "position_ids", + "past_key_value", + "output_attentions", + "use_cache", + "cache_position", + ] + for key in keys_to_remove: + kwargs.pop(key, None) + + # We need to return tuple to be compatible with HF. + return ( + super().forward( + *args, + rotary_pos_emb=te_rope_emb, + # cu_seqlens_q=cu_seqlens_q, + # cu_seqlens_kv=cu_seqlens_kv, + # max_seqlen_q=max_seqlen_q, + # max_seqlen_kv=max_seqlen_kv, + **kwargs + ), + ) + +class StaticGemmaModel(torch.nn.Module): + """ + StaticGemma is based of HF GemmaModel class. + It is adjusted to work properly with CUDA Graphs. + """ + + def __init__( + self, + model: GemmaModel, + dtype: torch.dtype, + mask: torch.Tensor, + lm_head: torch.nn.Module, + ): + super().__init__() + self.model = model + self.normalizer = torch.tensor(self.model.config.hidden_size**0.5, dtype=dtype) + self.mask = mask + self.lm_head = lm_head + + def set_inference_params(self, inference_params): + self.inference_params = inference_params + + # @sudhakars: is `arbitrary` fine being the default here? + def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor = None, attn_mask_type: str = "arbitrary"): + print(f"StaticGemmaModel forward start") + with torch.no_grad(): + # static operation - for CUDA graphs + hidden_states.data[:] = hidden_states.data[:] * self.normalizer + + for i, decoder_layer in enumerate(self.model.layers): + # print(f"layer {i}") + hidden_states.data[:] = decoder_layer( + hidden_states, + attention_mask=attention_mask, + self_attn_mask_type=self.mask if attn_mask_type is None else attn_mask_type, + inference_params=self.inference_params, + )[ + 0 + ] # static copy - for CUDA graphs + + hidden_states.copy_(self.model.norm(hidden_states)) # static copy - for CUDA graphs + logits = self.lm_head(hidden_states) + logits = logits.float() + return logits, hidden_states + + +class GemmaGenerator(torch.nn.Module): + """ + GemmaGenerator gets one layer of embeddins, + makes forward pass and returns next tokens. + """ + + def __init__( + self, model: GemmaModel, lm_head: torch.nn.Module, dtype: torch.dtype, qkv_format: str + ): + super().__init__() + self.model = model + self.gemma_layers = StaticGemmaModel(model, dtype, "arbitrary", lm_head) + self.qkv_format = qkv_format + + def set_inference_params(self, inference_params): + self.inference_params = inference_params + self.gemma_layers.set_inference_params(inference_params) + + # @sudhakars: is `arbitrary` a good default value here? + def forward(self, hidden_states: torch.Tensor, mask: torch.Tensor = None, attn_mask_type: str = "arbitrary"): + logits, _ = self.gemma_layers(hidden_states, attention_mask=mask, attn_mask_type = attn_mask_type) + + assert logits.shape[0] == hidden_states.shape[0] # b + assert logits.shape[1] == hidden_states.shape[1] # seq_len + # logits.shape[2] = number of tokens + logits = logits[:, -1, :] + next_tokens = torch.argmax(logits, dim=1) + + # static copy for CUDA graphs + hidden_states.copy_(self.model.embed_tokens(next_tokens).unsqueeze(1)) + + return next_tokens + + +class PartialForwardWrapper(torch.nn.Module): + """ + This class wraps a `torch.nn.Module` while partially modifying its `forward` + + CUDAGraphs' `make_graphed_callables` method takes in a module but if only + `functools.partial` is used to wrap the module, it changes the modules' + type and that interferes with the `make_graphed_callables` intrinsics. + """ + def __init__(self, module, **kwargs): + super().__init__() + self.module = module + self.partial_forward = partial(self.module.forward, **kwargs) + + def __call__(self, *args, **kwargs): + return self.partial_forward(*args, **kwargs) + + # @sudhakars: should we use better abstraction? + def set_inference_params(self, *args, **kwargs): + return self.module.set_inference_params(*args, **kwargs) + + +@contextmanager +def replace_decoder(te_decoder_cls): + """ + Replace `GemmaDecoderLayer` with custom `TEGemmaDecoderLayer`. + """ + original_gemma_decoder_cls = transformers.models.gemma.modeling_gemma.GemmaDecoderLayer + transformers.models.gemma.modeling_gemma.GemmaDecoderLayer = te_decoder_cls + try: + yield + finally: + transformers.models.gemma.modeling_gemma.GemmaDecoderLayer = original_gemma_decoder_cls + + +class TEGemmaForCausalLM(GemmaForCausalLM): + """ + Causal LM created with `GemmaModel`. The underlying `GemmaDecoderLayer` + class is monkey-patched with `TEGemmaDecoderLayer` class before + initializing the causal LM with `GemmaForCausalLM`. + + Args: + config: GemmaConfig + """ + + def __init__(self, config: GemmaConfig): + with replace_decoder(te_decoder_cls=TEGemmaDecoderLayer): + super().__init__(config) + self.config = config + self.to(torch.bfloat16).cuda() + self.hidden_size = config.hidden_size + self._model_generation_phase = GemmaGenerator( + lm_head=self.lm_head, + model=self.model, + dtype=torch.bfloat16, + qkv_format=config.qkv_format, + ) + self._model_context_phase = StaticGemmaModel( + self.model, torch.bfloat16, "arbitrary", self.lm_head + ) + + if self.config.fp8: + self.fp8_recipe = DelayedScaling( + fp8_format=Format.HYBRID, amax_history_len=16, amax_compute_algo="max" + ) + + @staticmethod + def _padding_to_end(inputs, lengths, max_seq_len=None): + """ + Gets the tensor with sequence padded from the beginning and + return tensor padded from its end. + + Parameters + ---------- + inputs : Tensor, tensor with shape [b, s] containing token numbers. + It's padded from the beggining. + lengths: Tensor, tensor with shape [s] with lengths of the sequences. + + """ + max_seq_len = torch.max(lengths) if max_seq_len is None else max_seq_len + batch_size, max_seq_len = inputs.shape + new_input_ids = inputs.clone() + for i in range(batch_size): + new_input_ids[i, : lengths[i]] = inputs[i, (max_seq_len - lengths[i]) : max_seq_len] + new_input_ids[i, lengths[i] :] = inputs[i, 0 : (max_seq_len - lengths[i])] + + # Disable the input preparation that involves extra padding + # inputs.copy_(new_input_ids) + + # Trim the inputs to no extra padding i.e. fix the max seq len to + # the longest sequence in the batch + actual_max_seq_len = max_seq_len + inputs.data = new_input_ids[:, :actual_max_seq_len] + print(f"actual_max_seq_len: {actual_max_seq_len}") + + # For Paged Attention, make the valid sequences, multiple of 64 + # inputs.data = new_input_ids[:, :4].repeat(1, 16) + + + def _next_64_multiply(self, x): + return ((x + 63) // 64) * 64 + + # This function is overriden in TeGEmmaForCausalLMCudaGraphs. + def _create_hidden_states_buffer(self, input_ids: torch.Tensor): + tensor = torch.empty( + (input_ids.shape[0], input_ids.shape[1], self.hidden_size), + device="cuda", + dtype=torch.float32, + ) + # import pdb; pdb.set_trace() + return tensor + + # This function is overriden in TeGEmmaForCausalLMCudaGraphs. + def _create_inference_params(self, *args, **kwargs): + infer_params = InferenceParams( + *args, **kwargs + ) + + # max_batch_size = kwargs["max_batch_size"] + + # Initialize some legacy params + # _allocator = StaticBufferAllocator() + # infer_params.cached_sequence_lengths = _allocator((max_batch_size,), dtype=torch.int32, device="cuda") + # infer_params.input_sequence_lengths = _allocator((max_batch_size,), dtype=torch.int32, device="cuda") + + # These are updated in setup_cache_params_from_infer_params and they should be static for + # the duration of the context as well as the generation phase. + # infer_params.cu_seqlens_q, infer_params.cu_seqlens_kv, infer_params.cu_seqlens_q_padded, infer_params.cu_seqlens_kv_padded = [ + # _allocator(max_batch_size + 1, dtype=torch.int32, device="cuda") + # for _ in range(4) + # ] + + return infer_params + + # This function is overriden in TeGEmmaForCausalLMCudaGraphs. + def _get_max_input_seq_len(self, input_ids): + return input_ids.shape[1] \ + if not hasattr(self.config, "cuda_graphs_static_max_context_len") \ + else self.config.cuda_graphs_static_max_context_len + + # The buffer for generation is some part (beginning) of hidden states buffer. + # This function returns pointer to it and also copies there data if provided. + def _get_generation_buffer(self, hidden_states_buffer, data_to_copy=None): + # hidden_states_buffer has shape [b, s, hd] + # generation_buffer will have shape [b, 1, hd] + # Notice that "generation_buffer = hidden_states_buffer[:, 0, :].unsqueeze(1)" + # will return uncontiguous buffer, which we want to avoid. + output = hidden_states_buffer.view(-1)[ + : hidden_states_buffer.shape[0] * hidden_states_buffer.shape[2] + ] + if data_to_copy is not None: + output.copy_(data_to_copy.reshape(-1)) + generation_buffer = output.view( + (hidden_states_buffer.shape[0], 1, hidden_states_buffer.shape[2]) + ) + return generation_buffer + + def _generate_context_phase(self, input_ids: torch.Tensor, inference_params: InferenceParams): + # import pdb; pdb.set_trace() + hidden_states = self._create_hidden_states_buffer(input_ids) + hidden_states.data[:] = self.model.embed_tokens(input_ids) + + # We need to update offsets before every forward pass to make cache work properly. + lengths = input_ids.ne(0).sum(dim=1) + + # import pdb; pdb.set_trace() + if self.config.qkv_format == "thd": + # inference_params.setup_before_new_input( + # lengths_tensor=lengths, max_input_length=input_ids.shape[1] + # ) + lengths = input_ids.ne(0).sum(dim=1) + inference_params.pre_step(OrderedDict(zip(list(range(len(lengths))), lengths.tolist()))) + else: + inference_params.setup_before_new_input(length=input_ids.shape[1]) + + + logits, hs_buffer = self._model_context_phase( + hidden_states, + attention_mask=((input_ids == 0) if self.config.qkv_format != "thd" else None), + attn_mask_type="padding_causal" if self.config.qkv_format == "thd" else "arbitrary", + ) + + # We choose logits coresponding with last token in each sequence, + # which have various lengths - they are stored in (inference_params.incoming_seq_len - 1) + # Tensor when qkv_format == "thd" and + # they are the last token in the sequence when qkv_format != "thd". + # import pdb; pdb.set_trace() + if self.config.qkv_format == "thd": + logits = logits[ + + torch.arange(logits.size(0)), lengths - 1, : + ] + else: + logits = logits[:, -1, :] + + next_tokens = torch.argmax(logits, dim=1) + + # self.hidden_states have shape [b, s, hd]. + # We return hidden state for the last token - output has shape [b, 1, hd] + hidden_states = self._get_generation_buffer( + hidden_states, self.model.embed_tokens(next_tokens) + ) + return hidden_states, next_tokens + + def _make_mask_one_token_longer(self, mask): + return torch.cat( + [mask, torch.zeros(mask.size(0), 1, 1, 1, dtype=torch.bool, device=mask.device)], dim=-1 + ) + + @torch.no_grad() + def generate( + self, + input_ids: Optional[torch.Tensor] = None, + pad_token_id: int = 0, + max_new_tokens: int = 0, + *args, + **kwargs + ): + self.eval() + + # We need both autocasts: FP8 for operations that can run in lower precision + # and BF16 for those that cannot. + with autocast(dtype=torch.bfloat16, cache_enabled=False), te.pytorch.fp8_autocast( + enabled=self.config.fp8, fp8_recipe=self.fp8_recipe if self.config.fp8 else None + ): + + lengths = torch.sum(input_ids.ne(pad_token_id), dim=-1).squeeze() # [s] + + batch_size, max_input_sequence_len = input_ids.shape[0], self._get_max_input_seq_len( + input_ids + ) + + # This is not needed since the padding to the left is already done in utils.py + # # Pad input_ids with zeros on the left to match max_input_sequence_len + # # This adds padding tokens (0) to the left side of each sequence in the batch + # # Shape goes from [batch_size, seq_len] to [batch_size, max_input_sequence_len] + # input_ids = F.pad( + # input_ids, (max_input_sequence_len - input_ids.shape[1], 0), "constant", 0 + # ) + + if self.config.qkv_format == "thd": + # For thd layout padding is at the end, otherwise at the beginning. + TEGemmaForCausalLM._padding_to_end(input_ids, + lengths, + max_seq_len=self.config.cuda_graphs_static_max_context_len \ + if self.config.generation_cuda_graphs else None + ) + + # import pdb; pdb.set_trace() + + # InferenceParams is a cache, where keys and values of previous tokens are stored. + # Moreover it stores length of both already generated and input sequences. + inference_params = self._create_inference_params( + max_batch_size=batch_size, + # num_layers=self.config.num_hidden_layers, + max_sequence_length=self._next_64_multiply(max_input_sequence_len + max_new_tokens), + num_heads_kv=self.config.num_key_value_heads, + # num_heads_q=self.config.num_attention_heads, + head_dim_v=self.config.head_dim, + head_dim_k=self.config.head_dim, + dtype=torch.bfloat16, + is_paged=self.config.is_paged, + page_size=64, + total_num_pages=64, # 64 * 64 (max_sequence_length) / 64 (page_size) + # is_cuda_graph=False + ) + + # def init_cache_params_in_infer_params(inference_params): + # _allocator = StaticBufferAllocator() + # inference_params.cached_sequence_lengths = _allocator( + # (batch_size,), dtype=torch.int32, device="cuda") + # inference_params.input_sequence_lengths = _allocator( + # (batch_size,), dtype=torch.int32, device="cuda") + + # init_cache_params_in_infer_params(inference_params) + + + # inference_params.qkv_format_legacy = self.config.qkv_format + + self._model_context_phase.set_inference_params(inference_params) + self._model_generation_phase.set_inference_params(inference_params) + + print(f"context phase start") + # import pdb; pdb.set_trace() + hidden_states, next_tokens = self._generate_context_phase(input_ids, inference_params) + + print(f"context phase done") + # Generation phase. + if self.config.qkv_format == "thd": + # inference_params.setup_before_new_input( + # lengths_tensor=torch.ones((next_tokens.shape[0],), device="cuda"), + # max_input_length=1, + # ) + lengths_tensor=torch.ones((next_tokens.shape[0],), dtype=int) + inference_params.pre_step(OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist()))) + else: + inference_params.setup_before_new_input(length=1) + + output_tokens = [next_tokens] + + mask = None + if self.config.qkv_format != "thd": + mask = (input_ids == 0).unsqueeze(1).unsqueeze(1) + + for _ in range(max_new_tokens): + if self.config.qkv_format != "thd": + # It will not work with cuda graphs, but it is not used for thd qkv_format. + # Attention mask in bshd needs attn_mask increased by 1 to + # include the next token to be generated + mask = self._make_mask_one_token_longer(mask) + + # setup_cache_params_from_infer_params(inference_params, input_ids) + # @sudhakars: could create position_ids from mask here + next_tokens = self._model_generation_phase(hidden_states, mask, attn_mask_type="padding" if self.config.qkv_format=="thd" else "arbitrary") + + # self.inference_params contains for example kv_cache. + # This needs to be called before every pass, + # to update the information of sequence lengths. + # Here we increase sequence offsets by one, + # because we generated one token for every sequence. + if self.config.qkv_format == "thd": + # self.inference_params.setup_before_new_input( + # lengths_tensor=torch.ones((next_tokens.shape[0],), device="cuda"), + # max_input_length=1, + # ) + lengths_tensor=torch.ones((next_tokens.shape[0],), dtype=int) + inference_params.pre_step(OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist()))) + else: + inference_params.setup_before_new_input(length=1) + # next_tokens is static output tensor, so we need to clone it + # - it gets changed every iteration. + output_tokens.append(next_tokens.clone()) + + result = torch.cat((input_ids, torch.stack(output_tokens).permute([1, 0])), dim=1) + return result + + def forward(self, *args, **kwargs): + self._model_context_phase.set_inference_params(None) + hidden_states = self.model.embed_tokens(kwargs["input_ids"]) + logits = self._model_context_phase( + hidden_states, + attention_mask=((kwargs["input_ids"] == 0) if self.config.qkv_format != "thd" else None), + attn_mask_type="arbitrary" + ) + return logits + +class TEGemmaForCausalLMCudaGraphs(TEGemmaForCausalLM): + """ + TEGemmaForCausalLMCudaGraphs is the version of the class TEGemmaForCausalLM + using CUDA Graphs to speed it up. We need to make one trade-off. + Namely, batch_size, max_seq_len and max_context_seq_len need to be static. + It is necessary to run generation with the same value of + these variables that we recorded graph on. + """ + + def __init__(self, config: GemmaConfig): + super().__init__(config) + assert ( + config.qkv_format == "thd" + ), "Generation with CUDA Graphs are implemented only for thd format." + + # Preparation of the static buffers. + self.config = config + self.hidden_states_buffer = torch.empty( + ( + self.config.cuda_graphs_static_batch_size, + self.config.cuda_graphs_static_max_context_len, + self.config.hidden_size, + ) + ).cuda() + # This is in fact part of the buffer for hidden_states. + self.generation_buffer = self._get_generation_buffer(self.hidden_states_buffer) + # self.inference_params = InferenceParams( + # max_batch_size=config.cuda_graphs_static_batch_size, + # max_sequence_length=config.cuda_graphs_static_max_seq_len, + # qkv_format="thd", + # ) + self.inference_params = InferenceParams( + max_batch_size=self.config.cuda_graphs_static_batch_size, + # num_layers=self.config.num_hidden_layers, + max_sequence_length=self.config.cuda_graphs_static_max_seq_len, + num_heads_kv=self.config.num_key_value_heads, + # num_heads_q=self.config.num_attention_heads, + head_dim_v=self.config.head_dim, + head_dim_k=self.config.head_dim, + dtype=torch.bfloat16, + is_paged=self.config.is_paged, + page_size=64, + total_num_pages=64, # 64 * 64 (max_sequence_length) / 64 (page_size) + # is_cuda_graph=False + ) + + ## Taken from TEGemmaForCausalLM above + # max_batch_size = self.config.cuda_graphs_static_batch_size + # # Initialize some legacy params + # _allocator = StaticBufferAllocator() + # self.inference_params.cached_sequence_lengths = _allocator((max_batch_size,), dtype=torch.int32, device="cuda") + # self.inference_params.input_sequence_lengths = _allocator((max_batch_size,), dtype=torch.int32, device="cuda") + + # self.inference_params.cu_seqlens_q, self.inference_params.cu_seqlens_kv, self.inference_params.cu_seqlens_q_padded, self.inference_params.cu_seqlens_kv_padded = [ + # _allocator(max_batch_size + 1, dtype=torch.int32, device="cuda") + # for _ in range(4) + # ] + + # def init_cache_params_in_infer_params(inference_params): + # inference_params.cached_sequence_lengths = torch.zeros( + # (batch_size,), device="cuda", dtype=torch.int32) + # inference_params.input_sequence_lengths = torch.zeros( + # (batch_size,), device="cuda", dtype=torch.int32) + # init_cache_params_in_infer_params(inference_params) + + # self.inference_params.qkv_format_legacy = self.config.qkv_format + + self._model_generation_phase.set_inference_params(self.inference_params) + self._model_context_phase.set_inference_params(self.inference_params) + + def record(self): + # We want to record model in training=False, because it will be used in generation. + self.eval() + + # Here "the trick" happens. We override methods from TEGemmaForCausalLM + # with their recorded version. After invocation of each of them, + # captured graph will be replayed with minimal usage of CPU, + # what will lead to huge speedup. + input_shape = ( + self.config.cuda_graphs_static_batch_size, + self.config.cuda_graphs_static_max_context_len, + ) + # self.inference_params.reset() + # self.inference_params.setup_before_new_input( + # lengths_tensor=torch.tensor(input_shape[0] * [input_shape[1]], device="cuda"), + # max_input_length=input_shape[1], + # ) + + # [1] Should be same as lengths_tensor from TEGemmaForCausalLM + lengths = torch.tensor(input_shape[0] * [input_shape[1]], device="cuda", dtype=torch.int32) + max_input_length = input_shape[1] + + self.inference_params.pre_step(OrderedDict(zip(list(range(len(lengths))), lengths.tolist()))) + + print(f"context phase recording start") + # self._model_context_phase.model.layers = torch.nn.ModuleList([ + # self.record_graph( + # layer, + # self.hidden_states_buffer, + # self_attn_mask_type="padding_causal", + # inference_params=self.inference_params + # ) + # for layer in self._model_context_phase.model.layers + # ]) + self._model_context_phase = self.record_graph( + self._model_context_phase, + self.hidden_states_buffer, + attn_mask_type="padding_causal" + ) # CUDA Graphs recording + + print(f"context phase recording done") + input_shape = (self.config.cuda_graphs_static_batch_size, 1) + # self.inference_params.reset() + # self.inference_params.setup_before_new_input( + # lengths_tensor=torch.tensor(input_shape[0] * [input_shape[1]], device="cuda"), + # max_input_length=input_shape[1], + # ) + lengths = torch.tensor(input_shape[0] * [1], device="cuda", dtype=torch.int32) + max_input_length = input_shape[1] + + self.inference_params.pre_step(OrderedDict(zip(list(range(len(lengths))), lengths.tolist()))) + + self._model_generation_phase = self.record_graph( + self._model_generation_phase, + self.generation_buffer, + attn_mask_type="padding" + ) # CUDA Graphs recording + + """ + Functions _create_hidden_states_buffer and _create_inference_params + from base class are overriden to make hidden_states and inference_params static + - not changing their position in memory between every invocation. + """ + + def _create_hidden_states_buffer(self, *args, **kwargs): + return self.hidden_states_buffer + + def _create_inference_params(self, *args, **kwargs): + self.inference_params.reset() + return self.inference_params + + def _get_max_input_seq_len(self, _): + return self.config.cuda_graphs_static_max_context_len + + @torch.no_grad() + def record_graph(self, function, input_tensor, **sample_kwargs): + # function is invoked on argument (self.hidden_states,) and all kernels are recorded. + # record_graph() returns captured function, which can be run later with lower of th CPU. + fp8_format = Format.HYBRID + fp8_recipe = DelayedScaling( + fp8_format=fp8_format, amax_history_len=1024, amax_compute_algo="max" + ) + + # We need both autocasts: FP8 for operations that can run in lower precision + # and BF16 for those that cannot. + with autocast(dtype=torch.bfloat16, cache_enabled=False): + graphed_function = te.pytorch.make_graphed_callables( + function, + (input_tensor,), + fp8_enabled=self.config.fp8, + fp8_recipe=fp8_recipe, + allow_unused_input=True, + num_warmup_iters=5, + sample_kwargs=sample_kwargs, + ) + return graphed_function From 6cd3c1a6fe44a0327ba286dbf15e1ceb34eab3de Mon Sep 17 00:00:00 2001 From: Sudhakar Singh Date: Tue, 17 Jun 2025 15:27:06 -0700 Subject: [PATCH 3/7] make cuda graphs work with non-paged and paged attention Signed-off-by: Sudhakar Singh --- docs/examples/te_gemma/te_gemma.py | 218 ++++-------------- .../pytorch/attention/inference.py | 20 +- 2 files changed, 54 insertions(+), 184 deletions(-) diff --git a/docs/examples/te_gemma/te_gemma.py b/docs/examples/te_gemma/te_gemma.py index bab980cc28..cd59a081e8 100755 --- a/docs/examples/te_gemma/te_gemma.py +++ b/docs/examples/te_gemma/te_gemma.py @@ -19,22 +19,6 @@ import torch.nn.functional as F -class StaticBufferAllocator(torch.nn.Module): - """ - This class is used when we use te.make_graphed_callable(). - CUDA Graphs require all tensors to be static. Neverthless, - torch API make_graphed_callable() takes care of output of torch modules, - and makes them static. Thus by wrapping allocation of memory into - torch.nn.Module, we can greatly simplify our code. - """ - - # pylint: disable=no-self-use - def forward(self, size, dtype, device): - """ - Return buffer of given size, dtype and device. - """ - return torch.zeros(size, dtype=dtype, device=device) - class TEGemmaDecoderLayer(te.pytorch.TransformerLayer): """ Wrapper class over TE's `TransformerLayer`. This makes the wrapper very @@ -71,39 +55,8 @@ def __init__(self, config: GemmaConfig, layer_idx: int, *args, **kwargs): zero_centered_gamma=True, ) - self.te_rope_emb = RotaryPositionEmbedding(self.gemma_config.head_dim)( - max_seq_len=self.gemma_config.max_position_embeddings - ).cuda() - - def alloc(self, size, dtype, device): - """ - Allocated the buffer and works correctly with CUDA Graphs. - """ - return self._allocator(size, dtype, device) - def forward(self, *args, **kwargs): # We need to additionally pass positional encoding. - - # When the `attention_mask` is not `arbitrary`, then for the purpose - # of this tutorial, we're using `padding_causal` (for context) and - # `padding` (for generation) - # @sudhakars: find a better way to provide the `tensor_format` - - - inference_params = kwargs["inference_params"] - # @sudhakars: big assumption that the input is "sbhd" - # batch_size = args[0].shape[0] - - # if inference_params.qkv_format_legacy == "thd": - # cache_params = kwargs["cache_params"] - # max_seqlen_q = cache_params.max_seqlen_q - # max_seqlen_kv = cache_params.max_seqlen_kv - # cu_seqlens_q = cache_params.cu_seqlens_q - # cu_seqlens_kv = cache_params.cu_seqlens_kv - # cu_seqlens_q_padded = cache_params.cu_seqlens_q_padded - # cu_seqlens_kv_padded = cache_params.cu_seqlens_kv_padded - # print(f"input_sequence_lengths (in forward): \n{inference_params.input_sequence_lengths}") - # this args cannot be passed to TransformerLayer keys_to_remove = [ "position_ids", @@ -115,15 +68,12 @@ def forward(self, *args, **kwargs): # We need to additionally pass positional e for key in keys_to_remove: kwargs.pop(key, None) + rope_emb = kwargs.pop("rope_emb", None) # We need to return tuple to be compatible with HF. return ( super().forward( *args, - rotary_pos_emb=self.te_rope_emb, - # cu_seqlens_q=cu_seqlens_q, - # cu_seqlens_kv=cu_seqlens_kv, - # max_seqlen_q=max_seqlen_q, - # max_seqlen_kv=max_seqlen_kv, + rotary_pos_emb=rope_emb, **kwargs ), ) @@ -151,8 +101,8 @@ def set_inference_params(self, inference_params): self.inference_params = inference_params # @sudhakars: is `arbitrary` fine being the default here? - def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor = None, attn_mask_type: str = "arbitrary"): - print(f"StaticGemmaModel forward start") + def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor = None, attn_mask_type: str = "arbitrary", rope_emb: torch.Tensor = None): + # print(f"StaticGemmaModel forward start") with torch.no_grad(): # static operation - for CUDA graphs hidden_states.data[:] = hidden_states.data[:] * self.normalizer @@ -164,6 +114,7 @@ def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor = No attention_mask=attention_mask, self_attn_mask_type=self.mask if attn_mask_type is None else attn_mask_type, inference_params=self.inference_params, + rope_emb=rope_emb )[ 0 ] # static copy - for CUDA graphs @@ -193,8 +144,8 @@ def set_inference_params(self, inference_params): self.gemma_layers.set_inference_params(inference_params) # @sudhakars: is `arbitrary` a good default value here? - def forward(self, hidden_states: torch.Tensor, mask: torch.Tensor = None, attn_mask_type: str = "arbitrary"): - logits, _ = self.gemma_layers(hidden_states, attention_mask=mask, attn_mask_type = attn_mask_type) + def forward(self, hidden_states: torch.Tensor, mask: torch.Tensor = None, attn_mask_type: str = "arbitrary", rope_emb: torch.Tensor = None): + logits, _ = self.gemma_layers(hidden_states, attention_mask=mask, attn_mask_type = attn_mask_type, rope_emb=rope_emb) assert logits.shape[0] == hidden_states.shape[0] # b assert logits.shape[1] == hidden_states.shape[1] # seq_len @@ -252,6 +203,10 @@ def __init__(self, config: GemmaConfig): fp8_format=Format.HYBRID, amax_history_len=16, amax_compute_algo="max" ) + self.te_rope_emb = RotaryPositionEmbedding(self.config.head_dim)( + max_seq_len=self.config.max_position_embeddings + ).cuda() + @staticmethod def _padding_to_end(inputs, lengths, max_seq_len=None): """ @@ -279,11 +234,13 @@ def _padding_to_end(inputs, lengths, max_seq_len=None): # the longest sequence in the batch actual_max_seq_len = max_seq_len inputs.data = new_input_ids[:, :actual_max_seq_len] - print(f"actual_max_seq_len: {actual_max_seq_len}") + # print(f"actual_max_seq_len: {actual_max_seq_len}") # For Paged Attention, make the valid sequences, multiple of 64 # inputs.data = new_input_ids[:, :4].repeat(1, 16) - + # import pdb; pdb.set_trace() + # print(f"inputs.data.shape: {inputs.data.shape}") + # exit() def _next_64_multiply(self, x): return ((x + 63) // 64) * 64 @@ -303,21 +260,6 @@ def _create_inference_params(self, *args, **kwargs): infer_params = InferenceParams( *args, **kwargs ) - - # max_batch_size = kwargs["max_batch_size"] - - # Initialize some legacy params - # _allocator = StaticBufferAllocator() - # infer_params.cached_sequence_lengths = _allocator((max_batch_size,), dtype=torch.int32, device="cuda") - # infer_params.input_sequence_lengths = _allocator((max_batch_size,), dtype=torch.int32, device="cuda") - - # These are updated in setup_cache_params_from_infer_params and they should be static for - # the duration of the context as well as the generation phase. - # infer_params.cu_seqlens_q, infer_params.cu_seqlens_kv, infer_params.cu_seqlens_q_padded, infer_params.cu_seqlens_kv_padded = [ - # _allocator(max_batch_size + 1, dtype=torch.int32, device="cuda") - # for _ in range(4) - # ] - return infer_params # This function is overriden in TeGEmmaForCausalLMCudaGraphs. @@ -366,22 +308,16 @@ def _generate_context_phase(self, input_ids: torch.Tensor, inference_params: Inf hidden_states, attention_mask=((input_ids == 0) if self.config.qkv_format != "thd" else None), attn_mask_type="padding_causal" if self.config.qkv_format == "thd" else "arbitrary", + rope_emb=self.te_rope_emb ) - # We choose logits coresponding with last token in each sequence, - # which have various lengths - they are stored in (inference_params.incoming_seq_len - 1) - # Tensor when qkv_format == "thd" and - # they are the last token in the sequence when qkv_format != "thd". - # import pdb; pdb.set_trace() - import pdb; pdb.set_trace() - - # if self.config.qkv_format == "thd": - # logits = logits[ + if self.config.qkv_format == "thd": + logits = logits[ - # torch.arange(logits.size(0)), lengths - 1, : - # ] - # else: - logits = logits[:, -1, :] + torch.arange(logits.size(0)), lengths - 1, : + ] + else: + logits = logits[:, -1, :] next_tokens = torch.argmax(logits, dim=1) @@ -416,17 +352,8 @@ def generate( lengths = torch.sum(input_ids.ne(pad_token_id), dim=-1).squeeze() # [s] - batch_size, max_input_sequence_len = input_ids.shape[0], self._get_max_input_seq_len( - input_ids - ) - - # This is not needed since the padding to the left is already done in utils.py - # # Pad input_ids with zeros on the left to match max_input_sequence_len - # # This adds padding tokens (0) to the left side of each sequence in the batch - # # Shape goes from [batch_size, seq_len] to [batch_size, max_input_sequence_len] - # input_ids = F.pad( - # input_ids, (max_input_sequence_len - input_ids.shape[1], 0), "constant", 0 - # ) + # print(f"max_input_sequence_len: {max_input_sequence_len}") + # exit() if self.config.qkv_format == "thd": # For thd layout padding is at the end, otherwise at the beginning. @@ -436,7 +363,9 @@ def generate( if self.config.generation_cuda_graphs else None ) - # import pdb; pdb.set_trace() + batch_size, max_input_sequence_len = input_ids.shape[0], self._get_max_input_seq_len( + input_ids + ) # InferenceParams is a cache, where keys and values of previous tokens are stored. # Moreover it stores length of both already generated and input sequences. @@ -451,36 +380,19 @@ def generate( dtype=torch.bfloat16, is_paged=self.config.is_paged, page_size=64, - total_num_pages=64, # 64 * 64 (max_sequence_length) / 64 (page_size) - # is_cuda_graph=False + total_num_pages=64 * 128 // 64, # 64 * 64 (max_sequence_length) / 64 (page_size) ) - # def init_cache_params_in_infer_params(inference_params): - # _allocator = StaticBufferAllocator() - # inference_params.cached_sequence_lengths = _allocator( - # (batch_size,), dtype=torch.int32, device="cuda") - # inference_params.input_sequence_lengths = _allocator( - # (batch_size,), dtype=torch.int32, device="cuda") - - # init_cache_params_in_infer_params(inference_params) - - - # inference_params.qkv_format_legacy = self.config.qkv_format - self._model_context_phase.set_inference_params(inference_params) self._model_generation_phase.set_inference_params(inference_params) - print(f"context phase start") + # print(f"context phase start") # import pdb; pdb.set_trace() hidden_states, next_tokens = self._generate_context_phase(input_ids, inference_params) - print(f"context phase done") + # print(f"context phase done") # Generation phase. if self.config.qkv_format == "thd": - # inference_params.setup_before_new_input( - # lengths_tensor=torch.ones((next_tokens.shape[0],), device="cuda"), - # max_input_length=1, - # ) lengths_tensor=torch.ones((next_tokens.shape[0],), dtype=int) inference_params.pre_step(OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist()))) else: @@ -499,9 +411,7 @@ def generate( # include the next token to be generated mask = self._make_mask_one_token_longer(mask) - # setup_cache_params_from_infer_params(inference_params, input_ids) - # @sudhakars: could create position_ids from mask here - next_tokens = self._model_generation_phase(hidden_states, mask, attn_mask_type="padding" if self.config.qkv_format=="thd" else "arbitrary") + next_tokens = self._model_generation_phase(hidden_states, mask=mask, attn_mask_type="padding" if self.config.qkv_format=="thd" else "arbitrary", rope_emb=self.te_rope_emb) # self.inference_params contains for example kv_cache. # This needs to be called before every pass, @@ -509,10 +419,6 @@ def generate( # Here we increase sequence offsets by one, # because we generated one token for every sequence. if self.config.qkv_format == "thd": - # self.inference_params.setup_before_new_input( - # lengths_tensor=torch.ones((next_tokens.shape[0],), device="cuda"), - # max_input_length=1, - # ) lengths_tensor=torch.ones((next_tokens.shape[0],), dtype=int) inference_params.pre_step(OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist()))) else: @@ -558,13 +464,9 @@ def __init__(self, config: GemmaConfig): self.config.hidden_size, ) ).cuda() + # This is in fact part of the buffer for hidden_states. self.generation_buffer = self._get_generation_buffer(self.hidden_states_buffer) - # self.inference_params = InferenceParams( - # max_batch_size=config.cuda_graphs_static_batch_size, - # max_sequence_length=config.cuda_graphs_static_max_seq_len, - # qkv_format="thd", - # ) self.inference_params = InferenceParams( max_batch_size=self.config.cuda_graphs_static_batch_size, # num_layers=self.config.num_hidden_layers, @@ -576,31 +478,9 @@ def __init__(self, config: GemmaConfig): dtype=torch.bfloat16, is_paged=self.config.is_paged, page_size=64, - total_num_pages=64, # 64 * 64 (max_sequence_length) / 64 (page_size) - # is_cuda_graph=False + total_num_pages=64 * self.config.cuda_graphs_static_max_seq_len // 64, # 64 * 64 (max_sequence_length) / 64 (page_size) ) - ## Taken from TEGemmaForCausalLM above - # max_batch_size = self.config.cuda_graphs_static_batch_size - # # Initialize some legacy params - # _allocator = StaticBufferAllocator() - # self.inference_params.cached_sequence_lengths = _allocator((max_batch_size,), dtype=torch.int32, device="cuda") - # self.inference_params.input_sequence_lengths = _allocator((max_batch_size,), dtype=torch.int32, device="cuda") - - # self.inference_params.cu_seqlens_q, self.inference_params.cu_seqlens_kv, self.inference_params.cu_seqlens_q_padded, self.inference_params.cu_seqlens_kv_padded = [ - # _allocator(max_batch_size + 1, dtype=torch.int32, device="cuda") - # for _ in range(4) - # ] - - # def init_cache_params_in_infer_params(inference_params): - # inference_params.cached_sequence_lengths = torch.zeros( - # (batch_size,), device="cuda", dtype=torch.int32) - # inference_params.input_sequence_lengths = torch.zeros( - # (batch_size,), device="cuda", dtype=torch.int32) - # init_cache_params_in_infer_params(inference_params) - - # self.inference_params.qkv_format_legacy = self.config.qkv_format - self._model_generation_phase.set_inference_params(self.inference_params) self._model_context_phase.set_inference_params(self.inference_params) @@ -616,11 +496,6 @@ def record(self): self.config.cuda_graphs_static_batch_size, self.config.cuda_graphs_static_max_context_len, ) - # self.inference_params.reset() - # self.inference_params.setup_before_new_input( - # lengths_tensor=torch.tensor(input_shape[0] * [input_shape[1]], device="cuda"), - # max_input_length=input_shape[1], - # ) # [1] Should be same as lengths_tensor from TEGemmaForCausalLM lengths = torch.tensor(input_shape[0] * [input_shape[1]], device="cuda", dtype=torch.int32) @@ -628,38 +503,27 @@ def record(self): self.inference_params.pre_step(OrderedDict(zip(list(range(len(lengths))), lengths.tolist()))) - print(f"context phase recording start") - # self._model_context_phase.model.layers = torch.nn.ModuleList([ - # self.record_graph( - # layer, - # self.hidden_states_buffer, - # self_attn_mask_type="padding_causal", - # inference_params=self.inference_params - # ) - # for layer in self._model_context_phase.model.layers - # ]) + # print(f"context phase recording start") + self._model_context_phase = self.record_graph( self._model_context_phase, self.hidden_states_buffer, - attn_mask_type="padding_causal" + attn_mask_type="padding_causal", + rope_emb=self.te_rope_emb ) # CUDA Graphs recording - print(f"context phase recording done") + # print(f"context phase recording done") input_shape = (self.config.cuda_graphs_static_batch_size, 1) - # self.inference_params.reset() - # self.inference_params.setup_before_new_input( - # lengths_tensor=torch.tensor(input_shape[0] * [input_shape[1]], device="cuda"), - # max_input_length=input_shape[1], - # ) + lengths = torch.tensor(input_shape[0] * [1], device="cuda", dtype=torch.int32) - max_input_length = input_shape[1] self.inference_params.pre_step(OrderedDict(zip(list(range(len(lengths))), lengths.tolist()))) self._model_generation_phase = self.record_graph( self._model_generation_phase, self.generation_buffer, - attn_mask_type="padding" + attn_mask_type="padding", + rope_emb=self.te_rope_emb ) # CUDA Graphs recording """ diff --git a/transformer_engine/pytorch/attention/inference.py b/transformer_engine/pytorch/attention/inference.py index 8a33a7f047..bcd4d7de30 100644 --- a/transformer_engine/pytorch/attention/inference.py +++ b/transformer_engine/pytorch/attention/inference.py @@ -214,7 +214,7 @@ def __init__( dtype=torch.int32, device=torch.cuda.current_device(), ) - self.cu_pre_step_seqlens = torch.zeros( + self.pre_step_seqlens = torch.zeros( self.max_batch_size, dtype=torch.int32, device=torch.cuda.current_device(), @@ -272,6 +272,11 @@ def pre_step( for k, v in self.sequences.items(): self.sequences_pre_step[k] = v - step_dict[k] + pre_step_seqlens = torch.Tensor(list(self.sequences_pre_step.values())).to( + dtype=torch.int32, device="cpu" + ) + self.pre_step_seqlens[:len(pre_step_seqlens)].copy_(pre_step_seqlens, non_blocking=True) + seqlens_q = list(step_dict.values()) cu_seqlens_q = [0] + [sum(seqlens_q[:i]) for i in range(1, self.batch_size + 1)] cu_seqlens_q = cu_seqlens_q + [cu_seqlens_q[-1]] * (self.max_batch_size - self.batch_size) @@ -286,12 +291,13 @@ def pre_step( def get_seqlens_pre_step(self): """Get cached sequence lengths before the stepping""" - seqlens = torch.Tensor(list(self.sequences_pre_step.values())).to( - dtype=torch.int32, device="cpu" - ) - # return seqlens.cuda() - self.cu_pre_step_seqlens[:len(seqlens)].copy_(seqlens, non_blocking=True) - return self.cu_pre_step_seqlens + # seqlens = torch.Tensor(list(self.sequences_pre_step.values())).to( + # dtype=torch.int32, device="cpu" + # ) + # # return seqlens.cuda() + # self.cu_pre_step_seqlens[:len(seqlens)].copy_(seqlens, non_blocking=True) + # return self.cu_pre_step_seqlens + return self.pre_step_seqlens def convert_paged_to_nonpaged(self, layer_number: int): """ From 2d12b722ad27185e23015bf7f7b61c3867625b96 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 17 Jun 2025 22:27:53 +0000 Subject: [PATCH 4/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- docs/examples/te_gemma/check_cuda_graphs.py | 27 ++- docs/examples/te_gemma/check_gemm.py | 15 +- docs/examples/te_gemma/run_generation.py | 4 +- .../examples/te_gemma/run_generation_llama.py | 4 +- docs/examples/te_gemma/te_gemma.py | 106 +++++---- docs/examples/te_gemma/te_gemma_save.py | 151 ++++++++----- docs/examples/te_gemma/te_llama.py | 203 ++++++++++++------ docs/examples/te_gemma/utils.py | 7 +- .../pytorch/attention/inference.py | 3 +- 9 files changed, 337 insertions(+), 183 deletions(-) diff --git a/docs/examples/te_gemma/check_cuda_graphs.py b/docs/examples/te_gemma/check_cuda_graphs.py index fa198db5ef..aee35f6911 100644 --- a/docs/examples/te_gemma/check_cuda_graphs.py +++ b/docs/examples/te_gemma/check_cuda_graphs.py @@ -1,6 +1,7 @@ import torch from transformer_engine.pytorch import Linear, LayerNorm + # 1. Define model with static buffers class TE_Model(torch.nn.Module): def __init__(self, max_seq_len=4096): @@ -10,51 +11,57 @@ def __init__(self, max_seq_len=4096): self.attn_proj = Linear(1024, 1024) # Pre-allocate static buffers - self.register_buffer('kv_cache', torch.zeros(max_seq_len, 1024, device='cuda')) - self.register_buffer('attn_mask', torch.tril(torch.ones(max_seq_len, max_seq_len, device='cuda'))) + self.register_buffer("kv_cache", torch.zeros(max_seq_len, 1024, device="cuda")) + self.register_buffer( + "attn_mask", torch.tril(torch.ones(max_seq_len, max_seq_len, device="cuda")) + ) def forward(self, hidden_states, seq_start: int): # Dynamic slicing of static buffers seq_len = hidden_states.size(1) - current_mask = self.attn_mask[seq_start:seq_start+seq_len, :seq_len] + current_mask = self.attn_mask[seq_start : seq_start + seq_len, :seq_len] x = self.ln(hidden_states) x = self.attn_proj(x) # Update KV cache (in-place) - self.kv_cache[seq_start:seq_start+seq_len].copy_(x) + self.kv_cache[seq_start : seq_start + seq_len].copy_(x) return x + # 2. Create graphable callables model = TE_Model().cuda() -static_input = torch.randn(8, 256, 1024, device='cuda') # (batch, seq, hidden) -seq_start = torch.tensor(0, device='cuda') +static_input = torch.randn(8, 256, 1024, device="cuda") # (batch, seq, hidden) +seq_start = torch.tensor(0, device="cuda") # Wrap with CUDA Graphs graph_model = torch.cuda.make_graphed_callables( [model], # Module list sample_args=[(static_input, seq_start)], # Must match actual input structure # memory_pool=torch.cuda.graphs.graph_pool_handle(), - allow_unused_input=False + allow_unused_input=False, ) + # 3. Warmup and execution def run_inference(x, seq_start): # Inputs must match sample_args' device/type/shape - x = x.to('cuda', non_blocking=True).requires_grad_(False) - seq_start = seq_start.to('cuda', non_blocking=True) + x = x.to("cuda", non_blocking=True).requires_grad_(False) + seq_start = seq_start.to("cuda", non_blocking=True) with torch.cuda.amp.autocast(): return graph_model(x, seq_start) + # Warm-up (essential for TE's kernel auto-tuner) for _ in range(3): _ = run_inference(static_input, seq_start) torch.cuda.synchronize() + # 4. Usage with dynamic sequence lengths def process_batch(inputs, start_pos): # inputs: (batch, seq) on CPU - inputs_gpu = inputs.to('cuda', non_blocking=True) + inputs_gpu = inputs.to("cuda", non_blocking=True) # Output shares memory with pre-allocated buffers return run_inference(inputs_gpu, start_pos) diff --git a/docs/examples/te_gemma/check_gemm.py b/docs/examples/te_gemma/check_gemm.py index dbcc0f53af..1ed6edd23a 100755 --- a/docs/examples/te_gemma/check_gemm.py +++ b/docs/examples/te_gemma/check_gemm.py @@ -8,11 +8,13 @@ from transformer_engine.pytorch.module.base import get_workspace import transformer_engine.pytorch.cpp_extensions as cpp_tex + @functools.lru_cache(maxsize=None) def _empty_tensor() -> torch.Tensor: """Get tensor with no entries and no data""" return torch.Tensor() + def gemm( A: torch.Tensor, B: torch.Tensor, @@ -100,20 +102,23 @@ def gemm( assert ub is not None, "ub object is None!" _ = fn(*args) - import pdb; pdb.set_trace() + import pdb + + pdb.set_trace() return out, grad_bias, gelu_input + if __name__ == "__main__": fc2_weight = torch.load("fc2_weight.pth").cuda() - + base_repo = "/perfhome/mnt/wkstn/work/repos/te_gemma_gen_support/TransformerEngine/docs/examples/te_gemma/" base_repo = "" gelu_out = torch.load(base_repo + "gelu_out.pth").cuda() - + activation_dtype = torch.bfloat16 fc2_bias = _empty_tensor() use_fc2_bias = False - + dim_size = list(gelu_out.size()) dim_size[1] = fc2_weight.size(0) fc2_out = torch.empty(dim_size, dtype=activation_dtype, device=gelu_out.device) @@ -129,4 +134,4 @@ def gemm( ub_algo=None, ub=None, extra_output_tensor=None, - ) \ No newline at end of file + ) diff --git a/docs/examples/te_gemma/run_generation.py b/docs/examples/te_gemma/run_generation.py index 6c45b9d670..bfe610d361 100755 --- a/docs/examples/te_gemma/run_generation.py +++ b/docs/examples/te_gemma/run_generation.py @@ -1,6 +1,8 @@ from utils import * -hyperparams.model_name = "/perfhome/repos/ckpt/models/gemma-7b-hf/" # "/tmp/gemma-7b-hf/" # <== Add model weight location here e.g. "/path/to/downloaded/gemma/weights" +hyperparams.model_name = ( # "/tmp/gemma-7b-hf/" # <== Add model weight location here e.g. "/path/to/downloaded/gemma/weights" + "/perfhome/repos/ckpt/models/gemma-7b-hf/" +) hyperparams.qkv_format = "thd" # hyperparams.generation_cuda_graphs = True # 709.8s diff --git a/docs/examples/te_gemma/run_generation_llama.py b/docs/examples/te_gemma/run_generation_llama.py index 2f90995bd1..1c3e6626ca 100755 --- a/docs/examples/te_gemma/run_generation_llama.py +++ b/docs/examples/te_gemma/run_generation_llama.py @@ -1,6 +1,8 @@ from utils import * -hyperparams.model_name = "/perfhome/repos/ckpt/models/llama2-7b-hf/" # "/tmp/gemma-7b-hf/" # <== Add model weight location here e.g. "/path/to/downloaded/gemma/weights" +hyperparams.model_name = ( # "/tmp/gemma-7b-hf/" # <== Add model weight location here e.g. "/path/to/downloaded/gemma/weights" + "/perfhome/repos/ckpt/models/llama2-7b-hf/" +) hyperparams.qkv_format = "thd" # model = init_te_llama_model(hyperparams) diff --git a/docs/examples/te_gemma/te_gemma.py b/docs/examples/te_gemma/te_gemma.py index cd59a081e8..706ea16bc4 100755 --- a/docs/examples/te_gemma/te_gemma.py +++ b/docs/examples/te_gemma/te_gemma.py @@ -19,6 +19,7 @@ import torch.nn.functional as F + class TEGemmaDecoderLayer(te.pytorch.TransformerLayer): """ Wrapper class over TE's `TransformerLayer`. This makes the wrapper very @@ -70,13 +71,8 @@ def forward(self, *args, **kwargs): # We need to additionally pass positional e rope_emb = kwargs.pop("rope_emb", None) # We need to return tuple to be compatible with HF. - return ( - super().forward( - *args, - rotary_pos_emb=rope_emb, - **kwargs - ), - ) + return (super().forward(*args, rotary_pos_emb=rope_emb, **kwargs),) + class StaticGemmaModel(torch.nn.Module): """ @@ -101,7 +97,13 @@ def set_inference_params(self, inference_params): self.inference_params = inference_params # @sudhakars: is `arbitrary` fine being the default here? - def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor = None, attn_mask_type: str = "arbitrary", rope_emb: torch.Tensor = None): + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor = None, + attn_mask_type: str = "arbitrary", + rope_emb: torch.Tensor = None, + ): # print(f"StaticGemmaModel forward start") with torch.no_grad(): # static operation - for CUDA graphs @@ -114,7 +116,7 @@ def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor = No attention_mask=attention_mask, self_attn_mask_type=self.mask if attn_mask_type is None else attn_mask_type, inference_params=self.inference_params, - rope_emb=rope_emb + rope_emb=rope_emb, )[ 0 ] # static copy - for CUDA graphs @@ -144,8 +146,16 @@ def set_inference_params(self, inference_params): self.gemma_layers.set_inference_params(inference_params) # @sudhakars: is `arbitrary` a good default value here? - def forward(self, hidden_states: torch.Tensor, mask: torch.Tensor = None, attn_mask_type: str = "arbitrary", rope_emb: torch.Tensor = None): - logits, _ = self.gemma_layers(hidden_states, attention_mask=mask, attn_mask_type = attn_mask_type, rope_emb=rope_emb) + def forward( + self, + hidden_states: torch.Tensor, + mask: torch.Tensor = None, + attn_mask_type: str = "arbitrary", + rope_emb: torch.Tensor = None, + ): + logits, _ = self.gemma_layers( + hidden_states, attention_mask=mask, attn_mask_type=attn_mask_type, rope_emb=rope_emb + ) assert logits.shape[0] == hidden_states.shape[0] # b assert logits.shape[1] == hidden_states.shape[1] # seq_len @@ -257,16 +267,16 @@ def _create_hidden_states_buffer(self, input_ids: torch.Tensor): # This function is overriden in TeGEmmaForCausalLMCudaGraphs. def _create_inference_params(self, *args, **kwargs): - infer_params = InferenceParams( - *args, **kwargs - ) + infer_params = InferenceParams(*args, **kwargs) return infer_params # This function is overriden in TeGEmmaForCausalLMCudaGraphs. def _get_max_input_seq_len(self, input_ids): - return input_ids.shape[1] \ - if not hasattr(self.config, "cuda_graphs_static_max_context_len") \ - else self.config.cuda_graphs_static_max_context_len + return ( + input_ids.shape[1] + if not hasattr(self.config, "cuda_graphs_static_max_context_len") + else self.config.cuda_graphs_static_max_context_len + ) # The buffer for generation is some part (beginning) of hidden states buffer. # This function returns pointer to it and also copies there data if provided. @@ -303,19 +313,15 @@ def _generate_context_phase(self, input_ids: torch.Tensor, inference_params: Inf else: inference_params.setup_before_new_input(length=input_ids.shape[1]) - logits, hs_buffer = self._model_context_phase( hidden_states, attention_mask=((input_ids == 0) if self.config.qkv_format != "thd" else None), attn_mask_type="padding_causal" if self.config.qkv_format == "thd" else "arbitrary", - rope_emb=self.te_rope_emb + rope_emb=self.te_rope_emb, ) if self.config.qkv_format == "thd": - logits = logits[ - - torch.arange(logits.size(0)), lengths - 1, : - ] + logits = logits[torch.arange(logits.size(0)), lengths - 1, :] else: logits = logits[:, -1, :] @@ -357,10 +363,14 @@ def generate( if self.config.qkv_format == "thd": # For thd layout padding is at the end, otherwise at the beginning. - TEGemmaForCausalLM._padding_to_end(input_ids, + TEGemmaForCausalLM._padding_to_end( + input_ids, lengths, - max_seq_len=self.config.cuda_graphs_static_max_context_len \ - if self.config.generation_cuda_graphs else None + max_seq_len=( + self.config.cuda_graphs_static_max_context_len + if self.config.generation_cuda_graphs + else None + ), ) batch_size, max_input_sequence_len = input_ids.shape[0], self._get_max_input_seq_len( @@ -380,7 +390,7 @@ def generate( dtype=torch.bfloat16, is_paged=self.config.is_paged, page_size=64, - total_num_pages=64 * 128 // 64, # 64 * 64 (max_sequence_length) / 64 (page_size) + total_num_pages=64 * 128 // 64, # 64 * 64 (max_sequence_length) / 64 (page_size) ) self._model_context_phase.set_inference_params(inference_params) @@ -393,8 +403,10 @@ def generate( # print(f"context phase done") # Generation phase. if self.config.qkv_format == "thd": - lengths_tensor=torch.ones((next_tokens.shape[0],), dtype=int) - inference_params.pre_step(OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist()))) + lengths_tensor = torch.ones((next_tokens.shape[0],), dtype=int) + inference_params.pre_step( + OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist())) + ) else: inference_params.setup_before_new_input(length=1) @@ -411,7 +423,12 @@ def generate( # include the next token to be generated mask = self._make_mask_one_token_longer(mask) - next_tokens = self._model_generation_phase(hidden_states, mask=mask, attn_mask_type="padding" if self.config.qkv_format=="thd" else "arbitrary", rope_emb=self.te_rope_emb) + next_tokens = self._model_generation_phase( + hidden_states, + mask=mask, + attn_mask_type="padding" if self.config.qkv_format == "thd" else "arbitrary", + rope_emb=self.te_rope_emb, + ) # self.inference_params contains for example kv_cache. # This needs to be called before every pass, @@ -419,8 +436,10 @@ def generate( # Here we increase sequence offsets by one, # because we generated one token for every sequence. if self.config.qkv_format == "thd": - lengths_tensor=torch.ones((next_tokens.shape[0],), dtype=int) - inference_params.pre_step(OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist()))) + lengths_tensor = torch.ones((next_tokens.shape[0],), dtype=int) + inference_params.pre_step( + OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist())) + ) else: inference_params.setup_before_new_input(length=1) # next_tokens is static output tensor, so we need to clone it @@ -435,11 +454,14 @@ def forward(self, *args, **kwargs): hidden_states = self.model.embed_tokens(kwargs["input_ids"]) logits = self._model_context_phase( hidden_states, - attention_mask=((kwargs["input_ids"] == 0) if self.config.qkv_format != "thd" else None), - attn_mask_type="arbitrary" + attention_mask=( + (kwargs["input_ids"] == 0) if self.config.qkv_format != "thd" else None + ), + attn_mask_type="arbitrary", ) return logits + class TEGemmaForCausalLMCudaGraphs(TEGemmaForCausalLM): """ TEGemmaForCausalLMCudaGraphs is the version of the class TEGemmaForCausalLM @@ -478,7 +500,9 @@ def __init__(self, config: GemmaConfig): dtype=torch.bfloat16, is_paged=self.config.is_paged, page_size=64, - total_num_pages=64 * self.config.cuda_graphs_static_max_seq_len // 64, # 64 * 64 (max_sequence_length) / 64 (page_size) + total_num_pages=64 + * self.config.cuda_graphs_static_max_seq_len + // 64, # 64 * 64 (max_sequence_length) / 64 (page_size) ) self._model_generation_phase.set_inference_params(self.inference_params) @@ -501,7 +525,9 @@ def record(self): lengths = torch.tensor(input_shape[0] * [input_shape[1]], device="cuda", dtype=torch.int32) max_input_length = input_shape[1] - self.inference_params.pre_step(OrderedDict(zip(list(range(len(lengths))), lengths.tolist()))) + self.inference_params.pre_step( + OrderedDict(zip(list(range(len(lengths))), lengths.tolist())) + ) # print(f"context phase recording start") @@ -509,7 +535,7 @@ def record(self): self._model_context_phase, self.hidden_states_buffer, attn_mask_type="padding_causal", - rope_emb=self.te_rope_emb + rope_emb=self.te_rope_emb, ) # CUDA Graphs recording # print(f"context phase recording done") @@ -517,13 +543,15 @@ def record(self): lengths = torch.tensor(input_shape[0] * [1], device="cuda", dtype=torch.int32) - self.inference_params.pre_step(OrderedDict(zip(list(range(len(lengths))), lengths.tolist()))) + self.inference_params.pre_step( + OrderedDict(zip(list(range(len(lengths))), lengths.tolist())) + ) self._model_generation_phase = self.record_graph( self._model_generation_phase, self.generation_buffer, attn_mask_type="padding", - rope_emb=self.te_rope_emb + rope_emb=self.te_rope_emb, ) # CUDA Graphs recording """ diff --git a/docs/examples/te_gemma/te_gemma_save.py b/docs/examples/te_gemma/te_gemma_save.py index a46f6a9b94..c83378840c 100755 --- a/docs/examples/te_gemma/te_gemma_save.py +++ b/docs/examples/te_gemma/te_gemma_save.py @@ -19,8 +19,17 @@ import torch.nn.functional as F + class CacheParams: - def __init__(self, max_seqlen_q, max_seqlen_kv, cu_seqlens_q, cu_seqlens_kv, cu_seqlens_q_padded, cu_seqlens_kv_padded): + def __init__( + self, + max_seqlen_q, + max_seqlen_kv, + cu_seqlens_q, + cu_seqlens_kv, + cu_seqlens_q_padded, + cu_seqlens_kv_padded, + ): self.max_seqlen_q = max_seqlen_q self.max_seqlen_kv = max_seqlen_kv self.cu_seqlens_q = cu_seqlens_q @@ -37,15 +46,18 @@ def setup_cache_params_from_infer_params(inference_params, lengths_tensor, max_i (Currently a hack, this should be reformatted to a better method) """ - assert lengths_tensor is not None and max_input_length is not None, \ - "lengths_tensor and max_input_length should not be none for qkv_format = \"thd\"" + assert ( + lengths_tensor is not None and max_input_length is not None + ), 'lengths_tensor and max_input_length should not be none for qkv_format = "thd"' inference_params.max_incoming_seq_len = max_input_length lengths_tensor = lengths_tensor.to(inference_params.cu_seqlens_q.device) # inference_params.step_dict = OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist())) - inference_params.pre_step(OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist()))) + inference_params.pre_step( + OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist())) + ) # print(inference_params.step_dict) @@ -56,6 +68,7 @@ def setup_cache_params_from_infer_params(inference_params, lengths_tensor, max_i # @sudhakars: to create a better way later. # inference_params.get_cache_params_from_infer_params = get_cache_params_in_infer_params + # This class has been modified from # https://github.com/huggingface/transformers/blob/98adf24883b007c2a7fb17bab1c01b1614673433/src/transformers/models/gemma/modeling_gemma.py class GemmaRotaryEmbedding(torch.nn.Module): @@ -65,41 +78,48 @@ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): self.dim = dim self.max_position_embeddings = max_position_embeddings self.base = base - inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float() / self.dim)) + inv_freq = 1.0 / ( + self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float() / self.dim) + ) self.register_buffer("inv_freq", tensor=inv_freq, persistent=False) @torch.no_grad() def forward(self, x, position_ids, seq_len=None): # x: [bs, num_attention_heads, seq_len, head_size] self.inv_freq.to(x.device) - inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) + inv_freq_expanded = ( + self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) + ) position_ids_expanded = position_ids[:, None, :].float() # Force float32 since bfloat16 loses precision on long contexts # See https://github.com/huggingface/transformers/pull/29285 device_type = x.device.type - device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu" + device_type = ( + device_type if isinstance(device_type, str) and device_type != "mps" else "cpu" + ) with torch.autocast(device_type=device_type, enabled=False): freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) emb = torch.cat((freqs, freqs), dim=-1) - return emb.unsqueeze(2) # should return in [b, s, 1, d] format + return emb.unsqueeze(2) # should return in [b, s, 1, d] format class StaticBufferAllocator(torch.nn.Module): """ - This class is used when we use te.make_graphed_callable(). - CUDA Graphs require all tensors to be static. Neverthless, - torch API make_graphed_callable() takes care of output of torch modules, - and makes them static. Thus by wrapping allocation of memory into - torch.nn.Module, we can greatly simplify our code. + This class is used when we use te.make_graphed_callable(). + CUDA Graphs require all tensors to be static. Neverthless, + torch API make_graphed_callable() takes care of output of torch modules, + and makes them static. Thus by wrapping allocation of memory into + torch.nn.Module, we can greatly simplify our code. """ # pylint: disable=no-self-use def forward(self, size, dtype, device): """ - Return buffer of given size, dtype and device. + Return buffer of given size, dtype and device. """ return torch.zeros(size, dtype=dtype, device=device) + class TEGemmaDecoderLayer(te.pytorch.TransformerLayer): """ Wrapper class over TE's `TransformerLayer`. This makes the wrapper very @@ -138,7 +158,7 @@ def __init__(self, config: GemmaConfig, layer_idx: int, *args, **kwargs): def alloc(self, size, dtype, device): """ - Allocated the buffer and works correctly with CUDA Graphs. + Allocated the buffer and works correctly with CUDA Graphs. """ return self._allocator(size, dtype, device) @@ -210,7 +230,7 @@ def forward(self, *args, **kwargs): # We need to additionally pass positional e # cu_seqlens_kv = cache_params.cu_seqlens_kv # cu_seqlens_q_padded = cache_params.cu_seqlens_q_padded # cu_seqlens_kv_padded = cache_params.cu_seqlens_kv_padded - # print(f"input_sequence_lengths (in forward): \n{inference_params.input_sequence_lengths}") + # print(f"input_sequence_lengths (in forward): \n{inference_params.input_sequence_lengths}") # this args cannot be passed to TransformerLayer keys_to_remove = [ @@ -232,10 +252,11 @@ def forward(self, *args, **kwargs): # We need to additionally pass positional e # cu_seqlens_kv=cu_seqlens_kv, # max_seqlen_q=max_seqlen_q, # max_seqlen_kv=max_seqlen_kv, - **kwargs + **kwargs, ), ) + class StaticGemmaModel(torch.nn.Module): """ StaticGemma is based of HF GemmaModel class. @@ -259,7 +280,12 @@ def set_inference_params(self, inference_params): self.inference_params = inference_params # @sudhakars: is `arbitrary` fine being the default here? - def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor = None, attn_mask_type: str = "arbitrary"): + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor = None, + attn_mask_type: str = "arbitrary", + ): print(f"StaticGemmaModel forward start") with torch.no_grad(): # static operation - for CUDA graphs @@ -301,8 +327,15 @@ def set_inference_params(self, inference_params): self.gemma_layers.set_inference_params(inference_params) # @sudhakars: is `arbitrary` a good default value here? - def forward(self, hidden_states: torch.Tensor, mask: torch.Tensor = None, attn_mask_type: str = "arbitrary"): - logits, _ = self.gemma_layers(hidden_states, attention_mask=mask, attn_mask_type = attn_mask_type) + def forward( + self, + hidden_states: torch.Tensor, + mask: torch.Tensor = None, + attn_mask_type: str = "arbitrary", + ): + logits, _ = self.gemma_layers( + hidden_states, attention_mask=mask, attn_mask_type=attn_mask_type + ) assert logits.shape[0] == hidden_states.shape[0] # b assert logits.shape[1] == hidden_states.shape[1] # seq_len @@ -324,6 +357,7 @@ class PartialForwardWrapper(torch.nn.Module): `functools.partial` is used to wrap the module, it changes the modules' type and that interferes with the `make_graphed_callables` intrinsics. """ + def __init__(self, module, **kwargs): super().__init__() self.module = module @@ -413,7 +447,6 @@ def _padding_to_end(inputs, lengths, max_seq_len=None): # For Paged Attention, make the valid sequences, multiple of 64 # inputs.data = new_input_ids[:, :4].repeat(1, 16) - def _next_64_multiply(self, x): return ((x + 63) // 64) * 64 @@ -429,9 +462,7 @@ def _create_hidden_states_buffer(self, input_ids: torch.Tensor): # This function is overriden in TeGEmmaForCausalLMCudaGraphs. def _create_inference_params(self, *args, **kwargs): - infer_params = InferenceParams( - *args, **kwargs - ) + infer_params = InferenceParams(*args, **kwargs) # max_batch_size = kwargs["max_batch_size"] @@ -451,9 +482,11 @@ def _create_inference_params(self, *args, **kwargs): # This function is overriden in TeGEmmaForCausalLMCudaGraphs. def _get_max_input_seq_len(self, input_ids): - return input_ids.shape[1] \ - if not hasattr(self.config, "cuda_graphs_static_max_context_len") \ - else self.config.cuda_graphs_static_max_context_len + return ( + input_ids.shape[1] + if not hasattr(self.config, "cuda_graphs_static_max_context_len") + else self.config.cuda_graphs_static_max_context_len + ) # The buffer for generation is some part (beginning) of hidden states buffer. # This function returns pointer to it and also copies there data if provided. @@ -490,7 +523,6 @@ def _generate_context_phase(self, input_ids: torch.Tensor, inference_params: Inf else: inference_params.setup_before_new_input(length=input_ids.shape[1]) - logits, hs_buffer = self._model_context_phase( hidden_states, attention_mask=((input_ids == 0) if self.config.qkv_format != "thd" else None), @@ -503,10 +535,7 @@ def _generate_context_phase(self, input_ids: torch.Tensor, inference_params: Inf # they are the last token in the sequence when qkv_format != "thd". # import pdb; pdb.set_trace() if self.config.qkv_format == "thd": - logits = logits[ - - torch.arange(logits.size(0)), lengths - 1, : - ] + logits = logits[torch.arange(logits.size(0)), lengths - 1, :] else: logits = logits[:, -1, :] @@ -531,7 +560,7 @@ def generate( pad_token_id: int = 0, max_new_tokens: int = 0, *args, - **kwargs + **kwargs, ): self.eval() @@ -557,10 +586,14 @@ def generate( if self.config.qkv_format == "thd": # For thd layout padding is at the end, otherwise at the beginning. - TEGemmaForCausalLM._padding_to_end(input_ids, + TEGemmaForCausalLM._padding_to_end( + input_ids, lengths, - max_seq_len=self.config.cuda_graphs_static_max_context_len \ - if self.config.generation_cuda_graphs else None + max_seq_len=( + self.config.cuda_graphs_static_max_context_len + if self.config.generation_cuda_graphs + else None + ), ) # import pdb; pdb.set_trace() @@ -578,7 +611,7 @@ def generate( dtype=torch.bfloat16, is_paged=self.config.is_paged, page_size=64, - total_num_pages=64, # 64 * 64 (max_sequence_length) / 64 (page_size) + total_num_pages=64, # 64 * 64 (max_sequence_length) / 64 (page_size) # is_cuda_graph=False ) @@ -591,7 +624,6 @@ def generate( # init_cache_params_in_infer_params(inference_params) - # inference_params.qkv_format_legacy = self.config.qkv_format self._model_context_phase.set_inference_params(inference_params) @@ -608,8 +640,10 @@ def generate( # lengths_tensor=torch.ones((next_tokens.shape[0],), device="cuda"), # max_input_length=1, # ) - lengths_tensor=torch.ones((next_tokens.shape[0],), dtype=int) - inference_params.pre_step(OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist()))) + lengths_tensor = torch.ones((next_tokens.shape[0],), dtype=int) + inference_params.pre_step( + OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist())) + ) else: inference_params.setup_before_new_input(length=1) @@ -628,7 +662,11 @@ def generate( # setup_cache_params_from_infer_params(inference_params, input_ids) # @sudhakars: could create position_ids from mask here - next_tokens = self._model_generation_phase(hidden_states, mask, attn_mask_type="padding" if self.config.qkv_format=="thd" else "arbitrary") + next_tokens = self._model_generation_phase( + hidden_states, + mask, + attn_mask_type="padding" if self.config.qkv_format == "thd" else "arbitrary", + ) # self.inference_params contains for example kv_cache. # This needs to be called before every pass, @@ -640,8 +678,10 @@ def generate( # lengths_tensor=torch.ones((next_tokens.shape[0],), device="cuda"), # max_input_length=1, # ) - lengths_tensor=torch.ones((next_tokens.shape[0],), dtype=int) - inference_params.pre_step(OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist()))) + lengths_tensor = torch.ones((next_tokens.shape[0],), dtype=int) + inference_params.pre_step( + OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist())) + ) else: inference_params.setup_before_new_input(length=1) # next_tokens is static output tensor, so we need to clone it @@ -656,11 +696,14 @@ def forward(self, *args, **kwargs): hidden_states = self.model.embed_tokens(kwargs["input_ids"]) logits = self._model_context_phase( hidden_states, - attention_mask=((kwargs["input_ids"] == 0) if self.config.qkv_format != "thd" else None), - attn_mask_type="arbitrary" + attention_mask=( + (kwargs["input_ids"] == 0) if self.config.qkv_format != "thd" else None + ), + attn_mask_type="arbitrary", ) return logits + class TEGemmaForCausalLMCudaGraphs(TEGemmaForCausalLM): """ TEGemmaForCausalLMCudaGraphs is the version of the class TEGemmaForCausalLM @@ -703,7 +746,7 @@ def __init__(self, config: GemmaConfig): dtype=torch.bfloat16, is_paged=self.config.is_paged, page_size=64, - total_num_pages=64, # 64 * 64 (max_sequence_length) / 64 (page_size) + total_num_pages=64, # 64 * 64 (max_sequence_length) / 64 (page_size) # is_cuda_graph=False ) @@ -753,7 +796,9 @@ def record(self): lengths = torch.tensor(input_shape[0] * [input_shape[1]], device="cuda", dtype=torch.int32) max_input_length = input_shape[1] - self.inference_params.pre_step(OrderedDict(zip(list(range(len(lengths))), lengths.tolist()))) + self.inference_params.pre_step( + OrderedDict(zip(list(range(len(lengths))), lengths.tolist())) + ) print(f"context phase recording start") # self._model_context_phase.model.layers = torch.nn.ModuleList([ @@ -766,9 +811,7 @@ def record(self): # for layer in self._model_context_phase.model.layers # ]) self._model_context_phase = self.record_graph( - self._model_context_phase, - self.hidden_states_buffer, - attn_mask_type="padding_causal" + self._model_context_phase, self.hidden_states_buffer, attn_mask_type="padding_causal" ) # CUDA Graphs recording print(f"context phase recording done") @@ -781,12 +824,12 @@ def record(self): lengths = torch.tensor(input_shape[0] * [1], device="cuda", dtype=torch.int32) max_input_length = input_shape[1] - self.inference_params.pre_step(OrderedDict(zip(list(range(len(lengths))), lengths.tolist()))) + self.inference_params.pre_step( + OrderedDict(zip(list(range(len(lengths))), lengths.tolist())) + ) self._model_generation_phase = self.record_graph( - self._model_generation_phase, - self.generation_buffer, - attn_mask_type="padding" + self._model_generation_phase, self.generation_buffer, attn_mask_type="padding" ) # CUDA Graphs recording """ diff --git a/docs/examples/te_gemma/te_llama.py b/docs/examples/te_gemma/te_llama.py index 426b79cbf1..637f4f574c 100755 --- a/docs/examples/te_gemma/te_llama.py +++ b/docs/examples/te_gemma/te_llama.py @@ -19,6 +19,7 @@ import torch.nn.functional as F + def setup_cache_params_from_infer_params(inference_params, lengths_tensor, max_input_length): """ Converts the `input_ids` to variables like `cu_seqlens_q/kv`, etc. which @@ -27,16 +28,21 @@ def setup_cache_params_from_infer_params(inference_params, lengths_tensor, max_i (Currently a hack, this should be reformatted to a better method) """ - assert lengths_tensor is not None and max_input_length is not None, \ - "lengths_tensor and max_input_length should not be none for qkv_format = \"thd\"" + assert ( + lengths_tensor is not None and max_input_length is not None + ), 'lengths_tensor and max_input_length should not be none for qkv_format = "thd"' torch.add( inference_params.cached_sequence_lengths, inference_params.input_sequence_lengths, - out=inference_params.cached_sequence_lengths) + out=inference_params.cached_sequence_lengths, + ) inference_params.input_sequence_lengths.copy_(lengths_tensor) inference_params.max_incoming_seq_len = max_input_length - max_seqlen_q, max_seqlen_kv = inference_params.max_incoming_seq_len, inference_params.max_sequence_length + max_seqlen_q, max_seqlen_kv = ( + inference_params.max_incoming_seq_len, + inference_params.max_sequence_length, + ) # # Allocation of buffers, it works correctly with CUDA Graphs. _allocator = StaticBufferAllocator() @@ -50,26 +56,40 @@ def setup_cache_params_from_infer_params(inference_params, lengths_tensor, max_i torch.cumsum(inference_params.input_sequence_lengths, dim=0, out=cu_seqlens_q[1:]) torch.cumsum( inference_params.cached_sequence_lengths + inference_params.input_sequence_lengths, - dim=0, out=cu_seqlens_kv[1:]) + dim=0, + out=cu_seqlens_kv[1:], + ) # If layer has shape [b * s_layer, h, d] # offsets are of the form [k * s_layer * h * d for k = 0, ..., batch_size] cu_seqlens_q_padded.copy_( - torch.arange(0, inference_params.max_batch_size + 1, device="cuda") * max_seqlen_q) + torch.arange(0, inference_params.max_batch_size + 1, device="cuda") * max_seqlen_q + ) cu_seqlens_kv_padded.copy_( - torch.arange(0, inference_params.max_batch_size + 1, device="cuda") * max_seqlen_kv) + torch.arange(0, inference_params.max_batch_size + 1, device="cuda") * max_seqlen_kv + ) # inference_params.step_dict = OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist())) - inference_params.pre_step(OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist()))) + inference_params.pre_step( + OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist())) + ) # print(inference_params.step_dict) def get_cache_params_in_infer_params(): - return max_seqlen_q, max_seqlen_kv, cu_seqlens_q, cu_seqlens_kv, cu_seqlens_q_padded, cu_seqlens_kv_padded + return ( + max_seqlen_q, + max_seqlen_kv, + cu_seqlens_q, + cu_seqlens_kv, + cu_seqlens_q_padded, + cu_seqlens_kv_padded, + ) # For the time being, create an ad-hoc field in `inference_params` to get the variables. # @sudhakars: to create a better way later. inference_params.get_cache_params_from_infer_params = get_cache_params_in_infer_params + # This class has been modified from # https://github.com/huggingface/transformers/blob/98adf24883b007c2a7fb17bab1c01b1614673433/src/transformers/models/gemma/modeling_gemma.py class LlamaRotaryEmbedding(torch.nn.Module): @@ -79,41 +99,48 @@ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): self.dim = dim self.max_position_embeddings = max_position_embeddings self.base = base - inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float() / self.dim)) + inv_freq = 1.0 / ( + self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float() / self.dim) + ) self.register_buffer("inv_freq", tensor=inv_freq, persistent=False) @torch.no_grad() def forward(self, x, position_ids, seq_len=None): # x: [bs, num_attention_heads, seq_len, head_size] self.inv_freq.to(x.device) - inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) + inv_freq_expanded = ( + self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) + ) position_ids_expanded = position_ids[:, None, :].float() # Force float32 since bfloat16 loses precision on long contexts # See https://github.com/huggingface/transformers/pull/29285 device_type = x.device.type - device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu" + device_type = ( + device_type if isinstance(device_type, str) and device_type != "mps" else "cpu" + ) with torch.autocast(device_type=device_type, enabled=False): freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) emb = torch.cat((freqs, freqs), dim=-1) - return emb.unsqueeze(2) # should return in [b, s, 1, d] format + return emb.unsqueeze(2) # should return in [b, s, 1, d] format class StaticBufferAllocator(torch.nn.Module): """ - This class is used when we use te.make_graphed_callable(). - CUDA Graphs require all tensors to be static. Neverthlessly, - torch API make_graphed_callable() takes care of output of torch modules, - and makes them static. Thus by wrapping allocation of memory into - torch.nn.Module, we can greatly simplify our code. + This class is used when we use te.make_graphed_callable(). + CUDA Graphs require all tensors to be static. Neverthlessly, + torch API make_graphed_callable() takes care of output of torch modules, + and makes them static. Thus by wrapping allocation of memory into + torch.nn.Module, we can greatly simplify our code. """ # pylint: disable=no-self-use def forward(self, size, dtype, device): """ - Return buffer of given size, dtype and device. + Return buffer of given size, dtype and device. """ return torch.zeros(size, dtype=dtype, device=device) + class TELlamaDecoderLayer(te.pytorch.TransformerLayer): """ Wrapper class over TE's `TransformerLayer`. This makes the wrapper very @@ -134,39 +161,39 @@ def __init__(self, config: LlamaConfig, layer_idx: int, *args, **kwargs): hidden_size=config.hidden_size, ffn_hidden_size=config.intermediate_size, num_attention_heads=config.num_attention_heads, - bias=False, # LLaMA specific + bias=False, # LLaMA specific layernorm_epsilon=config.rms_norm_eps, hidden_dropout=0, attention_dropout=0, fuse_qkv_params=config.fuse_qkv_params, normalization="RMSNorm", - activation="swiglu", # LLaMA specific + activation="swiglu", # LLaMA specific # attn_input_format=config.qkv_format, attn_input_format="bshd", num_gqa_groups=config.num_key_value_heads, - kv_channels=self.head_dim, # LLaMA specific + kv_channels=self.head_dim, # LLaMA specific layer_number=( layer_idx + 1 ), # Layer numbers in TE starts from 1, not 0 like in the HF. - zero_centered_gamma=True, # LLaMA specific + zero_centered_gamma=True, # LLaMA specific ) def alloc(self, size, dtype, device): """ - Allocated the buffer and works correctly with CUDA Graphs. + Allocated the buffer and works correctly with CUDA Graphs. """ return self._allocator(size, dtype, device) def forward(self, *args, **kwargs): # We need to additionally pass positional encoding. if "self_attn_mask_type" in kwargs: - attn_mask_type = kwargs['self_attn_mask_type'] + attn_mask_type = kwargs["self_attn_mask_type"] else: attn_mask_type = "whatever_default_is" if attn_mask_type == "arbitrary": # @sudhakars: following logic doesn't work for `thd` - attn_mask = kwargs['attention_mask'] + attn_mask = kwargs["attention_mask"] attention_mask_inv = ~attn_mask generation_case = torch.tensor(torch.tensor(attn_mask.shape).shape).item() > 2 @@ -181,13 +208,21 @@ def forward(self, *args, **kwargs): # We need to additionally pass positional e position_ids = attention_mask_inv.long().cumsum(-1) - 1 position_ids.masked_fill_(attention_mask_inv == 0, 1) - if "position_ids" in kwargs and kwargs['position_ids'] is not None: - assert torch.all(torch.eq(position_ids, kwargs["position_ids"])), "position ids don't match match exactly!" + if "position_ids" in kwargs and kwargs["position_ids"] is not None: + assert torch.all( + torch.eq(position_ids, kwargs["position_ids"]) + ), "position ids don't match match exactly!" # convert [b, s] to [b, 1, s, s] since `arbitrary` is only set for # context phase and context phase gets [b, s] sized attn mask - seq_len = 1 if torch.tensor(torch.tensor(attn_mask.shape).shape).item() > 2 else attention_mask_inv.shape[1] - arbitrary_attn_mask = torch.zeros(attention_mask_inv.shape[0], 1, seq_len, attention_mask_inv.shape[1]).bool() + seq_len = ( + 1 + if torch.tensor(torch.tensor(attn_mask.shape).shape).item() > 2 + else attention_mask_inv.shape[1] + ) + arbitrary_attn_mask = torch.zeros( + attention_mask_inv.shape[0], 1, seq_len, attention_mask_inv.shape[1] + ).bool() for sample_idx in range(attn_mask.shape[0]): pad_len = attn_mask[sample_idx].sum().int().item() # set the columns to padded @@ -195,21 +230,25 @@ def forward(self, *args, **kwargs): # We need to additionally pass positional e # set the rows to padded if not generation_case: arbitrary_attn_mask[sample_idx, :, :pad_len, :] = True - arbitrary_attn_mask[sample_idx] = torch.tril(arbitrary_attn_mask[sample_idx].logical_not()).logical_not() + arbitrary_attn_mask[sample_idx] = torch.tril( + arbitrary_attn_mask[sample_idx].logical_not() + ).logical_not() # Update the attention mask to arbitrary - kwargs['attention_mask'] = arbitrary_attn_mask.cuda() + kwargs["attention_mask"] = arbitrary_attn_mask.cuda() # @sudhakars: `max_position_embeddings` is not even used inside GemmaRotaryEmbedding # @sudhakars: change the hardcoded `dim` to something like config.head_dim - te_rope_emb = LlamaRotaryEmbedding(dim=self.head_dim, max_position_embeddings=self.llama_config.max_position_embeddings).cuda() + te_rope_emb = LlamaRotaryEmbedding( + dim=self.head_dim, max_position_embeddings=self.llama_config.max_position_embeddings + ).cuda() te_rope_emb = te_rope_emb(args[0], position_ids.cuda()) else: # When the `attention_mask` is not `arbitrary`, then for the purpose # of this tutorial, we're using `padding_causal` (for context) and # `padding` (for generation) # @sudhakars: find a better way to provide the `tensor_format` - te_rope_emb = RotaryPositionEmbedding(self.head_dim)( # Use self.head_dim + te_rope_emb = RotaryPositionEmbedding(self.head_dim)( # Use self.head_dim max_seq_len=self.llama_config.max_position_embeddings ).cuda() @@ -218,7 +257,12 @@ def forward(self, *args, **kwargs): # We need to additionally pass positional e # batch_size = args[0].shape[0] if inference_params.qkv_format_legacy == "thd": ( - max_seqlen_q, max_seqlen_kv, cu_seqlens_q, cu_seqlens_kv, cu_seqlens_q_padded, cu_seqlens_kv_padded + max_seqlen_q, + max_seqlen_kv, + cu_seqlens_q, + cu_seqlens_kv, + cu_seqlens_q_padded, + cu_seqlens_kv_padded, ) = inference_params.get_cache_params_from_infer_params() # this args cannot be passed to TransformerLayer @@ -246,6 +290,7 @@ def forward(self, *args, **kwargs): # We need to additionally pass positional e ), ) + class StaticLlamaModel(torch.nn.Module): """ StaticLlama is based of HF LlamaModel class. @@ -261,7 +306,7 @@ def __init__( ): super().__init__() self.model = model - self.llama_config = model.config # Store LlamaConfig + self.llama_config = model.config # Store LlamaConfig self.normalizer = torch.tensor(self.llama_config.hidden_size**0.5, dtype=dtype) self.mask = mask self.lm_head = lm_head @@ -270,7 +315,12 @@ def set_inference_params(self, inference_params): self.inference_params = inference_params # @sudhakars: is `arbitrary` fine being the default here? - def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor = None, attn_mask_type: str = "arbitrary"): + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor = None, + attn_mask_type: str = "arbitrary", + ): # import pdb; pdb.set_trace() if hidden_states.shape[1] > 1: torch.save(hidden_states, "input_ctxt.pth") @@ -314,8 +364,10 @@ def set_inference_params(self, inference_params): self.llama_layers.set_inference_params(inference_params) # @sudhakars: is `arbitrary` a good default value here? - def forward(self, hidden_states: torch.Tensor, mask: torch.Tensor = None, mask_type: str = "arbitrary"): - logits = self.llama_layers(hidden_states, attention_mask=mask, attn_mask_type = mask_type) + def forward( + self, hidden_states: torch.Tensor, mask: torch.Tensor = None, mask_type: str = "arbitrary" + ): + logits = self.llama_layers(hidden_states, attention_mask=mask, attn_mask_type=mask_type) assert logits.shape[0] == hidden_states.shape[0] # b assert logits.shape[1] == hidden_states.shape[1] # seq_len @@ -336,9 +388,11 @@ def forward(self, hidden_states: torch.Tensor, mask: torch.Tensor = None, mask_t # lengths_tensor=torch.ones((next_tokens.shape[0],), device="cuda"), # max_input_length=1, # ) - setup_cache_params_from_infer_params(self.inference_params, - lengths_tensor=torch.ones((next_tokens.shape[0],), dtype=int), - max_input_length=1) + setup_cache_params_from_infer_params( + self.inference_params, + lengths_tensor=torch.ones((next_tokens.shape[0],), dtype=int), + max_input_length=1, + ) else: self.inference_params.setup_before_new_input(length=1) @@ -353,6 +407,7 @@ class PartialForwardWrapper(torch.nn.Module): `functools.partial` is used to wrap the module, it changes the modules' type and that interferes with the `make_graphed_callables` intrinsics. """ + def __init__(self, module, **kwargs): super().__init__() self.module = module @@ -441,7 +496,6 @@ def _padding_to_end(inputs, lengths): # For Paged Attention, make the valid sequences, multiple of 64 # inputs.data = new_input_ids[:, :4].repeat(1, 16) - def _next_64_multiply(self, x): return ((x + 63) // 64) * 64 @@ -455,17 +509,17 @@ def _create_hidden_states_buffer(self, input_ids: torch.Tensor): # This function is overriden in TeGEmmaForCausalLMCudaGraphs. def _create_inference_params(self, *args, **kwargs): - infer_params = InferenceParams( - *args, **kwargs - ) + infer_params = InferenceParams(*args, **kwargs) max_batch_size = kwargs["max_batch_size"] # Initialize some legacy params infer_params.cached_sequence_lengths = torch.zeros( - (max_batch_size,), device="cuda", dtype=torch.int32) + (max_batch_size,), device="cuda", dtype=torch.int32 + ) infer_params.input_sequence_lengths = torch.zeros( - (max_batch_size,), device="cuda", dtype=torch.int32) + (max_batch_size,), device="cuda", dtype=torch.int32 + ) return infer_params @@ -507,11 +561,10 @@ def _generate_context_phase(self, input_ids: torch.Tensor, inference_params: Inf else: inference_params.setup_before_new_input(length=input_ids.shape[1]) - logits = self._model_context_phase( hidden_states, attention_mask=((input_ids == 0) if self.config.qkv_format != "thd" else None), - attn_mask_type="padding_causal" if self.config.qkv_format == "thd" else "arbitrary" + attn_mask_type="padding_causal" if self.config.qkv_format == "thd" else "arbitrary", ) # We choose logits coresponding with last token in each sequence, @@ -520,7 +573,6 @@ def _generate_context_phase(self, input_ids: torch.Tensor, inference_params: Inf # they are the last token in the sequence when qkv_format != "thd". if self.config.qkv_format == "thd": logits = logits[ - torch.arange(logits.size(0)), inference_params.input_sequence_lengths - 1, : ] else: @@ -585,15 +637,17 @@ def generate( dtype=torch.bfloat16, is_paged=True, page_size=64, - total_num_pages=64 *3, # 64 * 64 (max_sequence_length) / 64 (page_size) + total_num_pages=64 * 3, # 64 * 64 (max_sequence_length) / 64 (page_size) # is_cuda_graph=False ) def init_cache_params_in_infer_params(inference_params): inference_params.cached_sequence_lengths = torch.zeros( - (batch_size,), device="cuda", dtype=torch.int32) + (batch_size,), device="cuda", dtype=torch.int32 + ) inference_params.input_sequence_lengths = torch.zeros( - (batch_size,), device="cuda", dtype=torch.int32) + (batch_size,), device="cuda", dtype=torch.int32 + ) init_cache_params_in_infer_params(inference_params) inference_params.qkv_format_legacy = self.config.qkv_format @@ -609,9 +663,11 @@ def init_cache_params_in_infer_params(inference_params): # lengths_tensor=torch.ones((next_tokens.shape[0],), device="cuda"), # max_input_length=1, # ) - setup_cache_params_from_infer_params(inference_params, - lengths_tensor=torch.ones((next_tokens.shape[0],), dtype=int), - max_input_length=1) + setup_cache_params_from_infer_params( + inference_params, + lengths_tensor=torch.ones((next_tokens.shape[0],), dtype=int), + max_input_length=1, + ) else: inference_params.setup_before_new_input(length=1) @@ -630,7 +686,11 @@ def init_cache_params_in_infer_params(inference_params): # setup_cache_params_from_infer_params(inference_params, input_ids) # @sudhakars: could create position_ids from mask here - next_tokens = self._model_generation_phase(hidden_states, mask, mask_type="padding" if self.config.qkv_format=="thd" else "arbitrary") + next_tokens = self._model_generation_phase( + hidden_states, + mask, + mask_type="padding" if self.config.qkv_format == "thd" else "arbitrary", + ) # next_tokens is static output tensor, so we need to clone it # - it gets changed every iteration. output_tokens.append(next_tokens.clone()) @@ -643,11 +703,14 @@ def forward(self, *args, **kwargs): hidden_states = self.model.embed_tokens(kwargs["input_ids"]) logits = self._model_context_phase( hidden_states, - attention_mask=((kwargs["input_ids"] == 0) if self.config.qkv_format != "thd" else None), - attn_mask_type="arbitrary" + attention_mask=( + (kwargs["input_ids"] == 0) if self.config.qkv_format != "thd" else None + ), + attn_mask_type="arbitrary", ) return logits + class TELlamaForCausalLMCudaGraphs(TELlamaForCausalLM): """ TELlamaForCausalLMCudaGraphs is the version of the class TELlamaForCausalLM @@ -701,10 +764,13 @@ def record(self): max_input_length=input_shape[1], ) self._model_context_phase = self.record_graph( - PartialForwardWrapper(self._model_context_phase, attn_mask_type="padding_causal" - if self.inference_params.qkv_format == "thd" - else "arbitrary"), - self.hidden_states_buffer + PartialForwardWrapper( + self._model_context_phase, + attn_mask_type=( + "padding_causal" if self.inference_params.qkv_format == "thd" else "arbitrary" + ), + ), + self.hidden_states_buffer, ) # CUDA Graphs recording input_shape = (self.config.cuda_graphs_static_batch_size, 1) @@ -714,10 +780,11 @@ def record(self): max_input_length=input_shape[1], ) self._model_generation_phase = self.record_graph( - PartialForwardWrapper(self._model_generation_phase, mask_type="padding" - if self.inference_params.qkv_format=="thd" - else "arbitrary"), - self.generation_buffer + PartialForwardWrapper( + self._model_generation_phase, + mask_type="padding" if self.inference_params.qkv_format == "thd" else "arbitrary", + ), + self.generation_buffer, ) # CUDA Graphs recording """ diff --git a/docs/examples/te_gemma/utils.py b/docs/examples/te_gemma/utils.py index 46577071c8..27e07ee15a 100755 --- a/docs/examples/te_gemma/utils.py +++ b/docs/examples/te_gemma/utils.py @@ -29,6 +29,7 @@ from te_gemma import TEGemmaForCausalLM, TEGemmaForCausalLMCudaGraphs from te_llama import TELlamaForCausalLM, TELlamaForCausalLMCudaGraphs + class HyperParameters: def __init__(self): self.mixed_precision = "bf16" @@ -133,6 +134,7 @@ def init_te_llama_model(hyperparams): model.record() return model.cuda() + def init_te_gemma_model(hyperparams): cls = TEGemmaForCausalLMCudaGraphs if hyperparams.generation_cuda_graphs else TEGemmaForCausalLM config = AutoConfig.from_pretrained(hyperparams.model_name) @@ -265,8 +267,8 @@ def run_forward_pass(model, hyperparams, num_iters): for _ in range(num_iters): _, batch = next(train_dataloader) batch["input_ids"] = batch["input_ids"].cuda() - batch['attention_mask'] = batch["attention_mask"].cuda() - model(input_ids = batch["input_ids"], attention_mask = batch['attention_mask']) + batch["attention_mask"] = batch["attention_mask"].cuda() + model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"]) """ @@ -282,7 +284,6 @@ def print_sample_of_generated_texts(model): prompts *= 32 inputs = tokenizer(prompts, return_tensors="pt", padding=True) - max_length = inputs["input_ids"].size(1) new_length = ((max_length + 63) // 64) * 128 diff --git a/transformer_engine/pytorch/attention/inference.py b/transformer_engine/pytorch/attention/inference.py index bcd4d7de30..62a724ef79 100644 --- a/transformer_engine/pytorch/attention/inference.py +++ b/transformer_engine/pytorch/attention/inference.py @@ -220,7 +220,6 @@ def __init__( device=torch.cuda.current_device(), ) - def reset(self): """Reset InferenceParams state""" self.sequences = OrderedDict() @@ -275,7 +274,7 @@ def pre_step( pre_step_seqlens = torch.Tensor(list(self.sequences_pre_step.values())).to( dtype=torch.int32, device="cpu" ) - self.pre_step_seqlens[:len(pre_step_seqlens)].copy_(pre_step_seqlens, non_blocking=True) + self.pre_step_seqlens[: len(pre_step_seqlens)].copy_(pre_step_seqlens, non_blocking=True) seqlens_q = list(step_dict.values()) cu_seqlens_q = [0] + [sum(seqlens_q[:i]) for i in range(1, self.batch_size + 1)] From 97b756c6e5b05e6f4f8207fdbeef8226b3fb3113 Mon Sep 17 00:00:00 2001 From: Sudhakar Singh Date: Tue, 24 Jun 2025 16:41:23 -0700 Subject: [PATCH 5/7] perf imp for kv cache ops Signed-off-by: Sudhakar Singh --- transformer_engine/common/fused_attn/kv_cache.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/transformer_engine/common/fused_attn/kv_cache.cu b/transformer_engine/common/fused_attn/kv_cache.cu index af69faaabe..ea468e435b 100644 --- a/transformer_engine/common/fused_attn/kv_cache.cu +++ b/transformer_engine/common/fused_attn/kv_cache.cu @@ -116,14 +116,14 @@ void copy_to_kv_cache_launcher(Tensor new_k, Tensor new_v, Tensor k_cache, Tenso bool is_non_paged, cudaStream_t stream) { if (new_k.has_data() && new_v.has_data() && k_cache.has_data() && v_cache.has_data()) { if (is_non_paged) { - reindex_kv_cache_kernel<<<16, 256, 0, stream>>>( + reindex_kv_cache_kernel<<<128, 1024, 0, stream>>>( reinterpret_cast(k_cache.data.dptr), reinterpret_cast(v_cache.data.dptr), reinterpret_cast(page_table.data.dptr), reinterpret_cast(cu_new_lens.data.dptr), reinterpret_cast(cu_cached_lens.data.dptr), h_kv, d_k, d_v, b, max_seq_len); } - copy_to_kv_cache_kernel<<<16, 256, 0, stream>>>( + copy_to_kv_cache_kernel<<<128, 1024, 0, stream>>>( reinterpret_cast(new_k.data.dptr), reinterpret_cast(new_v.data.dptr), reinterpret_cast(k_cache.data.dptr), reinterpret_cast(v_cache.data.dptr), reinterpret_cast(page_table.data.dptr), From 5011eb33eec6fd0742a5e63fb656d7b6e067ad41 Mon Sep 17 00:00:00 2001 From: Sudhakar Singh Date: Tue, 24 Jun 2025 16:43:26 -0700 Subject: [PATCH 6/7] add code for calibration Signed-off-by: Sudhakar Singh --- docs/examples/te_gemma/run_generation.py | 57 ++++++++++++++++++------ 1 file changed, 43 insertions(+), 14 deletions(-) diff --git a/docs/examples/te_gemma/run_generation.py b/docs/examples/te_gemma/run_generation.py index 6c45b9d670..e208598dfa 100755 --- a/docs/examples/te_gemma/run_generation.py +++ b/docs/examples/te_gemma/run_generation.py @@ -1,22 +1,51 @@ from utils import * +import transformer_engine.pytorch as te hyperparams.model_name = "/perfhome/repos/ckpt/models/gemma-7b-hf/" # "/tmp/gemma-7b-hf/" # <== Add model weight location here e.g. "/path/to/downloaded/gemma/weights" hyperparams.qkv_format = "thd" -# hyperparams.generation_cuda_graphs = True # 709.8s -hyperparams.generation_cuda_graphs = True +run_generation = True +run_calibration = False -if hyperparams.generation_cuda_graphs: - # It is necessary to preallocate a static buffer. - # CUDA graphs require static input tensors for every kernel. - # This approach may result in a slight increase in memory consumption; - # however, the substantial speedup achieved makes it worthwhile. - hyperparams.cuda_graphs_static_batch_size = 64 - hyperparams.cuda_graphs_static_max_seq_len = 1024 - hyperparams.cuda_graphs_static_max_context_len = 128 +if run_calibration: + hyperparams.fuse_qkv_params = True # This is needed by the last improvement. -hyperparams.is_paged = False -model = init_te_gemma_model(hyperparams) + model = init_te_gemma_model(hyperparams) + + # Calibration + with te.fp8_autocast(enabled=False, calibrating=True), \ + torch.autocast(device_type='cuda', dtype=torch.bfloat16): + model.train() + run_forward_pass(model, hyperparams, num_iters=512) + + # Compute scale_fwd with enabled fp8 autocast + with te.fp8_autocast(enabled=True), \ + torch.autocast(device_type='cuda', dtype=torch.bfloat16): + run_forward_pass(model, hyperparams, 1) + + # Some parameters are in pointing to the same tensors, double save is avoided here. + dict_to_save = {k: v for k, v in model.state_dict().items() \ + if ("_context_phase" not in k and "_generation_phase" not in k)} + torch.save(dict_to_save, 'calibrated_weights.pth') # <== Add path to save calibrated weights. + + +if run_generation: + + # hyperparams.generation_cuda_graphs = False # 4.15s + hyperparams.generation_cuda_graphs = True # 4.38s + + if hyperparams.generation_cuda_graphs: + # It is necessary to preallocate a static buffer. + # CUDA graphs require static input tensors for every kernel. + # This approach may result in a slight increase in memory consumption; + # however, the substantial speedup achieved makes it worthwhile. + hyperparams.cuda_graphs_static_batch_size = 64 + hyperparams.cuda_graphs_static_max_seq_len = 128 + hyperparams.cuda_graphs_static_max_context_len = 128 + + hyperparams.is_paged = False + model = init_te_gemma_model(hyperparams) + + print_sample_of_generated_texts(model) + benchmark_generation(model) -print_sample_of_generated_texts(model) -# benchmark_generation(model) From 0f7ea225be20982cfc28723c9027745465390d0c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 24 Jun 2025 23:52:23 +0000 Subject: [PATCH 7/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- docs/examples/te_gemma/run_generation.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/docs/examples/te_gemma/run_generation.py b/docs/examples/te_gemma/run_generation.py index 87e5e40f01..910fa325d0 100755 --- a/docs/examples/te_gemma/run_generation.py +++ b/docs/examples/te_gemma/run_generation.py @@ -10,31 +10,34 @@ run_calibration = False if run_calibration: - hyperparams.fuse_qkv_params = True # This is needed by the last improvement. + hyperparams.fuse_qkv_params = True # This is needed by the last improvement. model = init_te_gemma_model(hyperparams) # Calibration - with te.fp8_autocast(enabled=False, calibrating=True), \ - torch.autocast(device_type='cuda', dtype=torch.bfloat16): + with te.fp8_autocast(enabled=False, calibrating=True), torch.autocast( + device_type="cuda", dtype=torch.bfloat16 + ): model.train() run_forward_pass(model, hyperparams, num_iters=512) # Compute scale_fwd with enabled fp8 autocast - with te.fp8_autocast(enabled=True), \ - torch.autocast(device_type='cuda', dtype=torch.bfloat16): + with te.fp8_autocast(enabled=True), torch.autocast(device_type="cuda", dtype=torch.bfloat16): run_forward_pass(model, hyperparams, 1) # Some parameters are in pointing to the same tensors, double save is avoided here. - dict_to_save = {k: v for k, v in model.state_dict().items() \ - if ("_context_phase" not in k and "_generation_phase" not in k)} - torch.save(dict_to_save, 'calibrated_weights.pth') # <== Add path to save calibrated weights. + dict_to_save = { + k: v + for k, v in model.state_dict().items() + if ("_context_phase" not in k and "_generation_phase" not in k) + } + torch.save(dict_to_save, "calibrated_weights.pth") # <== Add path to save calibrated weights. if run_generation: # hyperparams.generation_cuda_graphs = False # 4.15s - hyperparams.generation_cuda_graphs = True # 4.38s + hyperparams.generation_cuda_graphs = True # 4.38s if hyperparams.generation_cuda_graphs: # It is necessary to preallocate a static buffer. @@ -50,4 +53,3 @@ print_sample_of_generated_texts(model) benchmark_generation(model) -