From 243070084643e1500d6b966ad14780f45d94b9c2 Mon Sep 17 00:00:00 2001
From: Sudhakar Singh <sudhakars@nvidia.com>
Date: Mon, 2 Jun 2025 14:16:43 -0700
Subject: [PATCH 1/7] add tutorial files and other local changes

Signed-off-by: Sudhakar Singh <sudhakars@nvidia.com>
---
 docs/examples/te_gemma/check_cuda_graphs.py   |   60 +
 docs/examples/te_gemma/check_gemm.py          |  132 ++
 docs/examples/te_gemma/check_rope.ipynb       |  716 +++++++++
 docs/examples/te_gemma/media/calibration.svg  |    1 +
 .../te_gemma/media/calibration_1_half.svg     |    1 +
 .../te_gemma/media/calibration_2_half.svg     |    1 +
 .../te_gemma/media/fp8_model_init.svg         |    1 +
 .../te_gemma/media/fp8_model_init_1_half.svg  |    1 +
 .../te_gemma/media/fp8_model_init_2_half.svg  |    1 +
 .../te_gemma/media/generation_animation.gif   |  Bin 0 -> 135280 bytes
 docs/examples/te_gemma/media/graphs.svg       |    1 +
 docs/examples/te_gemma/media/graphs_1.png     |  Bin 0 -> 16100 bytes
 docs/examples/te_gemma/media/graphs_2.png     |  Bin 0 -> 15177 bytes
 docs/examples/te_gemma/media/plot.svg         |    1 +
 docs/examples/te_gemma/media/thd_bshd.svg     |    1 +
 docs/examples/te_gemma/requirements.txt       |    4 +
 docs/examples/te_gemma/run_gemma_2b.py        |   15 +
 docs/examples/te_gemma/run_generation.py      |   22 +
 .../examples/te_gemma/run_generation_llama.py |   10 +
 docs/examples/te_gemma/te_gemma.py            |  808 +++++++++++
 .../te_gemma/te_gemma_loading_weights.py      |  160 +++
 docs/examples/te_gemma/te_llama.py            |  759 ++++++++++
 .../te_gemma/te_llama_loading_weights.py      |  224 +++
 docs/examples/te_gemma/test_paged_attn.ipynb  |   33 +
 ...celerate_hf_gemma_finetuning_with_te.ipynb |  314 ++++
 .../tutorial_generation_gemma_with_te.ipynb   | 1277 +++++++++++++++++
 docs/examples/te_gemma/utils.py               |  366 +++++
 .../pytorch/attention/inference.py            |   15 +-
 .../pytorch/attention/multi_head_attention.py |    6 +-
 .../pytorch/csrc/extensions/apply_rope.cpp    |    3 +-
 30 files changed, 4927 insertions(+), 6 deletions(-)
 create mode 100644 docs/examples/te_gemma/check_cuda_graphs.py
 create mode 100755 docs/examples/te_gemma/check_gemm.py
 create mode 100755 docs/examples/te_gemma/check_rope.ipynb
 create mode 100755 docs/examples/te_gemma/media/calibration.svg
 create mode 100755 docs/examples/te_gemma/media/calibration_1_half.svg
 create mode 100755 docs/examples/te_gemma/media/calibration_2_half.svg
 create mode 100755 docs/examples/te_gemma/media/fp8_model_init.svg
 create mode 100755 docs/examples/te_gemma/media/fp8_model_init_1_half.svg
 create mode 100755 docs/examples/te_gemma/media/fp8_model_init_2_half.svg
 create mode 100755 docs/examples/te_gemma/media/generation_animation.gif
 create mode 100755 docs/examples/te_gemma/media/graphs.svg
 create mode 100755 docs/examples/te_gemma/media/graphs_1.png
 create mode 100755 docs/examples/te_gemma/media/graphs_2.png
 create mode 100755 docs/examples/te_gemma/media/plot.svg
 create mode 100755 docs/examples/te_gemma/media/thd_bshd.svg
 create mode 100755 docs/examples/te_gemma/requirements.txt
 create mode 100644 docs/examples/te_gemma/run_gemma_2b.py
 create mode 100755 docs/examples/te_gemma/run_generation.py
 create mode 100755 docs/examples/te_gemma/run_generation_llama.py
 create mode 100755 docs/examples/te_gemma/te_gemma.py
 create mode 100755 docs/examples/te_gemma/te_gemma_loading_weights.py
 create mode 100755 docs/examples/te_gemma/te_llama.py
 create mode 100755 docs/examples/te_gemma/te_llama_loading_weights.py
 create mode 100755 docs/examples/te_gemma/test_paged_attn.ipynb
 create mode 100755 docs/examples/te_gemma/tutorial_accelerate_hf_gemma_finetuning_with_te.ipynb
 create mode 100755 docs/examples/te_gemma/tutorial_generation_gemma_with_te.ipynb
 create mode 100755 docs/examples/te_gemma/utils.py

diff --git a/docs/examples/te_gemma/check_cuda_graphs.py b/docs/examples/te_gemma/check_cuda_graphs.py
new file mode 100644
index 0000000000..fa198db5ef
--- /dev/null
+++ b/docs/examples/te_gemma/check_cuda_graphs.py
@@ -0,0 +1,60 @@
+import torch
+from transformer_engine.pytorch import Linear, LayerNorm
+
+# 1. Define model with static buffers
+class TE_Model(torch.nn.Module):
+    def __init__(self, max_seq_len=4096):
+        super().__init__()
+        self.max_seq_len = max_seq_len
+        self.ln = LayerNorm(1024)
+        self.attn_proj = Linear(1024, 1024)
+
+        # Pre-allocate static buffers
+        self.register_buffer('kv_cache', torch.zeros(max_seq_len, 1024, device='cuda'))
+        self.register_buffer('attn_mask', torch.tril(torch.ones(max_seq_len, max_seq_len, device='cuda')))
+
+    def forward(self, hidden_states, seq_start: int):
+        # Dynamic slicing of static buffers
+        seq_len = hidden_states.size(1)
+        current_mask = self.attn_mask[seq_start:seq_start+seq_len, :seq_len]
+
+        x = self.ln(hidden_states)
+        x = self.attn_proj(x)
+        # Update KV cache (in-place)
+        self.kv_cache[seq_start:seq_start+seq_len].copy_(x)
+        return x
+
+# 2. Create graphable callables
+model = TE_Model().cuda()
+static_input = torch.randn(8, 256, 1024, device='cuda')  # (batch, seq, hidden)
+seq_start = torch.tensor(0, device='cuda')
+
+# Wrap with CUDA Graphs
+graph_model = torch.cuda.make_graphed_callables(
+    [model],  # Module list
+    sample_args=[(static_input, seq_start)],  # Must match actual input structure
+    # memory_pool=torch.cuda.graphs.graph_pool_handle(),
+    allow_unused_input=False
+)
+
+# 3. Warmup and execution
+def run_inference(x, seq_start):
+    # Inputs must match sample_args' device/type/shape
+    x = x.to('cuda', non_blocking=True).requires_grad_(False)
+    seq_start = seq_start.to('cuda', non_blocking=True)
+
+    with torch.cuda.amp.autocast():
+        return graph_model(x, seq_start)
+
+# Warm-up (essential for TE's kernel auto-tuner)
+for _ in range(3):
+    _ = run_inference(static_input, seq_start)
+torch.cuda.synchronize()
+
+# 4. Usage with dynamic sequence lengths
+def process_batch(inputs, start_pos):
+    # inputs: (batch, seq) on CPU
+    inputs_gpu = inputs.to('cuda', non_blocking=True)
+
+    # Output shares memory with pre-allocated buffers
+    return run_inference(inputs_gpu, start_pos)
diff --git a/docs/examples/te_gemma/check_gemm.py b/docs/examples/te_gemma/check_gemm.py
new file mode 100755
index 0000000000..dbcc0f53af
--- /dev/null
+++ b/docs/examples/te_gemma/check_gemm.py
@@ -0,0 +1,132 @@
+import functools
+from typing import Optional, Tuple, Union, List
+import torch
+import transformer_engine as te
+import transformer_engine_torch as tex
+from transformer_engine.pytorch.constants import TE_DType
+from transformer_engine.pytorch.utils import assert_dim_for_fp8_exec
+from transformer_engine.pytorch.module.base import get_workspace
+import transformer_engine.pytorch.cpp_extensions as cpp_tex
+
+@functools.lru_cache(maxsize=None)
+def _empty_tensor() -> torch.Tensor:
+    """Get tensor with no entries and no data"""
+    return torch.Tensor()
+
+def gemm(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    dtype: torch.dtype,
+    workspace: torch.Tensor,
+    gelu: bool = False,
+    gelu_input: Optional[torch.Tensor] = None,
+    grad: bool = False,
+    accumulate: bool = False,
+    layout: str = "TN",
+    out: Optional[torch.Tensor] = None,
+    bias: Optional[torch.Tensor] = None,
+    use_bias: bool = False,
+    ub_algo: tex.CommOverlapAlgo = None,
+    ub: Union[tex.CommOverlap, tex.CommOverlapP2P] = None,
+    extra_output_tensor: torch.Tensor = None,
+) -> Tuple[Union[torch.Tensor, None], ...]:
+    """Non FP8 GEMM."""
+
+    assert layout in ("TN", "NN", "NT"), f"GEMM layout {layout} not supported."
+    transa = layout[0] == "T"
+    transb = layout[1] == "T"
+    empty_tensor = _empty_tensor()
+    fp8_index = -1  # dummy index
+
+    if out is None:
+        out = torch.empty(
+            B.shape[1] if transb else B.shape[0],
+            A.shape[0] if transa else A.shape[1],
+            dtype=dtype,
+            device="cuda",
+        )
+    else:
+        if not out.is_contiguous():
+            raise ValueError("Output tensor is not contiguous.")
+
+    if gelu and not grad:
+        gelu_input = torch.empty_like(out, dtype=dtype)
+    elif not gelu:
+        gelu_input = empty_tensor
+
+    if grad and use_bias:
+        grad_bias = torch.empty(B.shape[1], dtype=out.dtype, device="cuda")
+    else:
+        grad_bias = empty_tensor
+
+    bias = bias if use_bias else empty_tensor
+
+    assert (
+        A.dtype == dtype and B.dtype == dtype
+    ), f"Expected dtype={dtype}, but found A.dtype={A.dtype} and B.dtype={B.dtype}"
+    input_dtype = TE_DType[dtype]
+    output_dtype = TE_DType[out.dtype]
+    if use_bias:
+        bias_dtype = TE_DType[grad_bias.dtype] if grad else TE_DType[bias.dtype]
+    else:
+        bias_dtype = output_dtype
+
+    args = (
+        A,
+        empty_tensor,
+        fp8_index,
+        input_dtype,
+        transa,
+        B,
+        empty_tensor,
+        fp8_index,
+        input_dtype,
+        transb,
+        out,
+        empty_tensor,  # out_scale
+        output_dtype,
+        empty_tensor,  # out_amax
+        grad_bias if grad else bias,
+        bias_dtype,
+        gelu_input,
+        grad,
+        workspace,
+        workspace.shape[0],
+        accumulate,
+        False,  # use_split_accumulator
+    )
+    fn = torch.ops.tex_ts.te_gemm_ts
+    if ub_algo is not None:
+        assert ub is not None, "ub object is None!"
+    _ = fn(*args)
+
+    import pdb; pdb.set_trace()
+    return out, grad_bias, gelu_input
+
+if __name__ == "__main__":
+    fc2_weight = torch.load("fc2_weight.pth").cuda()
+    
+    base_repo = "/perfhome/mnt/wkstn/work/repos/te_gemma_gen_support/TransformerEngine/docs/examples/te_gemma/"
+    base_repo = ""
+    gelu_out = torch.load(base_repo + "gelu_out.pth").cuda()
+    
+    activation_dtype = torch.bfloat16
+    fc2_bias = _empty_tensor()
+    use_fc2_bias = False
+    
+    dim_size = list(gelu_out.size())
+    dim_size[1] = fc2_weight.size(0)
+    fc2_out = torch.empty(dim_size, dtype=activation_dtype, device=gelu_out.device)
+
+    _ = cpp_tex.gemm(
+        fc2_weight,
+        gelu_out,
+        activation_dtype,
+        get_workspace(),
+        bias=fc2_bias,
+        use_bias=use_fc2_bias,
+        out=fc2_out,
+        ub_algo=None,
+        ub=None,
+        extra_output_tensor=None,
+    )
\ No newline at end of file
diff --git a/docs/examples/te_gemma/check_rope.ipynb b/docs/examples/te_gemma/check_rope.ipynb
new file mode 100755
index 0000000000..26d5c9058f
--- /dev/null
+++ b/docs/examples/te_gemma/check_rope.ipynb
@@ -0,0 +1,716 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "72f61b51-b6fc-4463-9783-d42a25ca3a2f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "before tex import\n"
+     ]
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "import torch.nn.functional as F\n",
+    "import math\n",
+    "print(\"before tex import\")\n",
+    "import transformer_engine as te\n",
+    "import transformer_engine_torch as tex"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "1f81be75-bf64-43b2-852a-7c482a1c3418",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformer_engine.pytorch.attention import apply_rotary_pos_emb"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "8853f973-d834-41a9-929d-8687b947134f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def compare_rope_outputs(t, freqs_s11d, freqs_sb1d):\n",
+    "    output1 = tex.fused_rope_forward(t, freqs_s11d, torch.Tensor(), False)\n",
+    "    output2 = tex.fused_rope_forward(t, freqs_sb1d, torch.Tensor(), False)\n",
+    "    print(output1, output2, sep=\"\\n\")\n",
+    "    assert torch.allclose(output1, output2)\n",
+    "    return output1, output2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "6b7bada1-6748-46f1-93a4-c2ac1a617063",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "torch.manual_seed(0)\n",
+    "b = 2\n",
+    "s = 3\n",
+    "h = 2\n",
+    "d = 4"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "54a8f6d6-28f8-4a9a-8ba0-0fdefff138e7",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "torch.Size([3, 1, 1, 4]) torch.Size([3, 2, 1, 4])\n"
+     ]
+    }
+   ],
+   "source": [
+    "freqs_s11d = torch.ones(s, 1, 1, d).cuda() * math.pi/4\n",
+    "freqs_sb1d = freqs_s11d.broadcast_to(s, b, 1, d).clone()\n",
+    "t = torch.ones(s, b, h, d).cuda()\n",
+    "\n",
+    "print(freqs_s11d.shape, freqs_sb1d.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "5070307a-3104-401b-b84c-00f3bbf02ccc",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([[[[0.7854, 0.7854, 0.7854, 0.7854]]],\n",
+       "\n",
+       "\n",
+       "        [[[0.7854, 0.7854, 0.7854, 0.7854]]],\n",
+       "\n",
+       "\n",
+       "        [[[0.7854, 0.7854, 0.7854, 0.7854]]]], device='cuda:0')"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "freqs_s11d"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "81e52785-e6ad-4180-9567-564af692375c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(4, 4, 4, 1)"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "freqs_s11d.stride()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "0da9bc09-7e1e-4056-85eb-64b6122c7440",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "4, 0\n",
+      "4, 4, 4, 1, \n",
+      "nvt_fused_rope_fwd: 4, 0fused_rope_fwd: 4, 0fused_rope_fwd_launcher: 4, 0thread_id: 0, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n"
+     ]
+    }
+   ],
+   "source": [
+    "output = tex.fused_rope_forward(t, freqs_s11d, torch.Tensor(), False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "1b78017d-09b3-4b5f-93a8-75f6ba6f131c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "output_unfused=apply_rotary_pos_emb(\n",
+    "    t,\n",
+    "    freqs_s11d,\n",
+    "    tensor_format=\"sbhd\",\n",
+    "    fused=False,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "6f5d9350-deb1-48ef-a0a2-e18e01ed336f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([[[[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+       "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00]],\n",
+       "\n",
+       "         [[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+       "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00]]],\n",
+       "\n",
+       "\n",
+       "        [[[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+       "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00]],\n",
+       "\n",
+       "         [[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+       "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00]]],\n",
+       "\n",
+       "\n",
+       "        [[[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+       "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00]],\n",
+       "\n",
+       "         [[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+       "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00]]]],\n",
+       "       device='cuda:0')"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "output_unfused"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "b01e29b8-dfdf-41ac-81a5-d8edf6a8c168",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "4, 0\n",
+      "4, 4, 4, 1, \n",
+      "nvt_fused_rope_fwd: 4, 0fused_rope_fwd: 4, 0fused_rope_fwd_launcher: 4, 08, 4\n",
+      "8, 4, 4, 1, \n",
+      "nvt_fused_rope_fwd: 8, 4fused_rope_fwd: 8, 4fused_rope_fwd_launcher: 8, 4thread_id: 0, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 2, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 2, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 2, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 2, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 2, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 2, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 2, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 2, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 0, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 0, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 0, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 0, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 0, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 0, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 0, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 0, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 2, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 2, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 2, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 2, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 2, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 2, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 2, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 2, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 0, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 0, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 0, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 0, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 0, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 0, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 0, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 0, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 0, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 0, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 0, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 0, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 0, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 0, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 0, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 0, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 0, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 0, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 0, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 0, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 0, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 0, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 0, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 0, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 2, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 2, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 2, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 2, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 2, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 2, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 2, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 2, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 1, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 1, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 1, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 1, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 1, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 1, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 1, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 1, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 2, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 2, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 2, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 2, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 2, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 2, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 2, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 2, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 1, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 1, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 1, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 1, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 1, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 1, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 1, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 1, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 1, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 1, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 1, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 1, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 1, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 1, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 1, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 1, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 1, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 1, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 1, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 1, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 1, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 1, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 1, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 1, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "tensor([[[[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+      "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00]],\n",
+      "\n",
+      "         [[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+      "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00]]],\n",
+      "\n",
+      "\n",
+      "        [[[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+      "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00]],\n",
+      "\n",
+      "         [[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+      "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00]]],\n",
+      "\n",
+      "\n",
+      "        [[[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+      "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00]],\n",
+      "\n",
+      "         [[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+      "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00]]]],\n",
+      "       device='cuda:0')\n",
+      "tensor([[[[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+      "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00]],\n",
+      "\n",
+      "         [[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+      "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00]]],\n",
+      "\n",
+      "\n",
+      "        [[[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+      "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00]],\n",
+      "\n",
+      "         [[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+      "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00]]],\n",
+      "\n",
+      "\n",
+      "        [[[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+      "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00]],\n",
+      "\n",
+      "         [[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+      "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00]]]],\n",
+      "       device='cuda:0')\n"
+     ]
+    }
+   ],
+   "source": [
+    "output1, output2 = compare_rope_outputs(t, freqs_s11d, freqs_sb1d)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "b168b178-1f63-4ccc-b084-2ac2c1ec016b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "torch.Size([6, 1, 1, 4]) torch.Size([6, 2, 1, 4])\n"
+     ]
+    }
+   ],
+   "source": [
+    "freqs_s11d = torch.randn(s, 1, 1, d).cuda()\n",
+    "freqs_sb1d = freqs_s11d.broadcast_to(s, b, 1, d).clone()\n",
+    "\n",
+    "print(freqs_s11d.shape, freqs_sb1d.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "33ec2e07-6e54-49f7-92f7-2f217a766456",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor([[[[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+      "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00]],\n",
+      "\n",
+      "         [[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+      "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00]]],\n",
+      "\n",
+      "\n",
+      "        [[[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+      "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00]],\n",
+      "\n",
+      "         [[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+      "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00]]],\n",
+      "\n",
+      "\n",
+      "        [[[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+      "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00]],\n",
+      "\n",
+      "         [[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+      "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00]]],\n",
+      "\n",
+      "\n",
+      "        [[[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+      "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00]],\n",
+      "\n",
+      "         [[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.0000e+00],\n",
+      "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.0000e+00]]]],\n",
+      "       device='cuda:0')\n",
+      "tensor([[[[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+      "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00]],\n",
+      "\n",
+      "         [[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+      "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00]]],\n",
+      "\n",
+      "\n",
+      "        [[[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+      "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00]],\n",
+      "\n",
+      "         [[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+      "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00]]],\n",
+      "\n",
+      "\n",
+      "        [[[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+      "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00]],\n",
+      "\n",
+      "         [[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+      "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00]]],\n",
+      "\n",
+      "\n",
+      "        [[[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+      "          [ 7.0711e-01,  7.0711e-01,  7.0711e-01,  7.0711e-01]],\n",
+      "\n",
+      "         [[ 7.0711e-01,  7.0711e-01,  7.0711e-01,  7.0711e-01],\n",
+      "          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00]]]],\n",
+      "       device='cuda:0')\n"
+     ]
+    },
+    {
+     "ename": "AssertionError",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAssertionError\u001b[0m                            Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[9], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m output1, output2 \u001b[38;5;241m=\u001b[39m \u001b[43mcompare_rope_outputs\u001b[49m\u001b[43m(\u001b[49m\u001b[43mt\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfreqs_s11d\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfreqs_sb1d\u001b[49m\u001b[43m)\u001b[49m\n",
+      "Cell \u001b[0;32mIn[8], line 5\u001b[0m, in \u001b[0;36mcompare_rope_outputs\u001b[0;34m(t, freqs_s11d, freqs_sb1d)\u001b[0m\n\u001b[1;32m      3\u001b[0m output2 \u001b[38;5;241m=\u001b[39m tex\u001b[38;5;241m.\u001b[39mfused_rope_forward(t, freqs_sb1d, torch\u001b[38;5;241m.\u001b[39mTensor(), \u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[1;32m      4\u001b[0m \u001b[38;5;28mprint\u001b[39m(output1, output2, sep\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m----> 5\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mallclose(output1, output2)\n\u001b[1;32m      6\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m output1, output2\n",
+      "\u001b[0;31mAssertionError\u001b[0m: "
+     ]
+    }
+   ],
+   "source": [
+    "output1, output2 = compare_rope_outputs(t, freqs_s11d, freqs_sb1d)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3b58b818-7b31-4ecd-80bd-b5ba049b3c2e",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "before tex import\n"
+     ]
+    }
+   ],
+   "source": [
+    "freqs_s11d = torch.randn(s, 1, 1, d).cuda()\n",
+    "print(freqs_s11d)\n",
+    "freqs_sb1d = freqs_s11d.broadcast_to(s, b, 1, d).clone()\n",
+    "print(freqs_sb1d)\n",
+    "assert torch.all(torch.eq(freqs_sb1d[:, 0, ...], freqs_sb1d[:, 1, ...]))\n",
+    "\n",
+    "comp"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c04940b8-3056-466b-90f6-07a02ac47ace",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/docs/examples/te_gemma/media/calibration.svg b/docs/examples/te_gemma/media/calibration.svg
new file mode 100755
index 0000000000..b1e1b5ae4b
--- /dev/null
+++ b/docs/examples/te_gemma/media/calibration.svg
@@ -0,0 +1 @@
+<svg width="1280" height="720" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" overflow="hidden"><g><rect x="0" y="0" width="1280" height="720" fill="#FFFFFF"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="24" transform="translate(39.6169 204)">FP8 with initial scaling factors</text><rect x="25" y="326" width="129" height="164" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#E8E8E8"/><rect x="40" y="351" width="103" height="71" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(76.8203 374)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(63.067 390)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(70.067 406)">weight</text><rect x="40" y="433" width="103" height="48" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#F7CBCB"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(74.3203 445)">Initial</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(55.7337 461)">FP8 scaling</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(68.6536 477)">factors</text><rect x="183" y="363" width="82" height="45" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#FBE3D6"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(212.27 382)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(201.77 398)">Weight</text><rect x="288" y="307" width="82" height="44" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#FBE3D6"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(316.622 325)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(312.202 341)">Input</text><rect x="277" y="224" width="103" height="71" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(314.289 247)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(300.535 263)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(312.455 279)">input</text><rect x="288" y="363" width="82" height="45" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#C1E5F5"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(316.619 382)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(307.952 398)">GEMM</text><path d="M0.015735-0.999876 34.0184-0.464776 33.987 1.53498-0.015735 0.999876ZM32.7325-3.48538 40.6686 0.64 32.6066 4.51362Z" transform="matrix(1 0 0 -1 143 386.64)"/><path d="M265 385 280.791 385 280.791 387 265 387ZM279.458 382 287.458 386 279.458 390Z"/><path d="M330 351 330 356.349 328 356.349 328 351ZM333 355.016 329 363.016 325 355.016Z"/><path d="M330 295 330 300.349 328 300.349 328 295ZM333 299.016 329 307.016 325 299.016Z"/><path d="M246.452 367 250.312 372.911 248.72 373.608 253.582 378.673 251.989 379.519 258 388 247.807 381.501 249.751 380.598 243.417 376.435 245.687 375.149 239 370.782Z" fill="#FF0000" fill-rule="evenodd"/><path d="M351.844 310 355.907 315.63 354.231 316.294 359.349 321.118 357.673 321.923 364 330 353.27 323.81 355.317 322.951 348.65 318.986 351.039 317.761 344 313.602Z" fill="#FF0000" fill-rule="evenodd"/><path d="M353.452 367 357.312 372.63 355.72 373.294 360.582 378.118 358.99 378.923 365 387 354.807 380.81 356.751 379.951 350.418 375.986 352.687 374.761 346 370.602Z" fill="#FF0000" fill-rule="evenodd"/><path d="M0.0369111-0.999319 6.03282-0.777852 5.959 1.22079-0.0369111 0.999319ZM8.10061-0.673792 14.0656-0.0265058 13.8498 1.96182 7.88485 1.31454ZM16.1141 0.247765 22.0214 1.2984 21.6712 3.2675 15.7639 2.21686ZM24.0392 1.73603 29.8647 3.17232 29.386 5.11418 23.5604 3.67789ZM31.84 3.77104 37.3437 5.51189 37.6161 5.61786 36.8909 7.48175 36.6487 7.38751 36.7097 7.40901 31.2368 5.67792ZM39.48 6.34307 44.1494 8.15988 45.1058 8.60687 44.259 10.4188 43.3326 9.98577 43.3934 10.0118 38.7547 8.20696ZM46.9177 9.45367 50.5967 11.1731 52.3529 12.1468 51.3831 13.896 49.6571 12.939 49.7186 12.9703 46.0709 11.2656ZM54.1021 13.1167 56.6269 14.5166 59.2925 16.2633 58.1963 17.9361 55.5616 16.2097 55.6248 16.2478 53.1322 14.8658ZM60.9653 17.3595 62.181 18.1561 65.8342 20.9962 64.6067 22.5752 60.9853 19.7599 61.051 19.8068 59.8691 19.0323ZM67.4635 22.304 71.6245 26.1896 71.8939 26.499 70.3854 27.8122 70.15 27.5418 70.2217 27.6161 66.0985 23.7658ZM73.2071 28.0075 75.3941 30.5196 76.9604 32.8279 75.3055 33.9509 73.7729 31.6924 73.8461 31.7875 71.6986 29.3207ZM78.0834 34.4828 78.4463 35.0176 80.7274 39.6775 80.8507 40.1892 78.9063 40.6576 78.8087 40.2527 78.8828 40.4582 76.681 35.9603 76.7517 36.0821 76.4285 35.6058ZM81.3191 42.1336 81.384 42.4029 79.4396 42.8713 79.3747 42.602ZM84.1057 40.6042 81.6141 49.1944 76.2369 42.047Z" transform="matrix(1 0 0 -1 143 457.194)"/><path d="M0.0162082-0.999869 6.01542-0.902619 5.983 1.09712-0.0162082 0.999869ZM8.01516-0.870203 14.0144-0.772954 13.9819 1.22678 7.98274 1.12953ZM16.0141-0.740537 17.4432-0.717371 22.0401-0.498449 21.9449 1.49929 17.3637 1.28111 17.3951 1.28211 15.9817 1.2592ZM24.0378-0.403309 30.031-0.117889 29.9359 1.87985 23.9427 1.59443ZM32.0288-0.0227485 34.7491 0.106806 38.046 0.36414 37.8903 2.35808 34.6086 2.10192 34.6389 2.10382 31.9336 1.97499ZM40.0399 0.519777 46.0217 0.986686 45.8661 2.98062 39.8843 2.51371ZM48.0156 1.14232 51.7818 1.43629 54.021 1.67858 53.8059 3.66697 51.5816 3.42629 51.6113 3.42907 47.86 3.13626ZM56.0094 1.89373 61.9746 2.53919 61.7594 4.52758 55.7942 3.88213ZM63.963 2.75434 68.4058 3.23507 69.9524 3.44976 69.6774 5.43077 68.1457 5.21815 68.1756 5.22184 63.7478 4.74273ZM71.9334 3.72476 77.8764 4.54976 77.6014 6.53076 71.6584 5.70577ZM79.8574 4.82475 84.4855 5.46721 85.8248 5.69592 85.4882 7.66738 84.1642 7.4413 84.195 7.44607 79.5824 6.80576ZM87.7963 6.03256 93.7107 7.0425 93.374 9.01396 87.4596 8.00403ZM95.6821 7.37914 99.8855 8.09689 101.618 8.45234 101.217 10.4115 99.4998 10.0594 99.5324 10.0656 95.3455 9.35061ZM103.578 8.85419 109.455 10.0598 109.053 12.019 103.176 10.8134ZM111.414 10.4616 114.471 11.0885 117.305 11.7781 116.832 13.7214 114.015 13.0361 114.051 13.044 111.013 12.4208ZM119.248 12.251 125.078 13.6696 124.605 15.6129 118.775 14.1942ZM127.021 14.1425 128.106 14.4065 132.839 15.7676 132.286 17.6897 127.573 16.3343 127.613 16.3449 126.548 16.0858ZM134.761 16.3204 140.527 17.9787 139.974 19.9008 134.208 18.2425ZM142.468 18.6339 148.147 20.5714 147.501 22.4642 141.822 20.5268ZM150.04 21.2172 151.991 21.883 155.691 23.3992 154.932 25.2498 151.26 23.745 151.317 23.7661 149.394 23.1101ZM157.541 24.1576 161.965 25.9705 163.116 26.5198 162.255 28.3249 161.129 27.7877 161.181 27.8105 156.783 26.0083ZM164.921 27.3811 166.417 28.0949 170.288 30.1608 169.346 31.9252 165.496 29.8699 165.536 29.8903 164.06 29.1861ZM172.051 31.2128 174.133 32.4706 177.115 34.5431 175.973 36.1855 173.017 34.1311 173.071 34.1659 171.017 32.9246ZM178.738 35.8385 180.177 37.0144 182.575 39.3924 183.152 40.3283 181.449 41.3778 180.935 40.5432 181.082 40.7285 178.803 38.4683 178.875 38.5326 177.473 37.3872ZM184.201 42.0309 184.451 42.436 182.748 43.4854 182.499 43.0803ZM186.867 40.296 185.963 49.1944 179.387 43.132Z" transform="matrix(1 0 0 -1 143 457.194)"/><path d="M0.0156214-0.999878 6.01489-0.90615 5.98365 1.09361-0.0156214 0.999878ZM8.01465-0.874907 14.0139-0.781179 13.9827 1.21858 7.9834 1.12485ZM16.0137-0.749936 22.0129-0.656208 21.9817 1.34355 15.9824 1.24982ZM24.0425-0.610275 30.0362-0.33517 29.9445 1.66273 23.9508 1.38762ZM32.0341-0.243468 38.0278 0.0316382 37.9361 2.02953 31.9424 1.75443ZM40.0257 0.12334 46.0194 0.398446 45.9277 2.39634 39.934 2.12124ZM48.0445 0.523202 54.0276 0.973298 53.8776 2.96766 47.8945 2.51757ZM56.022 1.12333 62.0051 1.57343 61.855 3.56779 55.872 3.1177ZM63.9994 1.72346 69.9786 2.17326 70.0112 2.17666 69.8037 4.16587 69.7855 4.16396 69.8142 4.16654 63.8494 3.71782ZM72.0004 2.3841 77.968 3.00644 77.7606 4.99565 71.793 4.37332ZM79.9573 3.21389 85.9249 3.83622 85.7175 5.82544 79.7498 5.2031ZM87.9141 4.04367 92.4444 4.51611 93.9056 4.71161 93.6404 6.69394 92.1936 6.50038 92.2225 6.50382 87.7067 6.03288ZM95.8879 4.97682 101.835 5.77248 101.57 7.75481 95.6227 6.95916ZM103.817 6.03769 109.764 6.83334 109.499 8.81568 103.552 8.02003ZM111.747 7.09856 114.174 7.42333 117.708 8.00485 117.383 9.97831 113.864 9.39923 113.894 9.40367 111.481 9.0809ZM119.681 8.32963 125.601 9.30397 125.277 11.2774 119.356 10.3031ZM127.575 9.62875 133.495 10.6031 133.17 12.5765 127.25 11.6022ZM135.497 10.9496 141.383 12.1131 140.996 14.0752 135.109 12.9116ZM143.345 12.501 149.232 13.6646 148.844 15.6266 142.958 14.463ZM151.194 14.0524 154.692 14.744 157.096 15.3078 156.639 17.255 154.252 16.6952 154.287 16.7026 150.806 16.0145ZM159.043 15.7645 164.885 17.1344 164.428 19.0816 158.586 17.7116ZM166.832 17.5911 172.673 18.9611 172.217 20.9083 166.375 19.5383ZM174.638 19.4697 180.436 21.011 179.923 22.9439 174.124 21.4025ZM182.386 21.5434 188.15 23.2095 187.595 25.1309 181.831 23.4647ZM190.094 23.7721 195.817 25.5738 195.217 27.4815 189.494 25.6798ZM197.725 26.1743 197.937 26.241 203.426 28.1274 202.776 30.0188 197.299 28.1367 197.324 28.1448 197.125 28.082ZM205.317 28.7775 205.372 28.7962 210.96 30.9008 210.255 32.7724 204.681 30.673 204.708 30.6829 204.667 30.6689ZM212.856 31.6341 218.398 33.9327 217.632 35.7801 212.09 33.4815ZM220.257 34.7672 224.842 36.8757 225.729 37.3319 224.814 39.1104 223.947 38.6643 223.987 38.6836 219.421 36.5843ZM227.508 38.2468 230.314 39.6905 232.817 41.1479 231.81 42.8763 229.33 41.432 229.376 41.457 226.593 40.0253ZM234.545 42.1544 235.24 42.5592 239.599 45.4789 239.669 45.5339 238.432 47.1052 238.392 47.074 238.454 47.1192 234.153 44.2384 234.207 44.2717 233.539 43.8827ZM241.24 46.7714 243.37 48.4484 245.817 50.7879 244.435 52.2335 242.022 49.9271 242.095 49.9899 240.003 48.3426ZM247.249 52.3466 249.052 54.5432 250.739 57.3923 249.018 58.4114 247.37 55.6281 247.457 55.7529 245.703 53.6153ZM251.537 59.4264 252.056 60.8708 252.444 64.0697 252.369 65.7033 250.371 65.6112 250.443 64.0611 250.449 64.2276 250.085 61.2235 250.136 61.4415 249.655 60.1032ZM252.277 67.7012 252.001 73.6948 250.003 73.6028 250.279 67.6091ZM251.909 75.6927 251.888 76.1544 251.138 81.7297 249.156 81.4632 249.9 75.9313 249.892 76.0185 249.911 75.6006ZM250.871 83.7119 250.319 87.8223 249.896 89.7144 247.944 89.2783 248.357 87.4283 248.342 87.5131 248.889 83.4453ZM249.46 91.6663 248.152 97.5219 246.2 97.0858 247.508 91.2302ZM247.645 99.5265 246.377 103.802 245.848 105.302 243.962 104.638 244.482 103.161 244.466 103.209 245.728 98.9575ZM245.184 107.189 244.709 108.539 242.933 112.808 241.086 112.04 242.852 107.796 242.832 107.849 243.298 106.524ZM242.025 114.662 240.941 116.859 239.091 119.979 237.371 118.959 239.201 115.872 239.165 115.939 240.232 113.777ZM237.891 121.694 236.653 123.392 233.529 126.024 232.24 124.494 235.272 121.94 235.109 122.116 236.275 120.516ZM235.853 128.254 226.908 128.214 232.307 121.083Z" transform="matrix(1 0 0 -1 143 457.214)"/><path d="M821 170 821 513.312" stroke="#000000" stroke-width="2" stroke-miterlimit="8" fill="none" fill-rule="evenodd"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="24" transform="translate(531.587 204)">Weight calibration</text><rect x="461" y="326" width="129" height="164" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#E8E8E8"/><rect x="476" y="351" width="103" height="71" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(513.235 374)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(499.482 390)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(506.482 406)">weight</text><rect x="476" y="433" width="103" height="48" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(493.898 453)">FP8 scaling</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(505.065 469)">factors</text><rect x="679" y="224" width="103" height="71" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(716.025 247)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(702.272 263)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(714.192 279)">input</text><rect x="679" y="351" width="103" height="70" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#C1E5F5"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(716.026 374)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(702.272 390)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(709.692 406)">GEMM</text><path d="M0.00641402-0.999979 93.1192-0.402739 93.1064 1.59722-0.00641402 0.999979ZM91.8051-3.41123 99.7793 0.64 91.7538 4.5886Z" transform="matrix(1 0 0 -1 579 386.64)"/><path d="M732 295 732 344.395 730 344.395 730 295ZM735 343.061 731 351.061 727 343.061Z"/><path d="M731.277 421.127 731.042 422.957 730.317 424.869 729.159 426.721 728.823 427.11 727.311 425.801 727.597 425.471 727.505 425.596 728.569 423.893 728.482 424.068 729.131 422.358 729.074 422.586 729.293 420.873ZM727.412 428.692 725.669 430.271 723.36 431.994 722.525 432.487 721.508 430.766 722.297 430.3 722.207 430.359 724.435 428.696 724.362 428.757 726.069 427.21ZM720.803 433.505 717.748 435.309 715.429 436.378 714.592 434.561 716.865 433.514 716.775 433.561 719.786 431.783ZM713.612 437.214 710.817 438.501 708.011 439.551 707.31 437.678 710.082 436.641 710.014 436.669 712.775 435.397ZM706.138 440.253 702.686 441.544 700.423 442.25 699.828 440.34 702.065 439.643 702.012 439.662 705.437 438.38ZM698.513 442.845 693.46 444.42 692.733 444.611 692.224 442.676 692.93 442.491 692.886 442.504 697.918 440.936ZM690.798 445.119 684.995 446.643 684.487 444.709 690.29 443.185ZM683.022 447.153 677.165 448.454 676.731 446.502 682.588 445.2ZM675.212 448.888 672.147 449.57 669.303 450.103 668.935 448.137 671.762 447.607 671.729 447.614 674.778 446.936ZM667.337 450.471 661.44 451.576 661.072 449.61 666.969 448.505ZM659.44 451.924 653.511 452.849 653.203 450.873 659.131 449.948ZM651.535 453.157 647.746 453.748 645.57 454.024 645.318 452.04 647.48 451.766 647.452 451.77 651.227 451.181ZM643.586 454.275 637.633 455.03 637.382 453.046 643.334 452.291ZM635.649 455.282 634.662 455.407 629.654 455.902 629.457 453.912 634.451 453.418 634.424 453.421 635.398 453.298ZM627.664 456.099 621.693 456.689 621.496 454.698 627.467 454.108ZM619.672 456.848 613.687 457.275 613.545 455.28 619.53 454.853ZM611.692 457.417 607.275 457.732 605.677 457.801 605.59 455.803 607.175 455.734 607.147 455.736 611.55 455.422ZM603.679 457.888 597.685 458.149 597.598 456.151 603.592 455.89ZM595.687 458.236 593.193 458.344 589.661 458.397 589.631 456.397 593.15 456.345 593.121 456.346 595.6 456.238ZM587.661 458.426 585.681 458.456 585.651 456.456 587.631 456.427ZM587.058 461.436 579 457.555 586.94 453.436Z"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="24" transform="translate(857.055 204)">FP8 with calibrated scaling factors</text><rect x="868" y="326" width="129" height="164" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#E8E8E8"/><rect x="883" y="351" width="103" height="71" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(919.685 374)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(905.932 390)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(912.932 406)">weight</text><rect x="883" y="433" width="103" height="48" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#92D050"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(902.185 445)">Calibrated</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(898.599 461)">FP8 scaling</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(911.519 477)">factors</text><rect x="1026" y="363" width="82" height="45" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#FBE3D6"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(1055.14 382)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(1044.64 398)">Weight</text><rect x="1131" y="307" width="82" height="44" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#FBE3D6"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(1159.49 325)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(1155.07 341)">Input</text><rect x="1120" y="224" width="103" height="71" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(1157.15 247)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(1143.4 263)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(1155.32 279)">input</text><rect x="1131" y="363" width="82" height="45" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#C1E5F5"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(1159.48 382)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(1150.82 398)">GEMM</text><path d="M0.015735-0.999876 34.0184-0.464776 33.987 1.53498-0.015735 0.999876ZM32.7325-3.48538 40.6686 0.64 32.6066 4.51362Z" transform="matrix(1 0 0 -1 986 386.64)"/><path d="M1108 385 1123.79 385 1123.79 387 1108 387ZM1122.46 382 1130.46 386 1122.46 390Z"/><path d="M1173 351 1173 356.349 1171 356.349 1171 351ZM1176 355.016 1172 363.016 1168 355.016Z"/><path d="M1173 295 1173 300.349 1171 300.349 1171 295ZM1176 299.016 1172 307.016 1168 299.016Z"/><path d="M0.0369111-0.999319 6.03282-0.777852 5.959 1.22079-0.0369111 0.999319ZM8.10061-0.673792 14.0656-0.0265058 13.8498 1.96182 7.88485 1.31454ZM16.1141 0.247765 22.0214 1.2984 21.6712 3.2675 15.7639 2.21686ZM24.0392 1.73603 29.8647 3.17232 29.386 5.11418 23.5604 3.67789ZM31.84 3.77104 37.3437 5.51189 37.6161 5.61786 36.8909 7.48175 36.6487 7.38751 36.7097 7.40901 31.2368 5.67792ZM39.48 6.34307 44.1494 8.15988 45.1058 8.60687 44.259 10.4188 43.3326 9.98577 43.3934 10.0118 38.7547 8.20696ZM46.9177 9.45367 50.5967 11.1731 52.3529 12.1468 51.3831 13.896 49.6571 12.939 49.7186 12.9703 46.0709 11.2656ZM54.1021 13.1167 56.6269 14.5166 59.2925 16.2633 58.1963 17.9361 55.5616 16.2097 55.6248 16.2478 53.1322 14.8658ZM60.9653 17.3595 62.181 18.1561 65.8342 20.9962 64.6067 22.5752 60.9853 19.7599 61.051 19.8068 59.8691 19.0323ZM67.4635 22.304 71.6245 26.1896 71.8939 26.499 70.3854 27.8122 70.15 27.5418 70.2217 27.6161 66.0985 23.7658ZM73.2071 28.0075 75.3941 30.5196 76.9604 32.8279 75.3055 33.9509 73.7729 31.6924 73.8461 31.7875 71.6986 29.3207ZM78.0834 34.4828 78.4463 35.0176 80.7274 39.6775 80.8507 40.1892 78.9063 40.6576 78.8087 40.2527 78.8828 40.4582 76.681 35.9603 76.7517 36.0821 76.4285 35.6058ZM81.3191 42.1336 81.384 42.4029 79.4396 42.8713 79.3747 42.602ZM84.1057 40.6042 81.6141 49.1944 76.2369 42.047Z" transform="matrix(1 0 0 -1 986 457.194)"/><path d="M0.0162082-0.999869 6.01542-0.902619 5.983 1.09712-0.0162082 0.999869ZM8.01516-0.870203 14.0144-0.772954 13.9819 1.22678 7.98274 1.12953ZM16.0141-0.740537 17.4432-0.717371 22.0401-0.498449 21.9449 1.49929 17.3637 1.28111 17.3951 1.28211 15.9817 1.2592ZM24.0378-0.403309 30.031-0.117889 29.9359 1.87985 23.9427 1.59443ZM32.0288-0.0227485 34.7491 0.106806 38.046 0.36414 37.8903 2.35808 34.6086 2.10192 34.6389 2.10382 31.9336 1.97499ZM40.0399 0.519777 46.0217 0.986686 45.8661 2.98062 39.8843 2.51371ZM48.0156 1.14232 51.7818 1.43629 54.021 1.67858 53.8059 3.66697 51.5816 3.42629 51.6113 3.42907 47.86 3.13626ZM56.0094 1.89373 61.9746 2.53919 61.7594 4.52758 55.7942 3.88213ZM63.963 2.75434 68.4058 3.23507 69.9524 3.44976 69.6774 5.43077 68.1457 5.21815 68.1756 5.22184 63.7478 4.74273ZM71.9334 3.72476 77.8764 4.54976 77.6014 6.53076 71.6584 5.70577ZM79.8574 4.82475 84.4855 5.46721 85.8248 5.69592 85.4882 7.66738 84.1642 7.4413 84.195 7.44607 79.5824 6.80576ZM87.7963 6.03256 93.7107 7.0425 93.374 9.01396 87.4596 8.00403ZM95.6821 7.37914 99.8855 8.09689 101.618 8.45234 101.217 10.4115 99.4998 10.0594 99.5324 10.0656 95.3455 9.35061ZM103.578 8.85419 109.455 10.0598 109.053 12.019 103.176 10.8134ZM111.414 10.4616 114.471 11.0885 117.305 11.7781 116.832 13.7214 114.015 13.0361 114.051 13.044 111.013 12.4208ZM119.248 12.251 125.078 13.6696 124.605 15.6129 118.775 14.1942ZM127.021 14.1425 128.106 14.4065 132.839 15.7676 132.286 17.6897 127.573 16.3343 127.613 16.3449 126.548 16.0858ZM134.761 16.3204 140.527 17.9787 139.974 19.9008 134.208 18.2425ZM142.468 18.6339 148.147 20.5714 147.501 22.4642 141.822 20.5268ZM150.04 21.2172 151.991 21.883 155.691 23.3992 154.932 25.2498 151.26 23.745 151.317 23.7661 149.394 23.1101ZM157.541 24.1576 161.965 25.9705 163.116 26.5198 162.255 28.3249 161.129 27.7877 161.181 27.8105 156.783 26.0083ZM164.921 27.3811 166.417 28.0949 170.288 30.1608 169.346 31.9252 165.496 29.8699 165.536 29.8903 164.06 29.1861ZM172.051 31.2128 174.133 32.4706 177.115 34.5431 175.973 36.1855 173.017 34.1311 173.071 34.1659 171.017 32.9246ZM178.738 35.8385 180.177 37.0144 182.575 39.3924 183.152 40.3283 181.449 41.3778 180.935 40.5432 181.082 40.7285 178.803 38.4683 178.875 38.5326 177.473 37.3872ZM184.201 42.0309 184.451 42.436 182.748 43.4854 182.499 43.0803ZM186.867 40.296 185.963 49.1944 179.387 43.132Z" transform="matrix(1 0 0 -1 986 457.194)"/><path d="M0.0156214-0.999878 6.01489-0.90615 5.98365 1.09361-0.0156214 0.999878ZM8.01465-0.874907 14.0139-0.781179 13.9827 1.21858 7.9834 1.12485ZM16.0137-0.749936 22.0129-0.656208 21.9817 1.34355 15.9824 1.24982ZM24.0425-0.610275 30.0362-0.33517 29.9445 1.66273 23.9508 1.38762ZM32.0341-0.243468 38.0278 0.0316382 37.9361 2.02953 31.9424 1.75443ZM40.0257 0.12334 46.0194 0.398446 45.9277 2.39634 39.934 2.12124ZM48.0445 0.523202 54.0276 0.973298 53.8776 2.96766 47.8945 2.51757ZM56.022 1.12333 62.0051 1.57343 61.855 3.56779 55.872 3.1177ZM63.9994 1.72346 69.9786 2.17326 70.0112 2.17666 69.8037 4.16587 69.7855 4.16396 69.8142 4.16654 63.8494 3.71782ZM72.0004 2.3841 77.968 3.00644 77.7606 4.99565 71.793 4.37332ZM79.9573 3.21389 85.9249 3.83622 85.7175 5.82544 79.7498 5.2031ZM87.9141 4.04367 92.4444 4.51611 93.9056 4.71161 93.6404 6.69394 92.1936 6.50038 92.2225 6.50382 87.7067 6.03288ZM95.8879 4.97682 101.835 5.77248 101.57 7.75481 95.6227 6.95916ZM103.817 6.03769 109.764 6.83334 109.499 8.81568 103.552 8.02003ZM111.747 7.09856 114.174 7.42333 117.708 8.00485 117.383 9.97831 113.864 9.39923 113.894 9.40367 111.481 9.0809ZM119.681 8.32963 125.601 9.30397 125.277 11.2774 119.356 10.3031ZM127.575 9.62875 133.495 10.6031 133.17 12.5765 127.25 11.6022ZM135.497 10.9496 141.383 12.1131 140.996 14.0752 135.109 12.9116ZM143.345 12.501 149.232 13.6646 148.844 15.6266 142.958 14.463ZM151.194 14.0524 154.692 14.744 157.096 15.3078 156.639 17.255 154.252 16.6952 154.287 16.7026 150.806 16.0145ZM159.043 15.7645 164.885 17.1344 164.428 19.0816 158.586 17.7116ZM166.832 17.5911 172.673 18.9611 172.217 20.9083 166.375 19.5383ZM174.638 19.4697 180.436 21.011 179.923 22.9439 174.124 21.4025ZM182.386 21.5434 188.15 23.2095 187.595 25.1309 181.831 23.4647ZM190.094 23.7721 195.817 25.5738 195.217 27.4815 189.494 25.6798ZM197.725 26.1743 197.937 26.241 203.426 28.1274 202.776 30.0188 197.299 28.1367 197.324 28.1448 197.125 28.082ZM205.317 28.7775 205.372 28.7962 210.96 30.9008 210.255 32.7724 204.681 30.673 204.708 30.6829 204.667 30.6689ZM212.856 31.6341 218.398 33.9327 217.632 35.7801 212.09 33.4815ZM220.257 34.7672 224.842 36.8757 225.729 37.3319 224.814 39.1104 223.947 38.6643 223.987 38.6836 219.421 36.5843ZM227.508 38.2468 230.314 39.6905 232.817 41.1479 231.81 42.8763 229.33 41.432 229.376 41.457 226.593 40.0253ZM234.545 42.1544 235.24 42.5592 239.599 45.4789 239.669 45.5339 238.432 47.1052 238.392 47.074 238.454 47.1192 234.153 44.2384 234.207 44.2717 233.539 43.8827ZM241.24 46.7714 243.37 48.4484 245.817 50.7879 244.435 52.2335 242.022 49.9271 242.095 49.9899 240.003 48.3426ZM247.249 52.3466 249.052 54.5432 250.739 57.3923 249.018 58.4114 247.37 55.6281 247.457 55.7529 245.703 53.6153ZM251.537 59.4264 252.056 60.8708 252.444 64.0697 252.369 65.7033 250.371 65.6112 250.443 64.0611 250.449 64.2276 250.085 61.2235 250.136 61.4415 249.655 60.1032ZM252.277 67.7012 252.001 73.6948 250.003 73.6028 250.279 67.6091ZM251.909 75.6927 251.888 76.1544 251.138 81.7297 249.156 81.4632 249.9 75.9313 249.892 76.0185 249.911 75.6006ZM250.871 83.7119 250.319 87.8223 249.896 89.7144 247.944 89.2783 248.357 87.4283 248.342 87.5131 248.889 83.4453ZM249.46 91.6663 248.152 97.5219 246.2 97.0858 247.508 91.2302ZM247.645 99.5265 246.377 103.802 245.848 105.302 243.962 104.638 244.482 103.161 244.466 103.209 245.728 98.9575ZM245.184 107.189 244.709 108.539 242.933 112.808 241.086 112.04 242.852 107.796 242.832 107.849 243.298 106.524ZM242.025 114.662 240.941 116.859 239.091 119.979 237.371 118.959 239.201 115.872 239.165 115.939 240.232 113.777ZM237.891 121.694 236.653 123.392 233.529 126.024 232.24 124.494 235.272 121.94 235.109 122.116 236.275 120.516ZM235.853 128.254 226.908 128.214 232.307 121.083Z" transform="matrix(1 0 0 -1 986 457.214)"/><path d="M422 170 422 513.312" stroke="#000000" stroke-width="2" stroke-miterlimit="8" fill="none" fill-rule="evenodd"/></g></svg>
\ No newline at end of file
diff --git a/docs/examples/te_gemma/media/calibration_1_half.svg b/docs/examples/te_gemma/media/calibration_1_half.svg
new file mode 100755
index 0000000000..af2641387f
--- /dev/null
+++ b/docs/examples/te_gemma/media/calibration_1_half.svg
@@ -0,0 +1 @@
+<svg width="960" height="480" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" overflow="hidden"><g><rect x="0" y="0" width="960" height="480" fill="#FFFFFF"/><rect x="81" y="206" width="129" height="164" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#E8E8E8"/><rect x="96" y="231" width="103" height="71" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(133.202 254)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(119.448 270)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(126.448 286)">weight</text><rect x="96" y="313" width="103" height="48" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#F7CBCB"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(130.702 325)">Initial</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(112.115 341)">FP8 scaling</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(125.035 357)">factors</text><rect x="240" y="243" width="82" height="45" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#FBE3D6"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(268.651 262)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(258.151 278)">Weight</text><rect x="344" y="187" width="82" height="44" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#FBE3D6"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(373.003 205)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(368.583 221)">Input</text><rect x="334" y="104" width="103" height="71" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(370.67 127)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(356.917 143)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(368.837 159)">input</text><rect x="344" y="243" width="82" height="45" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#C1E5F5"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(373 262)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(364.333 278)">GEMM</text><path d="M0.015735-0.999876 34.0184-0.464776 33.987 1.53498-0.015735 0.999876ZM32.7325-3.48538 40.6686 0.64 32.6066 4.51362Z" transform="matrix(1 0 0 -1 199 266.64)"/><path d="M322 265 337.791 265 337.791 267 322 267ZM336.458 262 344.458 266 336.458 270Z"/><path d="M386 231 386 236.349 384 236.349 384 231ZM389 235.016 385 243.016 381 235.016Z"/><path d="M386 175 386 180.349 384 180.349 384 175ZM389 179.016 385 187.016 381 179.016Z"/><path d="M302.844 247 306.907 252.911 305.231 253.608 310.349 258.673 308.673 259.519 315 268 304.27 261.501 306.317 260.598 299.65 256.435 302.039 255.149 295 250.782Z" fill="#FF0000" fill-rule="evenodd"/><path d="M408.452 190 412.312 195.63 410.72 196.294 415.582 201.118 413.99 201.923 420 210 409.807 203.81 411.751 202.951 405.418 198.986 407.687 197.761 401 193.602Z" fill="#FF0000" fill-rule="evenodd"/><path d="M409.452 247 413.312 252.63 411.72 253.294 416.582 258.118 414.99 258.923 421 267 410.807 260.81 412.751 259.951 406.418 255.986 408.687 254.761 402 250.602Z" fill="#FF0000" fill-rule="evenodd"/><path d="M0.0369111-0.999319 6.03282-0.777852 5.959 1.22079-0.0369111 0.999319ZM8.10061-0.673792 14.0656-0.0265058 13.8498 1.96182 7.88485 1.31454ZM16.1141 0.247765 22.0214 1.2984 21.6712 3.2675 15.7639 2.21686ZM24.0392 1.73603 29.8647 3.17232 29.386 5.11418 23.5604 3.67789ZM31.84 3.77104 37.3437 5.51189 37.6161 5.61786 36.8909 7.48175 36.6487 7.38751 36.7097 7.40901 31.2368 5.67792ZM39.48 6.34307 44.1494 8.15988 45.1058 8.60687 44.259 10.4188 43.3326 9.98577 43.3934 10.0118 38.7547 8.20696ZM46.9177 9.45367 50.5967 11.1731 52.3529 12.1468 51.3831 13.896 49.6571 12.939 49.7186 12.9703 46.0709 11.2656ZM54.1021 13.1167 56.6269 14.5166 59.2925 16.2633 58.1963 17.9361 55.5616 16.2097 55.6248 16.2478 53.1322 14.8658ZM60.9653 17.3595 62.181 18.1561 65.8342 20.9962 64.6067 22.5752 60.9853 19.7599 61.051 19.8068 59.8691 19.0323ZM67.4635 22.304 71.6245 26.1896 71.8939 26.499 70.3854 27.8122 70.15 27.5418 70.2217 27.6161 66.0985 23.7658ZM73.2071 28.0075 75.3941 30.5196 76.9604 32.8279 75.3055 33.9509 73.7729 31.6924 73.8461 31.7875 71.6986 29.3207ZM78.0834 34.4828 78.4463 35.0176 80.7274 39.6775 80.8507 40.1892 78.9063 40.6576 78.8087 40.2527 78.8828 40.4582 76.681 35.9603 76.7517 36.0821 76.4285 35.6058ZM81.3191 42.1336 81.384 42.4029 79.4396 42.8713 79.3747 42.602ZM84.1057 40.6042 81.6141 49.1944 76.2369 42.047Z" transform="matrix(1 0 0 -1 199 337.194)"/><path d="M0.0162082-0.999869 6.01542-0.902619 5.983 1.09712-0.0162082 0.999869ZM8.01516-0.870203 14.0144-0.772953 13.982 1.22678 7.98274 1.12953ZM16.0141-0.740537 17.4432-0.717371 22.0401-0.498449 21.945 1.49929 17.3637 1.28111 17.3951 1.28211 15.9817 1.2592ZM24.0378-0.403308 30.031-0.117888 29.9359 1.87985 23.9427 1.59443ZM32.0288-0.0227477 34.7491 0.106806 38.046 0.364142 37.8903 2.35808 34.6086 2.10192 34.6388 2.10382 31.9336 1.97499ZM40.0399 0.519778 46.0217 0.986688 45.8661 2.98062 39.8843 2.51371ZM48.0156 1.14232 51.7818 1.43629 54.021 1.67858 53.8059 3.66698 51.5815 3.42629 51.6113 3.42907 47.86 3.13626ZM56.0094 1.89373 61.9746 2.53919 61.7594 4.52759 55.7943 3.88213ZM63.963 2.75434 68.4058 3.23507 69.9524 3.44977 69.6774 5.43077 68.1457 5.21815 68.1756 5.22184 63.7478 4.74274ZM71.9334 3.72477 77.8764 4.54976 77.6014 6.53077 71.6584 5.70577ZM79.8574 4.82476 84.4854 5.46721 85.8248 5.69593 85.4882 7.66739 84.1641 7.4413 84.195 7.44607 79.5824 6.80576ZM87.7963 6.03257 93.7107 7.0425 93.374 9.01397 87.4596 8.00403ZM95.6821 7.37915 99.8854 8.09689 101.618 8.45235 101.217 10.4116 99.4998 10.0594 99.5324 10.0656 95.3455 9.35061ZM103.578 8.8542 109.455 10.0598 109.053 12.019 103.176 10.8134ZM111.414 10.4616 114.471 11.0885 117.305 11.7781 116.832 13.7214 114.015 13.0361 114.051 13.044 111.013 12.4208ZM119.248 12.251 125.078 13.6696 124.605 15.6129 118.775 14.1943ZM127.021 14.1425 128.106 14.4065 132.839 15.7676 132.286 17.6897 127.573 16.3343 127.613 16.3449 126.548 16.0858ZM134.761 16.3204 140.527 17.9787 139.974 19.9008 134.208 18.2425ZM142.468 18.6339 148.147 20.5714 147.501 22.4643 141.822 20.5268ZM150.04 21.2172 151.991 21.883 155.691 23.3992 154.932 25.2498 151.26 23.745 151.317 23.7661 149.394 23.1101ZM157.541 24.1577 161.965 25.9705 163.116 26.5198 162.255 28.3249 161.129 27.7877 161.18 27.8105 156.783 26.0083ZM164.921 27.3811 166.417 28.0949 170.288 30.1608 169.346 31.9252 165.496 29.8699 165.536 29.8903 164.06 29.1862ZM172.051 31.2128 174.133 32.4706 177.115 34.5432 175.973 36.1855 173.017 34.1311 173.071 34.1659 171.017 32.9246ZM178.738 35.8386 180.177 37.0144 182.575 39.3924 183.152 40.3284 181.449 41.3778 180.935 40.5432 181.082 40.7285 178.803 38.4683 178.875 38.5326 177.473 37.3872ZM184.201 42.0309 184.451 42.436 182.748 43.4854 182.499 43.0804ZM186.867 40.296 185.963 49.1944 179.387 43.1319Z" transform="matrix(1 0 0 -1 199 337.194)"/><path d="M0.0156214-0.999878 6.01489-0.90615 5.98365 1.09361-0.0156214 0.999878ZM8.01465-0.874907 14.0139-0.781179 13.9827 1.21858 7.9834 1.12485ZM16.0137-0.749936 22.0129-0.656208 21.9817 1.34355 15.9824 1.24982ZM24.0425-0.610275 30.0362-0.33517 29.9445 1.66273 23.9508 1.38762ZM32.0341-0.243468 38.0278 0.0316382 37.9361 2.02953 31.9424 1.75443ZM40.0257 0.12334 46.0194 0.398446 45.9277 2.39634 39.934 2.12124ZM48.0445 0.523202 54.0276 0.973298 53.8776 2.96766 47.8945 2.51757ZM56.022 1.12333 62.0051 1.57343 61.855 3.56779 55.872 3.1177ZM63.9994 1.72346 69.9786 2.17326 70.0112 2.17666 69.8037 4.16587 69.7855 4.16396 69.8142 4.16654 63.8494 3.71782ZM72.0004 2.3841 77.968 3.00644 77.7606 4.99565 71.793 4.37332ZM79.9573 3.21389 85.9249 3.83622 85.7175 5.82544 79.7498 5.2031ZM87.9141 4.04367 92.4444 4.51611 93.9056 4.71161 93.6404 6.69394 92.1936 6.50038 92.2225 6.50382 87.7067 6.03288ZM95.8879 4.97682 101.835 5.77248 101.57 7.75481 95.6227 6.95916ZM103.817 6.03769 109.764 6.83334 109.499 8.81568 103.552 8.02003ZM111.747 7.09856 114.174 7.42333 117.708 8.00485 117.383 9.97831 113.864 9.39923 113.894 9.40367 111.481 9.0809ZM119.681 8.32963 125.601 9.30397 125.277 11.2774 119.356 10.3031ZM127.575 9.62875 133.495 10.6031 133.17 12.5765 127.25 11.6022ZM135.497 10.9496 141.383 12.1131 140.996 14.0752 135.109 12.9116ZM143.345 12.501 149.232 13.6646 148.844 15.6266 142.958 14.463ZM151.194 14.0524 154.692 14.744 157.096 15.3078 156.639 17.255 154.252 16.6952 154.287 16.7026 150.806 16.0145ZM159.043 15.7645 164.885 17.1344 164.428 19.0816 158.586 17.7116ZM166.832 17.5911 172.673 18.9611 172.217 20.9083 166.375 19.5383ZM174.638 19.4697 180.436 21.011 179.923 22.9439 174.124 21.4025ZM182.386 21.5434 188.15 23.2095 187.595 25.1309 181.831 23.4647ZM190.094 23.7721 195.817 25.5738 195.217 27.4815 189.494 25.6798ZM197.725 26.1743 197.937 26.241 203.426 28.1274 202.776 30.0188 197.299 28.1367 197.324 28.1448 197.125 28.082ZM205.317 28.7775 205.372 28.7962 210.96 30.9008 210.255 32.7724 204.681 30.673 204.708 30.6829 204.667 30.6689ZM212.856 31.6341 218.398 33.9327 217.632 35.7801 212.09 33.4815ZM220.257 34.7672 224.842 36.8757 225.729 37.3319 224.814 39.1104 223.947 38.6643 223.987 38.6836 219.421 36.5843ZM227.508 38.2468 230.314 39.6905 232.817 41.1479 231.81 42.8763 229.33 41.432 229.376 41.457 226.593 40.0253ZM234.545 42.1544 235.24 42.5592 239.599 45.4789 239.669 45.5339 238.432 47.1052 238.392 47.074 238.454 47.1192 234.153 44.2384 234.207 44.2717 233.539 43.8827ZM241.24 46.7714 243.37 48.4484 245.817 50.7879 244.435 52.2335 242.022 49.9271 242.095 49.9899 240.003 48.3426ZM247.249 52.3466 249.052 54.5432 250.739 57.3923 249.018 58.4114 247.37 55.6281 247.457 55.7529 245.703 53.6153ZM251.537 59.4264 252.056 60.8708 252.444 64.0697 252.369 65.7033 250.371 65.6112 250.443 64.0611 250.449 64.2276 250.085 61.2235 250.136 61.4415 249.655 60.1032ZM252.277 67.7012 252.001 73.6948 250.003 73.6028 250.279 67.6091ZM251.909 75.6927 251.888 76.1544 251.138 81.7297 249.156 81.4632 249.9 75.9313 249.892 76.0185 249.911 75.6006ZM250.871 83.7119 250.319 87.8223 249.896 89.7144 247.944 89.2783 248.357 87.4283 248.342 87.5131 248.889 83.4453ZM249.46 91.6663 248.152 97.5219 246.2 97.0858 247.508 91.2302ZM247.645 99.5265 246.377 103.802 245.848 105.302 243.962 104.638 244.482 103.161 244.466 103.209 245.728 98.9575ZM245.184 107.189 244.709 108.539 242.933 112.808 241.086 112.04 242.852 107.796 242.832 107.849 243.298 106.524ZM242.025 114.662 240.941 116.859 239.091 119.979 237.371 118.959 239.201 115.872 239.165 115.939 240.232 113.777ZM237.891 121.694 236.653 123.392 233.529 126.024 232.24 124.494 235.272 121.94 235.109 122.116 236.275 120.516ZM235.853 128.254 226.908 128.214 232.307 121.083Z" transform="matrix(1 0 0 -1 199 337.214)"/><rect x="518" y="206" width="129" height="164" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#E8E8E8"/><rect x="533" y="231" width="103" height="71" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(569.617 254)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(555.863 270)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(562.863 286)">weight</text><rect x="533" y="313" width="103" height="48" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(550.28 333)">FP8 scaling</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(561.447 349)">factors</text><rect x="735" y="104" width="103" height="71" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(772.407 127)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(758.653 143)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(770.573 159)">input</text><rect x="735" y="231" width="103" height="70" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#C1E5F5"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(772.407 254)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(758.653 270)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(766.073 286)">GEMM</text><path d="M0.00641402-0.999979 93.1192-0.402739 93.1064 1.59722-0.00641402 0.999979ZM91.8051-3.41123 99.7793 0.64 91.7538 4.5886Z" transform="matrix(1 0 0 -1 636 266.64)"/><path d="M788 175 788 224.395 786 224.395 786 175ZM791 223.061 787 231.061 783 223.061Z"/><path d="M788.277 301.127 788.042 302.957 787.317 304.869 786.16 306.721 785.823 307.11 784.311 305.801 784.597 305.471 784.505 305.595 785.569 303.893 785.482 304.068 786.131 302.358 786.074 302.586 786.293 300.873ZM784.412 308.692 782.669 310.271 780.361 311.994 779.525 312.487 778.508 310.765 779.297 310.3 779.207 310.359 781.435 308.696 781.362 308.757 783.069 307.21ZM777.803 313.505 774.748 315.309 772.429 316.378 771.592 314.561 773.866 313.514 773.775 313.561 776.786 311.783ZM770.612 317.214 767.817 318.501 765.011 319.551 764.31 317.678 767.082 316.641 767.014 316.669 769.776 315.397ZM763.138 320.253 759.686 321.544 757.423 322.25 756.828 320.34 759.065 319.643 759.012 319.662 762.437 318.38ZM755.513 322.845 750.46 324.42 749.733 324.611 749.224 322.676 749.93 322.491 749.886 322.504 754.918 320.936ZM747.798 325.119 741.995 326.643 741.487 324.709 747.29 323.185ZM740.022 327.153 734.165 328.454 733.731 326.502 739.588 325.2ZM732.212 328.888 729.147 329.57 726.303 330.103 725.935 328.137 728.762 327.607 728.729 327.614 731.778 326.936ZM724.337 330.471 718.44 331.576 718.072 329.61 723.969 328.505ZM716.44 331.924 710.511 332.849 710.203 330.873 716.132 329.948ZM708.535 333.157 704.746 333.748 702.57 334.024 702.318 332.04 704.481 331.766 704.452 331.77 708.227 331.181ZM700.586 334.275 694.634 335.03 694.382 333.046 700.334 332.291ZM692.649 335.282 691.662 335.407 686.654 335.902 686.457 333.912 691.452 333.418 691.424 333.421 692.398 333.298ZM684.664 336.099 678.693 336.689 678.496 334.698 684.467 334.108ZM676.672 336.848 670.687 337.275 670.545 335.28 676.53 334.853ZM668.692 337.417 664.275 337.732 662.677 337.801 662.59 335.803 664.175 335.734 664.147 335.736 668.55 335.422ZM660.679 337.888 654.685 338.149 654.598 336.151 660.592 335.89ZM652.687 338.236 650.194 338.345 646.661 338.397 646.631 336.397 650.15 336.345 650.121 336.346 652.6 336.238ZM644.661 338.426 642.681 338.456 642.651 336.456 644.631 336.427ZM644.058 341.436 636 337.555 643.94 333.437Z"/><path d="M479 50 479 393.312" stroke="#000000" stroke-width="2" stroke-miterlimit="8" fill="none" fill-rule="evenodd"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="22" transform="translate(105.552 72)">FP8 with initial scaling factors</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="22" transform="translate(591.229 72)">Weight calibration</text></g></svg>
\ No newline at end of file
diff --git a/docs/examples/te_gemma/media/calibration_2_half.svg b/docs/examples/te_gemma/media/calibration_2_half.svg
new file mode 100755
index 0000000000..2d56f7d434
--- /dev/null
+++ b/docs/examples/te_gemma/media/calibration_2_half.svg
@@ -0,0 +1 @@
+<svg width="960" height="480" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" overflow="hidden"><g><rect x="0" y="0" width="960" height="480" fill="#FFFFFF"/><path d="M446 56 446 399.312" stroke="#000000" stroke-width="2" stroke-miterlimit="8" fill="none" fill-rule="evenodd"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="22" transform="translate(166.41 88)">Weight calibration</text><rect x="87" y="211" width="129" height="164" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#E8E8E8"/><rect x="102" y="236" width="103" height="71" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(138.558 260)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(124.805 276)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(131.805 292)">weight</text><rect x="102" y="319" width="103" height="48" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(119.222 339)">FP8 scaling</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(130.388 355)">factors</text><rect x="304" y="109" width="103" height="71" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(341.349 132)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(327.595 148)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(339.515 164)">input</text><rect x="304" y="236" width="103" height="70" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#C1E5F5"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(341.348 259)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(327.595 275)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(335.015 291)">GEMM</text><path d="M0.00641402-0.999979 93.1192-0.402739 93.1064 1.59722-0.00641402 0.999979ZM91.8051-3.41123 99.7793 0.64 91.7538 4.5886Z" transform="matrix(1 0 0 -1 205 271.64)"/><path d="M357 180 357 229.395 355 229.395 355 180ZM360 228.061 356 236.061 352 228.061Z"/><path d="M357.277 306.127 357.042 307.957 356.317 309.869 355.16 311.721 354.823 312.11 353.311 310.801 353.597 310.471 353.505 310.595 354.569 308.893 354.482 309.068 355.131 307.358 355.074 307.586 355.293 305.873ZM353.412 313.692 351.669 315.271 349.361 316.994 348.525 317.487 347.508 315.765 348.297 315.3 348.207 315.359 350.435 313.696 350.362 313.757 352.069 312.21ZM346.803 318.505 343.748 320.309 341.429 321.378 340.592 319.561 342.866 318.514 342.775 318.561 345.786 316.783ZM339.612 322.214 336.817 323.501 334.011 324.551 333.31 322.678 336.082 321.641 336.014 321.669 338.775 320.397ZM332.138 325.253 328.686 326.544 326.423 327.25 325.828 325.34 328.065 324.643 328.012 324.662 331.437 323.38ZM324.513 327.845 319.46 329.42 318.733 329.611 318.224 327.676 318.93 327.491 318.886 327.504 323.918 325.936ZM316.798 330.119 310.995 331.643 310.487 329.709 316.29 328.185ZM309.022 332.153 303.165 333.454 302.731 331.502 308.588 330.2ZM301.212 333.888 298.147 334.57 295.303 335.103 294.935 333.137 297.762 332.607 297.729 332.614 300.778 331.936ZM293.337 335.471 287.44 336.576 287.072 334.61 292.969 333.505ZM285.44 336.924 279.511 337.849 279.203 335.873 285.132 334.948ZM277.535 338.157 273.746 338.748 271.57 339.024 271.318 337.04 273.481 336.766 273.452 336.77 277.227 336.181ZM269.586 339.275 263.633 340.03 263.382 338.046 269.334 337.291ZM261.649 340.282 260.662 340.407 255.654 340.902 255.457 338.912 260.452 338.418 260.424 338.421 261.398 338.298ZM253.664 341.099 247.693 341.689 247.496 339.698 253.467 339.108ZM245.672 341.848 239.687 342.275 239.545 340.28 245.53 339.853ZM237.692 342.417 233.275 342.732 231.677 342.801 231.59 340.803 233.175 340.734 233.147 340.736 237.55 340.422ZM229.679 342.888 223.685 343.149 223.598 341.151 229.592 340.89ZM221.687 343.236 219.194 343.345 215.661 343.397 215.631 341.397 219.15 341.345 219.121 341.346 221.6 341.238ZM213.661 343.426 211.681 343.456 211.651 341.456 213.631 341.427ZM213.058 346.436 205 342.555 212.94 338.437Z"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="22" transform="translate(500.235 88)">FP8 with calibrated scaling factors</text><rect x="493" y="211" width="129" height="164" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#E8E8E8"/><rect x="508" y="236" width="103" height="71" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(545.009 260)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(531.255 276)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(538.255 292)">weight</text><rect x="508" y="319" width="103" height="48" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#92D050"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(527.509 331)">Calibrated</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(523.922 347)">FP8 scaling</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(536.842 363)">factors</text><rect x="652" y="249" width="82" height="44" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#FBE3D6"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(680.458 267)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(669.958 283)">Weight</text><rect x="756" y="192" width="82" height="45" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#FBE3D6"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(784.81 210)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(780.39 226)">Input</text><rect x="745" y="109" width="103" height="71" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(782.477 132)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(768.723 148)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(780.643 164)">input</text><rect x="756" y="249" width="82" height="44" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#C1E5F5"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(784.807 267)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(776.14 283)">GEMM</text><path d="M0.015735-0.999876 34.0184-0.464776 33.987 1.53498-0.015735 0.999876ZM32.7325-3.48538 40.6686 0.64 32.6066 4.51362Z" transform="matrix(1 0 0 -1 611 271.64)"/><path d="M734 270 749.791 270 749.791 272 734 272ZM748.458 267 756.458 271 748.458 275Z"/><path d="M798 237 798 242.349 796 242.349 796 237ZM801 241.016 797 249.016 793 241.016Z"/><path d="M798 180 798 185.349 796 185.349 796 180ZM801 184.016 797 192.016 793 184.016Z"/><path d="M0.0369111-0.999319 6.03282-0.777852 5.959 1.22079-0.0369111 0.999319ZM8.10061-0.673792 14.0656-0.0265058 13.8498 1.96182 7.88485 1.31454ZM16.1141 0.247765 22.0214 1.2984 21.6712 3.2675 15.7639 2.21686ZM24.0392 1.73603 29.8647 3.17232 29.386 5.11418 23.5604 3.67789ZM31.84 3.77104 37.3437 5.51189 37.6161 5.61786 36.8909 7.48175 36.6487 7.38751 36.7097 7.40901 31.2368 5.67792ZM39.48 6.34307 44.1494 8.15988 45.1058 8.60687 44.259 10.4188 43.3326 9.98577 43.3934 10.0118 38.7547 8.20696ZM46.9177 9.45367 50.5967 11.1731 52.3529 12.1468 51.3831 13.896 49.6571 12.939 49.7186 12.9703 46.0709 11.2656ZM54.1021 13.1167 56.6269 14.5166 59.2925 16.2633 58.1963 17.9361 55.5616 16.2097 55.6248 16.2478 53.1322 14.8658ZM60.9653 17.3595 62.181 18.1561 65.8342 20.9962 64.6067 22.5752 60.9853 19.7599 61.051 19.8068 59.8691 19.0323ZM67.4635 22.304 71.6245 26.1896 71.8939 26.499 70.3854 27.8122 70.15 27.5418 70.2217 27.6161 66.0985 23.7658ZM73.2071 28.0075 75.3941 30.5196 76.9604 32.8279 75.3055 33.9509 73.7729 31.6924 73.8461 31.7875 71.6986 29.3207ZM78.0834 34.4828 78.4463 35.0176 80.7274 39.6775 80.8507 40.1892 78.9063 40.6576 78.8087 40.2527 78.8828 40.4582 76.681 35.9603 76.7517 36.0821 76.4285 35.6058ZM81.3191 42.1336 81.384 42.4029 79.4396 42.8713 79.3747 42.602ZM84.1057 40.6042 81.6141 49.1944 76.2369 42.047Z" transform="matrix(1 0 0 -1 611 342.194)"/><path d="M0.0162082-0.999869 6.01542-0.902619 5.983 1.09712-0.0162082 0.999869ZM8.01516-0.870203 14.0144-0.772953 13.982 1.22678 7.98274 1.12953ZM16.0141-0.740537 17.4432-0.717371 22.0401-0.498449 21.945 1.49929 17.3637 1.28111 17.3951 1.28211 15.9817 1.2592ZM24.0378-0.403308 30.031-0.117888 29.9359 1.87985 23.9427 1.59443ZM32.0288-0.0227477 34.7491 0.106806 38.046 0.364142 37.8903 2.35808 34.6086 2.10192 34.6388 2.10382 31.9336 1.97499ZM40.0399 0.519778 46.0217 0.986688 45.8661 2.98062 39.8843 2.51371ZM48.0156 1.14232 51.7818 1.43629 54.021 1.67858 53.8059 3.66698 51.5815 3.42629 51.6113 3.42907 47.86 3.13626ZM56.0094 1.89373 61.9746 2.53919 61.7594 4.52759 55.7943 3.88213ZM63.963 2.75434 68.4058 3.23507 69.9524 3.44977 69.6774 5.43077 68.1457 5.21815 68.1756 5.22184 63.7478 4.74274ZM71.9334 3.72477 77.8764 4.54976 77.6014 6.53077 71.6584 5.70577ZM79.8574 4.82476 84.4854 5.46721 85.8248 5.69593 85.4882 7.66739 84.1641 7.4413 84.195 7.44607 79.5824 6.80576ZM87.7963 6.03257 93.7107 7.0425 93.374 9.01397 87.4596 8.00403ZM95.6821 7.37915 99.8854 8.09689 101.618 8.45235 101.217 10.4116 99.4998 10.0594 99.5324 10.0656 95.3455 9.35061ZM103.578 8.8542 109.455 10.0598 109.053 12.019 103.176 10.8134ZM111.414 10.4616 114.471 11.0885 117.305 11.7781 116.832 13.7214 114.015 13.0361 114.051 13.044 111.013 12.4208ZM119.248 12.251 125.078 13.6696 124.605 15.6129 118.775 14.1943ZM127.021 14.1425 128.106 14.4065 132.839 15.7676 132.286 17.6897 127.573 16.3343 127.613 16.3449 126.548 16.0858ZM134.761 16.3204 140.527 17.9787 139.974 19.9008 134.208 18.2425ZM142.468 18.6339 148.147 20.5714 147.501 22.4643 141.822 20.5268ZM150.04 21.2172 151.991 21.883 155.691 23.3992 154.932 25.2498 151.26 23.745 151.317 23.7661 149.394 23.1101ZM157.541 24.1577 161.965 25.9705 163.116 26.5198 162.255 28.3249 161.129 27.7877 161.18 27.8105 156.783 26.0083ZM164.921 27.3811 166.417 28.0949 170.288 30.1608 169.346 31.9252 165.496 29.8699 165.536 29.8903 164.06 29.1862ZM172.051 31.2128 174.133 32.4706 177.115 34.5432 175.973 36.1855 173.017 34.1311 173.071 34.1659 171.017 32.9246ZM178.738 35.8386 180.177 37.0144 182.575 39.3924 183.152 40.3284 181.449 41.3778 180.935 40.5432 181.082 40.7285 178.803 38.4683 178.875 38.5326 177.473 37.3872ZM184.201 42.0309 184.451 42.436 182.748 43.4854 182.499 43.0804ZM186.867 40.296 185.963 49.1944 179.387 43.1319Z" transform="matrix(1 0 0 -1 611 342.194)"/><path d="M0.0156214-0.999878 6.01489-0.90615 5.98365 1.09361-0.0156214 0.999878ZM8.01465-0.874907 14.0139-0.781179 13.9827 1.21858 7.9834 1.12485ZM16.0137-0.749936 22.0129-0.656208 21.9817 1.34355 15.9824 1.24982ZM24.0425-0.610275 30.0362-0.33517 29.9445 1.66273 23.9508 1.38762ZM32.0341-0.243468 38.0278 0.0316382 37.9361 2.02953 31.9424 1.75443ZM40.0257 0.12334 46.0194 0.398446 45.9277 2.39634 39.934 2.12124ZM48.0445 0.523202 54.0276 0.973298 53.8776 2.96766 47.8945 2.51757ZM56.022 1.12333 62.0051 1.57343 61.855 3.56779 55.872 3.1177ZM63.9994 1.72346 69.9786 2.17326 70.0112 2.17666 69.8037 4.16587 69.7855 4.16396 69.8142 4.16654 63.8494 3.71782ZM72.0004 2.3841 77.968 3.00644 77.7606 4.99565 71.793 4.37332ZM79.9573 3.21389 85.9249 3.83622 85.7175 5.82544 79.7498 5.2031ZM87.9141 4.04367 92.4444 4.51611 93.9056 4.71161 93.6404 6.69394 92.1936 6.50038 92.2225 6.50382 87.7067 6.03288ZM95.8879 4.97682 101.835 5.77248 101.57 7.75481 95.6227 6.95916ZM103.817 6.03769 109.764 6.83334 109.499 8.81568 103.552 8.02003ZM111.747 7.09856 114.174 7.42333 117.708 8.00485 117.383 9.97831 113.864 9.39923 113.894 9.40367 111.481 9.0809ZM119.681 8.32963 125.601 9.30397 125.277 11.2774 119.356 10.3031ZM127.575 9.62875 133.495 10.6031 133.17 12.5765 127.25 11.6022ZM135.497 10.9496 141.383 12.1131 140.996 14.0752 135.109 12.9116ZM143.345 12.501 149.232 13.6646 148.844 15.6266 142.958 14.463ZM151.194 14.0524 154.692 14.744 157.096 15.3078 156.639 17.255 154.252 16.6952 154.287 16.7026 150.806 16.0145ZM159.043 15.7645 164.885 17.1344 164.428 19.0816 158.586 17.7116ZM166.832 17.5911 172.673 18.9611 172.217 20.9083 166.375 19.5383ZM174.638 19.4697 180.436 21.011 179.923 22.9439 174.124 21.4025ZM182.386 21.5434 188.15 23.2095 187.595 25.1309 181.831 23.4647ZM190.094 23.7721 195.817 25.5738 195.217 27.4815 189.494 25.6798ZM197.725 26.1743 197.937 26.241 203.426 28.1274 202.776 30.0188 197.299 28.1367 197.324 28.1448 197.125 28.082ZM205.317 28.7775 205.372 28.7962 210.96 30.9008 210.255 32.7724 204.681 30.673 204.708 30.6829 204.667 30.6689ZM212.856 31.6341 218.398 33.9327 217.632 35.7801 212.09 33.4815ZM220.257 34.7672 224.842 36.8757 225.729 37.3319 224.814 39.1104 223.947 38.6643 223.987 38.6836 219.421 36.5843ZM227.508 38.2468 230.314 39.6905 232.817 41.1479 231.81 42.8763 229.33 41.432 229.376 41.457 226.593 40.0253ZM234.545 42.1544 235.24 42.5592 239.599 45.4789 239.669 45.5339 238.432 47.1052 238.392 47.074 238.454 47.1192 234.153 44.2384 234.207 44.2717 233.539 43.8827ZM241.24 46.7714 243.37 48.4484 245.817 50.7879 244.435 52.2335 242.022 49.9271 242.095 49.9899 240.003 48.3426ZM247.249 52.3466 249.052 54.5432 250.739 57.3923 249.018 58.4114 247.37 55.6281 247.457 55.7529 245.703 53.6153ZM251.537 59.4264 252.056 60.8708 252.444 64.0697 252.369 65.7033 250.371 65.6112 250.443 64.0611 250.449 64.2276 250.085 61.2235 250.136 61.4415 249.655 60.1032ZM252.277 67.7012 252.001 73.6948 250.003 73.6028 250.279 67.6091ZM251.909 75.6927 251.888 76.1544 251.138 81.7297 249.156 81.4632 249.9 75.9313 249.892 76.0185 249.911 75.6006ZM250.871 83.7119 250.319 87.8223 249.896 89.7144 247.944 89.2783 248.357 87.4283 248.342 87.5131 248.889 83.4453ZM249.46 91.6663 248.152 97.5219 246.2 97.0858 247.508 91.2302ZM247.645 99.5265 246.377 103.802 245.848 105.302 243.962 104.638 244.482 103.161 244.466 103.209 245.728 98.9575ZM245.184 107.189 244.709 108.539 242.933 112.808 241.086 112.04 242.852 107.796 242.832 107.849 243.298 106.524ZM242.025 114.662 240.941 116.859 239.091 119.979 237.371 118.959 239.201 115.872 239.165 115.939 240.232 113.777ZM237.891 121.694 236.653 123.392 233.529 126.024 232.24 124.494 235.272 121.94 235.109 122.116 236.275 120.516ZM235.853 128.254 226.908 128.214 232.307 121.083Z" transform="matrix(1 0 0 -1 611 342.214)"/></g></svg>
\ No newline at end of file
diff --git a/docs/examples/te_gemma/media/fp8_model_init.svg b/docs/examples/te_gemma/media/fp8_model_init.svg
new file mode 100755
index 0000000000..c7fce2120d
--- /dev/null
+++ b/docs/examples/te_gemma/media/fp8_model_init.svg
@@ -0,0 +1 @@
+<svg width="1280" height="720" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" overflow="hidden"><g><rect x="0" y="0" width="1280" height="720" fill="#FFFFFF"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="24" transform="translate(151.097 204)">FP32/BF16</text><path d="M821 170 821 513.312" stroke="#000000" stroke-width="2" stroke-miterlimit="8" fill="none" fill-rule="evenodd"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="24" transform="translate(615.044 204)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="24" transform="translate(908.732 204)">FP8 with fp8_model_init()</text><rect x="868" y="326" width="129" height="164" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#E8E8E8"/><rect x="883" y="363" width="101" height="45" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(920.957 382)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(911.87 398)">weight</text><rect x="1079" y="363" width="82" height="45" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#C1E5F5"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(1108.05 382)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(1099.38 398)">GEMM</text><path d="M984 385 1073.04 385 1073.04 387 984 387ZM1071.71 382 1079.71 386 1071.71 390Z"/><path d="M1120 280.99 1120.73 356.404 1118.73 356.423 1118 281.01ZM1123.71 355.042 1119.79 363.08 1115.71 355.119Z"/><path d="M422 170 422 513.312" stroke="#000000" stroke-width="2" stroke-miterlimit="8" fill="none" fill-rule="evenodd"/><rect x="54" y="326" width="129" height="164" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#E8E8E8"/><rect x="68" y="351" width="103" height="71" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(105.39 374)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(91.6367 390)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(98.6367 406)">weight</text><rect x="271" y="224" width="103" height="71" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(308.18 247)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(294.427 263)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(306.347 279)">input</text><rect x="271" y="351" width="103" height="70" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#C1E5F5"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(308.18 374)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(294.427 390)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(301.847 406)">GEMM</text><path d="M0.00641402-0.999979 93.1192-0.402739 93.1064 1.59722-0.00641402 0.999979ZM91.8051-3.41123 99.7793 0.64 91.7538 4.5886Z" transform="matrix(1 0 0 -1 171 386.64)"/><path d="M324 295 324 344.395 322 344.395 322 295ZM327 343.061 323 351.061 319 343.061Z"/><rect x="447" y="326" width="129" height="164" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#E8E8E8"/><rect x="462" y="351" width="103" height="71" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(498.862 375)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(485.109 391)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(492.109 407)">weight</text><rect x="606" y="364" width="81" height="44" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#FBE3D6"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(634.312 382)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(623.812 398)">Weight</text><rect x="703" y="234" width="96" height="45" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(738.66 252)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(734.494 268)">input</text><rect x="710" y="364" width="82" height="44" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#C1E5F5"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(738.66 382)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(729.994 398)">GEMM</text><path d="M0.015735-0.999876 34.0184-0.464776 33.987 1.53498-0.015735 0.999876ZM32.7325-3.48538 40.6686 0.64 32.6066 4.51362Z" transform="matrix(1 0 0 -1 565 386.64)"/><path d="M687 385 702.791 385 702.791 387 687 387ZM701.458 382 709.458 386 701.458 390Z"/><path d="M752 279 752 357.156 750 357.156 750 279ZM755 355.822 751 363.822 747 355.822Z"/><rect x="1071" y="237" width="97" height="44" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(1107.26 255)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(1103.09 271)">input</text></g></svg>
\ No newline at end of file
diff --git a/docs/examples/te_gemma/media/fp8_model_init_1_half.svg b/docs/examples/te_gemma/media/fp8_model_init_1_half.svg
new file mode 100755
index 0000000000..3b217a3eb2
--- /dev/null
+++ b/docs/examples/te_gemma/media/fp8_model_init_1_half.svg
@@ -0,0 +1 @@
+<svg width="960" height="480" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" overflow="hidden"><g><rect x="0" y="0" width="960" height="480" fill="#FFFFFF"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="22" transform="translate(195.4 93)">FP32/BF16</text><path d="M461 61 461 404.312" stroke="#000000" stroke-width="2" stroke-miterlimit="8" fill="none" fill-rule="evenodd"/><rect x="92" y="217" width="129" height="164" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#E8E8E8"/><rect x="107" y="242" width="103" height="71" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(144.193 265)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(130.44 281)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(137.44 297)">weight</text><rect x="310" y="114" width="103" height="72" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(346.984 138)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(333.231 154)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(345.151 170)">input</text><rect x="310" y="242" width="103" height="70" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#C1E5F5"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(346.984 265)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(333.23 281)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(340.65 297)">GEMM</text><path d="M0.00641402-0.999979 93.1192-0.402739 93.1064 1.59722-0.00641402 0.999979ZM91.8051-3.41123 99.7793 0.64 91.7538 4.5886Z" transform="matrix(1 0 0 -1 210 277.64)"/><path d="M362 186 362 235.395 360 235.395 360 186ZM365 234.061 361 242.061 357 234.061Z"/><rect x="486" y="217" width="129" height="164" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#E8E8E8"/><rect x="501" y="242" width="103" height="71" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(537.665 266)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(523.912 282)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(530.912 298)">weight</text><rect x="644" y="255" width="82" height="44" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#FBE3D6"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(673.115 273)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(662.615 289)">Weight</text><rect x="741" y="125" width="97" height="45" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(777.464 143)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(773.297 159)">input</text><rect x="749" y="255" width="82" height="44" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#C1E5F5"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(777.464 273)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(768.797 289)">GEMM</text><path d="M0.015735-0.999876 34.0184-0.464776 33.987 1.53498-0.015735 0.999876ZM32.7325-3.48538 40.6686 0.64 32.6066 4.51362Z" transform="matrix(1 0 0 -1 604 277.64)"/><path d="M726 276 741.791 276 741.791 278 726 278ZM740.458 273 748.458 277 740.458 281Z"/><path d="M791 170 791 248.156 789 248.156 789 170ZM794 246.822 790 254.822 786 246.822Z"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="22" transform="translate(645.181 91)">FP8</text></g></svg>
\ No newline at end of file
diff --git a/docs/examples/te_gemma/media/fp8_model_init_2_half.svg b/docs/examples/te_gemma/media/fp8_model_init_2_half.svg
new file mode 100755
index 0000000000..46587664fe
--- /dev/null
+++ b/docs/examples/te_gemma/media/fp8_model_init_2_half.svg
@@ -0,0 +1 @@
+<svg width="960" height="480" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" overflow="hidden"><g><rect x="0" y="0" width="960" height="480" fill="#FFFFFF"/><path d="M471 66 471 409.312" stroke="#000000" stroke-width="2" stroke-miterlimit="8" fill="none" fill-rule="evenodd"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="22" transform="translate(267.606 98)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="22" transform="translate(572.588 98)">FP8 with fp8_model_init()</text><rect x="519" y="222" width="129" height="164" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#E8E8E8"/><rect x="533" y="259" width="101" height="45" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(571.603 277)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(562.516 293)">weight</text><rect x="730" y="259" width="82" height="45" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#C1E5F5"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(758.696 277)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(750.029 293)">GEMM</text><path d="M634 280 723.041 280 723.041 282 634 282ZM721.708 277 729.708 281 721.708 285Z"/><path d="M771 176.99 771.726 252.404 769.726 252.423 769 177.01ZM774.713 251.042 770.79 259.08 766.713 251.119Z"/><rect x="98" y="222" width="129" height="164" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#E8E8E8"/><rect x="112" y="246" width="104" height="72" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(149.508 270)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(135.755 286)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(142.755 302)">weight</text><rect x="256" y="259" width="82" height="45" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#FBE3D6"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(284.957 277)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(274.457 293)">Weight</text><rect x="353" y="130" width="97" height="44" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(389.306 148)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(385.139 164)">input</text><rect x="361" y="259" width="81" height="45" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#C1E5F5"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(389.306 277)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(380.639 293)">GEMM</text><path d="M0.015735-0.999876 34.0184-0.464776 33.987 1.53498-0.015735 0.999876ZM32.7325-3.48538 40.6686 0.64 32.6066 4.51362Z" transform="matrix(1 0 0 -1 216 281.64)"/><path d="M338 280 353.791 280 353.791 282 338 282ZM352.458 277 360.458 281 352.458 285Z"/><path d="M402 174 402 252.156 400 252.156 400 174ZM405 250.822 401 258.822 397 250.822Z"/><rect x="722" y="132" width="96" height="45" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(757.906 151)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(753.739 167)">input</text></g></svg>
\ No newline at end of file
diff --git a/docs/examples/te_gemma/media/generation_animation.gif b/docs/examples/te_gemma/media/generation_animation.gif
new file mode 100755
index 0000000000000000000000000000000000000000..25150cb9b64162084b017442a3905c57127c6713
GIT binary patch
literal 135280
zcmdSfRZtvG_%7(d!T<vSg1b8e4Nh>^K=5F}EkGa;EQ1g3?#|%u?(VLGySwl2e`@#C
z*{i+YzUum_tE;*%zUq4Wk&%<;<2TOuz=!An08qZ8DNAXnONgmTak8-^zyba%Jt88F
z0Ym{T|IGpavn=ra{r&CjEjT#%zd}z>Phnx9m6a7YH#Zg*mW+(d+S*!OT^%(wwY$6f
z$;pY9me%CtWKvQR5-m3lrzj!~*ZJiYCMITNWF!Rz#rgSpMMZ^!gM+H7>cqsv#>R%I
zsAy(p=D&acjEs!f+1WcfIsotph*a#boxSbdy{oIMUlj;{ngD2Q?7qIfO-)VH)6-E=
zQ4$go&d$z#eSI7p955J6U0ppRBg53xl#-INw6ydS$5-mFN)`nlu)nZwZ14WhLH_?d
zkki2cUqAnVz#vF)NN8AiL}XNSOl(|yLSj;KN@`kqMrKxaPHtX)L17WJxTLhKyrQzI
zx~8_SzM-+HxuvzOy`!_MyQjCWe_(KEcw}^Jd}4BHdS-TReqnKGd1ZBNeFL_+wY{^u
zw|{VWbbNApc7Abrb$xStcmMGC^!)Pr_6|TmC6TSn>hc4AV$vV1%<c)oq?S*Ut;*>O
zCE${^ULCB;9f+im2qBTH&KrtlP%qXWs?HxtWdAvwCRbB1mdf}0XmzNja3WLG50zBD
zwrDC>HkRqfa4mGEP&HRRUB0e(u0*?r*LrQZu4JLYs3(L}p}usf#&V|k$4GtIN`u4p
zaJoW6`C5zH_0if$L&Zi17y*q;v9WTq2l9!T&0w^#YI`7xS|LNRsd{%LiOXhvw5eu)
zB1<BaOsTo{aHdGT#9*ws?s%c{=SW6@QcL~mO5^Y2^|6+Q^NmhFG;-zEMmacyn2!fW
zD*;!#@fm2C5PkmN+lw_SCl{+oRA<A5*TosD9xPWAGl}E{2m?O*9Ec2jDBi-fhqp%`
z*$sih|4soPAW^^PDaDPPwoZN5T`IE**MEbaau5(cGA)RHpd|&ad2P<+8vFaPFu))*
zsCgUz{0|bd*xrV=i~^qF{1Cz>IKZ2-^wJBI!u&+jCc*EbAA%=>K5WTMgyQ-RQiUH1
zi(*HUP#sYEi)&h$eHA9kF%dqX%@uV=K5)o|$H+NQONb%h|Cpjxe@G{!N|du1VvQ<p
z019LPBLpC@gAsmn;AlfIeCY5c5D7^jh3OMmSL^NqLIR>70h+CDgkjRjPS0?*>3nJ|
zqhb<-RNW8+_+e7_pOl0^RZFLj8b^a+!<!DWUdTyMQG^((i$Y3tXM(PhIE*d$GB4E1
zVv3mNNv2E$p-#gh=&(DmE;nIWHrG8z09<hmH(8uFoq}oyS|)QUZGiP#on#~z2+F31
z=o5V|93~Y%XoBG7FaXx*knO3S3zk1qzAvY)X1#N`34#kYIY&&x0G>w`9?)Ut!49~8
zp%T5n0$DiOY%!{LTLlA?j<RsEi&y^<%0B6rGjpY!Q&col{4N+l^|f_>;xMVAA{8w$
zKnyg!uyzN6iwxaC;4s+_6O@zJ6zGx^bre{g51rLJC>J$O%10Mj;r=_Vzn$VE&`T1e
z3xl|+C%hhwkU73uo>CA(o?s!`Um=~5zNWBxr{B3OwA4YL7Y-><d)I!1%+;qA?tE6a
zFA78uJ-e9VQ}NuM+<Gb_gWalGKMs~QV1%{+7v#><{gZ?*1AUFk*K=^cVsuWke^HgK
z#Pw#YJ%=nRQI9q=<8$mEW&b$;-7Vw0^ZQ-7n5A-o{p#)cw4ksd=ZZbWYVZ`qJs}iZ
zk3D^RidD|aOL;>k-U@41AEWU@_8#XRe9Fao=gcP)i>eUcgIo8jZD4Qb`OJU1lH~NF
zRae$@=1TyQ8bWmB6dNr!-^!tYCmf_%M_2N}bzeUvDgsWE<~V+*mV6+aZ3jWvg|Q_4
zArx?J$h2qf%(WBuzS`N}yBvI-ski)ET_t<P*dQ-(CTlpnfX*RCh*c^9ykdPfNluiN
zzJK&YEd%7Ud5FKirHK>vJvYHFyPrc?E}i=zjX?gWc`Q-yb4(B58VB7{gG3an8&CG>
z3Bzp|UKH6hfp4adLcd;OoidN1HyMO^9{oXIw{b8CzyYVI0g9R<qsJ}BU^%t+7mY;1
zH5RaI0_Y-BWK(d=_9L*%sLZ0RA?|;X*B+yHie+A<<!$1o#f7&X<ybvr?``7vJ{D^#
zuL6_a9cbhzvVI*FMI_M_2FzAqM);e;@oQ6GpD-n*)#NuyH%U2tn*rbF66^~!u_zFQ
zB&mabI)zzrD2vaC^~0-kE-}6j_!8A(+odIu;tz<fTUBF9UvK|j=aZLMFAbZJBk81*
zm->LS1s?eMZ9?&J$_TqW%)=x826dl3{7E*(Fwf-fh9B&ZO&=7kI9?2&BooGD@$2jL
zdK#+OH}R+m&zlj8v!k2b*gIv_LVrv4+wE^xwKJ)%LXzR~PmQEUPJq*y$hqkxm5L{v
z(lea6gw}?sQKZ?j*F$S|Iff||wquJDp(2C(Crv80+7L&cSqhRnX+YSOKry#}56f^j
z<(EHD-|suW@92Ot0ddknvKol`NJ2V~W4WP48zrSl@^T?qfXbdDkxNKRJzZg{g$u)#
zQ%KH}RYJ{S_Q|6tpY-lK3z~eUQc5)6FsE3bx<ceLosx^q(qcPzwL}?dJjuv>;hly=
z0Bx_`&q<xJvb!c05L-zY&YIZHY?7zax!5P&fjmx}$kr0WNPXEK+}?G~z4FX`@*Y!?
zs+<XooZDMh#LKub3I4)@tRCd1Wcbseowv94Hs8^-V^JErgCC6k(H?68TN=7=XH@;F
z4!TOQUCQrVY*B8HYmE#V&7Y=Nkw3RmVX0g>k`?5*-OQLY7T*sbE0uY_qw+1sKeS=v
z;CNAbimwac*Ujvn|NPZz?6syW#6Mr`qX*eSq9^n&U1N=*O#4avk57z0V9LjZWN|>8
zaRdi1(z8%uXeWj5{g1WMWoUM&34?@7KqxjRX&=3@_txT&OnGJUvw$&$pnxCo&9%p2
zK<Pt@TYF4crHi1ep#)~5>5Uy%A?_x-AxbCGC<k_0WRsb%*=$F;Zl$}l{w}f~`+$%+
zkAy<FpL*fbxCP#Auwd8$AN_KOdHJ<ee2F2Kj_ri-Se^eT;DNb~j!~O*T@|5Sy9NG=
z>~sO+H*zKWXq4oh*zw{#me((8hWnKvNXD_$seOar<}k;0o#J0_)6CKp^W`KtX$FyX
znRqW&yXScVGfNLYpWiR~4njMU(H$8hmp2!%D*goNF-xTJj;_Ghd;dTMdsd`uh$P$u
zDo5;}*Lwm!1$X>xWEBC1Ux&APNxL<<dgsd~CU$d4xvG2zbwZ93b1vi@dkD!SPz?2h
zk`tV|bePF%G*7<joB`a?DQiybQ^x#`lKpDr&FLpRS9X%03KsMo^U>CXF6tj#a*jHo
z(mIhQ1`l-|JSU;LbrytKCrccx?44-s6Z78%-K~Xm{2~R;oXs3J$j9HWhZ_!8NfE(Q
zd;R?D;hb^AD@#03UUG3d(jCybv)ShFbndTfml-h_fjKB=tK;5D`aSC|I2R%@dW0eZ
z5N%|-8@`#LAiv(w$;AOr8~V@9TcMe4ev9pP4HAdo_f_k&4XErC@^ownXckUX`ZA_Q
zWNRD8y&QEPUSref`-Ik`%T0~9V;y<vMTg*AzRCf;xwXx!sRzK13<>|AfW==R1BHkd
z{pGB%=WmG`It2`fyF#H=HJwsb!!PARb3-KREHjBu*WO_OJA4-(o1H&%0}hP^|LR=L
zkOEEfVrkJE1po*>hgecZ^M5wPOe~P7x9R`R%=_S8`TYvgXOn#g@=Dr-^Z2<-s0-6c
zq7Veo`u=rs66NXE^ZOv{NAa~K*I8%JSg}!*kk9P0K=ote*Dy^=g8?S}2MPck(0I`e
zr^qiw)7-0o?hAvHv#fw)3@^8@Gy4|%We<%K!k-Y#AOcI#zit3t6R=tVAaoJ(L=XH6
z^pg|!@jj6L97Ki85Rkz~f|m6CzL0FVN3;B%l|DXrvBxyX-;)0iAMyoP;;s8%DsgWo
z8Pp4@^qtR`Kt1u;pe%apqblB!LRT}HU!s<fD7P<z;$B2|+LqJ2BtV0vc&=q0iCk>T
z-32`(s!-{<K#oc#zXdZGLj*+2Z#3z9@quO@bvUlCz!UzlRVH6#w^#sO0llX<lZ@t%
zhd~+gTuQBN&cJVsG1E%dU!02*sl0y!hMm6t0@@Hgs9l&aK(N0!aYo0+$jWiCgywUE
zW4aM=F<nM$wOla)9T;YlT*1G?fKaEpfKUP`6CM;e@!$-dgQMhUB(viDbVz@F$t3R^
z8`BM_%87}XghN}3ve<F_2vF##4rbVQ+MA0@zKRPAG<cPbE+}-xT3~lhuo6$^(A$Rm
zm5ww&v?Q$Lc}$>^cJn1dbeqI>Zua3V#ZI*No9OoV5eDbIKkEp?VSV|@{Wk!`T>alB
zeNf93PoR}NBsuX<#ADJGdkDO9_$T!rsadu~8pKH`$!SZ;Cl{g>-}SkD0e)5}nuA8I
zjwx6N$)!svI}}EF`3RUr%Gm(L-EHD}e)00A)Yc_*W5$HwEG2^>87Uo90nvDPznCt^
zv`MF!_J!0*!t`nW^jV$s`M>Fl$?3~Y>8ngj>FbZ_Fv5&2{)`=+jJ>}Z2gw;nO&KRk
z8E20f7lfHt{Fyg8nRkCPALx@apPDjXmNMTSGXX?d@B&$gx>+CGvw$gCsLffR<t&V+
zEKH(oY#M=V9Nla@_iTccY@+6DlI3i&r)&zM94dhv8r>W^_Z)_l944CP9G2xAwx=9+
zqFhdaTyEXmFYdW~DY*j8xkAgiB2T%ZM0w)W0(p|UdD8BAvMG7;&3TH;dCE_DszmwU
z1oAa>^EKV`wNvunb({0`m-B;-;Ejk1Oaux{bqmbh3oKI#teXpLmkaEl3LJ<EodgP<
zbqoKv7upnqa75v-Ckwrw3c*B0z5+%5x<!HRMUa%DkmjPW<)VnEq9`J0i~uywy2yYj
z-7N*0(hN;ohGslLvxtf_iwiw<p#|>6MJdI_&Bdk5#pSwCkLJQeqVycyl6v=&#*~ug
z<`U;(5FKVoCsAp)KxwaTX}^2vU`pw5bLr@E>G)IWBvIM4K-sKr*@!^NBw+z=W!dU-
z+4@r%jHrA|pnONSe9yi7Af^1Mx%_0g{Oqaxf~evOwHyafkS7mE5CaEC!S-BS@$yvh
zMpXGKQ27m@PzkSB3G}E$wW$Q9R-(65Vh~qh3RZp8tHQRa!ttmgO06OZts-lwB44Q@
zW(Im#!!ItU8p*@Kf2*KN1$7ix!(CO=kEC)EQ_y-;z+aWH>I>GCC08@sRIo9ZaOhPC
z9jEZ1QScJi3YL%x=+z28mwauh?T6J!FxM$RC&`qM%7ucI^y;*ZlQlwXMMvsXkLwIa
zl66u^^*lg7&?xLY>LUp2bp`8f_3G^vYTq3~>%>EA-A3xXMr!}1Hn=i2e7uCG1p~JG
zv408H{XuKgWNy4ks_&#=vLOJ?87FUn!}>a_+ld-CRT&@@Oz;;X@o0d0zb5>dw2uz#
zg{q`*RgF=^P2p%Dq4?wiW<Z`4e4}1U2Dqj979I$0Jj`n91+?@Dwqz;<#;s^tTL5cg
z8bfYGIq^`-(J=doTQc0*TG{1i1SK`3;h|9^A0IJ#1wm;gwR?c(J|6X)Y8L>EH55}W
zF9(4X1`jXFR;Cd6l-0JF3if|S!?^;S`!pXBYo9%%?2cfdkbqv$K<{W6NMYU|YnbY{
zL#G2YgH_4%ctR#UlAlsPqOW#v$K!W581V`i36|h=EzfkNS9Tu3x;q@&y`LL5FuVTr
zK$ej@vQtU?4p}FLjJ0@B=bk^(yt8!g-gUNOvgqY~1xvu=`qCHn@uGLq)2gPGej(NQ
z3m|e4-rDTdXH5I>n+K5-p=g(vL$sfez%ho(cZ&+(=?h16=!gtsCAa{<Jb*M80B75H
zh8|5qQA*m*mKxRGR&ejSYUh(g$~am8?*e^@LvyGovpgruCk?g_PE7DWhhF-4Dy{$r
zQVi?jOunt?{<ggln!_~!u@d}V@kD1e5IZd_5Jgi;0CVKfIO_aB8C(<Q%4N&d>5mrW
zL&y$46`O!F+xo9GF!V&CbWixNAK)_p#5r1@f!2k7Gukgnv1BG_OV2cU48NMMc6^Z7
zlRNs7&)uf~-y@`F9Xwp{UTP4ShKA@wdhH2hvgML@>e;(QNF1>~=is{~mis2ridan*
z?CD8L0Vw(g`E^T{t<f<z)#`Ig#s{_*KoF~1g}?QfLJ-vnm<D|96!XNUtiJ|OlD2&d
z4)dIDYlMxnhPTd-S_MqE!T+2PQj8^U1xDtu{*(OM8_SHE1%Q#HEOT|@u`)Ty2@M44
zMmlxo>5rWYb*q-4C1LlTS<KXGlbZW7GS_*zdYbaRXo4kLVO3Nyqu<xn$$v=-8QM8`
zAoy-zkG<hL|54W-9GttvbH0&gW(h}|5vo&PL(dvnc#_t6;%e^+Yz|RfXpvgjTHy+)
zX4(<ciOk_=iywu&qhqj<&bi}GG-Q3P<e{bo&uM(1qVoje#7KBL^Q0(Dg7UvQp#-tX
zP*nYxf0A|zsOG^?0-{mG6YL86Ot<=M2}esUc2_Rd!<JUqXNncM$I4`ExrC=UQR;JP
zdXzN7%?0`R05pYje(coP;oZC7)wCa~J8h`v2%H{=R;CAQ__ksTYXd>-onyBCDB={H
z<}1xC`uanL@|?Z5<{GyiJXnfCA5~L)zmwvfp#A#++J6NFRJVq9(vMs*)Q=d?CJG^p
zuj9*5mV%<?zfCB_|A2LMFJ8cQ)5V5=pyBu~7Gba4zi3e8*vgqDU$Sm6BvPEMEt-^>
z**0u1ODu1E*@VAi1`teY3-v|G@<dH|t@@mHvj6B2sGAAB5ScgOtqOP2AwwY!Z}I%L
z9Rb*p>nE4t!(e#N*ur<-D^Oj)`_Uo$cLwfFpFw<^&j844U^17#p%R~bQ?M_0yWRe(
zA#NaO-pgeFo3a#dxPKDX_q;RG#_ljSt&Oo3XW<`0+7S@JNf<F9&F6k9g<&1BS6Z`G
z1BTc`74k{7BURg7BfiglpA?G#-h4Q2K{|-XfE5|Il5N|NcmZ1r2jq8;YQHr%V;+@@
z9xZ0IlfqboZ8jBdHy8SW!CUZHv7;^aQ}Jbo#^ZzD(tEGz82xNVl%LLW90W^AwoMF3
zb4QO@<>o=OdkkFa-sPthZ~A^1fLh-b{&3v@>|WnnhLcjXFcHv>HwcMDPR~LxzI!nY
zHg-9p`uIM+X|xWU!-`{D*f~+X*z%?wd=n@%80W#=|DX>f!dxQ@YWehQ|8zaZJJ6gi
zb6eQp_-W79l%VUh;3_wP!TB+fGID?Giot7bI^qnC^cOnTi^z{Q#~CFdU(is9!#r)m
zE|wxCwi3mP9i8TXSCmHIXtTADbxCYUYqWuqQ!W1oLGNIE@abpr`DN6}xaL`e@l))N
zA$;jPT8B5gb8MCK<=fp4H<xK=*Oa}dt?U=d$~{3Z+O;t^Jca6HZ$Qth9@yr&;mLiQ
z8Qpo!8eXKJOQe&*FZ7?}ccR~J$^a{Q_5K1bQt|ey#)x{q41snPPv)m9FiH)#$b06}
zd%pY3tAJ~!*3T>KTKlo*vvrtH1)@*04if=eF3VxhM@|;VSL7+7<KZqnxBg7($mcrm
z_g>RnIZm4@;SV_-TK$<Qnj+VGNMZ%7?F{e>1Nd(1)hrc}Yr47o+uN+@og^rytVdqY
zV0H?CsJ0~oJNu-uEQGI9mLS&hA&MaiE-TBAz4KA~;Gq&2g%wTk)6IG@nh%VELd-0v
zpr>GL@~xz0WW~nP>6lqaQ6I$oIOG<!h<Th`G+8neGKsmFLtU8$>Ct9{K!*;87Ngv<
za<hh_4xEzsno-|H9AFrem%^5#8|NR<SQa{b5YdQ)Ane)Q0U%-l&vzh4PH!*{vu-Pd
zV_Q-mhkM>+-x{w3?jy)f{rjMPiRpuA`3Vl|96%$AW{M>bmP{ee#=>y~bE5T43UEL3
z-%(G<^|*;p>VD*$QjHIDkQBZT!5kK!IWoItV5ZRiNitwA`Ju#*p0u(w<J0`6sRp7g
zA!DP!EtE1|?cT>4d5A$>+|%$6SEw4xR$mygpf`xWdS@t(S+6}zC(&f!Gj@C)5~s}v
zk_`JVPD4dF)$WN=<IdJ8QNF)qWS&^_n!kqu4N;Z?<GB-Z-Tvv|iR<C+z^wyF#E!rS
zG`<8bX8rbWU#?hvl2Po=lrbN#HG91piqA%nB4R;g2M2ZxPnrCC<C!DmodD#it=-G9
z=fC7~uRiw^u83q1BVSauK_fp55#XkunUt(?016Rg$sf;t(AbRN56}cc6(MU9OqW5h
z6~s_JnDcdow8$isb9_)CjC&pOGeY2O@MmO#)xpmw)@44zC~-0?s%TlZA=6l85tMw0
zytJH|>t}2}vv^&1g3$y6Zxr(+lL#FOcGC<LorFljVDnTX=%IO<6J%V@A|1Wk$|8ep
z3;t)K`&$!c1{g!$GCPor36u(9)6LCXV;r{3s{t}u<tG?~Kx5;~Q4iu_cEiw&tU!5d
zXh8-lWnK}Hz)`AsIg?dM<+!{}S-mdJahaj}Q&Da+e9Cb}H->_3m2sPOX?ho3pl!7?
z+p%r!v~nn8?d;Q_jrqL6$aejz_g^&oMvG!5`}lSHS=%PVj1v2nv-TG1mWzZWyTJBw
zwEWhGvyopNSTV;HkxwFs4xMZ3HV)mWpIbLOL4w-nW*A>bVZAtpVUGPk+)|DZJmeJn
z0d8*=ry;sb5XumP3f^TTup(_|gndHs_gIUA?e74lUnEZBjKe2^zbD0WTdya{>PBf@
zrCqgerj@_iF^{WKFFK2TQwgV?)c!T*()iu=6mvSx3;m0PX=3=F<%zq}yFy)sKQ1fL
z>8!46e{AIL*Ipq%c2?XA5AQd8KfYP4`gagMq}Y<Tn{S1E#o$l+D@X%}Bq$4$*iE#s
z=if{5Sm)nQ)v{;ba1bQ>Yf@DBJ}%&=MVqhAoaAh#O{2x?q@{t3xiznVg^B&FULZLK
z0yx!SkI8<Xl3bB@*_<2v{)|W~2#{{OS+ZtsitYLi!n~V~4b+npd)&*6^nN-@RY60_
zIEQ)fU38FlAYP9czQ5G$OniNP+dl8Od!6upMaY6fVeru?v**~hNtdMC?*t0$xF9f)
z-D3L$AD8X0kJ%vImsbk{a0+mrf$Q{^4Pq-&W>~{|C`#@PF&l6IcL2e<LWrx7<pVua
z=AjW)27dgxu?6jxo{{-{P%lkRg_peT*3;d7FGEIT@V7WBe??wiYWYb@%+(zHkK$d2
zKLRirZ2OFBWik}OQLb9=(b*^{#y#<{3#OqK)a!Ei-b4xtcQ49bB#E3ZP1AEGauC%6
zqzKx_1M$KIic*sw4$6u<#YEWBunWNVF-dd8rr1*93PL53n3VKFUQDE#Er&Jka&hx0
z5b4^lLbSFD;>-N=ndPt)bcdZ18p3wjjFS~~*Q*nnSPCe;4;6lpsVCRdn14=xRMaA>
zNvJW+FQ|1RWU&rZNzTjSt>_lfNjgv6v6bTM(`k1~xK0b*DOiz)4u@uOX58Ea@VL0B
z(D6(fT+n=oW`im_Rdbmgf{J*N<yBD$ZnAbi(T2(EsIZxE<>2JYirCSDKj2izT`NMq
z*ofshvfa4s9$QFIN(xLD#l?j%BJE3(U#kr`JBM<W6)BmSq@~(_ipA>=ko)vq9$bGz
z3w)Il3$h*$R6Z(_B^6cseDf{*M7_{7nCbg_g)Szd1AGZ@&C#Lv5n6k8EtFZRP&_pS
zQk{ZZW_5dHVzQi+BCT4kUv^xK*$Eb=izyVSVe0e`nJ;t?g398a7)=mq=L<l}1?y}i
z2#yyA&5v?qeNVo^n3P@nO-cihh%ADN=f|+|YVD;!=7ie1L!8~!#V%O@17?<CGOOaR
z(=0l^0?P@3U$Sjoisu0e^Rp2Fbpu;AHYnn&+tSMQH752^F?!!C?|grCp0a3{DCi!V
zXf<>rjaXqJtn~zFH3j`XD;`~;UZyKk?B8Y8LvLA&Bf<k*k{)_)JWk%lB$98hRlp`a
zm;eJMEn$yrG`%T$kM3W(?&g*)nwU|4ToAZ5B5K<GdwnnAg-ew0;s18NXsZ|2!ReOW
z^5YbOu50p%`b#6$+4s;HL^Y7>Z69ZOm5q};GsgBocRLWzMTKM%2S2Bs=8ZLKj?olR
ziNKM&-jT)aWT*BpNu98$`oRP=N2h|?z<gHf9pt&=!gM!ejCYX4tZ58!)9zP|a9S!T
zo%wQZGDMHWK}}X_RJqmEHNEybx=x=^K!Im0I=&{gsbFelQ64$0{VEaPX5SdnII{GE
zDW6gijVd>3@@nViXO*YrCj^1nJdp~6n?tL}2fk^ecIOP%wG|rjFZ0j>NEVlgWz%l+
zZ0k5ynnmqiMK$k?*&Cf_=-8yp#8aDv0cM=v=?46d`yF>nWFs)<<=-P(Il-FTKqTKE
zWBc};Efa$US*0k>Hghg3V=Ec;6W(5oZXZ!*KmNG4Zc;oxPDss@{&1g38ba%y*t~XD
zpWMpqa)FDLmUZ`~H}bX3%1c6M@OAtiS4X&B_ovGyQl?Hf_3h;+F=7XFa?bYTGFv5e
z{iU0=1_hmn4RFuu8p-*?uy{K|T4K?j3mZRWyWM47L~6doKh<Wz)8AigR`<$g9<#~7
zeiIHFu*`0^&vc~DoGK(kxSyX)631(jRZ{mAKPS&9$=xN;zi#RvLq@(Y-}chAb!y`O
zSt9g)cO}kz4W<62KV22!q>XHF5sWMN^*Ys6jozUDXfd+P<N|y=ig$w7#537;cG~c1
zt}DfGsEJnpo@HEkInGc>YlPlsw#56k^jF)l`gvYkA>=3$)$1q9gvqL2;-Vh@^+A`O
zd&$(>q!sULk3QJbH|y<o^xrE^Dvbwb{%g#q%Cy_fpx0v?^5?sfw7Ku&F2N0Jr)|8~
z=VMOJ`bZr)eY|hoY2I?GbR2g}<tG=XwC|64%Y`rJ9q;et|DKV-2te?)iyR^Z{OW4R
z7U<T$;C+_Ju52Mvfdj}>^&xcdA++=%en8DEq@Eh!KE4-y43b=y@;#<@q=$&m75OrY
zxjF93V0~=l<^=k@!T%fEQ{f|^p%|p}+9$vT_={@GNig_JJ;=&v%ZUwouv&^R4*I{|
z`BM=vd^`9k;-=57?YHwn<s*>9ps`3Qw8(C7Kp`d@Xl5B`VHtQ~473~!w37|AZVa?%
z2(kbM$(wt4?|W+}1w1(U1<Kg6FH#yUs^8<AO-MtO3xYg=pLri#O$O0O*(q?R;D%h4
zz>B(vuJRE{^1(%rGx&c&K=Xt~v=MfKO3{$LY+#*9K$#iDo6jL-F<2GhZw$VPcb#{P
zY6uSp;6Cb;#3oj_mShftP7;A%<)CXnL1<Zxxgj-5v>ORj8xxfR(8K`pE#k`3{#lk3
zG?)|y{lqfb7+MPsMVi75`2w1krA$lW>TSeYWe_`B41b5gyL{6MSS<QeCmKQu1_6yC
z*eZ!YMiDhrsOV8}W0pYpArKNuI9D%rY!<>wCm=rw?xQLZ{Fd9k_P-jlNP>cJjDv{F
zB(5-RieJ<ySYRy<Fz99x_hXb=8m4$V2L?kh0NEjgeF=qAE_SmB7hOl}0qehga0n|w
zd>6VWMiIT>i1({#jNS-phOmJ8I0`wg%!3ePv6x^A32Sb^92`M;FUDmh=o@|9yRi_B
z5Ah1-$3i;XOI4I_7P#J1!jx6=gp4#Lmd0u*--zYhRT$%)sAImn#fO+<WIEz6^h<Nb
z30C%^v_ixNYLf!863`CG-H`w>mE`fF;d8QyI}-tk(3v<axJaH)DPD}M8vHR>=22Hh
zNdPAtVMkIZKYB$|RCF?tyK!<HMRH$M95g2}N=+2#EP9WS2BZ<~dQ9yOPCKjliV&AJ
zy7aY=KW)q^4UUE~y~*r5p>?iY%J@~{YZlG6B~aQJC~yIot@#L{NWBMVG!Np`4guYb
zF&u_aTo%zls&-w2lxh#2=J}avI02bAnbNL7oM!1!x#=rbECnb@MaikuoAD1E;js@1
zQ`D)sl^J1CDImsA<&FsmqH*w@7zdP0Z7AwbLjhkca)uLfSQ>HwS}u!no=72C??f}{
zFD~)<i=V`~lfuBjXIRR~(5DKYYyoI?RI<9eT9lg$DisqGr!|OH%0GrCKaN`tz;39j
z>#ts%uVI}}ACj-poUgZ>&$9ei4k<V0GTC}BZ>z3=OP(XFKd-^1AcQ_bPX~j{S}<Q5
z<vA*8;3mF$Dyi|ZP_QZcJQ=hrm)W1CMx3ug!=#Sjo}T;x+E-`x&($elKc|kj5VBZc
z-^{b&7XK$CB-aQxNDS3}6F<hN_@`W6@p880QAU7nauYvhi%z=dkZIv#P_uP>xUBuI
zg>AYuT5O1JqplY65wzo*S@81DyN7h)zW@#Zc7Xsk2L&MB0dT1UwBC%}iHVB6>?{L;
zV{R)!RztBQ!=I{t1NdNJZAK+g_0Z$*ku3M&AsQ7ox&?N)6^PcQ6g3s{3l&4CJUm1{
z>cF9FmFNdX_!|xYzscN^3v>W9$}jq3FhvziC(qxyNZ&Drfw9aKnwrYU-c6+0i;Ah~
zY9mtjl`fT&A+@3%w-P(L08yga-LDkhh8guJU2Ca4a_ZB~ptOvY3RS8>1d-PGlb)={
z5BU*~+PjLdsptaN8otz@k}E&!2eXn131<Mc096zn!fHOvM4k~e#$zKEg~HT;dg<Ge
z?FI@iW;G6l@9GM5@0B?<Uxu@Uj|qM*RaVRaP8<NEx@hO_^wMZ`<3t*eRHu-Y+N2<J
zClAa&j5^MG4MpOWbxCz=(QxpUL^D{01dR1*C{4K0)eQgh>ugbYOuBJjP39(99BN}~
z!yBj9RN&Nrv_eyXRKINA2<okkZqE8Im}EJqO_rBeX-TYmmP$<Z%ROeZvZ(j^(DDVl
zO_JXA^CJEDh<9~K>#46+$&g`(AWw`9-bhm8yFA-oXhr9+>axuj9)<P@sg|{49&KU|
zz3;(~{A~p`Q0PiWxJMiOQF|*>-Ks(da}O4ds#kKXJLrTTz0@7!1mU6v+-_cXIo$%I
zZC9!9VhwFacf{8pMDs)ts#uZjMqXjz3@!FNQ0FhQ5MezL`5fkZgWpW;(QVAtW^r7?
z+|sj!*NKVHa%A4ka3bnO5WpKo=?sNWvO+`8G)c;EqTaxFpE?vz#R#ou-|Bgi)z
z&i^Ib^Uy?@&)oJtVrM4a8u45&7UnFG)(6e)wZtAk#qKD->RO)by8kl(AnEQ;rBr0;
z`0&y{%Q;~BZBT=HP)M=UdZj0<buf9LeJ^29II+!MXfX2kph?US=YpTIB9!)JXq~^W
zMXN9EBwE{Z2(@O&1)=R(uivtC@HVOc>#A}Qi&bpcm+PL?VMz6G^M}DdFN0(xqa9HL
zOq#9vFF6UVBYv@JVb;u8fH4zW$~oeZ9=$O)n>GzlZ~bb(s-k*h>*%l5k%5(=Z;BNe
zp4EG!19>kl>z*zho~1C7aY>=^Am0A+RgnkJ{_588c)HGb&*5t!J~yf4hHn$t2ovAF
zkM_0>WQUD4%ji<+w_l`A64Ot>r`O=O8WXJ<<DWY4zYKS^Ow6D3EbET|y!r@Ud69q6
zvYb-lj<FoQ^g0R5P<|PHTOHSIb!*-!op#@vR=b+<+8+NP=7kvNCDZ1$LJ!AeWA({s
zRxW&2t!<V-YgP??PE~l0B?+wO1zt?$kvJ6<TATTUJ*)C<{x`x*x_OQQL6+`9X2PFd
z@32YiGJO-_d1t42C0_(|qvE0u^MhtNECdUh&`jS~|NkOShi7fFS7?~mLjA~;*7pIj
zu0Cm5e_6{VGg-9w@TGLGB{`sfP3KZ(`citCe@ztJ@1kW>hJ`w*1+9a{NF}Df!tGV)
zJRz^`kkjyb)|KSy(r)as&r3xWV<9+7tFF+MHsOidv5K>x74@Xm55p_N9(IflM0(L)
z)UjjV)mmm|l-6RpS7ou+Rgc%CuUAox7jR<NH-zc7S!vG7x(0-s`pZU7!g(%EsUMV#
zr(XH?(=m_7*a7w%hB+HT-(bZF^N!LRaKbRX-!O8>8nybm#P$X;*QPi%485H8V+M6p
zQ3C<XhU(D9q5cNZNjo<QTM9lIH!fKd(-{|`{gxVMB=f2#&zrl-)@DRPCZ^EWx9x3!
zWcpU`_Sy9|P-EwbdPhciM|6Crw<%3tWJir`$F&;9NV<(x>-oiC`+10lVZHxnS`OgJ
z|BZjobll&7^tYz>?r`J0ajVuN;hy=ruT#0NiTB=MDU`N-PlPL!nPiBKXUlwZ_Xz)>
z7iNK#ICcUWviGnkLmOvNo@LA6VsGF7LAHrmW<+T~>E|tvk`esx&FOXY2jl+1vBxOC
z-LSR6p;%2oR{P%o47@ahuu3*ikg}s)Nk7K^L8I~r2>wu$`tXay;d_ZUjTfKWr0XGP
z&RW&+QL4q!fU<XCxy$rhqd>Y_kMfCN(n-$TiJa6i5#{me!|6=jad5;;XvT5#`ssSa
z9%vA@L>99`cAAuebrpftRkm&1c=p6}lpJyR_I8e5aOiJ;`ZwtG!X9?Fe(-$O+>#a9
zJ%)xF$uy&N2KWBUf9ycI?woz)Y#IB6QREC5>43dqeW(oDf+oYNPl{_U_dj^ITU<iO
zt{c{uKMP-6l3Y=dGsYIxHH==^?`APZ5_+sHEaR=zlX>q&ScxwjBWqr+giWudUkjYG
zd?qhdA;#s6hWp2i%Qs0}_yv0oF6T!F`6YjJ2JH5e<dxw5A#UH|75%c)FO?&wjdOUz
zYw|zWE3cRD8t=57e3+UW?Fg5#g_sC^TnSrGRWVR}V?$tkCjdPQ-82}u%m0Z37GrJL
zbBMbFCV$|{ydn5?H8OvLdUXsIzHP3*_w1nU7kTQ#ya`vq*6{%bKH@m)knT+`ROyfu
zqzICWBqndL+HFMq`Z_{)YEWi)mHj@4^v@N!;)$#Ksej@l!@0bVE^s6&K9)V<pbMi-
zt$YfhHs+!U3F}N5Wxu!LrlS95y5oZISJ3v^%QN=-#01sUxf$Z+6CmqZD5Ovj1xN@&
z5C8-M2oZqrVc~wHf$#|ML4oM8@&6-lQxP)L{~vjqOh<<U{ZHQJl;ULmkGySs$7!i<
zY3k_AY$;2x=^AM0X{#L^8?Kt1$}6Rto}Vn~UHIShwqSPW|3z=->sI>523qD8*s_9R
zOf0YHCnpOL5KzgkkVmCx6ys^sh%6#A`ZV^~LV6NEFSeVJ-m1X{f`1O`lS?EECEMmn
z+f&Q63&lJBFL_%~0{!3e7RNx!hFJ&o_usOfjv!QgI-wi6AS&6QG%`$D4t1>%mC)f4
zI=tM^`OQgqf}!)8!!foiZ1GKujLQvXo71%NU;YQ)c3p!0Z+IIj%rwTP&^kq+NmV3=
zjZusM&?W`rs%cB0>YKXpv0_n3I$}9Xs(RX2ZW%kR74ISn-mR+E*-6aD3ErJ`a=bR1
zE44mMZuZAA8;-xX^4s)Jkz{)QVJ-Dl$z`TwF=gih1GPS)Z+SC(u$VSb5ky{YXuLUW
z)##5`XghnUzdhxeA$@1R$~rBhhk=h?WMDo(x+0kGB)bCu;eNGn_E(i>jwmf3=t7hh
zCI(reB;<@*Gsh>|nguL*ndw@A+D(m&&jTbkDLMXdQv-PRep5$aEA#0YckSMqgi7Mc
zZAVJeL3d&l*^;R_1q-e^0ifPVFeM>HZzlpDg}<l;z(0NyZpjSNiSQrd8w@$D0%ABT
zX<}2@sE7yWF$s(O*^hpHtf$YcUkhA}5*{Pm&ql-!Imik7a#W<^;(oKwOlZo7n*dKC
zNsB@2-KaejM@5w5mqO$VrE)j=yI+*^?^b#v56~F0kypKrWK$+`hP2Am-Su=(;a`P%
zQrV05Y=bZ;y|`bx%#BK)6G(>#%^Ul#(*Yhgn^<ZnkW%c7y4P{as&?>j?VAn?O3J|#
z-JGYN)tHv-)AmDd*;*^tL#Oke3i3<R{&wI0DrrSMJZ=UWYW(U3@t!!ST{llx*dp)}
za>Q;w;a7$|+S^ihkR2~x*4Jy=Rt`l~UtJ8Nl8;`Eu)h!3PW2Tj>MOL$O_%O;OKuhy
z4vC<J=?#vr%GahdgyNd&2#uWFFe`m!y`9x)$@$GMqyBt6Cv9$<G{$?6j6WozS#nyp
z@gE<}F!wulQZ~?HZRq_qVRygg=2~d|QwPDXCK^al$(yT&u-LFcor`|8EIu;8Hz0>!
zSKH>YZ^s`QZCL)e$MRcvZ1dTCys3$a7`Z;nM)=LKukdGTV?#VKTXSQ6r2TVr%|`k2
zS%kaD@ZrDfH}y?XB=jbR{tA)WBPs-R_k&<1d;62_5k=42MT@T#)FnT-MwnnhpWhy~
zkX1~nJNijaM0Og^-&pKQUf*7JL(7KlZ}v04WIjqB|9XFFN)3Lhy{|pSx&Q-(a9#qt
zIzI?p_{{1>o$H!*ii=(N5|9KA;U#rpSWh~Vfi?#v54r$vlJ1`Wp_EUWUAWCV7Xc{4
zA5gd~#gUYv#1Ifc$QHmZ^s7q0(q~HaM9N+=d4&Mf5>in9d^fGYW$+iZURaiz50ma?
zi1z3fE|)4Cn>mGG+w*8n3gi=O%4L`?A~pTwpfq=}+9$iPpLiQU8UE#}&?#nunJA!?
zNpw{t7(^pzKgci6#1Q1mvOvDzGbl-96CDwjM{`Wsk3hs3=$W>Yt?HJhXk9HA^^(UR
zLl~l9Fdm0&O~W{FFjU3h7{7x`%VNPOudz3lfDl8=n%X0mec+Th@<_`bv?Qayj;>=*
zw9OpcRIJ%A5kJZj!=5WQI)+G8lagLa`Q_uR!jfH$+RRG<pNs35Bc65gp5iw?B)@TI
z`I+RiutK4^TZ)f7>Z#Y<%DhVm6Q0(f%r{VxDEasCJ<;D;XNQG?7$K?xjJ4UAF1m<x
zs6D=%7U@`hLn1-<lVJszS!C!|qb%|~F-$?(R4&A_KL|VGQp$7L!Wo+snRrs#o%2Pa
z0Z4x+;K*e~3x#eORXv_&(y?8N#O<JzQM$8PUv7(}m5S9}h)@f3d5TrynKYyv=b$Ea
zC3;e$(v(Q*$=zk8=I=^21cjDE@=TTKL5wCJqUjgV*ec2~@|X>-1sG~`5b~{GEsc=%
z)ElfP3Q+p2RPLTc+c^LHt8!;SAZ1<>V!9(GptTj`ELj@I23Bz*u%^SFcnmZs0EJ4d
z)><XY^VN9OLe>E`79-2c1$fmp0DzsVjoxf`y-VfVwi&*>?!=vSW4{txv?LlA5+ARZ
zciW)0LRIcz%$D>#sgz6R{^t=gt)_Rx3b(1g)l22v#;$4VFCl0mc~SLk;l`g`^>Suo
zQIOmAcIln2M>ck0S#XJev(Y|))?Sx)+=-<s-Om?>!|5viuGjtgM=P|6s+UBvON!&|
zvN+uMW5R|!;WPSmh^FD@dP6Tho|d!v^{C0UIbZ9OK06Bu%eK}DSIc0DBQ2ReH^KzM
z0MnggC{oQ9CGjIYGNen>;9;x6?&0%AIEIHySoHn3CUYvyw5Uv-op7PY!JmhmNdrQ$
zbb_O!0L2Jjuf}Pw@+Q5PB~Br*rx8aiq4l5knv889^T1o3Aj5(S_hVv{d_1CYZ}H2U
z9Yu`Jv%e&1UbT5smlpotpISKVuj7B#S*fo!&djcHM@Hire^Sw<juxpa?r^knsY|i)
z3A~jT1|3}!HP3g9aGnsfO$vGROqJ|O7yBvMTD21W;BXVw0PY_f*rbfjJN)+JMPD_~
z^c#n-{Zr-G@P2BKyt}xKRPVi}aDdUVu)0fDm5#%zP1(1+g0;odx{r?L5bC;k)Okl|
z`C_{Y6Brm*Z;;&?!|=-0U0LhrZNqO5#^56f)L)*=?G<kQS7JI`Q}5J2Q?^TuotA!L
z&i4(I?J5R;rZS7>QMmx!&PujrBjUJGo7G@H^0WRXorN2+0|j4~R}O`GcYs8<KN_MC
zBL1h#M^@>cC_ULUje4*omBVJX0xZWvz9FE<3Cl=nZA*Nit1pJt1?_C7`(R_JFCM@S
z)DPQ}lvmkk!B;vnu+<lFjpRQOKU0WTNIvHus2M{be=x-HKdEQ;m_mmm(=Tm1RY2|x
zPP2z(#cXs_s&@ng(fBw^n6^<VHAsQ84bY(bCPSJXQupKSF8a>G`1mQUZ8gaxeiptP
zQ|;BKt}I87-%4lWh+kSAmEGKk+pn4k1dic4AN3Rwuj-B5*)lTFX4KcPn>yXubpM4;
zX^{hrJNg!XXFup=wOux7w=Cp-{;NModU2QYSB~A_-ej<YtvRA*Kt=z>0rS7?cI8eA
z!u}tu1R_CzKnN{Lku<U!7??x~dH^F>f>HT=u-$y{7s2S-D8xW%_y;g%BN!#ghl;?L
zjKG&#8;oe_YbYr*x#e2c<@6`fLk#gZl=`=$%Rgj#&x^Z%fCoQbUf0t*S1tm>z(9M*
z9n-~tnvmN^aiG7=l)oxp057uNPNhc>*n_RmL%_|Wjou-KDqss7a1HY-G!N8B3bc6e
zZ=Vft68|Ud=J?muuciLqS-rnCgx(}6$kQ!Q6KKT=aph)!=p9HHLHIm?7^Vc~!34IJ
zi$PJArZ_U1zl-p1ErK13Twi>G@Ma-Z?7<ojkZ8UTe7cbJN{B82`ln2DzuHX?<L%(J
zN{If4P;c4Lymz-y+z%lFSZ027X!f$yM6SU<{rvLHLcjM1ds&7wHip6S!@BVxYJ89>
zZHOEnb3a3PL4R1sb<px+_;7xx#rNPl$_RIcu=#4&KR}zo!JzSj@H0!z(*xlab~wTB
zk%C5%4{ni9mXQKdcGpRffTl=zD~)MsxEsDGAYl~ZW5BD#*S$|T)xeLJK=V(J5qO7c
zFvustS}jQp9VoTVA)U^`ATPBJhV5X)s#`P@p*k)hDFwfHB(5Zd6@&`K)43rApdQ25
z6pIia&Fdpo6t62z7$>PCCG}S-2M{OS6elqhCu1d*ofW6VA1^}~r%H&b?=0gK7jfAT
z@pmys$eb`?4>5riFFGkcCPq5Jq$y!;Cl*kYQ0x$67@T0MlX!@jh;N=SPM6?jmFV3h
zZ@=`<%L?eD<LUR92;r9tS~7HR(s6FG2u)6kA(Rb}a|_Z*_Q6k%ZAw<oOWvDHI)No`
z-X*0O>7-=(C-b`|J3#|;tWwI8h09P3vW8^y{#sN$rZhqYi>#cA593R9Qkw|V6r59s
zcv6wz(|$^)bvLEC)uhrcq<RLYjq;~gGN#Lzr!g1CniizZE~O*YrdR>fjfc|L&2%z8
zOJt-3r2jKZj~UF^d(8N|okm5UArzc(@i+6xB-1ZHgQF;Y=rH4fFiR0JBO91?FOj8Q
zlnLLQg+!O|vSj%o#RPRY3#VAIoiGFIDG*;b>rE${teNvuisnaD2&ua%<x{pE1CtUE
zHQ=TxmOdqi-CdT@+L*c-iz_9Rmnc_2o{t&TpEV@*b91h^wJQH`Qipb+)UctLK%Szt
zfbf&w*X9_7<vfkycs`xnx5T_hp8W62`G<AcC@HyDLk{|HNBQPVmRjz4VFc;4?gjSl
zvMNsrCPan4(EMNSg}rJ8WC8h-^92$Gg>FPeL~@0J{DnrtIe(anLQqLsDBuJgq6~4N
z20TzzDX6XsR3{#46aY;vhQ{bZvxcFWOvMRD(EMd+kw9?)QE`cUacN3%VR3P#F7%cU
z4%?{2^9g;n5<%qR``6q5FL*0Z)`nUXno=g~E+%jr^MAtIug)gKa&UA1C%hG?m|ZT4
z%`ba6;y$!4e_5`0d#V5sS8g5^K2cVF@Tdf)R-!&vAf{GyVLao1y+EL6uOg(V!u6>7
z3jSa4wxx<}rHbOYii)_JMzESrubRQ5nkluKrKOr}rJDV@nv=MOTd?K}S`EEw6+dy+
zR~WuduWmDLZ4qrPu4t{aUadrEtyD>^3>rZ0xK;tJPU*Q;gShUyV4Y@aopwu|;!54O
z<2o(0dR^vv{pUJ!;(80gdMmwp%awZT<9gfYdbO5XO`Y109u01(4el)so+}OB&kbPW
zMqj~3f4#;)k48vpV@OM5*h*sr<MaPG+8Dv6|KivLkEW#5rj(YZw3Vie=cX*;<{ZK1
zJiX=ukLIG(X4>MG=F*ks^5^DC;+AT`mRh}*dXJXI)RyL!me!S)_UD#P;?{0D!PZ{A
z)_#xH!PM5_me$dg*74`oN#eF?!M0hwwt0{L3~k$TOB>bdO56H#8;rPpOR#-MuYJ#>
z{UEjdsHOd6rTy%={ermTN<^^ZMz7<}qvIj9<Ef?NWu@coxdTAb2`|*iv{ZEC-U&?W
zL~ZQ^t#+QscVgO_emLpG(eJ|Z>>^0(>UFPp$6D>etFJ&K?xqszrqS;XI_hFrRehfH
zWm)ZJd+BB;>ERUW;nwf@;@QKO)+5l`BedEh^3o$p(#!c0#i-xQ#)BrF)+^uItGL>$
z{L-sR(kDlvFRkCl$piY9)~DOrr@z`~@Y2Vd1}|ybXBwtjg9(67>SyNju^#QWS?zat
z>32HmcP1Hd5gPcDHsIDe;JP~Celp<sGT=)x=nonU6dDZDAB40HF6#D2jB?Hb`eTHK
z;`E0SJcn5M`XgJ1&f?ISA^n7a;hYz#T$15D&*6f!;r!O&qS4{v)#1vQ;p&s&T9T1E
zp^^Huk;c}MhSibglabb!k#3UFUeM@={?ZZLtf7ju*3k|B-lUh&Ns_T?p)vL_;PB|^
zB8k!xSf<Bwv~P6`_A<70GPXl9zAH4omo|RTI=;U;-kdfRoHBMLltl*^p7oq~NSk<C
z?PZ*lS`eB5peL0~`XU^RpOlV&NS_3@O(L&NqMlBIUMD|{^@o#8;=TIfs7gJzPLZrl
zkqP%+dQMS|iI+O)VtGwtrcYysPcxTIv%F5Tk<OIsPqy++e^F8#f0+_!n-LNo!>#n1
z5pAPb0S(i!&akgd%X-bq+s!Jb&nl6QBR<TC3C{&{VzXLK2&d2Kw#_NoPJC0EGqO{q
zUF$|_n^pFjH&35eX`8nko9`LzhGQ=<7GC&`Ad){i=a#<U?$u+tDn+O|<4n5fg($+R
z565^hWW6>YlD-&rIv>%th^{ngcr)SqV=0#l%^Q6fp>ZK?ZRu~@lD_0pg79(z*X(Eg
z`N*}!sJ7+ewPon(B8%;!<;z@A-j5XzNjNspaz@)qYul8q=VCVLbiMHExbd>K-6G_5
zxwLGxylr)SZB-S0;bOJ2_s1HiH9V{BMDyBzVbR(IXt|wsbzXQqH-5^Sb?M-Bb+m2$
zXl;FRY@I@BZAW;+<i`Y~!=m=-+Ed#`oRH5Ce_cX~<r`txvG^h(U==xIxaf5KVhsj*
zgPp%lvXd-W%fs->WlO;Q_idXmYnx<eeUQ%0f2`|K_y$`89FtJb4GgkPmi8?+j7^*H
z;UUE>I)m*vxAmfzO@a39ZzSticH5%m5mMoE>;^lm_B)@wcVML}@MJs65geLh+uuZX
zRYAi_Z#&xMIfQEq_dkXu%6H_*_GCr&STk0@>ARL^GHTv1p!b}{+nxhij~V)KZTj9H
zHV#_!;Rml}Bjr7l^?mR<jKgy`&eh<cuWb*Sw(8h^@T+|(^>jOs>=33gErs4Q*aq`C
z+fT_jOe0%bTJv>bJIwLUx;^Q>D_si9I8rZP;d<N7eLMOWuoaxK%@TI_&-*wv;yB%V
z^pSPg0^_(df?_snsxWPXx#(!{ZS#@zsEh5e;I-Q_Y_2}zc((m`zWmt!?GR3Rdu;ub
z+vlJ^<7BISBO+tFEaB|*AVc9V+nQuKY{B64|6=T}qS}fZH*E(i8mwq>THI-IEl^yG
zySr1|o#5{765QS0-QC??Ki>EI&zd=ygE`50)=G9(o_+tWy@-6_$=ma7{PZYeKbp2@
zi}(CBVqq#`ILqVwf%lT4?!2x1Y>4jSqW$tpYyVbyxI_999pyX7Y8bYn|1IN+yG(tl
zeGQH8ir7o9*Xr1A{1Rc~lCokwt8J~q;)29_r^o7QGGhIk_6_0K^@=70{l_(V`&g#u
z6}<JeK<2gJ#ylP6%CgnoYWVogq18eS{TeIL9cS5A-Fr9W#H|XE*JAovZN#zA#~lUH
zeK_4n&Fht_*ZnsRbav^JzZ>`ImiHz^4@LFo8ngQ!8HY1kx2`DnZq}39^dtWC556Df
zav4h&GB;M4k5jD|RRoWIACJ36)0P=$Pw5ZtUQc?FPg7<6rL8wne5+6GBO!dxvte-N
z)^`yfA5SHb2m56ged`aE6;IU{V~PyV80W{}i?e9=k-UwUq|xgi9}AcnFTGx2g_$dL
znNPz+FC!fzSssh^p0^p9W9QF&DY9>5fp2MP7p-2W?H_L%?u)TRD|zmBeHREw3T5+N
zZarD=WC`zguHAPnV*`<wzxiH9Ge4>?`tdy%yrZIZ1#}^8KuJZ7gNuicaJW<)VydPt
z!O6u<LrX_b@ZYF8#nsGQLsF>4x|WZh5%5OL;Hb>qR>WUGBY0pi0zU0$eQV(!mR6c(
zF^y0Bn1;qfr()?5o?g~Aws!UgB&6q+*0S=9$S9AHvP}O=qRG@Escr2QnOV^>v2kI1
zAqMIW!K8-!Mhm*k67Vl0g~3l^-0lvh{wClm%9Pt1Qu^-l3dfu`82uxh<tN0&N{u6J
zc2nGaHQ8t~vw453*w@0bG;XKoS@>Gx$xMMzbRN!M#d7(~5Eh{cq<>fhI>go5LRi2b
z^JN+>T4pSarHfSt1LNSn&%;M`G~?&uBdk^`4O%D5ZJ}%x^8(}v;~KE+m787uh<LnV
zk0}ZnOnaEqi2sw{_EEBx_lYF^tXMlOsyV36qD$os=d9Z^!uVU#do;`NcS+KGrk#hY
z!E&w9>E#^g?l@~PRVt?#&fRpqH%c+_Z7jhm_h52P+*qiMbLnKP#q9(69Yb*?cqElC
zlDEyZm$?kaMuVmG_38FxrJ+}i>u3NQ{%B*I$oTz@0?X-v;IqChu2S+bU$4c+vqpbs
z)wRLUGSLmhv;vV>SDDsKD?m{Hx4w18R)oe{3=<3}WmgWxZiCZTXZFhZPk!rbKW(w(
z-l!ENfGlPZeORjubm#hSe*5M7ve&>ErEIvo-52aIW!@pfL@i&t7{3i65HR_hBWOR_
z%&yTW#iBjg2qu&#={JnJXRvXGD{HY{xO<a{KsalEgJ>q>dy{ca$oUYq%{;)=ATLH*
z+*HrtT%ph>)N1&{v~Uo&c{@GKTRf{MV7h1zlu<rR?TZv6pGI7j3~!uTKOX{4ZC-B%
zM^?3Cb&8e!C%@H<ekRIM`Eq;^o}JyCQ{68uappUoClps2Z#81sus-b~RVs?0ZrQY-
z0hWw=PA&m8FIT04TIRY!t=jHm?58DmM@#4`-uIuaD<EMdZ2}^-pr}}Z({9#1$b8)2
zYl1je;Hsa+)R}6b^^j}_2p0t7lejqIFB`}+!>Xe?AXV`t97~>UN0}GQuUZ&0scnnc
z8o*U?ToXw4levsI;Ys>RHzZ|HP@|vmE%vMZj3{4Sc!7X+_+18z@&MDgujAOiaXk4n
zhb1F<W}ytX-|=bo!&V(YHl2=ch3jA98R1Ur?ncEwv$>U%OE#{qHIFA;iL_X!9LAS`
zL#vcPhuyz2gbsT#umOy=&)<yMw=+Jx-q+?QeRvudhhokgYJC01p9f5h88zhKp;YZw
z>o+1^Bx~deBIn9JklhV)BKsKuJ#I>`SIL+}?l*hX?+fg;>e5ep^U9GtcgvS6)?8^7
z=kAZ&*7>a(dEfuG8x;k;-97Jg$ef<c)xM%X&>lp1z1w`w(SA-nD4l@qP#a#T%qzIY
z@^J!G8;`G9Q0g!|b@AWb7G|_y7({=wzL}VP{tOdiJa(dnRN@?kM!ZZe)P$yAqLRIi
zB8b2VlDE+dq3>#UG!}N?A4jRP?&^GbZixBlvjGVy{6U<HX~kF`^%kK;ya|EpgJv2u
z*&ob;DFb!Un4z)1&1l>T8FbQ7hS-M1m~1<0nF!Zp$pve;k>GCNWDxd6d^@Yu?r;PR
za0?FyT0+UixWf+ce|-6?H7Q04g*YGtO)nP}Rw~oZ4F{&vvyF)<HNXnJW*05@*9bq&
zBc2(|5lWE`$g$AI;RzPs!p!--m7@K2zW9gdv$;(ePTSY(8&Wu~24x{hpo=sOg5ZyX
zTWF0!;N&2l1N2=J^9+8{9Z4bxPuQ}{EsjW=j#{u*BtmfzAuh2uvq08_FU}TlR5AA`
z9ghdqS`>1j{A^ell>r@;8b3mIV`HTWWsYqY!oR}HDmR*7eXJJ#esP)fLWmQ8lBDD(
zXqI&ozf~E;SYk~YYBLe|mpgH4Ql`;Vok}T>>YY|3HZdO>w@->rtym19o-%EhVPEhR
zwWH@b*W+|-&syjge1OvTdAxhtzUXHUg`_ppWG-oZws<^@T;_AMi10+A<Etqw&#+v$
zo&$KPdatXXnL|%t-=XxuOjcn$WWKfpv+T-S<i}|9d}DuI8J|D2#$L!m&2)UJd)X0n
z@!>FoRaBurT`Bg93Y((=g1?7lss87?E+UPMh-xL_Dl3^AHFBuY2pD6=#4FbjcC5{#
zD>F@&gL4_DO9-^2lX)>wj8YP$zCiOgkqKQ{J$7tJ@h0THJ)W8Cj;TpGE42+B8C^tR
zb-h5PvCc$XI~2HgsGU=`jZr6GFmq}-_FuEe{w;G9;?%mkXAbR>sJUlX*?3A;?#2}&
zh~I3--eG6uauK@m%KX<eDec?J`QRF0qS~tJYj2tfp4QtCwR0B|dG5Bmn(E#BW!o7K
zeLw>o)lc2=`Y_ANAU>pR%L1kJ^;kw&RhmtVgJ|23Bh4T|q#c|Qc60f-1DQR%MTC>b
zZmf1<<QB6HTtt^aAz|e~9bYQzER{aVegglRE97h}mtiH{02IpqZ0W=!>hsKr9jAs2
zUaF7^^gn+_;YLK2I1TU^)uhtF6=w>O4SqrfyeT1wTqn5V<Bem65UacV7negJVwV{x
zZz$Xhm{Va`0SaBwTU_l;W84|`cdOCLwbM;gZE0A-GztJSUvJLYT=Ki3KT@bvPOfvf
zg77W?x<i$dD9L1AaJFi=c`Rj&Il)j}1%<5^nH#2ELV0Z!l_kyOyz5Md6)OcC$2Uc;
zmYM4lYDdFx%SI1MrMmXJ+K#hB2-BhP%>u_iP=crW*mz4P=f~R1(aJ`NZtEk@%b`bY
z#vJzrB}?aQ!z!uD);n&S@q>}^gHkp;d};(|N)0_r;Up*9&=uFxkDX0R32y3X!=~R6
zgx`SI4Zm?JAU7%+hgQhWWl3Il<;$zs(Gjnb*fjR*eK=Zo)NB*H@OGx>o1zAGuDSx<
z57()y^tRRPGObrPcM^)H4PU9VAf5LWyqxE7p_Ws@EC(_VEE6p2cc?^542%@bOH?o%
zloc#Tx&v%8is!fWUhIpjlvi^XR(GuoPTfCMX%_aREp3GHPQ8s8y;AM&doz*L(E`)e
zmb4zz3tJUvBHjH8MNKBA-EW)^T15#i{*-0jT_k+)-7f5CJ-^;lo>VHeo=^qk<>q2u
z7e74j<|{lcgWGZGU6{9dP+rF?TBIE^8w=noUW7Awp9BfHOcGz84|6HjmPKjKlC94L
zw+ZhS6Rn_42c5P~G+&uSsw5*O-p*+>E;Ejrp4<4g_l!KXfjFH2jZPr;Vh6-R<>!L?
z;k9`Q35g2zfepnz2{E0B^UMZ#RYpcS^xn0RpYsRS9C%(7+B7uKHwF1j?%AQ|Vaz_z
z+&OtA-Jv5<0)9bw)0lXXiu%&B17NcNU2L9`Re<|MUs9Z}Z0rCI6Fk>GE;SKX0(Sio
z5r0BQgYPU}JC{h1I$n>kq^OHtOoxz)CS(E1hd`uFFBD>cA`S$*H{jkFLbk};k0b!4
z(JOiv+YH-q(8>RUuF&VA+s8UdU5>)b^qS<yA>^5lZ?nD^ldK|_E#Te>0Pn1in*~78
z@j}W1aEkh+E(DPuVn-i>{!(BFiUtuI`6m}?gW1?5RFI`rd{=A&=lDUNQW+XQ=x35<
z$fBPiB@P6@XOfMtrYKM>0l3f`+;s04m+0AoqlyUY`CG&h_p=Sa0eTGbdEtQ2Dd2S3
z0f8#a1f5|_zrEkFp{Q`a`r$x9WC4JTASiDDBr1>r5O~i9iKasivlzaC6S1Qbalj5Z
zD*A;m2?+y=V84Xq>hg9wgk%DQqv(JjP$7smz1TZLjIJSpSpoMCfMd|t&$<^l1c1x}
zM4KpnY%E4WT}DOuM$d|b{<ih#6g5Z{joEcD@HWKB`An7nhWK6vZKBf54o27iqJbm^
zpi#n6R7QUU0RrKGFe)J0M?kR&1Pur(k~HoYy!Rhm2oZR1F&Cg@lP{XA)7xPP0OYj=
z=gqGQ<OKMIfq)7}UPyohAbUI`rQh;(jL44hDPs(Us=qBe!#~Qz$UB{_AdUzefa_oo
z`(Y3iDwI;Z+%mr}>JobuCO{1YPzna20m2s;L19Y(Ex2$z;s`XI<okgTU&iEZm1KHA
z@_kTpcvCVIqu*2(;C>+~JP9I~18`ypL>dfFIf5+5fl_+}aLNXhgP;t$y^&BMKEe+n
z*(*bcs#ETRpp;a+CJ+5^`D0<Tprmyp=P2Pk7DcKOjR^gNor1MP$PqgZ(+&Ex-4eyN
zi9<*y0US*kz|9N}(=-!PV5oDz-B8Fa6yR|&=1P$~XAwe7FcvK-Gc?KjN00YSQVO2}
zfPIkfaS#CWEC|a936B6kd=Akz^@^aPF)j`damea$(SKwIpo%dAU4X!&l)0>s>nFjS
zMaYQQkavo#kU<cCb&f1$sOh3>b|FR?M&eF!@Fp%I|FOrx7bSDb)IV3gjBwsmjbW0{
z{=`WTUpN5l<hoU8fq$16NErc0mn4se@r+CPRMc=o#re2^g2hUZK?sOr7?<QqOiE8Y
zBC6N>j|(t}!#557t8s9W5)0(^BZ)2uU@PoPARY^|DTu=ePz#9`GqYd8RdW=}`#>Vk
zt0c~d<y6R^%nuer&VkGSD2&CuEGBylepDexLxp0p0hFYaFl{CcEfvlYhluKzcrj)9
z1vC0DX5AMuG^mjmssIUO13JF>9mVI;Qq#PF7zZffe&7`NcE%W%q%8q_CQx&yl5}An
zA@~eK$@fL9#RMAyivK84(4b*L*pwfoKpr;#hfi>rCf*rVM-<V+5KR)#4q|i#4O7z0
z6;;?80k@JEC7!*OowH<{gEE$U@q5GBF=FS4E0JiDkh!YRp#fS+vGW{kiyWzP-_%D_
zAkfTxDGxDThxu@pAalQ^F2E<b20KKh{8*!^_$g775gT()ZJ&NzwLF>d(W&Mx-c<9$
zucWM`<VCrf8ZiK>xtiY%(94<hLmn)7Qd|Bbn#a`p*uHi!rH&pzMGIf2Ib6e8Qo1M`
zl7WyvRFX?_^!>)(hvlT+8j;gw$W?nbjC%`%IF;k|S#UmCi#M+N`!~~wo<dqTua6Qf
zg%rlqB3%?3xW}YM%ol9;6G+)1z`1FF7o0A>I#AzSaca3S3Ng*wrPM$LA78>Z5U()&
z*eU@|*yg0U;)QqbsmO~JIRjkbk5>}rR^g)<irkdjZ{h_r?EQ^X1p>8>ggU9$6;NZ>
zsvQcB+^zvkMm@sjPT_sEbgAN^p&2cy)pg7ME6Qbv*SY~{|0+Z{bcw291=n&i(q68g
zw^3rWgp2Xi%;Sm(ct8FF_!7>q%B(vGnA0hM-+?@bk0Zsaj8eyMHt$3Y?JR(6>PcmE
zm#FuK|M3P#MLd#UwDkKHFAKf5!aao%Eh_fQa&-e*OzX?<#N~FH(eC{i|F^o_+QX*x
z)Rw+yMNJHVF;xP^eF_;8U~4g%?~iZ!QXAWkIQf@4iIG-Tq_!c6Mmj3EU%_C9I0%NO
zI#~S{KRo&^(62BKsT>@ip`}{3U&*O0URmr7VhQcktKA=#r~RdRL5GH__5RKSgA7D^
z9q4JA*MIT@l7K%b6RN6mh!Hw<+50oq^B1!kpL1Fwz=J^A0&hxS98TZlW9zJV7lFDj
zzd*+XWyE4-4&>$MH>#C~wt0FpgwqDNMqNUC45AoqVSeeaksQ&4w8s-x<m3wBo34@O
z)`-jnXsAY~9AyB50UC(4zN%TAs1XJ#z5YWH!!5nyv@}y`V97Ly$-^XqP5;TpR!*O`
zv={lGX>G|*nSD(m3pUA0IGLPkz{$nW3vy~YRoneQ>hIG8^OxMZ(TTS(3R_x>W5pza
zsUeTJfgv}`4x}>KZ%~gK1*8bxRn46UG(*uRBOfF!l`-%`Wg1fen*_jOt=eLEUeb_}
zMx#?nTLpYLaT$O|C;_a}9Y5;J2mixJqRlNCIrT^J`{)c$*>`XQ#ufAkE@Ka1ek8GV
zU`4(Fx^Wt|GL4Ov2JI3sIS7Dr9vDy`!(OX^dlOVy`<!qa)Yc&6&!Ut}TcIk;u+;m`
zN6VH^Fs6nX&Km>rEr1B#P{5-S-7{}i>Kx3YNy4?!%^3X(@^W6w#h|wzx6d_Ie;dfY
zsB>#5Ue|GnOG(+5%&xnFlG|Fe_q3ONRgN$lS&u{#+FIb#If96TV{i)d&a~q_mLpO+
z_9_{VLl^5TS2od@4_kCk0P2EcOW1!})cg$dQ`;Ci=y(#>8f3#)t)3P+UhJY0R@dSg
zo3cYcSCR!yQhVdlirSWCs#R`{<t(5{o&IwF+X_*T!ASV}rm<cmPs;)B635D_^4&-Q
z&#HESe5yFIW$4<++u26REH-_p74NO~WcboqnMSNf{*LAPw#WK_2VZ=?BuV$O)ll*7
zn{kWZ)&o%Ymp#VI>jrusg1YcFnbkH$#I}0>UzLPyy7O&=Kr9CO9SXvopH@4Jgxf<t
zE}>&<3yM;_$T~2{yPe^iu;ZCs1e5JN0~urfsXx8i+IIgT357Ro3Mg|$rpO->k8rGu
z#UminobToP5a`nHhw$$kj_-k|jPIURH_SFm`>b<3mPnqKY}dsd2={>W9SZB3N*Txx
zYpXctia_3$CptXhjQwT%{kD57W8{5<2&{;V!+G@0Q=Ub}Qe8^!jYQ;w4!4b}IaUPE
zEu`_)@v_Yih11^+61z^cdzV}mcG4{|9vQ%QpgQl7&8;bF_Q{v76PT%!f7$#^?<YOy
zCw<7L41Eaw(x)xur?{}Z-Q}lK<EI0jCq-NC9vL%!$S40IcD=`?t9~43O7FUqPbv~l
zW=Nk{(jQK}ZNeh2bewGo`W()cOH9sthP~4+J25?GFg=$Om_!J?kDFugUld$<TwwYB
zl!N$2uJO6I!xQ%8(Qgd4og4k#JVib?d#gshUleIyQr1$91$ty#Z5`>Hz!3e>*`nxb
zH_z?4hP=3*Siklg=%*x7yb9`H?bI6wuDAN>t+$^+zP0>rKSs{j+KmWFDLXhGKRY7a
z5Q!9nP3+#M@7ZAexDdN<=YHj-{u#-w=0(jY!>z@~uCs9mZG9JW<&#+6|0OW&cg6i5
z>)jgUI}i0OvEiGl3^6Dv)qQw#7^{Os-mBiR<~}-P3a@*n-hbN$;@;JdcLkh5M7Rv2
z2ZNeAP7IIsVduWq2V3uFQWsBIgtw`ls}<+>p68q2wKuZ(?qcS}d6_PvP(<2P_>q4n
z2ITQqXYz0Qy>vmpIO_&&Pv>frVVXK^qHNq8%B~AuT&~hRywx1J@$L?6oRy8|UWVVE
zm%sAH-QX+Tw4waLd64L_e>qLOYnFMp(|ug`LP`nghS0>1M^TfEcpIHqawc3~E#FFy
zeDkut6$8TIe&7Op0|Fre{epx2!+gU-qa$Nt<Kq&d0+Pd1eL=XH$-$WcnFSe`7#QS$
z{|EQB@iWq;`LBDsFjHDTJ-fWMz1py`eXz7!w|{s>b3FNfxVKf)<cWp+F3q!)0i-^O
z!ayNParqyjz^~LHk5QXB-f6UidCrr6|M&-TS|S}ykrc#0h&i0XG3kzoQnUE(CczPQ
z{GaUYm!19c^f6VT&<rkWn&N32NnH+)6s!2jPxkgd>a96gk(^qAvTkxu;4c-DR_Tww
zdcv{4w5vgJxAS?q@y=@rKRBk(HN~j)#dqE8(4>+Bmpn&9r($$`sN{-|_v`+bdi#XU
zRia2W^Ni%0s%SkU9wZEuuB$6b|5R_KTJEc?F;R;ZL6^4MNMBHPEHPXvH$Zl8F&M$G
zld6684=I{rt+b+t9w*XH>n{d^Tf@rF((GF37AuD{VC|2I_P4p>@`)DT3N=rf=lruB
zSj&yJj}O2X6%;lD2N5xcFOUs*j+=i5h;_H#<`QkU<=__d0{$ovN_GH+MeWdz;}_MJ
z_Y`&XVd1gl#f#I(H$xfwDR;t-qOmeVbMyC?&DlxwcOous7bhaQ?nQQ@Mahqdtn}s3
zqio?k8#KtGi+|+%p|i3NN#J9OAp9BojiY571xGHfNwyvjpFEu(yy%a`jG;#&Ol^9Q
z?z~@INdF=?r=2PD)`So>cgRMW?QmZF8Q-o$C-}%*?~%FK3Or3E6Y8d{*}$eOlBAKD
zl^i+#OIR)lu&T$T4vNCs|B*)*GJjZFJ8wqg$~yl+u^Ux}*#uS)TBoF`@c+VbS~f@y
zK9!yM1uvUVr3OnKKTl$!oRn$iQtTJ4RI-;cp_z!CVkZ4Vw~;czZMD!Q<S@xLn4adM
z^>#iKDN7vQOrYxd`sKtu2okrV4H`R)f=yq_ET$We_e)0oS+k3-7`odsN$}$D#MQvn
z>XWUegnkz?oh`?qmEfvxrPsr>4X@(mPj6h>mCb~R78Tvq`xRsK52rU?lH$ErB5P(3
z(;<kuVfo*S$CFpp|KvXJ2a|=;?EB}s_FguB(|4u|AR`qBSqw#by;(9TY%}O=zec)j
z2MxOut#DKbI*pkflr>nqTSz&8mHx5S^_}~Yp4u)1xutJunR>#VB`VV_C~c-w^V;^u
z<cu8aK;*SP9Ate2jkAq0wRjL6y$%5Zqs6J?&6y>k^e?B+k2|!rq$$BqTG_-_wf1L_
zDAm)=UkRI%JpFdnXVT2+%XhpyXx#UHAku@DYK<%2?#DOP6*_EF+n?9+-2^>PRtqaU
zx#74rxHkL9+PQlF22q(gqaezZH{W8bM89<miGCmrbR3aIsu}(zq~vaB|4)3|?W~!l
zK+)wchw4!IkKhXhB`=)or61b}@LYhEI;0iFM#By?y=$B^#54F?hm%kn6@%}Cqtq_w
z3XjAJhdW$h)b28b04_+h_ta&AVDl>u%tYevn=Ywf0@@sm8Ii1xCn?<^D6~ZtoNP?a
za9iKd?7q!7R(=`}i?A6keGM#;MkrwfQ(v9WX-#xsQt{FayBBxe9*dV(pen-KLT6Ig
zxj!ka^!rs0mhQcB<Jyp{;tFZ@i&g;KVK7?QWvmiF<`2&^!Cwr$8xdj=vEdAYDM|@<
zy_MlEA!JTl<NUDv)8~XCHN7d<ZakD-c2jN%yYTPCoY@1tN!@5l*I{WfTlSuy4vz3L
zGl~}ErNbXURkfRx;pXq?H*gYs<#q&kwK^D8V&kq)eo=qrvG<$z1aSgI*}>>jByQ~V
zX4#Ae1?cctBR!%$`KBhOlKv`{%HzHt-)eaZg+$OhOxYCYEq|3^9Q>v*tGr*bi}C)c
z_{&9Jl9(FZ_LNi~3p=)MfFsC4ZS^ld820aw?z9~xq@k1250Q}N+X6Ts1z}&Il)|+_
z;mT|l;YiR-4k;oV;LpLE??ZdwWu^b|YtGd=#)JqxQZ9`{9>UqsxUkl>o!AN~+0hT7
z+?2agH8~g<mCkU*#=0^a53t%xu2TIlOS!YdiGhOhLhG|)xf_e7_J#Oj>ldstkfH)6
zlWU)i>8~vA(F7GuMSi#DOAX~_i?sfm&+FM<fXOHZgX-QN4Xm7kV5u@QA7)@(^L<@q
zc$v8>)ynd+5@u~%nU%JA``R<sM*|p0AO0m++Mt@r2RD9ZVYOKCSX40s14?h7{y55%
zUfD}I-e|i-ysd*pl*?xbU=|>;e!ZMq*D*&YGt;tiSN%|5?#AFgCZU}rrk@g?Z{<uR
zF_@1YmHaa-Lwp8aY(hYyCPd8C=>U7kC8oaV9nac>LQ+>B^vB;6o<Hz29|1Zg_26Pz
z!&uqsuDL(<5=t|HxM=mzL;HF^rHvpSe7&m0oL+>GstDyUI6URYfm#k@Te{w*Q)y~u
zqCGM|x&$>us(FdD;pzEf@|+2oNggy(<;}J6o<JTMOe{oo(tzYXbLtWfocUi*Okohi
zhLxHiVL^g+b8Up9T(~_J`7Tq>G1Yl}O@>p(XF^JvCh!0BX^%<Q=3vtq<@GnYYQBxp
zmBAQCD>|A&zVC|PQ5i}hkLM5d*(W4h?3MG5RRy#E%2ggaR%%Jf39ZvB3<&O_j+{>?
zRWve9P2Jr?zO2C?OJay`5+fJ1250BbRsc2UmAPJ)CL`+6QVrKc@IcAGXhU&BdH9Ry
zXL3i|e$G}tH~hW{l-e)9(<D737!ekjn~rE`J&>}l?MvY@FTXDLp*!NUrcRbVuQ7>T
zQdQ?elyIb{%J|^nrnj@Kf%0)`qmaG|QFm98WEQQP3O;AiazR&MWjA6Fwy+Ut2oQG6
zuydfxe)U9>CnI1p;BV8|5PjWdpJ*C)I?uT0A>a6-wJX~dYHY@e@H_+TU%iNMI@Q+a
zIE-CmNeo@F7d778=bmVuZy0wdLtU7O*Q+t0GFAh>FGSNpTo(QINV3qjUnwLfY_ePn
z9$l`X9k_2Gsa*fKYkzsW@J<xjn9dn`b**+e5--O(E4MmicGr9R*IYM4cMB4GFkqzE
z{P{%qIPUg-lNA~GYa5i1r}mZ+6<IUz{Xy_!TYk-v-k}h-^;YjVYWGNzXNgJ5G|Gne
zwiBiO^5vbfwiGWnhGNN6Ci-#UK=nbU|E)NE;+g&5XxI1KMgT1RuUsM-#tx$P#{n5n
z*PVjZ#ulC)AjcCom}fH)<zdgez14_Q^(~(7(t!<u!ZA=FZV%fVlHGekSOSsMyr|H1
zO_}6_)gz@(RIH!dAInS7Q9#wtBVpbdUYTdHkYrjw78=Kwl-<{a%mBjHTWH=Cy+#lU
zO|}2d_bE}yX%WWcT><A7*3T!<Yf{9A14Ge&25w8lR{)0?UDjXN*<UotUz`%6lf_?p
z5EJW8m2**B71L1VC+WI~jGeGdNG~^nv%Ta`<7ZgIq+2d4<$wq>U73%@z%vr<=Dt81
zN{onmTdE@M&KilIL7qTkj!p%S2w0`)I6KEf12std)p-M5WtS%?U)x2H)>dFWi*y8g
z@M#S~++T^IM5YM^M;#m!RZ<J}A|1-YfHNl#c}NG*!~h&Wjc6?LtFQiXIHB=VYV%o;
zb2_0k-l5ITp{+@wW@MqAAe0Ct2<-yfV0KW!05!<~zMd#``G5z4$X`}=^XX|h@}D3H
z%<v+dzyXS10GzN*XIwKy+De>=eRiZ!#V}Y(M+I1dR7zArI9<Iu>&_b2El1VlB(Lx|
z4^X4Mq>`ZJE_ax71hflEtte%ON)&<%-Yhv>9~^#*lhy1Yy-j$n_TFE~92E;skR45<
zPBj~7deQ4E2eSv|*JxvNv&JauA(YtP5o7f+j8vEjhf&((bcgo=_J?$YN9HfEN-o%L
zyeg(=ur{v+3L%4HJkFL*MR9#eF-%W!H<R)RcQLZ?vHu>U-hbMEHI4T5)22&`=YiIl
zS@68PwH$nmSN_WSoytywGIC6X$5bvt@q;6AC`$H#IMKc-=BNnkeZW{SnOM+I2ND)k
zX>1QJa(jr4ba1qZZQ#>Nij%7QRT~KBoSc}zfq_;hpV*WPo{2(0_hM_b8xwR~5jOKp
zc8u-Qeo^`NRW))NEk&z{9VSrb0!J!-D7g_An2f8?l$`jWBq+xmXO;NZP08zO!BrK`
z=Ii3$+(BDwv5+%3ug}$8pDI9$6{}$>b$JP;<w$--H9kU#u(>u~XW*~FpfeXGIZl!V
z*#nF6ww)1GwDFM6sz}m3WoF)8_(oHv`*iwYRQl@@=mE>D)+ml>$<dY4WfwjC3d3ip
zIK?Y4EmA4QHAoC**-b<7s{(G;vY~0>PGvSeLXh?Y^*l!=270m%E~w{AI_h-vH&ZX;
zhv=)XNyBI<ry_pLMH$7xW}&jNfNwczN7=FvIsDK5yvh=1$v#$W8C~;fs!HkM%G6A3
zHqx*nbHz!R2wsc6IphBH(yDnMFI0KUj8QtAz6TGkH$zt6vPrL!6DS6#-g+G;D8!h}
zVlk2mbOVB9sJ-CKumx{&4aF1nQh=ZGEvcvktbe|Ptwd3OK3rnR+Fm3m+WaE%^Pf<H
z)(-pMY2t|G_-du1z~?x>5TL=g5=cq|iV*M1Cl$$0U<{75C^^snF+e1jV-!2CWI3e}
zM>Z@mnj<AA08mm;lK9?)A;eh<Wg-XbU;MbQ0)Wpyg9qyzCRw=#wC~t}hjPdbS&;Yb
z)f7_4s7nV+qH@i?#juwL9Vhqe=6afX7$t_I9tRgYJI`QJ4c?`lev3~0?nkp5`Kv0J
z)v5HbIcm=gxXoGFg>9}d!WpLpALG;LSo17RO`Q(Jsc>d9Q%SKmX^uu2AsKZnhcl1w
z`_zZ=s`uD(ngla)I1F&iGDn+|yjf))f=OO_tk@8=s8`%cR0A27gIyo3AkC{k_Ejna
z&}#+oTtS>}-N|6l5U}1;b&l{q+0<xL43;0yiFArOCr2q6%u?<y4#IA=ouak<r1ch{
z65OPU>gEEukyu1!Q-p77)6LZ>7?K=_QtYN>{>iaSwv|R)!3HM{UMEh&CAIV9^5n;U
zldfgv3*fB1+JPd)A<=;Ey5Uc?xs3*4pM5SVXSJYObsD1UzodHV)T$*GM=maw71%14
z#Tu3t(_!ZZ4yx4Ac$f4f(fO*TZ1cPft^(bb7R|!Sz>#VwtQ6hZ0tkvEZg9h3ef>vB
zLVal63$<DNb1-UQODB!LfqKc(hyx`abN@@C&26LGNFB^U^crW|-RJi2yj=Ud_<3)N
z88NhN&8?_SqNqc?gFCSn**pvMqh*iEt5!lXao&8sr9RK7>UA)~p4H)q0`(T$@iA1Q
z?4{V{`Ue8ty(CwpDNrSDT$T9NnS?_;CUGF_2sgyn#KO0z-m<06_e;e&4IElp*B)$e
z9Zd7DR)>I<4t)?hK(g=)Yt0XfteD~E=@uJuq!59jgj^le#pDhSN$*3$o>t7hZMJ6p
z#*%^)xTEQSkDD|E<8H8eo9>ay8+8ktOCe{mp@f<<Gg6-tzBl}hAs4y6V=KQO>OeO|
zD|c%yc%-}598CkOKWK$TV!nQ*$oN*QSXH96kfp~Je<;R6VIx&~q`8kazFkelA=woq
zg55oRSDKYjVOG+4)7b54itx-aRJuB>-`_zHR)F`t<sl@4WUjd*uIwvKC)=r8uP0T1
zSPRt`3zF57;ePi~n1FixQakjMk(nK1?biN(h;je6$SwYI*DbeWr!>f|^n2gr4}Rc}
zZ(GQ7;g8%L7cV2_!ZcXxhdDNv(oOpf#s-Zs{A#EuG2Y}Je{P+a_wPfb9adv+3i#)P
zGEka<)XzIuWSBP2p_<^vsYB+<A2m9Gp*dCaE0&LTVq7<XpRMMv+2B0q*nwnR$&<o3
zZHleyU@Ja_Ojr-r+ce`CPkd{?Qj2wdX&%9gK?uhnERWqsXdcY)Bw%TT7g2Q=-pB@j
zM!ap74{C%O+~1_o%)#S;4kvuu=t|$(*^cDH1CE>x;`%x^`)MPL<cv<a)Lh|pBEHH^
zfCAWY6stIT>4O^ckQ@b8JJo{2z>@RsYYT4j<1%ZbC*mWq!{IbiWsMK@jL0iJ!L4PC
zuNpkz(tW23A90#XIECeyIB_FXlMy1>*h`TdV>V<|*@!Ep0#Z$(?!j#AO9^Ys<(3ps
z-`WpW@^6y)mIwQPXfoZ4)>$RzOT2W;zNEzkNMm*`S9q)rzL9PXl}36veN}5;g`41C
zjh$~X55#UM3@;m#=%jOP=ucnWBv?OY%KFPQ8V8vw*#Lexw9c>`X$o2%51a2f%Y(BZ
z(WP5|Oy8(kq=nC|xEO0}B}gBBNwS1r`}rx%j74n5tg&g%9<Q;VvqwLbZeVC_HR=U3
zUDeI*EiEliS*bd4;V$zKbk_iFDcYup-aHLPIlfwL(Z6q}*cNcB@hJ?a(LN^PEzUs~
zRz)Ffp(8bxn1s1ExHlocH_zVn(*t)t#M^gE9kg}W0!Cfe#9QafRtll%`PPSfiNcyw
z#)A4BHY5r6^~RBgX$9U1vdgFC(lqTz`-ajrl1??t@OfI1#zzZ>>QL61tP>$Nc=kQt
z4@P9&Q}VX-4kn4^qXj(UF4~+i@Rlx42Pwwg#1ryp+TnDN50gIc<m+&eiSJpvvX~PN
z&V@#<(M66dt*)o_7ZdKfif{NWX$0Z`+{%t>wN88+5Z(ybW#7v;5)ZrN6M9&et$E6X
z2#!=FCVvok?1Km0GiK}GPbUcltOHRgi?B)4tHa91pB*<vpC*w(7U1$YX5|F~<HNb;
zhAQ`?srGYyX$d0OJ?^s86L1`sI(s4Fo1b%ggNNB^_))NBcvasV$MMJbIdsKtT07Pp
zZy?0`411Xzar?NIX8ZZseteMElH~b=>qYe0pDnFaDD6vP?JdPwx`lAdo41{R4eKkP
zOX_KJ(#dAR@J00U%KW_~9pN>5$NBef$K3SxPmAOF4~pjm`9&u=dbKA6AG4IyYw<1Y
zJG?vjj3*p?cO){zKPr5`tV`UKPktLScW#@c4*?gX-Mzin$EO50Z_)mhxwELa8SD09
zl3Am|Y@kQHX<ln&7!dfE%6%nz&Z7Cy_;%+|!Vh@`wDfu;?&y&O*TAnnD$R5Weq8is
zH1Fco+_}UEyZ5GJPJ~<G7t2b4h@Pi-o4GrNtkGwS5ti06)`c)tx5LZ8&RY+PJ<>U#
zXg$8iXx+!0kB*l*!hq0;QMmL|pR#<C_Tp=ex#KwVDXJS}?M&1Y`^?PUR=&3(o}hfb
zjVg{7Xsp~ntwGe1w;k=?4Fd%Z$VrLK*FIiXIEt&|(r@Q{^Sf*9WZY)^8Zwd$*(O$Y
z2NiRp7n9zAC??7zIZ^iiKfEo%k^TK!7gFc{65f_H5C1p3{SSL<?iQN*ANKa-1@V8v
zTUv8H0XZ~AmFR2AHD{9|6_5vz<IsYKN3#v*{EUuWI(*P1!!skPtnB=KQEI|6Q!djp
zbE4xS@}ok%(dlfEhjdVyuruHd3t3GGKKZ}uZRaO@oBMhEfXnrN!dnS(|D>YqVeiZK
zCQ10%tqwCdlx5mN8QtHu2vb8e&L#&9Sq!1ySe*Z9j3o*bDn<zn<;@l=fd3ob{-?Wj
zeM4Z)V>ZH!V4k~2E?jnJ8WWdTsWy?%;dUDzYck1~402IDpF_kU)XcIz<D|yG(Fp+z
zlLmVHZ}yg>W`8VSmi+y++BDb7<AYoQjB)X|v1ka`O7&Q};#^~nA|;->uiI*c_wOCZ
z{n2>2)f<Y(7s371TV-g8J+Ik7eq<?{-taZ$`S9d;;jWAe+Eusla@Tkrte*aG?&7H&
zp@_`a{@?JnBa)AgUYuO=Wx)+ub2(jEjTc-Tn5D*(9;o15`{y9`--(!Lwl8pBVaB|)
zi7c4;+2en+x6Y#~=%ST9A?T9)U7<|fhdszY*Hw1Hndcw%BGGprcaZ6%JHJN@wkLfL
zXC*cL9!>i<>qmqHb(3BM3#%AJ)Yr!%!vvKtrnxAzN=u}%@>)arQOsYO3{$nnoedN8
z>rD%iWphk-qin6Hj8dK7sdm%N_Dw0W1J9|9lH}jnjT8U0H|6FC_8%1)M%rtcnHE?H
z2j~@Mcp{h;<wS^68^)$_216H>51WC@+Y!u5tN+X1u7{YHH=hriSG2z)fc5kL)7|zH
zhFVk)`&J`Wj`Jc~ezLa`mUZ*OG^f?eRwHzD^hk{SP-_tqR!x{ZcM3nE-GgL)9;xM?
zxA^v|LjVl>@Pb>O&tKx&4)?(fosb`?R$WhoVf0-vD<hR%FfE~sz36;Ml`Ze{D;M&C
zvadGX_g16U-K3GHw*4frVRpmRh1``LXdSPWBk<o(8CqD?|NZORKOg-!k>t+h4Zvn0
z=7;7b3b&sYXBe}e`H<G1hiDW~hqgkJ(GGW*SNl|N7qn6#Z)Y(T-#)|J%un^!tb#yq
z_P4eW>x$ik)Tes85$+_St3M5BOb|kaYJ}p>@+SWamDTxa8oGHSr`+;*Bc|db19p88
z>F>$;YHhqXVTE)C$vL5s>7jra<t~UwarE9Kaq8WV<WbhsKHHn5-hR8u0J<hbc;|Hp
z3Iw4^j-9f9e_yttHp^Sc>3b`mubU2}JnrVTBR%f>8`GNrn@0T4dr(=P!Lytolyj(L
zXQHQFYp71%=WT>Y&m%q)U%@vhob+JGyPtqsR2@>QlN)t%{?AEOv)huOa&-?sF&J40
z#iNT$2lDEwkni2p+7JIU3zqM&t4-6_BR$jyk#o}*)*BHvS#$${lLZAs*$LBkLfA;X
z*!>W14N!vW+eQU+AyrlcB{F)?3Wx~WL<7F^q2eQRdUwB4G5mw>g??eSZSTsax2=Ez
zOe(<&QdUPod^(bsoSnSTQ78b_A8BNbMS=h!VvnK|{wo`}Zti2-NcU6ynMUJ3UB55U
z`$o0^$f%qW2K+9;2HH1Ya6|wKXfB)I^2ioNcpFS*qru-vUz-PoM}JSpe*d+JJeLTM
ziA)@Ag$F=<`cf{kZw*L*_FloD9O^O^h!0jE!DXrv)qJ*1>ZL7U`iOj@LeoOmNYyZU
zaRhk_Y3W7O((VB+8+%6gV&YN(YIJM$LK3?di7L?8Pgo-3{Qb5u<Gn&y3y=NcT9771
zuS+Md13gl-5ul4%XkJ*{Mte0<n^mEL-zXuI{?xb3-nHbcZO%P%R*;zp_IWS0L()<P
z{85=IYc$_+BqU}58jTt{*gswSB*J39{$Mfv0dv(%J0J6v#vv1u<e)#`*uIeatXL*L
zXH<rFCPQ$IT;k?vG-w|^SM*KjRRdS>;YcP9K<vYv?lfIc^7ZRhDm9v4%9sSdbaBdI
z$vdb~ZUTNYn$HrfzJajNIGvL%HU|wwcFaqvHfs|-T69RRvd;p<u`9Xww}NDnIOkhn
zpXE<RtVu#u-{;v~dUs>+#mJ@Ui>Q2g+F7YFAL2sRd3;giEe*s{p@Qsl{8xGt{wpQ8
zC0T$}@$WyyR1o~Ka^9GZoN=WTb`l|5OVT!%&}Fufp{u+5S5-xI=0Z*?vbmhGWvzy0
z_I%j@2ukayN|*q0v=-?ywOB9%MxTBU=b9|^FU*OSbh{12jmKdps~q6b`RT}t@A`B$
z_SlvKwzI%Nb4=9UT_Irp+k}n7&#KfY+T9RDJ_sbc28p&$UU{ks3N)7nOsFK4X;cLi
zp?BZVRRzm@gl*d~Jv1dXo*=`dXi2zZ=XqdWkQ*g00JXn@1vT-%d|U0iJHM`Pkpe?b
zsIN(E)?~e;QBdlZ==A!WK&IeWhE#;@(VF8b+|iYgIl~S?G%U7>XBc38Pu7){{hBIF
ze}l+2B0<0wn_`<rM%YEsC|QaOv6yFcL%G_gIv<mgu=+3aYD9W4z<DIhsy3I%BMtK+
zso$BF0L8&ufWU(7YhI`ivId}0(AO>jjkt=m0gscbTBQhfQuD02$eXHJXa4W09?e1t
z)F7taX#6iBM=)p?fkt&+D=k0JMM^>&MJ^qsNast@leVaY+wyEVt8%lXCZqA!n&dk0
zHGPn1vTXme-Ll11Lm!(wtk!vMV$$z7HJ;<=qt1qG_l92(1Y+>*FWJUeM5I5vENvK;
zm!TONyHMKE#Q0|W#H6z~Q|MZ^Tseo(6!s*1aLxL+yn6{cyP!$hgGYp|w?(n|uW%}D
zvjbP_d{f!ON9ZiVf+qH8yy(WhA6~7j;cszYG);IB#!xr~bn`PbPen%D%D|`7bV)bQ
zWFm{+6;T);sgpF%Rz%#DniJ3}EQrrEAm3MpdYs}Pxhj=P-`9dY&MYPZma4Ar>PI}z
z?JHW=E+QUUPdqN%FIqM}kRLk{Jum&~J{feB$j4q9&#N%&)*T|0r$GtN>$r;6J%-4q
zQFG6m&+|+Nd??S8p`N#S476=WGLg^!3%%7wd08CsysxcjJF|{_Sv~Q5XuW8=@Irar
zMD%*>WiV{Nii~{SrSW<iwQj%3M0q=u@OqxDXuqr2aK<+EdRe__f9ODYzYO(y-DT)_
znuvVA1$(`nT6er`$h?c`c)j1kO4pPJe7v4`ef&qg{io;!LFpZT<ppEn4GZ#yFY+#r
zp@Z@{^hPPd9D@W<G5TPN`e2**;DUVcihKwLeTWWyNZ@?GQ~HvL`cjzqQh|JFihSt?
zeHjjYnc)0>Qu+}jK@>vza1{9|F~I{5op>7kepC9BlKA++q6vch#gfoO*q!-{{AJ(*
z`aAu7*#hKD0+bsO72%wvK>?bF0X4Gz3WHSI;-4vL5rW#I)2nWv`CwqKccAA(pe1FH
z9ZryYVc<VdkTXu8k8+U9VUWsVkgHRWmncXZ2jt=e3Mc~oP6DMdacc(H&Bn9&HG*#9
zi~QY8@Q=xPnfR6R0g$lwh?#Kz9;QGECLwpqA&JotL=8#=QRc()ki?w`c{hl)IHYmT
zAx&^T#U{9U{{VgQ5K%qGV4cu5*)XE+(9efKO(urLM*o?bg&!$E7-&SCO(Oc_6qZ09
zNtF1*Gr<Y6+*@(l{Hy?NM<r+@7F<-wsUE<9paP?^z`lxVQWk(Ck@qJH{#a4?i;4R)
z94-SYS7y|oQzP{~6N!UCl{7e&{O?j-Uo^^UgRu(u?iBDB2P3hc+}|H@PJf1U3K{LF
z7&Zi`lMboCR&rubLf~GSsr}LIbby}+$-v6~wwaAi%Z~oQ365=Xiq-kexqS%V7b^-g
zWcq_6247f9a4%}xFnktT#>WpZpc~CW75_%*)%Tl|Axn#vJWiwuKCi}R4Mq<8ASTQ(
z>KIz)77PFL;E*}4o7oZO{2+YZ47d7Ij1YVxpptXIF-osE?g#v@GFBqFp#;ArH>)K~
zt>pMva!nY81cRk`A=NnP{=_`!#4UYJdPbXW-6a2^<bsA|^{LowK-}b?2*IT!2bYwh
zqrZ%73DL!J%VfMV{`kI2$#r6Gt8n=E@Cc@Nae25gg{sL>s$q0XiN!APxLHv5&We3k
z$@NERX$xtpTdb~2@KC|ZVWugOrfKC(iKVJ(*|L$)y@vR;A5Up}O|DbPnCp-5ee!Co
z1$G*$N>NShu3~9%M~MWo8vaK3ho%{CxGt|Qn5w;iWHA9jczsgxm@W=p$Uq4lm$dY_
zbXOyNtMB}nM<nmT*`-0LU0b2zWMPX2#+?|v$yp{>E~3-5A>evrAcx(LXP)9(?$r1{
zF^_nFlx#LvXI3*z+)Fi-N+tTs5d1&-zoGJgjG3eyQHvO|BR8fsSUjiX3ZF?aKlLBf
zV`&<Hkr@u%uwNP2A-UR|uAHuzDb5^B{CW_|Vz>8dQh#LM=Iv7Zau0Kb%isa#!!pmg
z!UH4)I>UwT4TVZqpbbYr^$|R|GAEUYbz(lZP@MYR_r{_}O1OLH&ktW=h}s{7PCLb&
z2+d<ZkK^KmhU{g5!rkja=}fZ{JLQrGrji|jjKbrRki?Svs**KyaD^FIT^W4K1YVsk
zX*dSEtHkeBgI8j}9cHCs%B9y#rOVUcf#Xu=#8O`DvI+4rShh0mg;Kt`k}`Ab5;v!^
z6@>E0`!d+XvMsZ6HMa6EiRA~w<rMYh(8}d!)D?r@N;{W}Z$c{M9?IvjDxQujxEm_w
zuqydJ#4BxBDre>^V8EZRWv)(@h=^5)ca>2Im1yQws;pJpoPpRQRr3i|5lU4AG}YI0
zm7vC|@1fOU_f_Fo)qGSZ)me4b)Na)b5;f#lHR|)#EZ`a|#p+4U8ZN}zL5CW(gqq*x
zwFoS=I4?CKBek~iwJJ)rQZ#kFvo%f+wF&Z}byEN8sIcl(PwI$j>nO46wI%BJnd`~t
z>kPs5^YV51#1vry-?>N|Y$Y1(Airl`)_dqRxPcoyxEe6L8}uL>++P|z)f=?_)tiGG
zhtSh!d>f{(8>hBeOj{b`x!A%Hn=&>VQ&O8WVjB|>n{$1eFtQ-?&6^8Dn~Ou6*$12Q
zz|9qiE!3UO={D3=<}G0L3N^u|`RtJv!<qWLNm?4>)*gx0KJ(Vk=a!|RwhoN6j<rAz
zwZM;@w9as~PNuSCuCy$fljcgat-Z8LxWN~0x6O_Oq!G98oV4zNTeM2r&eVCPsM-$Q
zTIW;YuV^|3Q(IDWIv&*9o<dm`Pud~Uh*n2B`oZlfHtjHyt*~kE@THyIBb~}MEs&C3
zqBkv8?C>b9otx(HFY29W?(l@89mJ=<02*BsNJN*ZUEh$p@R0zd?j1i~yFWu*=FwkY
z!@4=R>8T|lK1l`d6L?12j@wXpfv|R=*I(p?r9Bd*$@F0e$giEBvn&E_->=dx*4AI_
zqrK|xwAi!=!rbl5Nd4<H{Yv<K(yhP5NBhj(ak)=>Hry%*Ecz8ryY*MwR7!uTjrP0Y
zQ(K+(h|&)5ybipp_x^kB@=xnE8660fEDbrW59IEq=k5!_9{?~8MQaSPjkbr44()CC
zW$O&=st-ir_vF$JvYZZLr46T#4(D$V6ln})bGI3~4=cA0^X3jDmz9n@UJchH^`;Y#
z9EA@0y!LyS)-|+_47rm+BaK9>kH$z23x<ty$BqnJj16B6jiQf<SPW0#kF4PL&$RwD
zb06C~MQd~K-_RI2dWG*=ZFNrjHD@t?la{cJ-@7T<-^1N=B011e+WP1|@I3l!S8@VI
z6K&LDB;I`}zjP8nH=LZ-mufNj6-c_YI*!aUvc^4ld^!d)p9seQPD>$86>N@=N{kSA
zOuR}C#nMcXoK4^OjK8^0Q4<Wk<G10^jk0>Q5xq_Gto3uZ!6VX*0i|Y;&PMUZroW%f
zNLa=~dLWSTbbMPIr3#<UK%A8fpVL8`p^=&sU7M31W4TV7)3-!d*PQwJHp;2l!TvTb
zUG@uFYTmXD0d8!(Onq*DWS-n(u621j!R~A!R(Fc342f0;(%*71Abc^XY!PHRI>x*h
z3|tDOTY6xdiR4*I1TOV+PrAJ=r1PNXzR(I0EJt%KYkr^42w#RBTOtXg`>DCyxw@<g
zwNNaz^4zi<5(hz*wbC-S(ss7eMn{;z2-hLC+E=#HwFci~xjHhoGT6pasJSx5gXB}@
zSVOl~XStRMv$im{)*7{zakr+p_O`G!rn*A6exQjso9?(9zOFyEZfm%HMz>LZx&Fm%
z<tBXN$YX;SK*l${@j6EFLAMD(xJjEePl>hhRJNJEHIH;oTVl6)?z({}y;TZWp6E7#
zH`Lm~d!G;ow#H6E(V%XBr{5-%-lnkH-oV>p<=VnO-^MxJ$|as%#ormq-(q;*KK7XT
zL)XDMu6h@~voE<|udy@eu=Bfp%gAz8{mZVX)tET%EbjV(wDexZ=8pN;&T8v6+5E1Y
zV)=-y^v++cT`lST2kw27vpKc-J;U)K9l~*AWHpiT{WFiffV91>*1em${eR;FRv8mE
z<!kce2Yy!jI!Oz1z{Ab8ZU2nj2jJY7)<Yh$1L-i&HHYzKr;G^~>FQ|dBZTt<i1fpB
zmZS8DwFu9{D8ggD@uNK6jm-B+z47C!h`qpwBZ#u2tGa_~&+hV!sbYHN-13th%j3HD
zW6Abo;jZ<yR{GI)<PG15Qwyu3UhcsLWbLN+({u2t<=CkW%<01UdXMLViqYA6M%z;P
z!pghSn)JCO!P!gNJag#z3H|zh#9`(C$JbrOMfvdC0w3uHLFw)sx)BkT4n?IARHRF!
zyStmA8wQx6ySq#28oC>XhBNwq-yP@deYoL<4>xeJo@c)6w-kU!z9XG+UbD9&Gq?|*
z*F26ATvyr%k1X>JpBzmbczBXp?W0B>=TRShZ(ksy!A0_}9C<hf;vEyZ?vrs(Q$XLL
zADy5r9%`YV3NRd1vqZk5k32mqIYqxX<yx-dja1^~KAYA()s#IGrl}IQRuURL>)$`?
z7C3*UTKV>+l2pC*d1>mIM&P+(WW_ip>_Hj!(fZ;;<i#@sSVq*vr{xO`ZkWyyroJj1
z84vz-6!ZJhg;B<Z$q}0Z{)MIFejCyX<R0;)JYn~82>Xb`>5)y9=8{VMxJ`NvN3}7H
z;&;$8n+?sC`{5C((i2wTHJjvhIPPj(<aMm-Y68u5^3ipC>t!PJIwNBhL`Cnld_^q2
zuIql)wX`RTI~9C%Gf2%ImT_aQbks9n7WuiU(Q~(%=B9GFNtt`%vG!;OmcIGOdAAj@
zS#CW)sDG6C^|oiU3+wT=iz28g@?Nd9-M#R>GxVmE;NiDw8M_72iq-s9#+`8K!$|AY
zLB?2h<jK0~d_Df1&cl7j-t8FlwjCdX!^hXpKfs^)*)u>`ctm7WbWChqd_rPUGAJ^b
zIVB_W-}i5ez{Ms1i@=ReD@hCNAMgoDkLw#8_ZyBMo0#|=J2gEx+qGQujClmMvH9Q5
z-=1H<FY|&!i*FVtdPbjJhoZepY4aYyA{Gxij*@oUqLxmTE6?r@CFM6;=qu0Zjif!2
z7pSHzT@m;(*_AA@uB_G{&!H{tQm<wtr@Pv>xX@p@RWO<%<%>n7P*wQ;3&R<&ADf)M
z0+te{C@zMe*@w3uOFu6<R(+a)7<7hEDb|$CR+{`Gz84G0mrqIkXy(7yXZgAzY@fcX
z7_K(b;)RS&{m%j1Zk11ysvJ6i@KVVL%6P8t`H@7%Dns>^Tceq;L#bc5oUt?`UrQ&L
ze-bqFbkqO&A?=ELXTI*|-qLVm?U7@?uwFp@%!b;KW~G>lRTJ27Gw=OFx=M4y#okQi
z9v(y<E;wD1<0G6Um~XeXzLid6snd9SdAv55p-S`h$Hz_$6-gDhd+DwCYZ4N#z8yw#
z#|x%ki2JuVLfbH3jIXOyLP$OFpUt07J*WOqZizu;8B~1<3t~(#Sq*0OAzop+PPSWC
zy4yit^xSAJr1QT2@jT6&4Zgh=DTT(yD8r%6_eEeWJ~7ut!pteliOhw0HR_|f=|;S!
z5m{)s!V2rRSm7zs@1J=aM2tdlpJOJ0v}<-YQ*3(rX;aW%^sgip&tB7|%R2nb4-{O{
znisJpHrvh$Vs@cQ8xAgF2p@cbm6K?sKyH*<#|z%hPj%ScDWEgO%1QV3nU%{J;0(?x
za%N{U3G(w{U)%d@aNp|2_%KV~z6UX9e190&g&+r;C6w192yVrYkn3FWXv__9*~HS@
z(&A1eF|OiN<(>UnSOCXioeDJ(CzM}hcl*O0`8r2Kf=2*n{rQynQ8U|!qEWT0-k_nv
zT|V5nt?ZR~QtcIU$#DlxN|L$G7BTjIBjHcQl9vf@!^+$Jcjk|KX#+WJdnt{x`}&`i
zWkwE!l%<{y1eba7rSrEg@C^xXaq<r=OzoeJip)2i^_SdG3XF-34jp7FM(&^gdM8_&
z(0lXS;$l+crT@j0j`{G#H2()w_{<Jiv*LG%NkJ)mj>=gHKL4Wig~Niar=H!S!&(5(
zyxZl$<?@9Drvln@l&Du&K`fA~)i4$-C)DR3rR!G0HLR}JqCfCmuOv7^L^eSrBlTP9
zz8)#PUImbw-I7>4hrPCa+}r)CX)D*N+G1aq!=}qa*UrXAtC0N;66?DYI_!u$u6FLD
zyR)&*S+{eKZUuvT!~xu+o%1>F-*%UX1GthQ3z7JMlg$FeVY3sq(H7Q?R_@2U^T_9|
zSKHIj$A>#?2hXd=y$k@loRk|%nd>!2CK5@XRAk)|5m68f#fkP6mduzBQAs8mH*5Pm
zRhhR5Elw83j}@UN_X7t^$}B96K53h{F+aV^MRXmxPVzY(+U5DKewVgR>YL+0sk@c_
zcP7mg<R?LLe%;9aIpwo);r9GW+0P0z+ep5K28rfo6W6l71q+^ttLo}KtC#EH3mFZ0
ztkA<6*6r&N60!+iyha>hWa^o&KZ$Tkr6Jew@0PeZiTqT>(D?hc>}2@Us34`>WjID2
zXN6^W2;u;4G<vU+?ohD5(jpy8f4>S}ShS<<`W)>}zZz&F%I70q03T+Za=P|ueA&kY
z5;k?7_u%lj@}WFnEQw6e_(X7xQVNHo{LoiotT<OJ<~bYFAtSM0T9tOsIUa?Ej1dS|
zGQ^17yjRJ><t!x_oa?t9l>1Sp6a-T$5Zmw_QoHz(rga=<EHXUHD(w=LemYbjN$00(
zZyD>bxKJR44yOH45tQnomj8Q9ek?%Dj^|)uN}NVvJcPI^136~%#dEXq2$ypy(&lG0
zlbT(Tp!3{j-oj`C5IE{ICWqB6yh!O6;d}G;bJHhjMJkS9(!`jZM1#~#diQ}}+2n#B
z-!wClTJKh+QePCkLk(8Q=s=?RcmY-)W>>ETf1>d+DO3svYxasolZ335q)8QP{~DNr
z!iS3t!;5uycoj*Flmp(R6@R|G!Hq-|YgjN!4A2#4I*5tOu##ByB}oP~U6ta!a4bs<
zS<L&JTJRxp!zD(whFb8q4wae1`^KSxvtuB5RhH8}KsI!4swcj>m7~<$SaIH#?rDu^
z^>C@mk9qOgU3hKQr&543_QI-mTwVQH>5s%g&2_7|`Y}DoPkHjGZSBkIX5AG#=%Ajp
z6-g%3fhfdD_;ztN-mz(Wa>H4tVBx%BI=ACs*=1Q#UuDG6{-6}<LE)nRQQff>KyBjr
zA#oW%L7INDvFs&+KLq0fULm|~nP2I(L7B=`2TSR(PG{%}-kESC`|q+qSsdx7FJn3x
zRd{{G8zqT~uez~}%5Uh$aMs9?`Py%PmWMkwwZwiu?v?7vj1nd_q{mP8l>^$wM3$^G
z^I!MNM4ZOt?HjVcb{^1#*d{<rHh@2`2i5V;61o=*$<MBaj7mUy(?gp=jn~6A5tXT1
zN?T$%IKxgt`JjWLEy>&KQ7^oz%$>Vs2?|gs7J&ttO4;^X{+sbgD?0-wDpR=++7l6o
zd&k1zJMuc9Ul~(&`LAA>y+^!`jR#&7y>2%9RNyiRWfm;{W@)D3s6ADaAXuuEX8v)x
zVXCaf-oVpl`zyumOrJ_k1r1_6!RQ}|TUDTZV5W0BH#1UGQ#*WM<$600!=tIK>s3Cq
zO}SmzR;g{69zJwvyj?v0D{+rp=5Ln}OtrS<a`?#O_I4S7U)P4Na_mEK2g6XU>mV69
z4xkXfTOo+7>-uYP?^a1i>w1JoP9j|I)@bo5>-%I?PGeH;)>&2S2Q)@b6B_R}cp~eE
zj8)D+<nwo%e>LvNS^DkWmK1)&SfI*zHpTt69Jy-4MB>PKzWDvlyU2#$1u7R{o%>zQ
zKN^>8<O1S)zh{8oIMb^FuSmJyH&Jbzn;wDJkTl*O*hDriY^hu}%-<h6jW#Y_j$F3f
zk|I1VeiIB`^{Rv7;W$XOX_aL3sz?0c1S=}CX`My&dJGK-KzEITIsqU7Z~_ot>1ZM-
zeu(dglLpWDq)-_7k4{dlp~w#P0Bj(Ssi~=~tZZ;_FaraFhK5FGXXpO@enCM&Qc_ZN
zb+wC&%hRV%1q1{Z7Z=CI#{T@a|K$rc4#pH=Xqs*UhFI*i*l>>8SuQ(45HF!0+cItB
zILRaXa&7x9#4^ST?G#G2p);I(nFlVdurenlV9MxrmCLYI<MMHxTl(6>(N&Yj=+W(?
zm-pYi7x={?DD=POJ(rN=)U@>f$a`rih5se*xq@rz8ycIMTUy)NJG@ouI(vHi`UeJw
zhDRXXqZ7Y=|E*ljy&RukT86ExuB~sxEpG1Y?(H8O9vw?;pZp1YSJyX<XSWZJ0Awsm
zIRced9}FUP-M+G{PU6?-Gz$MDz8U(j|4Mu>OmXBZa{FUhKY+-JD)I&sdA<$AwcGw3
zJlIXu{P|DbW19Y1IG!UH%RVgr$*$;E!Mhv<``>op$v>qlt`GLb(@;d<yBN3s8~6ga
zifc;eYi&F$!WC;Fi;YxUtBZrR&}DZamq#o_;ybQs3n1e7JXA-r+7rc~n0~jzyD^x=
zZ_ZrvH}T<W;VCy%?fg=Z$@x6oP;E{M1nVPOSGRWibBM?n;pSR<wA^+A$5UylJE`tP
zOdc#bikxkY<;>dw|3p3-?xJtYBh8J{2TPryH2U}w*Js<meEFd*&9|3>Go2Z#tu0*F
z@Uy+;msl_Vw5|{}Wltnx6PUL<LC~MfR~X=fC&Rkpry(V`;!kR1;^$B9u)Pu}XoIne
z$b51>LCpDu^Z^{KknPn_j2eu!Fo9i`Z{Z@Fg-hXL@NEW1i3is8s3BDO^=P>VBf}Ua
zp=*x07!?_|?<Vh55UDTjQ}9NDE_wd<M1ugf)Fi_Mt%XFR{Gv?_69^*py=Sk++)A~>
zV=_iGubh9Sxxjaf72F=!wlkfdB4VE<>Yv#I!d#P_D47uU(VQ4{GdA(K&)}W>y&s6&
z$MYR)w=kEP!K|n-e>VwSQZqMJ1nFTfW@9ZIG21Im{T;FgX~LV?gSG|am?L%%TtmzI
zm?<(4xet4&Y($2`l7wwc#r&Z9{m0OQng!}COGNOS@2p)-Fh9guPcJ@fV9OmmY&>ef
zK5E)z7*ITFUYrX(YMI&{JZc@j!9H&5!BRSIZzT^q?x<xSI_`uph~b=c6)Gs5bZ2OX
zo%AG{4W0Bxx!|1k1%@b{_IrZDP6wPAz=-Vw>js>&A)^7Mvtfg|u(J`3-J!Ell^dM%
zF<C6-^KnTA^6>KsLH6PEUqCV3i{DHN$`_N=+Tj;dBxZ>4hv$L|pFw8`QHIX~K;iJY
z>!M-!{Bb?*<-$(C^5x>{Z20BU{Lb*@@&xTQ?iFkhQ{`&Km>%LZ5GtHruo58)q*#y9
zXd&N7Fjiq8-vl`>k!__1dXQ~rC*qOq<QJrq?t*K9q<fIw7LxsnX%%{sgPN`M!o#NG
zAlIWd-wW5{E@MHrlRiazx6>i+YPYj73YrVI^WTG??k`NgL7T(p9aWdlmjWXRuT~Rj
z2(LE_M(Lj3?AGc(y*=zbBDg!9&LFslZ*dbm++5-(Jpw+e0nn4^C>{_C2uaeJNIKuz
zP{rP9;)l`3Gs?A(emeF^GNnZm?qkHkwLSLjJ*LHwz2YPU9ow@g(doY_>>%#(@dt*{
zaTu_6k|!|v1PBe$vDy@NQXBaMO5&`F`pLa129yWM>2^Jh?Rz85SswiEj-it%NA9ii
z$CD7TWP0M-D>N?KlTdu59MVs#-Mk>)r^ZSQa`Xd*Jwn>v;Z|YaYS!gsgWJj@oED_1
zFN31QPfsEj0J*gHj4Qpb$S2VJC35M%MfJTEi$M){%Vkt0=$F@4Ly2!%XXG#Hm!-0;
zhz-4CWW{G2c#l^Rmyd$jlI!pNRKXVsar;h1^^HwYbfO}`s)-3WSvaVdUY^)O$-v`_
z+4F@kHk7hQf|)qBXo#J!GWoq5vp@>#uqksnXo{0wsFIDU%hL8&;L=?_UMHIhMNDNX
zp9G7TUg3y6xGe2Zf{IRJO}@i<g2)fv#DacSq>6J|neqINMT&`a%v+MGEE7$SR+=cJ
z&7UvM3(wsIz`*|g@qJbHO(Uz!0ad|x_y9DA+MY)4HMk`@=C>R3i=rzvu-Z|1Ro*<6
zE1Oa|%dcd0XueQ5waU-ArnFN5XUTih6Cd{9Fsi53h0WsZYW4-c^X4E$D)Cewb9Nhw
z)IQm(H-mSmD!`w{e^eKz-?D28vP_k0LrRR|DYbv?)>T*7|Dd#T=U823*XS6phIln{
ze7??~ZeB$stUL+>WNfP1cESY9fES!|gvFW_yfx(puAGMT%(MMpODhucsK|}@v8#t|
z6A&N9a85{bXlF;)R7r_*nK|drO|DXvR73K}EWTk^&Q2tJuW9BQur1b!IjO0oz2&kI
zrD9%K`C3xflSgVBi(S6Km-uDG9oUi0q383ywgEuV2z0p4TRd1TZd}SEaq7f|p2Q^T
zZN7juOcYZrxl~UV?Ympot#T~8P1Y8kH*?pV7B72TBo>h0-CI{7al*WwJLRKD+Efsh
zz<dNX^Kd+D%1G%sSNuMS<UVUThVYfF1lW`2P(GBENOMxG2E|`xF-n$$)k;=Fs%J7e
zJx&V0a;}9<)@6_jw4CJImaIiwB&AC{l;`?zu17t0PJ1JHnoSy0vK}L-m8#@%nwib{
zz30;`=wnModPNDvMuI&VNcZ71wS#jrDgHXiP_hy<QL+iDo=qgT@Tg2$<=jf0tWU6O
zIZHS#*-F1ij(2;gj3Y(j+Rl9L66Ysr7eiROoh_&x8|G0JMbAaHll#dfHZg%Qk3T#|
z`6oyw{Xrm1nrpW(o*WccAjDYo!7}?(CFl*rLonzo*IvnFLwxNLIYa5sVfxQ&Kc-vK
z3Cet_vc3>CzUq;L`^1z|>{kdjh7Wnvcx7`P?0=dM``rTfs3<+CvDXPBUwD9DcHMW|
z#NNKxlyr2RC_QYbz75&;5OyYA<vMDb6bD@f&^MpqGJ5d;Sbk{1YeW7_k>*V#A%^zy
z>L(%OxI<7P2*>m42R-nl>ytzv#j{os8$QU7Wt*n8hHH_&9}K6_zw+0aGjs=Z$b(Mf
z`NuZcEJTOiGn^%-<ZlWs>yF3<ouzq_<!?#W8jJ~JoM##5Z_Ci=O>hUD=PBmzsDNJT
z{l*l#z>uaQLhAd|(?SG8_q|U?2E}Mn{|<(;8pjtkjcI!po{bBw5TE*as>3gdy)%=D
zTnN=e^}v@{)}X!W$e=n7P<30n7y(y^(*hzRCDvKEuLnLz0=~1jt>Qwjhq0+o((&(B
z9c?ZY4JFYFRA0?vx7|!NK3tS}y6?U(yP27PfTJcgW12<YF4ca%%&Eg4Uqajyf+c3|
zqsbvwiAV1?`IoO4rCQDkRNaxK^id|DE%12HdswH@RX2<4R;{YXUMS7^#<Is{WyZsf
z<MJ(#wV7$@(dV+V^?tba<?M;-{l!tn<A|*1aV^rR1}4%B65_ZPfYfWNz~!;lYjDdG
z3&Z>Aj2Chc5Eke{?1XfQ?u{vj_~73SPsE$n#Dm_+n{>vBLKB%r(}!8a1IX$Pyz*g9
z@&Vg=agO;g75dV$`tsKK2rgkfB@059kn?`K?JXsOL{;Y{Lh8rx&I|F7pNZ}(j%aRO
zdA%_4Cn@s-u==ZD_{YNhs|x&eg8XIV0(AR4B{lt7w*5{n@t+U`+KL3)X$Cs521eNh
zy3_?a$$7b51v+TreeLr$F$s{H3GkEi(FzLcH4aje3$W5enn4MQ6A5(I432XOPPz*8
z!3YN0_(k>wg%<`HvxaEMg#?lYbMpkJU<83?LR_v8R~&=Zd4d`>f(l7PKGp?oU;5@3
zhA_8<RFZ~zh=e(^hW5+^t{H^p)P?44hZTwV@b`rwcd&-_%LVp1h0i91rCo*c0z&kH
zLVp$dn+AFQ#_*hS3LlsO?goW-&qVx~318U?|DF`OA{V(a<6+(x+J+%-gAutGgajaq
ziW7)RwTqZxiD<zHZ(t3_o{dDgibSpt?~#k{WQ`;bii~iI#OaU5>kl9xi}o{&A{Gs0
z!VD#q_rt=BY3_^pW)f+(9o&`_ao`kFY7(206eA3X;4O;gF^yQSi+({C!!8xO8x)%q
z6c=9@E7TS%wG+Fx9e$nUfkEarAX5~6gb}Hb99+#BD_R$Ghfu-ELGR_`4(Ou+_3@CT
zc){8Db4^c!>o6|Sgxagvo9*~$r}%}w1lj&X&H6AWQU9N8iC^WD46mb%$ihsD;yvmU
z-^#~WiAGY1#`%(kb89687bP)>Mu%%fr#VMQCr8I#$K^ODs<Fl8CMU<#C0Sl4rS5=C
z$w2Iw@j=eX)mn*3MIhBakUM5d*iK3*8{T(EM=Qh)H4#8ha%x|F>cDL3P<?9M6{5(H
zHi4N|An))yIc?H3)w}B2zv()0{|dB1mf}U`;xX;KDe9D&m~K{^z8~z+;7q@~leBi7
zazU25a-6;^>T~1lbU2&w3NzD_B!dL;bC{{k&Ka2Gcqn6;csH2@<XMDsnUS(y&&UC|
zAz3JyS(G5eL_$`iV-~RjfUY5nW-f~vlpV{GePEl#a+A#t%3{*aLeb7VbI!!VN=L#>
z7Sxs!#sXo0b5PmRU%1%5G>gdy&V8kj@m7oi4J%i4AosOGy0X!3u53t*d`O;RNY1Al
zMh%x(Ev$SUtUSG4a#e8d`+>X<W*OgaY*Xa(#1(SQLozH~MD;<jUqTALh2-0t6<81C
z|7^(gX3uqZ`RNFXa}FtV1?R0wrFM}yMu<5^kr!EOIL5I%zVLF%4<j$I5i3-?!A085
zxFZ8^%Y$>+Gn2*KvUX#OwZVlhU;qfrE&@o<C{8pk#xyBza4C*WEN*Hjt|QNE5iY5p
zPJ4wB5c4h`lQr2#G$npE3MHf@I<us%uY|#*G;k;8gsn76s#G$#G#CStlaw+q<|nRI
znmt>x<&w4zPBB}!!C0DeUY!fuF-w`-g>25nZSR)sf}#5^Dd1~J1OfDZz~{CBGP)Z*
z4uYcHmR{V%oVkP?ke3BE#Hqb1o4zSq0hg~~l_eJyeZ_iCid{~r2)#3d+R2xDDikR4
zR#3QBFk)BM?UV=mVph1YSM=Oe@P?M5Dk9&SBwaOB_{>#^>Li|OSBz^^F(dw~Zp)t!
z#=l>wkO{3SAT7r<P5f9?HF;HKPFAh0Q~g35c~GJHLZn=UqskBbOs%o%!KI3CzWfqg
z;nkiaa+{FhRAxz0!*^Vnpan6xEi${U*4V3&NGZ2YsnXQ(_$2CLFH~G;Z(i-ISaX3<
z|DmYP$2?wlFqp)({u^RNL9xEj6jUKrYpYP@5bEVLU*?xm%ZgoPTa1skqh)@Jm{n-h
zFs~6zsdp23UPDozj$PBLQzKxW5N=-n*|kA`r>tGP-cYB0y|yk?qv_pkUG-pn(O%_~
zkjBKW;zrlL#;M{a|HjIa!77MrP40YAK6d>O;(>gxY1gqnY_Pe@scCn<agU;fpQ3gX
zyV36US*LT6PH&ndX4{igFO(8{q^9(_`nFNb)^U#3-n#nY0dPK6@%?p1xpvOPe6{^N
znAEMErX+*zE<I<kHLv&?5vYTGKT`mN#BtX_IM=}n=oA?0*v`yi3dt68%jRb<>tnBM
z2k8{HAh%bscCjIKHJW$HDs|Zxb}8P~n(cPgpmiIvcYkne@iOZsf92D)SlnIN({;Gr
zt-;w-zSe2j#8PV8W+_}`hU1Vh*W(n3WV2w;6vj?OnfClHhPtHJnX*dQsTTrjC+6(%
zRqC)NmvHOab;V5TYp(4Jb8{uU>WkWMb2jb?g|*CW#bu|I#mzRj@AVe>^iLM_YbmDO
zaFll{#wCPdWG{IBxbDB6uPSsKm=`aeb8Rf*Y^o|5tk50wOsyNe9T?l~@1z{uNNGe&
zCzS3Z5Ng>v_F%=Wd2g1^K%V??YAUE}e<(8*zq~l%Rcihx&fxn!tid6#{DnV53md7P
zag+|a60NYIf^_n37QT_vvXRm0p|sTDTZ^Hwup!N%idDCc3#G9n!_gA2Q98}x1lN&X
z#Iyos%n@aKD0xLHs2erB1@px)?(kUq{@5eVIDYAXfkkkzS<^{VQG3u>apu@U!RXA;
zDDS~&_g%xn(1dExFLcW>RPe8S*hIhV0G0c23g@r4<p;G-UQ{8Mj+2Ftiw^&KIXq5t
zFn)wRAsAj)%Q?=Z9L0e<xu`oXe2XQe?CVc9L2o(r4YxVIY4XHr;+N|<(metMV433n
zlD!|&cro}CH=0PVi_LQUjB-SoYTAx#@{{spvc>e2#qZGY-^>!z;<&RfhJWc(VcT0y
z&MEy$4xg+anzmTPkH4S(vN#!|hm{yUlT$MDdU&SzU`F_0Qj%)Un``vaVkS6z_P%7c
zet5RAykzpD(!AoqZ(FW~SiQMio4GfOGfvI3E-xlp(tewY&nR#$j#JHcQBAhF&GawM
z4K~jYTTYGM&rEa8SC&q+E6)^oEqqU#ULIa*+FwY&pWk#}`t@RATY0HEZLv0d4iYxc
zk6Y`y80o&)Wi{Mxqqp4czL;-0Uv#jHfJ=|V^G_sWGY*F3ai$`4mb}uIwMv&tUQF%{
zFSKwjZKomLye<F6t>YSo-48Dh?ZXCeVaH%D*z?b@WjENQ<tj$R;{5E|99bF;<$5pM
zIu2!~q{sTm?8ql!*cZ+@GRR7f^0Mg2n!9eI>rgK=WW#5FE|zfqv&yVwGmMC`k5;#D
zU$n#6qwlnNMGdkF4~Ma~V2Lz!@@j8!C}f}UZ1E^;QO<QTfI3}Uwmg=yov61RfH^-u
z_v+!zhvCis@aWNOUuO<qVGUoif~<YO+Zp(<7WXjq?qS_hWfbJ`{A0|>WEOCS4Y)g{
zx0-|ZJa1_#%?g(Bc_+6NR(kjhiZ`c!xc$XrZ=`caMP;wrW4ZZoPd<8&c&WSGW397g
ztq1a~&uWc?dOzJ`x9xskwR~X4ityW9%K_u~fj>|056<3Ih`ohGueI)Bhzh%-Qo9yp
z<9m9LM$2&C#G#=|pHX_c3FOG^VX*b)r~%{!czGPpa@?kH97WO}L+NUEFoF}>3=?k=
z<2d2NTROQPytr*HE^eSZT-0Fz4)-;lSPY&7gtk6ET3W$9C8IfY*=uCxUg1|+scbkE
zSZ?BcSlyFkhc}*)FP}Wc8_QZaV^Td~z(1MCUBwnZn;blS{j$1!Z{^$rKhoow#OV3z
z-Z}U361}O)ggNzwxA_H8<i#7+Mpe(V&D67v_xn=sF9h}BH;XV6Rs0Xu&n#NcGjHKs
zOGk`{`_k_(_c_iN@6Tvjw|fv1c^MaNp;t$iCjv_snGYA1(5p*vxN-O?N$X|u(xn#-
zTxR){r>!=@V)<&!eaEi#`m5@x#_}}{WKsU*4OrzOU;q58YjfozwAuX{s&{2hb8E<b
zeP9mX$APnw9-NzB_8h^r!f*Yo_X0+5gH^AA`1kLue+`cAg1GN09`6cAud6-JOGY=;
zpzxPJuEt(o&wHNgJZ79uj$RH&-ZzilyDHwESl<#sLb_C&J&zvdGVYg?nrRJ!7_>m^
zQJ*>H8@MS4`Ss?7)Az)eZ(s7TQWA#tFz^-W@}-Qh2(YKA6|1Ymz>Buuy}3m=sbkw0
z`Nc{d^3XbkaDj1y+fuj1?sI2vuLKP;VYFYeR_MNFuhM_Z$x)liS=ZLe`#xis&pfHG
zYCO25vOQojQutY)J!HVBc#p%PWS`Tr^x(jN@z82C&zj7d$C6Ej?IfJne9SZiT-jb;
zb}nFFeIe*jBb0RrKj#+7zjQK~_XfB$+=w;0oVGbP-rc$0-8DDzg^9X7dcSP*M0z5_
zV0=v5;fF=S$6+Ctm(r=Z!Yor#my6pIMlNg>)_KAp7e@U$W?!>_Q6ZXGqrzh6lxZj_
zHlKFDLV#CyI8{*X)Q&&LcpzHp^#i4X8p}kkLL#5#ZcP4%SHkUY-;%dI@h;J>wLRDs
z<Ona;?~R!xCpMq0(lZWXvJsS0ud`a4NHeeH`VDruJU!UEJW6izs#dxeyy98y3L)XQ
z+P}(!^(tfC##mhQZ4QBie;%eXRv6R8ar*-;Zv=LKgEeGPhRJ(P6rskyAeOg6hYR@j
zdqH}^!pE?VKqBrI_1O!X<+d<>>%%+I^KI;CYz-9|G5G#m?aw2RAQzjxZ%wh>)(;Xl
z=fZ8sB-}DD?ygR^eoYUzNIu-%T%H~2U$l5W!4jeMMkRN`_CjZ`qxHoVyUHYbgDEXS
z=l@*WX~moHYaLx6mCM!2PjVj-`d}uI)9QEn>^k~Tu7<0xitHUC4B>)vPHUWet91;K
zFAz7Lo{J%gGDg28cb3d0kz=oCj8*1i%Z+`nAj%YvC9TDjpk-G7-C4u-nknf^NN`Hx
zH;^+k$W&52A7t5Z&73M$PR5dEKj*w9B=U2$o+ZQmram~`6HAOW%irN?VOB7E18dGJ
z+Bw$TXa(|}oLFrawtOdb5L-c-3;AY#Mu-@Dk%B;gHaoZotgu#8+;GEQLRCY~QCdBR
zI8a!-+rR-m*_h)fYsaEkgm#j<a#qwnPvNW_78{J4tr%4h=c@L7t7FN90f++BEV@v1
zSNE|40QDO&jR0g~MqVIbmm2Y4BoMcESbH(2^QNx1!W`I&BVvQn_>}euH$cE-(2^FP
zHM6Z5w*Uk!-<KxvMtvqr+m5nMuZz-nPW-Bjkyc!yqL2W8{sbHEozx++42pzJQ)G}5
z-*7UBbAOOqQwoR-jNRjX$}dcBH~jYTgP9!xYp2v93YQ4tj5Xk$mlP`RN3S*{^4iqO
zUxXz%Z74WZl)k7qJZXohZ(3>3M-D!}bEwAzq#-`SlDHQ{#k!l9p2vaeUC!V2D0d=b
z8gf>neCo8Zp8{}twJmTy^8ilTSt~iVi#?`ROdCfYAl2jWs{m06x?lj5_bi#UlbRo>
zuTeETso-03^AU5y?Y0LUL(eIJP6r$kv?2fu)?uHcE|xM!>^UT;&=D_`;7L6R5w{KY
zG4s1uYny6qcc{OSVUFh*q1;y+z(OSnKu9f*#0g>UonULfK3?<H48vn%{h+Vp1M-zD
z@Veg1b^m9^`iyMbWjX$nTGz}+Q(N?swR`TLcn0JsaAK)4Z>S~FJE?AZgPP7AZin2?
z&!{u;k`4%;_uRE@08bXVCVJ!tunK^kBkPUnHim@fMFCSE6DBp`c1h;f+!jbG$MjNl
z!+EERIwT~?z})r~3zpSW!SbTT63>}M_Z3#<j1M`S2Z_Q$ADjIhkf7xtWZjH@G?Vr1
zY4SU>!;r7rw1u)J2k+4vzrC6#Kx>5skJ@q4l8&Wr2@xk*K7lm?P#R2XnS89>br@HP
zN4MID<)mVgUgr>#khMrWaJi4UzUmReluM|l{r2K#J4P}|9}@MUM`G&d3+=H!SsArB
z64vQUZCXh&F>WVI9T=+a<F$`wMP?#K?mW^BtMZ^+nZ`3+Z)9@e8WrjvVINQ!MtLv+
z_eD<u?K-cfK4QLnEv6j6pGgP!+}~{t0Yrm1me_ZS-hNoJHNCo{xrSX1iFu4TBd2Z9
z-OdWR#Gj>-$Q1}vT_f>M@w#v0WJ~Ffy(2E$OWp@D%Tat#?o0B1MOdlX%_AeW{(dNf
zf`SnzB5_=;qAhbZ^$WB7jzHR1AevSV06QR9-oxPon*O&moT}%_o-s!SyqPAd`cbVY
zJ{4|DRokfQa5eQQDMVMS;EEW~7!~uJm%&{@{z<g@<bJ68gKs-FiOje;#LL|B`|?v&
z=K(*5pFu>E1)uY_G=Rkc6KIxeWGHAi@=aD@y5PGEu2A+j@=NjV8dU7EFW7twd4=L4
zVx>^1wWh{z9ydyOwTsE$g@OjsOZ{d(0`9M>%OiW@Dx3s0q|3=Chc`cFaKcOtB3VCz
zh^NX@?J33goOMD@&nw*g+du9#EO1E+)+WbW;6xNl6x5IF#xrm^U4PY12zp(r?}V#I
zq5(mb1Nc2lF@MQzFnvXMX+6VrXz1RE1nY4$@?9)=`ZG#;1non8!#xL8TCwgv@Vr5!
z5aLZ`;Y+{S_5|NXubz^830J3EXd|!7S?f$T!wjDIO6llCOl;+0!m+)>;?U#C;MbLo
z8QUjI-u@H?3Qk<rXc6~jhFK|!JL;c4ze3@O?ge*KAHmEv)HnlkC}U3`>zz!^5!gds
z8^ms;*pGAx;Uo!XNW8|^V&<O?P{))upBsD~{O-V?xU#=-0;6pPn&{Xf_EhN_z06mC
z7IyS)NsJ!~HR{|!(|4Q(!k!4WVn*<Ie1GR1lE-QOVl)3M30$<GL15xKcRJ$>fe9+Z
zH4@LgP7b%%P7*up=W`-UZb8lst=Q>KV~A2}yM%F}?zJ#4?QqJ5MUrTzRCR+9Kjdfg
zrogTCtX)leP@eXxtyM{C<o8@gZYM+H63)p22kkYRx#1NgfjD%kOkt>HZOPMWH2!Yx
zdacX@RE3iTS@m*km!g-J->T<xYk(rP%|2n4oJ%urF5hk>kIMdPSRu|j?X<?-*Aq^e
zrSo{??=h#1xD=t7CCsSn4t4(#={Y;l&wTBiVR;hcDM3gH^-kMLHo*Q}fvT@)Sg)Wt
zghan!wFPFp@8bHIA5hUYV?l|8c>EL)0=h3So{#Cltch9glW*|MM7Ep_ub7<lspp-^
zU9G;Bs5zk;STk#Ii#;hm(+VL7hIFny*yu#wZBXMo2!CGMXx!3XE9A0HRF*uA-*P{8
zV{s^BOTW_NemD+uyc@74u>5E%=&vbJfzG1N>qIs34NoK$32gKw-#b$8nG>L!I`f>E
zWDdrwUbSXnQV!LdKPM<_^Kb95B56BnY@(uhmL}WVDGAg6vH+i+c*`3$sk;;y>xIqw
z{P=^UDxY0;;PS+U5}bj>+HJSO>r5?8|0lV=|5>irLsz1EePt`%BLOrBW=i<TTVQ`g
zcjT)1@Y%Cdmim;j*R$tdf9-$@Ry5oF6!y?h^EjEn-QRZYG<uVZe|L<H)J8>$N$?9N
zjy`!1*(P?3i*kHp3I#sOpcPYJqBr|P=dsTVQw@fl`m~jlinLoDO??_80ejs<{qkgR
z?iI07-cvM%VX1anlvC>4bAwx^6}vIL6|!_#@WK;v+BcbGOB*!M3-ZSoboYWQ(6?Dc
z%(S-e=t#}_)=i%wQ%1~4kuRsxjk@Uvqe|@ph}4>C+a**c$EJ>&lRT11_kRpd{*DO{
z1q5six8~9c(Mg^4w;`8htz{0c+C@yI;#oxF?MZ2(yuqE*yrWI&reg{r_M6pA__12k
zr;RGL%BmgC2EqHyM#o{+d0XAnYe`2Uy|RXy`S>bp@qH?EHiLMM{u71)U@n7mcdmlT
zpKhsxr6!y2YcGdb*PY<fz2z4!BpBD~SKtFZw*n>CZg$f59N@bZ9`%KCx^CV%X+9u>
zn3ybogFdHp4ynt^bCne&hpYkboQf5CLHf^v1FJ$ZInUuTyaR7Ur#eLQb6yelit%;v
zFXc$0<vg3Cx7c2k;L8;^`m(6Z_!cRXN7@i?{i_NNBiFO_R}qZXjO%t)-((b4ge46n
z;orExj6C_dZ-u@{sN_hx<S3s~A$>Je^jhZ(Vw6XN<puJ7k?CPn>S0uyTjTPfdkM@F
zQuwAq3{%=P6gbQsBui0)=e8xUsbl47;C;8`To*v2R>-E571~gYVUkc^(FEs7K$t!<
zf7gO%yrkdISo$uPoGTra_dcC+r8HSqZ2nVCs+Mg|3T_IJ@bi1IB`4iA*BPo_RAz)(
zr{4K4eaxiykT;M;X<(Ia;Ib}awrO;eXEMUH?oDYdX4L(j*?c$pJ3We#WL9ck{-g7z
zG0;E?M5ZpespP|~xMY+xn{QIX`~yC2&i7Wmr9+08<)_dV5#^Q|V*v+e!B6!9dzsZx
zM>%7TcU$%j1&)URJ9d8uqyZ<70_Sv=#P2Ms?8eU71+Ff}PRwszN4nfv3fzT^-IfYG
z4hy{2Sv((Dye0xY(OA8S`Arf@S$&vUqi_m+gbIC`dwpdJ{jxE9RSNw-Z<~H(4X|Pj
z{2CD8z#0@_^3|;{D4{TTmkTvGy)Xn)m|9jC+QRCX%NjPq8vfZRe2F!pMmFM*HF6{%
z49*&bR`dpgEt;6eG=!8bhIuE5oh?>q=kr}*tV~gy`ja@7qWI4{gdf=wtk@F86cRny
zl3aWf0@#uhI876h*+BWGTwz6^8n%RrqLd!fxQ?RKDbtvVqO{GTsMVr$_)f%WQ3e`&
zI6D$JlbAh}5S+zq7EBM$7Ge+N1Lw%>`bmRx)y;g>z<EYy-e1A_4(y)nw%~$*T~{A)
zVZyFs47e!Y%sv|ou3`UC0WR)g|J(sCnKFAf0WRG%6JuQkL*VQjr(h^r@xqhhGUC0G
zr^V&W=1z3Q6+#?Zyv3C=#Xl4{stMWIs?<4Zh=;3<IBLa)Y8*J~JO*n6IO^2~>k>E`
zga+&LIU30a8)`V3;4A};Jsiz*15HyLEiD7hn;fm#11)flHjjZ;G|u*~18s<xd(_Gu
zKz2?yzLHMut#-+ht{eF-m6Gly`R>mpJst8rRwcdp^1U7<eIfFF5heXrZ1Vl-B?D^m
z1CWwIA^E|Uk|ApOp^=hd;P&w5zR6_C2;3t4xMUQqG?WEUI!1gDOi((`%oRjiIw5or
zz*G85hRa{7^tZaD@4M1TBQA&u*E9>*eA?rn+<<E)!qPp6Yc`$BEs1LmQtDjDHQ!<x
z)=|13IkYf!P{>qQy0~fi16H~OFO9h2f=Od7qXAcLOjf9^vT%T_Kr2&L;F=`k;SVXu
z`c&|m3S=YX_l6O$&kVQ~A-~}O+?F)mj<8B_1MZ|VK-{t+yB+(xHNco8$X?IkLO<{T
zj<G)lJnS$!JOmaWLyj2n4vx^c{js=DV)_n<xlifkPJ!G3l-y^MM|RTCbFrW^73c-y
zT-SvWxA#XV+<{xziTi37{W5_28cpUp-Fhw?dXwIJ1%ci&N^afuKpm=}cT>=(W6=9e
zXyPvSqqNE6gS9+T86dVFK*D252ZKIg;UQ+_K{hr)epzNr%Y%YlhN61>s9uJ8#*Jod
z^FU+8gFX#K_q4eSD8p#v#!M``N$0`JfMP*;u4~G$eQCLIdTlO8cyNuOxXWc1n`L;4
z-1wKrXAe9C+)x61-cuUl@~1@Hge*UffxJYw5TchShce~Q)_~77cn?1F68F+Vh^>F@
zIh2zW14#o<b|QGmA|YfMyxaNZ<UfHFwLdm#T6igSAe5u!>r>@aFM-rsC##3NG&B&J
zN8S~*3R+}ZARUSAGBqFlRw+HV?V?Zx!yp%<?CHD;9}~2cN&j?~#;Afhl8eRhblQWD
z^=Bz-r0rBf1)ByJdx7n52p@+qZ7D~q?L<!nCkYqVwC&guAMmmScw{>Qui&2N<Uy|-
zqQT?mZ7kuXsT^Rg<V)e?7q0A+<QH%)5m2q{QLhx#q2&}ZuI#en7ZxuO_B`tds1#w~
z6iwuBPv;jyE)j$Bx6;&9im!1<^zt{4@W1FRezDBoxLGNg!}0Qxzy5(=%9plS3SXd>
zxa!q64rvyFYM{Vt+2YqP?J8xe-mq}I)vzn4`79uVUMyp6hxmiaZn4V++LcBK$oGQf
zGwh1<s}u@o*%fQ;idqDe0>Mh7b_G*a%Es&}TXuPe0;;lL)knJ=8no(nEbQ+|?6ar^
z)zHCe-1Zql)gQLlKFZpssR(}Rr7ilTZ=YgRtzN*U;aHvQA*dN!q!}rglu)hZ$fjLz
zkw6I%)X^`}X}yT+sn#7e)f4B_TNaGntp2=gs!u}2r~haljaFlDWcuZd_X~|fBy-Kz
zN7HYkyx(5J!zpBH4Drpr7xRAChld)~7}1y+2l5(w!h-{9Oend{OuzA(W(Wo4*O<LD
zGk?iz-U|2csj*Nsv!tNnwOoe#Zq``on^|A-SU(E+pw-$~oBjC3^Mgj%i-Nh<*3;}K
zl;`KmOLv)CyGS$pP#$~zOJ}262g(dHM`IpG&rAD&S|_NP^J^aG4B?;owJxn@uKW}{
zuC2n~durWA&D<$?+?Q)LHfueW%{)IGdm>Zuc%s+6LVPlwljBWO$H!df^JwPlc<ejR
z?JFyCOQ|g4-<#&I@6`FdE`Y{7Fy%N9${iSZ6%Z~G>`R^)oN+Y*cDl|N3EkQV4TOgF
z)(H%YgllYuQ%>7Nm_x&lt`5%YA{lU_(Ccdmug~%7qhoPmSSadi`J4&FMbChuanqr3
zs`a%W>*K#+Cm7e)+fvmNy4D|Ci6)7MCPj)?#?>b?V1o+k8!N7#H98+sLPS%MLsLgZ
zp}*_X_OQ~o>YGnRNp9=+4n;FNDMB*w8;YMbWEEp&voy5wxsZvA?EuAc6NhpGaUi)G
zE(N+``SVH`fPVp3k^cm){s@=)Ux6zC2^j%a5w!Im(TV`B2*rxPt_aThk8VYHR|INB
z09S;1MMzf^dY+Y)mE7E10t#AOTwMHTBu^Q+(5TsP2%n=<Fkusspwe;wnx57Dtl!f=
zI6gjZVq)Ux=y-g5+|kj&!NEaENhvNa9v>gCprG)Nt@Zc!udJ*@2wN>JEp#dt96Wqf
z23|}Os{h5$0?=ZlQvW9?TjDVIKS5a)==}c;WrKo4{yUUS2BrL0C|mgdDMod5|A&(u
z9UK3*lYKph*n0jql>L93Q3xn|dw2gYC|j1<?)z^j8%V(LFDFaNFZ}On)aww+e?eKW
zZvVfaY^r?aKTsCYjQY=F)L$qI)ElV!2g;@?RR05I5$gp1K-qscqriHDf1&K!KpNX$
zDeH8$`&Y`g{%1AnFO)4`>-_^|mFf}e1WX7iyYwI3sJ~LS@I&z*DZ4ja_IIA3=3u^#
z?qB7o<u+eznm<x@tq&n(N1E#YNLj@UL^+D?Pdm!|kCeSUf+3`=YD?1}DO;>R`bWzC
zX-Cl@sO<gi6@to+wzmGEvaB#Kls{4ygZXbeiUv`SBKSjPpQ)Stp|XW50hE8JENuYm
zA1Z4?jp#?^7p{hI)*z_t--!Z&DU-E7RCX;w0=`Z8w;;71^@i9KVP%<%))7{AXFc|v
z4BJNBM~Z(MQbh<W>#(!&$I5OZtgPu~vPpgs#U{w&pNbS4!pctlvr@2??gro4${_PX
zW8cp7`==uXMqpW?-E9PxW#7pqi&Hn-$xAXqVA;PVsmuU&1eQ%O+btp~%m?oxwCwI~
zaaj-h-;&f`Y27B61fgZ&y9h0d#<7pkvgZ2;E&Hb>MJ%+pUpXejaZvSJ9ie4sjEWIj
z)?tt2;Gdq<I)cmAZ|5Vp?0(JOVdL@h9*#d-1&E^5)n@Tg%N>00s1<<zjPtk+g~Z~x
z9fRdhQ%ZRMxRXHkIp;~&GYyNAZc^itlO9UP{gXdcDWcOp#zc$Le%6A&a|QdSgFL;@
zIM0RzrY+8fMYc-LMkEk>1*1~vL|o@%Z%8c9$K_Z`&nJ|G4-joBB3Z7B-ybzBFD5mO
zOE0E$9S<(1Um65*!DkGyiP{j81&F$oMS&iC{wD_Q<$~R`<>jJXXX)jV+a+JyvKKn=
z3g&ZZ=|u5_G3shHM0n(CErP{{=xRL<(U;mtGKO4lraB&8Z)FA&0&li+6RmD`3jY?S
zY7cMro|g3@HVY7qsoR6PEyyjRG4=fL_NWb=`|h~wPi3l)g<ETWh#yLKH|-=#bAR?*
z1Io8DV;tFh@#*!U`}*?hk^8IlMC*s^ZGu0Ysr}lchuh=cKb@)Rr`C`6S6hEdQ<q1G
z|4hU%AR)RO;-&7f7lzWG)>PY{;R1GouuPQru_i1bYahngbhNy{c03oJ_jA}^V3H<$
zfTUwTR;4Vg_i`P?4IIb*JYiWlC$=?Y9ypdlsahy_HeQs7+LQ}c8-}0PYx)XHOe_BE
zxgeWYw2P~Qe$qT&@P9;!pmG92nBM6UA`y6_0otCtacg^ym>MK}LkoEOP6|l@@B{#(
z4OQuyenL!(LJg9_z^tQ3ahmBtdjr6f7N8?75_!T0Y6}4ze|yS2CPOU*Ku4GQMtS?(
zix6gj6hq{LM93!3%-k00+vSb4B!_|yppAJ008suQcq07pBjzrK62K>lA{G!6<b1bE
zfHWr4ofjVQnz92WWUN!CgHZpOR0k`Uyr1rpnl0`-FM$HgC(H{cNw_%-?4f1SL})}Q
zs1zGdV??{@k_Z9DlpO>+ntt>?@s;|^lt`lDMe=II?QyuI-i$;r8JAT96o$+V@%zbs
z^;N0NCw1Ap{>K2Kd?NIkeM0~qz{?5mqduf$h2ja1ca>ljNRIUhfH?E5Tl$HGr-hdk
zHwY?ycmk^TYeP}s>Cp(V%hi5nip-n|2-%IvD}T1b78nJ16Z0efwHN(UEO|9Lk+uTL
z@Ga(UQNJRj3$W%eSe>DEU=reTQEY%)tl6vJZw0DOF}$B%HYuB`!SgD$w&&2D1yN2l
z%sqcaKePus)P^2=A3;~vivQN8$l+x{1j;2}Nd{+n#Ng#o;fUIl;?vmyZFofzqBkWx
zI6LA3ugoOCEj5u<oSQ%trwS4K1sa0{b5s8e7+4^hQ*(3h8iKmvQY**7`K23pZ5uAc
zCJ<k7VU_%{u1^_an>e_zDTaSpKNb$LD^OhA)xK<)8ODb=)DA8lx?DEGaG_4UicgnL
zL6=S2%21c-!6kUZW%Ds!IMi)RartKMvgK+R>cM(BxQuY%i29Ta5;Xx#ZZQ)Lw@u2M
z!X8F0Qimq2S(Z(J<ApAsg!Z0LG7k047=T*{)=pn$5`g^No6s$(<FkPGOIe&kLYC9c
z&nRtzS`vj%IWqy&hc?K>3g2Fe(xOs%oUl1Be3N_uKoS*NK?>3}eDQ*|PfT({K{BPO
zKtWoxPcDxyL&5Dk5m<B3WA!xjo5VMsBU2PPYJRe03nMX6gD0Z!)tGsUbxEvifJOe#
z)bx@yQW}E+ahLQDOhjykpOD1HeyrMNG;8`iXAm0~Q?^Z}dBGwetJUkAe-5JbGGxWq
z`eiS6nzWH>`Yvau-w*FJgIAf0g`;<NtR%oL@w?l$W{&*0S$b9JwBZip>&)Mc#6$&+
z5?cnf!K3m$mF1DmCTvqYF0)_B?P9**9tf~Cj8;Kvs_j8$Wai?^%`JRot)+Qi^c!Z@
zfw6_<;fHpSMOo0#g4K<^>t=$+&P)1)m4VH77H;2m=H5?Lx7Hi(d~|eLKu(OQ&ssS2
z`x!hcZ6(!7g?KyjO=pSs*>X%3w);_-zWnl&quSoATx)4p*L7_y$Bd^DwoJJ5Fe2;g
zxCpnM=nCCUS)a@9xBL-5nNZtM(2NB}T_bX1Vx70&2otvXavtXj7r}nC)c^9Wvcd0l
z+f~eV?m`^JV$_ltAmu(1ixr|aGz(3Wr!^Lv7(HajaZNr@JrbzN!@HzxrrfuK-iCLH
zSVHz*$mwfb%mE)Ga$IEV*ZlBf<}1^#=(%2WhYJ(__D{X+>oi&6hBt25zd3VPY&uP`
zep=Bk$$K@{X%dbmOt-3>kyBMrcV1#WzM4XPOHm}zmkGTOK7{Np?6tgc<F4BDq{(S>
z<=yA$s*PSL$GKfkF@X;|N(NJV+V0GooYvJog!3$~teES+O;fGDY<(Gp3wpGAOV$i~
z>|Z8mZlJMI5NW;Js`0xXZGC{hY~_9T@?f%3pZ4g|LxI5SuoeZSPy@}$i{1|h((Cl7
zLEyEfK}ah4blXHE(+gX~i?r|sW}WwZEKcR67l8=iqv0#&Iz?g(5n!1Q=Zp_R$O1LJ
z*{+0nG<}8YeE5P;*qwYOW_+F(;?$xm&|`Qpnjp?+y%D<-<}UM7n(<S)@>2=&;THkC
z)AU!b^HWRm6S48vCH0r9!)bDKYZ3O_JQij$@f-dYK>i)Y>zBXP6_S<{ise;+E@_~J
zoDTxa>i6NaYdEaQ1{`Jv5M!YFB>8<z3RF+>wJr1u()15;3e=eidZ!sQYT%}}gcfuy
z6|9~a{FW!srY_iXO(m@_C_@u9QY0ig2sL#ZhZ%165Ey(=7V?2MR2mknVH5JkAT(11
zkew9jq#2U0>E~OBGlm{|CKdYLE3APutf?*_$0?{yBs6O~tR*R=mNjfr%Du`sY}+f$
zFDN_|BYb)$>_ea5+Dzzcg8)sZa3&|*85X|<(oh&i$N^&Jpe}S<Gc?yJ;&)O+YhlQl
z3GTwB^GO}v(iO@;UF6kF<PB>Cwy5uvTqKS>?uxPfH_fOKn<$m2D7^l#YbS54ooJ$>
zh;i2Ni#pt0N4N2g!055eXyvx(OV*f;x(L1^e*v;EmVVp?bq8o)%nU3>2@osH7QSv0
z#4C?lw;lO3IF=|F7uN)dV+ZlhBMvhuwq+(-PBfffE2h6NUZ_7_&oq9Kz(=hg_hXUw
z*V!mp`8b;TFfFaP(7HIS<b*{6zn7m9UXn%q6pf*~POzL!wCGQ?HH}f8#r<OP)5#gv
zI2lD6GXY~K9xf8678Q3Rk`!B%xb`kOHzDbFS(0{Nq9$3APJbi;TXLd&;%eqkN8<Q~
z)FSM3vc$~#ghb3lM^aGbbz%Zpf<GHxX^`cMSy1OOD2g;C|2n+tI;>(gzE>-}`#QW(
z9uFvF`3^I6kT$hYE;XV)I!Y_0-!!#uHZ>A)3Cqj;iEP@3;N*hfv|Q6T*mcZ&aLB<<
z`l=~jGw}}+)AVA4w8_GB{QmUWo%F$?xD&Ft?S8yXX2aHijCQTek^T%pvgAkk%qQB(
z$lCFZMR=fV#Km*$o1!?p-9#F8kUnPASv}tFmc=8*|6%W~zS>;)w%b5yaY%xM06|KN
zTLmpz1qu`j#kHk4MQXSecXxMpcX#*T?(WVGZP!|D*YkYed$Pyg2XBrt@&_at_ng=K
zt#V)yM^T!GGo?jLK@pN{+&WN)r!|a@9s{Nj%CZ{Cdm4g=;a3WesYk<o<r>emivEZ`
zfj=!$a5d;QaWH{f5JWfOk+Aj{E#gNUa62d=M$O5D>c+_nj{7|!Yv{3B7_L%nAq*n8
z>|b=YO;MBskN`fSD%=>FBR*t8!FE(P9#Y(RznW~z4X`IpvC&QWCK5_<!r>^A;=G#j
zZ8XI*EkzS8)x|tDC@l4*8_tc;nCAjW@4h6PirAZ%*;$4qyRX{W2&bi%*_Ftr5edcb
z`@~{H09-+dak}U}PXHB&OW3FBHK*w@E8b+s2`LI0Tk@VcX|{O^xGhCtPt}l7umzIc
zA_CH5Y@J4Q?Y!&Lk!mt0MF0~RSyN@4Rj0mV^jXWqS*zSxZc<rzW?5@tS#xcf(^82i
zYVjQ}o<_1u51pdJ-JFE>vtN`vL)H_-m(C&h1nia9?nh691tA^5v+Zqj&@FQC$Jmi9
za$h{nAueY}7mN~hzCn73I4hF7AsH=^^bxy8lv<C%Fi3+<ONvR6;%$Pa<gM2aEGX<a
zG*Oa+@{#T%<4gl^w*h_ox+*A0_t)~zB#}RvKmHVkBul7KpHOg-tIhp9Giq2J^t{lE
zHE9uIqmnLo3@PfrQKaF{fsLdAf*8BY0N82@y=*fa;_XZxJ^rLvoTBSu+Mb4SPwl~b
zWZQJW6OsaaR%BWMWRMSlqo&x%9nAvMM*p+~R#NQmUhLUU2ttz!JLBai^1?eVMHfho
zEh$ZC=Zt#<;0-Bb?kV*Rwji`m$)yc(;ONN(HwNdA<$ghn_B(dS9V^RX@W?55?iMO9
zwFqoTf2vwqE;dqLrAU}<kxMFEp3G3$N*~iU<`6zwA(oratC#Ito<@$Ia8D#|GQCo!
zw({mF2L-T7{4{t?u_}roMDH~8nQoQ5T~#u1!YpRkI%dpVyVtf}^=rH8bCQ~R#hQaL
zyDJ7?bW_HfH-a@%Bh_Gxs$2RMyA1J!qcu1@1VDWka7GPUL{izyD&mS-$==!>#X7q6
zS_07w%#Hwh{W@{Gy26v%PcLilS=J=F)=G!f-><-bm{EUQzm^XXWDu?6A14qLZFr*J
z@NC?Nr6TxMM8osk2J!I*iSau5ih7~XqKyjbjfx(P?z)KhRRe&hQH2Lzji(7wz5<Om
z#XU-3bZgQf#n;P7IXi2TB5mHo4F9Cm%x~O0h#5~g9{swlUVk*h))L=7qk2QJ1vf3p
zMYP)8vdVxy+6ODnoU!Q+yfLt&Hkc<qlr)hItN9yaf)!7MjsAHncW;YEM_ssni{E-{
z1{Oh<ep}OO%cWiumQwo-(l#hodxT}18Bd(ndOHnP2QOns9%DmiM}+@+bBlg^TSQCz
zdR#7PE0twOKTrLjXe+~chb&_kK5=K-c;{qBOCM?1Xh*}mN86%hWO|lI*L|cUWQrtp
z#iSKXPW7<v<VS#u^}<vA3<z3^jYx{UTMxo+wF~QkzwAEfNHYnqZb(b3FUvqZPRk@t
zH^AzBXq?d^(n};}o37BCMIS;1vt3>9<;+d5J?&$~&KeB^OpkKjv+CdE?q`eS<ecbV
z#NgYb&whNt3G3{8AXrYOmn&2m{xb5Z?ZCh@gMoWUgQ8-C(6Ir;k%QNf_|nP=%2tC4
zow;(IKwKk0h)%)aT(5|n+tAECWJ!`-eJ0XOQ!QoU;h|b9O|4;gY@XQ5H|Cx&J$x;c
zlVR*$K%|p~v%v_=X~aEq#4U2fXJW+nV#H4w9$*Fc@PvCs!Xql-QM~X-?9pfgc&ylH
zd?q{&29Mm(p}!Iy!K56CCmb7LA2T8@%n=(aVj3&78Y|BnEA<?!xEQO%9<PFpH*Jiu
zW9HXPsDD@+RWmAj?~c;rS)}=B!c1ynh>1hEW^~*je-c(G7;MhQG5P##VnJ+jF|u%l
zX>vSsa(-iSqjORob80VgybU%<=A_0}Q_>ZA(>uJlP`LD~Ug-^$VqsR@qRKJ!s%f;Y
zX&HtQOq>}o^9;854DRGKf!7S=Lm{+lhQw>^aATVL3*gJ>l=Ak}xm6%CSt&zSF^V;c
z$DZtc94=2ht_M*7cJVn^|2dwnxqDS}{F8HnWb=F~a{?dcAA8L|<(n5$nU86=K|2MY
zsLiV8d`pwE_?B_zA=~vWxqLy+x=m(tfhA-COAyHlU*o-Wo^)3E@Js&@_+sPB3<EO%
zd)!MM<v@1Ld_125y{?;gbe8HY0zYmp8d_Hxt902`^_?2U0ekYDt#EGHOfGuTt*D8+
z1&H^2yF|u?;ZpS&c=C07^R<t6M5eBHC6IN`F|Jlt05c!X>XNNGi?-XVtfqQJ6p%IL
z@~vX1Bhw4a=7`^l%vj6!S}n0&t!Hc=O<k_W0oPWoHeGf}xOFzIueamib>h@ta&;65
zw<s~L4^D!InH%9a8@6;CrQ#cgQ#hMf%Nx38Yae(vH}os>qBa>on+{xI^Hm#sqZ`ZO
zTlDH%E3R8Rlba2fn@e6Bmsy)FUaiR4t=pU1s4Sc4-C#g=IQFZyB9(1RrEO7+RlF-8
z!K(yZ@11(<_07p0a@DQ1$t{%ZjS2Y<kj<7Z-PIO@_jWhmw)^R>cgGex{~k4H_ao2l
zqV*mydzE)<?+(&_>tsT?<<67pPDbzj4wXIdtNlAHOPf_Yd&~z?)!T9F`{T^}>g0P!
z-3NDL_pw+G&$13#4fo#|9#YyJs&*efdUYhmzwyEW_lS)0sMaItBx~>Mt3$%<quaGd
z?dwOJ<Qwm{4)4D@R<_yIJUx<ET~V1j@~S?Po;sGf+Q0j1GtlNZcnaiX*r`^1=%TtW
z5q%Wv4GN@B^<F*oJxw7EP0jI6^;cN&I6W;RKda#HK@mGEH$2O`>P+tjQYdHKuI$B-
z>!Z7{Cg$yx98x}is@+G*>o`z-4#_-+diIUJvTb9&xP_E02+Lko$bKM}^@R6wpEzs$
zG|Maga;3VT3tMQ5{A#}X(hKimnjEMaIVb=G;<!GG2L($r8^2iUZ`F95D!oDwKKP+j
zc?&{H=-v8$4;m?x4L3L<Aq(za`FsY89^N{!x*}(lF!IUi*=9`_+0oYPHBQ;-m3(Oe
z-T^<5`sknEzlZlw`cpu`z&<|Po6muTg9ilcGGBs<h7JiiWWNR%4<A7|<xE3LMvjTN
z<jq1$;U`dT1@o}7(Np3_?-eb=%g4@0c$6$7D#p)Ad6lgqD<>{sd@9yaRg;%w{Hiw5
z)l*mGY65mQ&<P)!0@WWoVnW$J9RuG!b;YBTDKG`Sd+tfXMUTG?(h%~c5_TGR4Sp};
zt0-Ocw6*WOs6R|+sK9IoPpix6hUww{37)n@w1NX6+v&%4$;4;=%%8cXwWL#}Qe_I;
zHgtxN9_MIVVXU~xCaWd7D7PKxDirF$i#~_DY5AidEe<7G41Z9m5a|h8aE0Wk)LQua
zyzhHpG+S$Sa>&aAaK>!)r;{yGD&0}+4CnfKcy1;(E+UD6y39S@r2RZm%=uX+G5T6R
zr^-N)70K5R<Dr5_hZiKKMrm>Cc$C)WdOMUZFc$SJo|-*)S*mQdm6GX~jj__NG%gcZ
z$IY#Z!M>Ye=b?MnI@JPHlNOdIdn?0Rw`{O3@2s_Mf4wZ`x_2&hJ8N|E8>7G(zLHN-
z?JZcvvuf7cq!Nz-?Ihhf=Zq0uya&LF?ONb7Qr`wJt}D<3ode78S#{sLoo9zw`#I=9
zu?kQ#>F06D8eH5AP$`y$2dExJN`EK!(E6cYGj*gTxb3Fo`QvA|*$Vsw?quVe@V97c
zfLdr9N&P$STZQ+z)0gsq+nL8SK6WBMLh%=Vuu*~6&BOLMACqluU8<dND&c+)b6~>G
zD_#QSfZpd<Gya(T1{vF87&8xFcipA=9MCTw%X=XW6AH)ZZ)8KyEa-X3(TU$HCTK-4
zB=_F&xpbW6)9!(XQib~ioRty%6^dH0ppmzp8;L`VcMgDqf}fuqNN_9j294@wRu=hd
zXw^R)R4M&TD4|`A6(p_S$qSkIbkZ|aXmWLfqffcqYG_!yRZM!q@?auZRdHf)Tb1J@
z_OPsJI2bZ=>97$z;{=-wRz9qPaZYN|@s&&(6W|=rc=5Viat^upM-ZthGBL}`n{7-G
zz4uYJhHBD2yYzmivU1l<!M;=(Ivw=x61p4%i{NSxTN*SQjA!WLoQe8KNW7Yoc<815
z#8|Uv+FeXjp+B|ql6ZYRtw~*;DF3i@Ad>A1_j=KUb$MoPJI>*J@;#aI6*4ZT@U5zg
zCZSEosX^|AvfQTfwQ2&@wul{PWRB74%xrd&9T}2p(!CBOb4!!5?jptA1hXL$gHADi
z*k{98KSAmMjT93AlANJ(!tla^@Rw1&;)hvC8Zm-S!#22+U#A^8s!V5_#UoD^BVSN6
zb9Anvn+PUXPn@o;1zw(3MqiPgZ+p8&t*@nCv7fDAice*1v~#E$?JPt`Un;NP23(yV
zOhsQ^EZz45xJz~bj*622I1;wzbTPhwZ`A-!pgoFd3<_09NZ%mtH|W4LP0W+9r=M8a
z$x$dlwD%r^KL8yc(@Cq-ikdzK6|sVuB7(43N*<F;0-cRw)VsbAJt3;<ap1zA1-i;T
zr6L2l3QNy|Lr0#{@PXVU17`74<(@I9fZS>12WAONN1m}*gFMvn=ZIS6o<E2JdFn{d
z5yMBGb5w!6j05IKSLCQ)JemY~n+?p7os7KTzXbW%;m=c`%L_dr1N*v4&)<f^g<kN1
z{cigP%-^At7k;S%_75GHr{;nSzp@4g#Nsc|3d@T~MS%lTZ%Z%G%fm%vtH43I0Sk;e
z@-G!9!NH{i3(RKlm#UZGkXn3-MOIgN(RXBcp{>%3_e0^LT6}n6y#b34Q{~0<RPe&#
zWCM%rrEoDrYrKdV{3Xs-dGU`?c#$j8OWbg{_}3~JUer#&63>eKD~n0I=#zmZz7zN>
zn@hYHB!XoDbW#P0Z)Etf=x>%ELq{c?`S9aFfy+<n6eQhM@Z(9KgUipkMkNv3kOWGC
z6(K|l!aoW>k?zfkh&=J=>)<N<q<euYqB;sv;gk5uT!Sm(W}{Nkm&Euff&{A)t_sre
zWCW?gZ&qK2j!Gx<5u`~5LRY0z72c$)5TwfwuF8~-zR9*G$WSL(lWT=4$mB;6Wa_+G
zQ-F`k6ju>s83(Q@ttiNrLnaBb%?8(0PDW*`F9~w&2-elm73J#5hzN6C->kodj>$Fi
z5$5>@u4~XK%D1Z!<`aevu4{6Q$#+{57Q_;4XbUSU^hXgE5~jY{(3Kxk7_K5L$_?Dm
z*HKg)n<OkIEFIkVU^b>WbxBxKOR#C=s;D$a1}VjFeY0s2I;OP52Px|f-29ZPsJx~^
z04awLZhk2pQ{J+MRLl@;nYJpb?BPd2Dp%fYnZw6aj;bJ4JAqr4D~kB4XOocXlff<P
zlQGq+OGph8;kGS4x{}%ra-v#vnQeRMxEd-yQ5`4<f7^jhNgY#_s2)1B?Zh>%j%!2I
z0HP$^aS>K}ix*AQNGG%7CO?k<7E(>rbT4SfLr3Wy$rMpD*U*j^81Xci{EDbWkZ{+>
zRY`-29NLN}EVJtuI<7&(4{ehS+6_omdd~n>g|^EN?FN;Ozh|+5cBm8Xg#cTXG#^Am
zJ9T9C!r<eY9MxcGmvPWu#EO#EqbX>&+0b6p320o4{|ee;N4Ot@uB`opoVXX{DzhI4
zozQ;4Pu%Alw4Xqy4AOb2O57hhw4cN^q4Uaycpw%?c#tBjtSc2wJeVqTkS0F?(v__y
z9?A_m$k0*NQ=B3mF2x->$TFMIQ@tV{sU<wjaRn*szauAsx5^ymg-+;e@so@KdxH)O
zQk4z#R7u9*Lx)ABxDy73HYDRSghwT<${#*P<C09Q$Q+fyCq8_wCYjs`!ab^3Q8u)g
zBAGfFI;uLE02<m{kxU~&j%(0WjJ}bR&fucU9@jx9jhy*OXF<Wo4Y+hF#_p=5b5O*=
z5UxpMZyX!ac}mDhi?E7`e>CX=E}iU2oBX6na5d@Tz2K7$Y#o)4;ZvkbT*D__W|P2=
z(O0C)f{@c5R~3XS0$afrmObqYo&1!{4_lQC#yK5GRr#E*3R{yOJ{>B>nf#n>16x;z
zoQ<@qe96a&hHdD`o{hpMzZ6%)HnEL^&&F3&zLrnHw#<fc&L&SLzgAztw(TJ2)95$|
zb%LC1$5r-x7CL3xjKfd1>l=JNPp4|uu1dCt6*_#r$Tek#xZ$uLivzh>7FIRyk49AM
zWG_~+<fqJstH};?gD=)~a8xbErpS&;hc7nGrYxqguE>sSA(z{(s+M!)*yJazvX{G|
zQ<h8o<fo{;!I%5rpeR2j2iD&uho1?rv5>3fX#9KJh?W;Zd-#vG1Hy3lRe1QrcDO;#
zgmm+kj;`)YQPJw^Y5@TO3JQwo=;*DjEg%qRV`HPHrZzP-B_Sb^ot@3f%4%d}<n8U<
z-QA5vPTQtaH}ataOvn4*WQPCtIYM#>`CrN5d;Rr)C5Qj-BnPhVlEc%RYS$sy-z0}$
z9S35%*nb>{pNa$YPsibJ#ewK|$KfBvf$&$y;UC2T@|)xEkK#c1i{tQ%;z0Pf<M5l}
zfd7}{@T=kgVJ!acIQ*tKoSCLo(RK@LQPuoZ93Yhr-LCYt5940)8))}PPx<Z-%8^t)
z9a2)wd^)VAcky{dL)PGs^eJu(LU1VF_%im?Q0&XdyY7j?@h=~(4krzPVt)w^&4_gu
zq6b?vBb1PNJg5I)^0>!Gmd|$HQp?(D%HiuK7g7jG?a5NY4Ki*}e6Tpealp+iTg}j8
zK7*uIPnxYi9{6xJU9whXF<nwWeYRzT%73;ahxN*GyX|VSe60bmi)X(V!WZyxccfq3
z3NcJo<v1F`+~z%=aOdYciLr>jIF)}`eYTGrqgrwLqQvmg`EIfy-`TmW`qkR8|J2?r
zjlk)kp~dtAl`BG10pvIqTil0gpi9(8=;A=TaVd%$93hXeWN_{0Nz^U{LIf~PwjDA*
zs-cN-qF@AxJKP+fnwOS)42|1%V%k^R7YY$1#CYZ0zBhxbGxCJw>+H4@vr;6`|Co#D
zd5;U^&Mc~<X~X5c-j6&HxA8yHHo}-eE?x0&3E*EF$E<SQ1;^i__7Xj3%QAL&OZx7x
zjkfYZU9ab-`AFD1+82o5IZM;>xf7Y?CQ6JvZ`yOJ?Ze{+!68;3%ct|`1oC`B?Dx?F
zB2+0Kq`u=976*H5y}LsRlYjcstIwZgUW2gG<%PuLs{rq(cW=BwZ-4!^FA$KCNcf)X
zrIL)1f0bn{t`52=yHlP|#Q^C&F=#uKY&WRM^6q{A*5dbmCc!<HNe`%mI}hX^gpcdT
zus1@}x<1_A3LmS`;C_WJ{yA(fqE`Cty=Le$^P&8RHUH6p4kvMwGO15H`f<$i@b-%L
z8~RTfY#&3~5INX_*mTi3Hgs-D*D5@VTZ4)dWNp1(^q*|Wu<4$5)Asoh3Vh4MtVP07
z0|e#`Br+-Mh^o>{1!^+Kqh^wb`zG~Ax?zX$UC`aY6&Q#kK1dPb)m_8^A_v_cOci6R
zkxE4TuTfx2GsR5nvrQX(@P*Bb6=Fpr*38|XSGS+R#%l|1ag!-Z8q9oYl&;iyIuzn`
zkY(D5$icP^SDhYY+hS*^J-qJ0UW*Oiy#_hA<p%&s|FHgjC_I_%i>B%Xy=?iYRM!`o
z^x!~w&6!|$_bs+`)pyti9i&=ZjkVc@-Xs+I$`(y<YMjJ0Y<Yep;!dTayy9w&B&LsJ
zc@vX+(arCdxWk3V1ECy_hx2bgCAxPnMy;0)JY)WpUaq{MP^>XxMfJ6DZDO7NsC*ck
zmJn69cL8U(azykgOIw{@%g#`7#XN7erSS8q?L*m0&4@*7jziUHjJ@LiPL_|TJbB18
zBejR`S(_<5R58|M>&WxB?fuKuajL)CMU~Onu5rFP(><<#C&OdF_9CMhEF02jo1sl|
zuS1oHrLPgfZs=Sd{%*drF!Pfm1HXZiFqFK%r4Rpu<w@`W^`p{Ow7YrKfufSfGVr#I
z#Ct8`6&jP&$5p&TCQT)l?^#}iRJ|=C3eDHoJhwM%megq)3DX+yVy%WY8k6M*Anry_
z9deb}@EJw%vM*5$L%T4V_F@JrmR$Ov?cPoWv5?H%$22EBZbQaNbWHbNo<aMSKjcSp
zU9i5oq0kq6>vO1ruvo4`*u#Wv3$_eru7~m#4~L2tX+I#SR<Ai*Vff8V9jmrA_mi@a
zMC>BBK(Y0=vg>IQp6qWDu?=o8#*U6;r4=P+I_OJTjP>Ij71nNS7|D*of7D>{|IEQ&
z*I>1dI=A><AL@H&GWJt5$lEJ^v8zJdT$NmdL#e~zk2a2~3HjDto{%PEK@M*r)%>Wz
z9+lju(;6$TESmq|&(t;}GZy#)CnF5eI?=+h^(B#ulGe<3ep0#j<|V0W5@A8w=gE(8
z4p-wUg3TvAR2$ur<S-Rs2(5khxORwp=v-nJw-*R@aDZW&i!Wh|tZVr(btz)y*$azN
z8qY}_waQur{zqfaJeMEsov-Y#n%myU+)wPTEHzv@9&rTP-{Q2L!?hJa3lc!`dZ~tn
z$H@Z#ze2kG5CFhNLqZ_~*rL&1Dx1^Xo9g5<&%WA{MZy8vk1z-v*YgXkvo4=w_~qE6
z0R)hB9qn#Xt0L8Ztk`{;16WeWwXH2?*k@wCm<(hwKOs{)WTj05QUCy$DFG?LHURMB
z>Em%#GUVrh0z2*iKqKDgLx3$`xexg}X`9il)TDby>S|Jyq|@8pERWH)qb-QEZFZg*
z*{nb;pT4(w7i_Zz`Pva|xqW24`BnGf5E5$);DQDP8*l>{V{^`dG7dq)-nOGiv06pF
z0TDvUkhMpb`;t^_yD|Lfv$N)qqx~}qyCxbG=NEQQZLKMrQSOJ>t1hZyjMxH0K6pcr
zYF?n=OlzFgS#vbmi!D1mOL`ABMS+CaJvD*=W;tvfAQsPakQo5BCv^^z%l6@#U-<@J
zC=t-21fl}ew7;?1+9GixMWR|B9XUOV0RUJX)rlN%X+K$we6<lW$NJ)c>SISJfMRkC
z(9yOTI069V9A1o=>X$fJ3ORiZ)(&=Y)e=&2J~q#IVUwkW1T->z9)jYC2EfTNO(sGj
z>;ae_JGslTyyblVc-h&m#QrSN%wEgJk;W_l;*L~ekxFC*wsj`eLSg`Tn8~_+D%Qzw
z(K<_%J`9$ACSfZ}Yt`#4JsjdRviwat*}na&`wXpDr}I~VW)D4xO=pO!ruL^rB9`vu
zr(bS;)7dIv-VxH9mh+l#u|1%5Uv7D)bmSe@<aMF#jhyPbIpU>@>hoUCXVt`8YDsf{
z*=G^iS3=0W5X}WG7zw$?mkjPpapHR$-H$z3uk872>J<r!KtHNCqKt6AXbocqb_Hm$
zKK8y54yyhWd~|PgS!1HF#?UwZokRSBG64ckG8TL9K7S6-A&`Au8XzGorFRe@LZ=}t
z5hyJz`{Y1FDb?Z8NuU}u=q*=}hJ27FnOTr_XpnAckUl)<!%2`4G}weI2uaK6Mli}}
z*WfD!_TaI{#sWGLe9!<p^bkvUFz-pQ*@F;7-|LH1h!Y)v`N*UZKU6Q?B*4rhSm(1>
zC?jF0uYHL(Q%UGxUuevULzJs~99I}>s*g>x&pM}Xsw-oHj%&D!59VWNc-Q^#HM#JN
zRId_rrm|2k9HDUZlkhfhL{@8fj!s0qeZ-B_i1gNo=GllMcv!1^M4t}h06e_DG<+m9
zvLP>Wj1N7k@pe>tX~aC7VKLPETrTWgNK}Mb6e?HLHk5Jg<Xf+NbhSzJ)bab%(&*jR
zs93s)C(Dt=C($)^(csW%9<(wB3~q0%(U{WR7#5;fusH+1g442DY`$M?woWXm2m={L
zB(iQ8WmpW-sTv)zI*nT_!|B_*r%|_bBXya$V_Aqb_vPbno5v$T<B3+H_|xKX+Ty!}
z5>V(8@@92eP9$kd6KEX0#oH2?p^4>A2^q7&TyP_fRDbz2V;SA}hpizWsgmA5N&2*x
zSoZpZF2;wsLxT^xAM|zI=Gl`Cx!)LIB!3o39>RA@vK&peK25eIPO;}saZpHcGEV`!
zgr&G4&d`mfc%7yos$YKGsdxbjsX^wcAz`UuWvLOPsZpn?F<|1fIPSCrg|sB|w3M*4
zw6e4es?oHp)3hAo^t?3uA<lHN8*D`)YA-&*pg!rEGi?2o06<T=K~)>3URZLATXLIu
zMtfRv$7x0vab_z<W<Pyqr$Xk4dFH4{=9q3KoEmRH_FE${q0$NIhh}3;)~xqC8eQrh
zk;z$_wOMVH&Sa?0+w_FHx}=mfX5`5pN23HM+@u6R6X)aXoz=ivL8KyxiGike&M|RL
zWv)fzurWB?9LEANAtI3vafZ>jnwjQmA>|h5+K>j@!JO@AF!LzGbBm14B9mokNRTnd
zzR^D-yv;yze(FkBp34?a$gxH|&S3@!$zv68yBy6|-O0}oL?Qq<JykS4WOFN6ODo70
zEO@;}xS&8HV^JvAo|ltbs5Dk6K~17YQuOwvr|Q`k%|{SvMdJ76MTWgaaXUrNjVy|^
z!-)}M!<Tf=6yIX=^5W-%#kyz3Unq;iW{Mq1idDj)2HJ%$LQ3=z*S&HrXoU)}fk=dy
zrJvMGJU|5ifF&J6ezkIP>1W@<Aa##Xh=n1I+0B<_iS4B>#sC-xQkvq&rPJIj3rLOy
z(ZEp_tT>zW$hFiRQvQ+%)h8deln3g`saGuVD<>ksG$|OXu#~K@qckJu@E90_46Q-P
z){Lnhl~h`&mwEJ-<1<v*30Aqzl*YSPt(;Xj2xf;YRW)H&+t-?bj0*PCArlsm50JvM
zv#NI%L=D6>&b`&-w>+3u!)q4vYOq9^(aviI!L?u>CVZvZUXxm)b4Fr{irS9Z8nSUF
z3evi<zS=txjMSEO&3<+CN{oneUiG_m<g8eXR6O+quj|<vneIo_SLW68R50+J*O!7D
z9$PXzrBG@pu&)=YU=*onpw+Jy&tQB(+L&{{QF@(T#<DTPuaR6Kf*x_pscyGXO|)Ku
zr%7F@NmI1xMnu!AiYEPvI&G@+CV`Gd6D<0-O3fvYn@xFW%`2Mi`kJlJX>3VbtnRny
z`ysxkEt}))6NNHb9O_zL>bGb<e#j)z8f4iT644r>lt|Cm`tI>VW?)+!Pg{agTS7-G
z(|W5Ka$C}PTh=;q3RZpE_}z^2wj##1T%P*;jJt)F?TF&nbM}gM*}nFw^|l()jylo1
z4N4tJmK|IX9k25`IxO45pp8A}clvlbOO!gLEjva1I!CeE$1NKtBkoLRbat&bEa=}^
z;^|xw)bBK2r{3u3*uv`Gsi59d>NpthJc^(`S#LiR?YiJ;eS+7ON!p`O(Z#ddUF^}L
zoYBKo*Hg{YWrx+ve!Dk&y+=q>zxRQCZ<A=RLxksz%DZG%JruCM+g8Q8ulqVJ`)DKk
zEZ4ek8uTP)^qO|`GSv04ZS=9gY^iwfa*K7U4)=@O_EUog;`RGqQ5p<T*$<5I3_QI+
z@$%?=Al5J3*)M5;Vlw6_K06SbG4M3Apl@XG$sV$%)sS}NkZ$CV8f=i`(U2H&tNg`H
zAW8p}WynX~TS$7tB6XPy)Iq~q6xo*8Bk8ZRzKM;b)nz$*j;M?W7PgJZt&jM?7?io-
zdG`mof|OglItN3==)++pkqF~NSy}=%YIcA0>6=k@`<A4Mn<-WRxz5`r6QiH*j}<V{
z1Z1?9RgN+B3_U2$?091^s0tga?!2qQ)Rm?@u3<7>l{qd^H{P%@E{8nP$~2KqI$k$1
zK4dkFOffF2JV84q);@v=kHIDvc_tq!PcB;B8K|5z?3>)2px%Z}9eYgP%ADM1y0ga=
zIi;O9buQL3u`yAPJ$Z#aa0u%<R-Wdaox)(I#+sZShD{?!O#{h#QN6l)>6B;IEoU|(
zW{Ab_;9kz$>73bLpSewThthhsUu^bNf0lleitcju4S4R({SVW~IYXRLbDCl^Y?rr-
zCgyrQh6O&{wa!d_Vm&VuHU9!IS|GAX?LINDIol**P0hbS>HTQIq-J2N){4MrK@JD?
zO%!F+*n%n_szMdnIwn8^Cp~*&@x$e!5!sRn-_j?(QJ-34Pz(|%XDRXs0NMsv<1E{d
zEjwH;;cNqNjFw%!mfa?ovzGuVkQE=k6+h-LgfW2NsFjeam9P&;amOn_z-k=XDiDA~
zpthP~y_y!a3hV(8^jKtXuIBKq<?$hrYp-G3u9aD@l`<pYO|OBc^T>-AcQO^0nylAb
zqSo80)}KF{HWmEXL$=Y!w=tlyF=V|l61CBAiIR~S-*LGyO}07Pwa(l{2)Eu`j@n$U
z+FZvmLFm_N-;(C|w)R!F4#^hHdN)t2w$3NF;8_6dm}*G}|HtMCG`o!!z3$uVD@uS&
zzpzayfP@d+!DHFMx7h)2ITBazkY?||ws$DVcW+<qfP0X@61y~pyR_B2)K_cFxa%zB
zd$mz(72<pB-fMYLd%0D6`BkfY-K%MQtDCFFfbI%0pRtO_{h*FH7pq0>OXFu-{ERp~
z3zMVb-DG6x0I=G=guonu*A{V~sY3Qm6}G8qwQNet?M-QkLyW9GX{MRU&i#*_^MW`>
zaOH#dQT<X=J*Lq|bajW;S5&s-$Cy<|cpK{s4pY5O(Y;pD$9L`@dl}Arc{S-tenN?S
z5@gfse%0#|eZpja5|w@IA3YgJM}A6*e42zyl@fi*oOhbRG8S&w7kPF10(Qp4dsc9D
zTvXj(5`A`i_LQc|=5`JF`Q7{H4O0W<ECbEiR87_A6ptTrlU?X5pLShQ@T*)zv&@4>
z$f=%PCXHPo<b}&w{>ypQOM*q4%jM|H)#}Ujsmo0pvWd$}GLwtAU2n>L)hj~#eUa#W
z6Ynbl03O0}m;;4MGlz;T;SmRlr@mSHZaFB{4ij8k4!g2FZ~nXG0K6w@3<7qpgYIK_
z`sVfR{>ySWXuIe4S^xf@mcv)$!omZB-z^6PVN=e$5C!93EC<W()6;un5&u{YQ5d|H
z`WIFISPmn1qxoc2asII!Py!`l?>G^1ec3<$ZaLh1+tK2!Ari<SU-an&tjWp>?Tqm-
zL!9o4d}MZruk$)qEL!%7D<PXX;=P8z0s5O$eBJMsLuUQMNU3zi(k}<61p2Qsh6|k7
zEC>w{mcx=KOeV-csr2K@P$5G*6XNL0$>G5n#7K?bOum-DlF0aNqx%K8sVG?7SmRoA
zNLnTGQ*ZySIh;^xe)bUzX15~#tot!e&rjxp_=`R~-!OM_DeTLKiE^_eW{oqf4^#E7
z5Dsh7DbcXneu}S`q~`lGH4t{SU~8DgmyN(w_udMarRmOG@t0xkNK5mBH8{z$J!I=^
z&!PKjlFY{X;tUqIb46YQxH0ctgN)4sq(CB2+U7@rSpw_O%J0?K0&!w`9%Hd&7}aBO
z4M;pe|E@WRG4g;K@!u%<HWDgZf|`ilNBA}o8~pGbe4ELv&Ot2{j--Aqw>^2lt#@bx
zmHb-2iw?Aj5q@p-nHAu6#=>#GcIL`+a0lym(V>$Euk+#EK_&lA_6bY8F3yDr|1R$D
zro+Lwe>dO7IbM&zO;|wBV{Be3{NAVERR@?AexK0Y$bdc(rb_&NQML($>cD$}KOpf8
zMj1HpT8x)qQ2LE>;Gm51chw;>a7e+Rl3-Zr^F-jV3c1w<!HAmUch`ZJ5UvrZ90b>l
zv?3hUPK>;E9blD&WBP>?L1P~(F9^qt8ezfXCf{X;LFM3yFQgMzkV(^p$lyuyjY`PW
zwe2uveQ^Pqw!KLfLNaZS%||rjK%f$W&>gIaW?k+^h0MA!RT0g35VK8&%z5!%63zQO
zBMY7P6XSy}1iVoRT?kT!T0<8@-baNlh8a{rmm)q-hAu@}UHcA>L}X#hah`m{D+z%r
zVJk_I*4M&ARM=`7B(sWmEu(NUY%S~Bc*toa3t!Lc<Rc;4C>T@;-#{o2B%37*QQ@0q
z8&xD*6$g_9*UrNw$#%_6@`&v^Y<|+61_IRx$W9Z?hIF^(ZZzU>8Pm1%z%~`J*TqY4
zMY`Yfj68C`4<S7q47^c|JQz~8!G|4=ypN7N95twh9gTmUiaeUMy26JYPdk!F9nX65
zlby^5szxES2OE5{)8)kIsMFQVYO=HS!l|gU&B`l$g!s@%9(}&s$xnW<Kd2ggaX4Xv
zM}B#HZ9bfCRFnTOA1*KOugGnY@TURjasoHVfOhE8)5y>fd;wHGpgkyH8ih_y5K{$s
z?LOSX=Nb{jwFWv+;?JN7%RR=60y@%3<IiBok35D{0iEsz%wXxrJ-L=2@VEwMaLh)Y
zkY55_1b^BO;D756ct8DzU-bv@zx@aB-vY$n{sZ{m0tDz61LB7O0s5B#0scjS_-Q}@
z|5PAAzc>&-6bRs-4g}~Q3F4;%f%}I90sPB?_)CHS{%Jwre%B!Wu^@ndY7n^i@xN&h
zIKO)kxPN#MxW9-H*B%7!A0h<KPZQ#&2!Zpv34wd9Li}Pv;QX#a;QX#aVE^qxT&oZ`
zzq=4PKV*o%TnL=sWeDuwWC-lPZHQ|b0{b@`0{goT@s|yO^Q#U4DaZO%hd{gbAsC_%
z59xkffTH}J9sjufgy6*ph>U=_2&9Xky$I}!0KN$Fi(tM8;ETY%2-=I_ya*zU;KB&_
zi$K5#Sd0uLM8X6jgP};M7)a>Y$T)b&_%LKbiW^X>8zj^>V04HDJ>dV%*Db#qpZ{gY
z|I3d5cOd`&|A9RC$3Q;V;PB@_zIX$CIA+w$^m8F^x^%se2VGwj{XUU*fP6TfK@eob
zi6;+^&Eq*AM2hq0L>~0ZM*hb{9{lS@{>MZf_{&EA`$Qi2&qhA`e7A#R3xv2U`hw+R
ze?Zpo`mAX7#UXI~>(<5fUD3;*BYE6EM)JVnKSuJP7{qDO-$wF4#7h2pBu^t$i=nV>
zPt*a5L7bLCtmL=9UEdacAyfwjhW(hy%eEk9^4BYQP%M_#^>xv0C(e#ooUbkQWVqMo
zMX}<5&O!~hug{C(ipGJ1TN<djcU&br5cfqvLXGqaJ8p6vhy$Z7jm+kV<5Fr^@kGst
z3!`B>9@?Vu#JIzV3!`N_o<<(=q)S50+_<BN6QdpRWJfK{e25dHwzyac6sW>2kBN7E
z5I06|LvUMLo^$W|`XG*s(g?STDB$h-g>)oPb0Dsan!lDtI@X9r7N8fA+|&V}0&VH`
z9@@$jnFM|{Szu5Xrjal63o^+|WHd&nQJ(Y*HrQQYzGqiTt$yhjqJ4jn)mNTc(P=uU
zU1KpM0=-jP1W&a7JdrIMepid{W6%@|4M*jt4ih)n$B2H!eW~tG9iOSCA~)|yJ(`1y
zn)W>q-K$6r*`-N+r9YY<J%P9{Rou{RXObFoE3*&a($nQgEgd`kFd5;&-8Izx7=9Q3
z{cCnD*Io~1*RbsUq!+OTFV(%IMcSS&3;PH31&RxYGCxZdCV59M<=YpKP*$<>@==<U
zXIeorBLS^M;8WXJVjKip7E5~_+%}NJjkq+5<C!L%EHap;@MATP{o86DXDBc1AVXU$
zU17LwsHhD4AkzqOZFEUwxNP(w3-l{ro=_4&)%)C0v*?`TQkV(?w~x5qj?HydM%Do7
z$yJC3<%pE5tKChP578{jcg)OC79bgI_8SV0;LUssNgo|_BV^{cvKLWYQ*i1#OfL4^
z(Cd#IqwZiY_Toi0`qn-=z8S<^`M%E3^Gtq`TD_z>@}-H?*!Wn|P}yMOTZ2yv_a_!L
ziwlPbH^06qpD6isREq4lY5IypWjpAIX^D-=*kDcR<SnG~z-mijmSKD!xwvv(nMLPd
zjqs9CwkGB#s|9FW9=W)&=9%24)dqvgx%fz}wjHY#G(-L-Zb*Hwa^tqOBfa9)VsQ!N
zxv}FjOG0d1!8(S`Emx{@RYI+h265Y53ntQc1Y09b?~QYTpGeinuH>5NZkfnAkctCe
z4b`K6xvvqZPe_etR{Na!zN5G3`y4rVgK&~zh^ms}9JBjE``+C>kH+zL#M$r;MZx?4
z63m5L-J!Wgd=I>=Fc;bLL)$-)eT>Szy~Oy=tX<)GUNoxMGA~1Mx9#P}I4d5V=URz9
zE_?-XRGFHBiedC2lb;grX6lFpzd7xfMK4U{*VmTO8XZje@F^_O=)GJvcUxxD=ZyE6
zjtXlE1Ig5dnP0J2@87T(sk=PL+z?%tCb59Ssy=EYiop?wM-dO&3d}!t>gz#B#>T7*
z^KMiUXd6!59d9Q4npbY{4_;;$`+jcpm>IjQwR43MAyiU!9Z4QmZfbuVo1WDyu5o<4
zEgt-64A=0u&g!<Y%hA|u{FmcKIhriIH|3?nj}JJjA9U?Rx+ry0L>E@S!MQnpSw7Fp
zg*fCSKEYT*I>y@U+7Yh5=kus+5f#_8%O-Cxa*q^~<7mG}@AZDP#nFn;5m#5RxPMZ9
zMFS&;d80H!hfJP>tjd;fb*6gCXYDntzPmE-&c8Ite=xDOd$XLNq3V#EYJ7u3&|+di
z#W8`Obc?0eVgPP^K>AX7#n#Yux}N!<faZMDHM(_nY4W%d^`ceYx43owOy#6@OnEsp
zm}mTb$5;BEi9L-e{pE5MQ@(ViwRnD`RmC#%-r~#NZ10Qam;5)Y4H3CLc*V-2Zqqs{
zvg5lT>n+z;W^-4tlh)x2r$I0CRmjy|$ZO}SmjSQz7C%m&R1a4Pm5@93tJ)k!5%9lk
znKIeJB|pC-dnGjD{W&K3YF8@gN@Us4wC)k#1V*w=?^laM8jC^5*PtWQ7#dsZTAM6c
z+bL(O6Q*X{+Tkw@$p(a`cHJ-RNF<FFkL;i=cCZ;cN;G>Z8hga`4^A!nX6NgzJgYd-
zw+ku2EsAfI#sEZc3OxqE-a_oC1pojY&}$HTP-30~Q7$3}HaUkBU#S-WhsQ#;qAt(}
z2XE1V4uBj7sS;qqLnW1CM>SJbiIi`ztx6ovdK}e5oODZ^Tt7RC(6ZCJ*f9{<YuG#K
zn>w4eIK2x2yhmd*CNjP&<oxa};&POWlc^%=v=cxA@NUl;(riycMD1Ye5+LX5U+8SL
z><mS7^`>>XcrAw&<H9rI5>Dju(bPVA*@3bLAlKroG3T<=?7DNrG8Et<M~h_e{)T&P
zl5y^`>pAi(JX=Q`BS5K~TQaRP6DKQ{ts{WdEnC~81<fTzh_#l;0cgZt=AvIw;!aFt
zePa*M;o`38@3~s!IXdE*FUN{w>p1M<*+c8uyX=W{!%OZMp#8)PNzMop%?Lisx<=%H
z_uhy77|o2T*^@}i2L%tQg9r)v#4FO&!xxS93Jrh+_nFpo+X?YPiuaLg@gbzs&eQf$
z<Y2{<_a&GAKxyWMn(BvK?<*+ep!>k*OwNa%%MZ!l0l$^~KHM81<a65Mk4lA<sqL+)
zm+z04Vk;o*zd*~%>EftB<c|Y)2hw?;#Q`$W{GUMsZ@c<}Qdx?QoebUud~&i*bPRZ*
z!+O&w@B>wl>GPmB+ClHkSXE114C{mXootNJgAL4BbzB`AN`e&R{gs3Rk4yj^EiUgq
zLqlGc`nl`)pY;U`9tYc+`AX3Rw#tP#Lpi?48<>^`o0|or+z16wg!-fVebWh5mS<^q
zadbiF2$%m7i5|ww<$r}7iY5@2=<1hT8rZMRk`8s~S`Kpy^<EqasnTY#a0%xvg&Wf8
zL{tj1#G1N)+6zmAgh!l&4x_VX&^ol~IAsWW3(|T`qLYlF12#CLvS<7N6j94?VqlHe
zMraf!8#Fp=Ckh}Ook<x0bB)eWkLGHPPK%8eT=oUH5nt-~-P9#U#sEBZ<romQ7(x#p
zp^H6z?PqAQ7qh|<JD3^^Ok<IubHHAWsbBGL6pmPS@!2*3;K2cw-(uf$g$tkfYnie5
za5*s1ThVG+b)wpB(b$>Ehj5+*n9bRiG{?82+R&~tTet+8KS_8V7LSS+|3HN01~9>T
zE};T7@u)c#>K2h1;%X>?o>cB^E2(R%NT0|rlITX8Am$eTAUPfslDP2IG}Vkb<d{Pa
zBl(?q(m7}1&V7I$2B(=wl0#Yi2Umq7lVsD?q!%yD(U()+rv*@tGJk}J$+o4mAJ}N>
zrhv*~_wG9tLpi@;q&gBOM~=qszD_l=e~=bFn#OCMlE%rh08NW?OQlXqwP;IKI*p^F
zcl%bKR=S$@Vl_Sr$})%sC{Rd$%Wai-nr6<Ok)xovz8l^t#97mp(TS0nJes&=lCe~p
zRupE@o|c%9maeUkv56eDCeJx6l2tcqCDi7(bl-PM4<l>tGzpH8{fawzrz~VCE(_%}
zeTF`HW68#QG+C#OnNXK=!!7%yExUm}d_67-i6Q4`G-V4T7e&#g*W6*x%@IvE^Tt`m
zX<6<CBo|)FjPK5g?Vbx&%%Oc5yOx*Z+@H6om4)V>wUU>o!T2Z-7c-yw%of(JF=+2k
z*`CFxn3Z&zPg<T?;l_*{2B2)qp%MwdD-uUTpFQ0dAlMS;HWz1TmIY==m3zcYp7u>t
zA^#0YysTcK{93jmNs9Z8BA3+y_3-?6ip*?bMd<XzX>_B-Nt9s|=p|9=AsNCYiIgQS
zkBFT}9D>S9qGL;ZFo_3FOZ?lRMO-EH!4}W~;)w8aXzNL7Bq(x*t}IeA3Oxo9ScR^(
zMWtIn7v!_E6``vuWw-NRLNDpciy5Fi<>i6KF;#j{OoiN9OlXZ_h5x;ZmPgQLn(_*t
z*orPp=tGi9PvfH3FDt!b<K&hLN7{+lxC<vNh?K$|XWEHg-chJ>lS~wMt#XS^S=S@7
zEvwqHAliIX{S8#Drc>=8S$&F0WJ5hxZI6_ehFN2}Q;l*C$>**?Rf2q6DaEXSG>2v2
zRzM;j)mqPFN)nDkGG9J=s`qf;jV-F|CfQoX;Q<=}v+n*QQwT{0BM&OflC@uvjV`@T
zb-PYatsb2#nl?kc@6?*XBh$gQ{tkMc!Z%5(j%@z*EPILulGOUEM-iIXpNp4643dK0
zGnXgdB}jh&DSF-^`Rb)^D55dUqdIn{aZp-Om9Y_(i}s#A8|<#+*O##=Z6OUb%_O6Z
zIE|Ti&Gi7N4G`K?jdFd<i~zg7@>d?V;@VQfI+8;HbtAgXFt=tV(GSEent>VK)}oC#
zUt0~%v<S6Yfv9b~`ZUkm*-Rsv_wdC$Gul$In!VWC-dT!gjJJUrUllMGL@qYB7qq?e
z(0EIz9dq7XdfvWS)?U0$!)#WmDAh#rtf>&|RTEZcx2O^sNjvTHjuFPz(D5`Q%g#EF
zPFd2<uJg8&fzDekh#h_F=lfOTiVZnB?JMh@VLVBr`c})LUC$**yGzp)d6gu<&r%Sb
zTW`iryw44nJe|TG)I*q6vJdL@&O71fU56Uh2bPU5WP1sfeOS{PJc)a5GQ}eCc8s7l
z+1wA-zdCI&#)?Jt>`5%_JCyBZ#_pXTRb*)ky|<BbyRrqPv-dni-}-H(rA13gur<R)
zZ|2v20IZMHpzG}XBU7c8&_*xMM%tA}`$cl&*S9t89o-9zJzSlGC;B<}H*z>Udv_Au
z9`g3i-Why1ks%5z2rr`X8)KtV>P}|t>u~GU#OT*G7?{iG;^giAGBF&=*p-kruop4x
zGBN0ZLGAf6SXFA+V`I2#e8_NO=tgC~DR!?vHsWqn_E81g6*d}FITUsQkC}kmSv45k
zAN~YWtk56L<7uzG+jXqe>;h{G_ZW72)9HcTWiQsAr#x1G-65jiIZZki-Z`qbI}%hr
z$VoTUQ4kr+MUd{5k!b0e<|CRMao$>K)l7l?I^S|+mUm2^wB0(gM{#!4G8;A#RW|Y2
zqL3wWY&)`#hpl<^V)9hEKl!3)U_9|74^1GZvrcErmGWe|lQvXrXdm`k&MJ=iZ2FFJ
zV-4ZVATK;Ua12P+)SN^U>dq@#fl67$K!7#t2}7gGVu;n7U7bLs^I}LS&w}sGeifK2
zvM<9}=V4bteZ<E=awmhAce9S>k|8&|)>5+kc@?BIeBNBWMN|dSc{cwUsq!_>!c!`x
z1*y#jIcA7t)q;`Wf(j1AfOJtGREQ<Es1sYM?7nEUQ%R}6NjPWGXJ}1m@n}gOwDhHm
z@H~7;kh<8?i_qe7No#S5(q4toPGwm&cF={45V>MmBX-$qlK`D`<!x_G05c&dVvaJH
zkKo>BO?Vao(fNwX&Rkp<0lCtul5t&16~VizRq2^|@+=jCOtLjeq_qNOf_&>WarL#b
zOZ>#~wQRnC8a{$`3Z8YT+@2;f0_w<hNu+^}OMIWpb)nkHJ}>+(<qa{xjoTwC_`_8j
zBC#8j%=kCRHv7FC!J?Z_BxjdZ5Z24)N|s9z)!HV%-P|r8J}zMEfDeCR1#rTQFEqI&
zkUI~!!mHrh=AhgrztxSW7PZaQyMSebhk3QlKC=yeg$GjIdEmQ7v<23@Q?<h$y9ns%
zB4fe(z_<IrZi$*7?{n1dJ-b~-Ts-H?U0S>9c?yO-7SJAhw53DT9=+Wj4-2^Wa_>&>
z^5bZ*pz1#5y%kC!8}N(jeKOw_aYJw<*#Wh2#<}>x9l=#Oa&U6h0cGqexe6{=nf#DU
za$Ca&L~L`2uYRce3Zyf22&z3aA_rAd@*P2aJG4F=fh5<>y+JLLM})cSwuYcCzGM8}
zeWz^TG)2@gX77dvImrF$7;Wa*Zwu(FdU8|nB%~ThUUh<XZ}WCkHE@IM6z$$YLN;*U
z`V`H0t1tBwEq5!+8@Rf%l8XzBrBFFTp*$*y#zm_>Lo(i}>c+(<KR*MUH*}xLTAyF4
zo!@RV#5J5c-^qzF!@bx8?B0HLaU`)j;*HB(eF4u7nzX^4pd!EAXFZ)~!F|ZGw6ul8
zSADtSbGfN{sY-mcobA#rcC{w3AA7uoV_$tG00jTemvKk{3~yQgCo6i1b&6Vy7&QrF
z-`w6E(EZmYe!uYLe*IY>=D$|-LGHCZBDF!kt?12O4YUdVvZ7afbT9){`}#;p>+co4
z6>t`4&->4cKJRQ>{ht+m>^;;i@IQQ6A6%=5;!g4xU(O~}2xUdA=zs9#KJ=z1#OjC@
z{dc}xABbY)r==$m`<*ZE`-(DL`LKWf!IzyWI8g$0Wiqe%a+>!yCoEA%+M5E0dOue5
zdhXu>QJ&z)74}UVs8m{9^W|m|shiH=m<x;Vd>M*|P8!!kdBYR2qPK+R_hUVHf)&(G
z@!_L(Z@h4DQ3>45p+yE9`sF)cUV2FSTB3$Ty9I>P@qI==ldG$E&j%BvF%AGPlOg!>
zr<Hyr0#vo(Haj#vf2pI(YrL$SVe!CdTybV+4PN|}pErx6bQiC{+J+o~mxJcogK=Yv
zFW#V#`qUuzuXgh<-eL7Y%;+s8Xi@t0C<M@$Du4*I?AL>VhSd*jK+NcO$k5l)eHyOu
z@-uzp?OVu83g7Ya`ogot_44m{ITsHX&;ocy!StOkUrh`AkMJ_>{|GN5=<=Qa8(zMq
z%T)hIy!?YMlmAz|{DUr&{};Uc7hQ(^C%pVOT_*iEUjBzJll+O7f2GU6<K<uIGW0jR
z{3~6C{(_f(r^}Fk@bYhTneZ>X{5xHS{Ee4?r^^IC@$&C<ncxRr{(~+Pe8<aw(q;T>
zy!<C!#=FMLf6!&Z=&Q?%f74}r=*Yk6GQsq%|3H`Vr2kEq34qT3fiC0yn=Yr~{aDxk
zN|*7$|6bSQ{WY)un=YS>pnad$i(w)EM2cAdAjNM8-1h?}BEaI`n^43em5N`iQlo^7
z>BZFC!mmgW#lfdlIK7f{t+V_}jn+#;CrUdlOMk7(h_U;V!IWv%mgUw}P0at>kWU~j
zJtH$KJ0~|Uzo4+FxTLfUah|rSx~8_S{y%51|Eyp?TwPn=*xcIQ+1=YeI6V3vWw3Rb
z$6Ipao4qOihZ$@-k%E-(6Vd-`2K%md|1TZv|CYg~{8tD2&kQ!@O8<Z8VE>xIhW*D5
z_Foxn(*N4QMr5#I|ECW2GU@-RgG~~o{a-uSB;P~W|G9$={Sm_c4;^giUm@&&b+93(
zi>W_D*uQnKA-{&Of7yr<epy{f{?|x!74k<2`>&BG!5<;)A0tuxKSS8pBT)i|&5}Pk
zUlRk~|1^Y+_uESJ$4C^<neo?^==YH*n6>%0l_(;F4Zbd6{~p2yeJ^4E8o~zuEMfn`
z`9R-G*u@+Fa6a(S#1GK-59b5^1bu&VKH%S=?<eO2UB|G0YheR3u4CB0w6Jl1#IXNv
zVFQ1}uz!HQYtD!J8|WMViu*h0OUC`9hK>7w1%0@GfIi%g-$5S;LHd3NeYn>>>|a11
z?ltNA1@z%wlfGX-AMQ2j`vvsj{vdsSgFc)er0*}#hx?QC{RR4Pev-bQpbzIS()R=O
z;r>nfet<sg@1$?nKL#t2_JuG`+v~91fE3Y0dfC=CnKGQ+z#NZ6MlIoXh0)!h5}b}i
z=C7^oDyO@_HCRck*v`ToZ;AIpT11oX2e)E(Xmam`_IM;cOcw6c#ZlM`8|g@5FK+Gp
zV7?blG>w(a*(}^;f|!mj>#GBXTD!iK?L}_uGi!0{;dPsj_DAj|bn+g_cUyKjMFSd>
z`QPF7*xyo%L1Cj60AY|gG6%$F8PoDpQxZ$MPWs1X<}N?GS4QkP>K_j~(B=_LBlh(Q
zNJzOy$0Hf`IuPSdKw_ft%1d=_;?Oq#q=a0$N5)}Nkz@hMaXTxo>@bL9!E6to!+2M<
z{L%)xSPN6b&FG~cPz|Q4B&Eq@V#(xlhi1Y|*zT)btiG$GA8PU`%y8#oP{4~HDkMwF
z{J?~zgf;hr^?gTugg>{W&jyrkybP`d%P>3fGAe;#>2k%aMY;NBj2eTt;jM9rc|mNL
znh#zEb(+X9M!_x^<sLIAWcw5qC~-0A7RHSZzD+F5D#Xx#_c9O;>yx1`s$`NjNmoqX
zE-Ds1VKNH5JwEgQu=iGBQSW`c_RtJSO1D9$Ae|O1V$qFsgTM^k9YYSo(A^+8G|~+U
z0z)fG3kXWb{=3$CR`0!^eO>##-jjU>CmjE|fA{zP+{{Py3KKXpCa5HS5!B=u_2yI0
z;nvSoZ)lQg<a2Y)S@`-XubE=84h-p1N%$78$+jxZull|e=wmYhf$T4DC03Ad#Cz&D
zhpm;S2c=8(Ej8^<C;WcYLwA-i4C^~6j4J(^ZyFJCK;B-nstWWb__%RX>@7uw<CEJ`
z#_86OF6!Q@P}$)H(#V+tf!loF9!u^mQxFDq(ObTW#4!0p&=2Xo@N<e1iZcCPaf~Ip
z<0SLCWrd9JgjKj+4ki$*M{<*zwnx=oDL!d<g`=>0;7M|ElBS>;7d>={?X7dNK7o1S
zYxIyhpO8|wlzEas33S9lz9xBzhg_rqo%fnJJbjnvy;R_<F-EnTOi|2rxiRQ?P|N{N
za+IYb#q9Vi0sGhTOz9Z%Dyj5|6|a-{rQ=jhJKm@Dh7`)SZm0`sy-U5UE@l+8epc%7
zflS`D1c!xaE4X&{Lv4lYef#n)#9q_1%pzx1Eccf2*=b)d(_rPz1YWaOLUWy?KJ~7-
zykEDenrC?$Me1Kx+bT9g=g!7OB2@&4>^)x3%|ZB%q(`H7WJ>3N@#!DgOT4j=A<6gO
zD;iMJpiQrd&c&n4`t~IZN4+rt&*d!+)Z1^-dqKPO{h#e2;uLL;{wfTN6o#+6`I%y(
zY|cM0P>1$?8Qz0Mo`1ZtF3>RW_>ObDuGg0(%f_JzA{T{RsW0@EC&I5}T->)>=tZgh
z2Yna88AtQE5Bu)cISJhPo@ddyt}tCRQA75f?5_7_lAp&UZ-lGOi-k?`-6p}>HrMb7
zZ#qLf$^J2!!&*Y=ZC+dckh!I{Llk}BHuG5B!mf;a)0X#ccZcU93Gs2q(ZX&Je~SRN
z>~TD|3{4cHuax9X{*%}0ZTmE`QmboDCnHbv4l+U&)*kro=QAcOPG!k@OUhSzWC2KF
z(@1)|4%qm7zVN-CU-2uW%dYGE!DUb7)rD99|9Q;y_TxK8()%u6=U=$>XBbAOy$^DI
zJ=dQz9}l+6v=mleSZ}wT^&iU|$KGJP{6;Txwi>H&GC=~}sac}k@@?n;#D8Tl-hSDH
zVwSyh$-cUn%elJLBm~j;RSWoG(+~oU><XA4!3@C;%d}r45W~j5MlR-mpy^N5Pe_yE
zf70O35Jt#!?0-caz<l%AVO=v|uEURmgWxtQpeieXZyhW^6ZmL4;4X?lhy@au3Ja8g
zfTd;vllKDU#DG=epm_Bl`D1*=l%SZZAZAq*K9I_eAPs&7!GEC@9O4+PC5Eqq3Jw?#
zHaNyJVxbA~F$#Qz0-HiYJdHxE!@yQEA&`y`#}rp*vCxN*0A{yiLWh)47xhrDq9C8+
z&}h=I0Q}$}t+24F(6AZY2o_YB^>|oJ3NDN$+*}|$Q42R25^e$uPo=?4n+Z4A3un%n
z!O0Pe&{2;lNWm#eiO{HuDAU5JIF5Km8d*()Q_G?ispc4on!yf5MY?50wiRKw(nNWS
zMRn8Q^+KW?KvB$tTG#_KQMMgX;~dyyV$tzh(NhrInUv@hRP;P5e39iiI)f(W6HUZt
ztr#GQy$*@kM8%ZM#O%yO?9s$liN!L17mGZE#MY<8o~A^e&&0MI$AUPcFvVd?EYY|d
znD{IR7;hEqS}`W^35-)6MlOy`rX9y77e@oerf!V07BysIq79;>jo%Lry{&}L2#r5C
zkLRMr=S0VA<0i!I#WM(^A;RD=9%zDxT!IADgaw@t-jN_T0J@J(c*JCtcuPEy0FkJS
zhN+?xNly~h#p9ldC(&vrX=r0A&_fe+2a>c9NnxTdjKKJY;>j%INy5OfX=A)NePi;w
z8Q{`1%$gP+A`kcWOBNJ|yICc8HYPYKqT#Rn;JlnEfyMAHHc%KMC89AU3Y`*x_VI1>
z#S%dL3~fUZDcT5_mG|8gM8*jsS;;@d$~I96Q~*saLZp@;QWH5-<1{!C3EH4)ab&GF
zvR)fm4)w1DBe5KjZ70ZT1oBWcHIEkA0ZpsrOly`)QI$vZpQPbrr7@zAE#UNi&U6;V
zw1L&MDR9~hlyDfG){9PWJ4weZN|&D#&-iSGT(t_A-0+=m%-9x3425TGpppCHnON$X
zyWq@CMCMzi%yWy(BXAaqGYdU{P@>7irAx<`2)Lm2yJnsB^(25ymMi-ToJHB>hdpR}
zy(F7tGecH8m^n3*evsg!wmB<TCVNC6wXz>~QwDD;0cS}Lt#!_g2tR&;T+zWC@lyg}
zS?k<mbQW<_=1s0-U7Fm_;<-H2*&hn>76yPNr95lg{3*`-T)F)F%6S;pc~nig1eW>Z
z;rZEF`MQJouTH_{kGS%1#|!LF3moYR9a9Sg*YoCZQz4rO!;&oHltLf6R5|OyIX6%c
z9Vo~H6rxiUnpzarR1~9A<WErG2gR`&ge6%Q2e3;d!&6~M30V4OF@j4cM!Y1iDdjFz
zNzRYLlFj_#zC!(m65msj%!fJ}`HiXd9;MiFg~V$mm9x2mr@31pCHbrcW5uP(?8WYz
zr78qvkJ-w)O3I!NmaT@A4M^k>S(mK}l-fKho8~M(?JZTY=qv9FFaMlXK1Wclxmlj0
zP`=Dou~=NODu*z!s#vowry!{O!cuu%uJXrV{v$4u{nO{?5gI$E$j>E27%GXQbh<dY
z8OO@RV96>K?8pk!)2bEHDsss;hXgrgo5VES>dUjG^pUxYbHvQe&uJxJ5_ux=kVNcr
zPl6k&Pr+3QC+dFeHF!GJ+k4gA`4BN=8WlIORCATom_)TqNzIl(?XArk3f;1M^u)@|
zP+3VNZzPcvy~p#p{JWAM&eAuRI(5@~Z%W{GS9B!cIgeL_^;21ud{yxJCxms43J7P3
zhSzfi){@2dK&3tr4f<IP(7^`1j-r6GpP}*)T@)S&6>+T@6*Y&7IYYtd8{>gsL?Tiz
z-=i^Cr}4>h1Ei!;t)nq(qw%qNDN3iQ5ZaKlg=rsI%UmppT{_pKyw_9}iCrz(EU(^N
zuY!$2Hs7ymZXv{OV>xS<A#Ld#!s^y-k#KD3kHi{mZV?%88CAg=r*9P$Xl0!w#G3MK
zEt#ub7;Ku=ZJn2_a5!sahqZn##a#7l<AAko+F)+8&9&X$Yun?-+?Q-;QExxo!Z=2@
zGgP&omttI=wbQUPxaxK|L^LsAUTeza?BLJp@EGjil|u!3qOM1w6bataq`m!lWy&!B
zmg)R0aAi7?4Z>INylvab8U<o6>*Q?d<V)-1xT)Gs9EG^UfGMQfC47#NhyY2=cL7(X
za_3$6=j|9RZy%MtRY-fQ#2EEf<=WfF!yQl3IzFCNEF!xoNqZhj_25dju|)Myo^SP>
z5cVE&_a3P9?%DKyjqKel?WJ5B>ixXcyF}PG%iTAv(l=?-_b#$eZnU&-XsEA$tFMQ<
zfglTnE6^XT+8>hEAJ)<zEY)r3)#Yd1b3q7H?FSgDKq)OCJih^C*#Pibl)*5VK0iQ6
zXf#mZHCU83Skf{`06YU$oDWtp3{?|K4b|!m)q4%0(uN3{T83KYhuY4EItYOqQ>kI#
z#<bUK_*#G3@L<dE@ci)T`7n5#VFb7`ozxp4nDQE#NgJ7M8JQ;p>e=VO!`0{~sZoN@
zdZVjeqw8s-n}jW++w-G4=c9WJV+8wBW8d}04!y>X3Dd?-TgJ}k$1cyu2tbVEn9}3e
z`s27@?{WO}ae~(IYj_Le#QI@m|FPJa|3FSb08aX~fc9tJ1eoi9!v3qa4(RNF*A9s6
zfZz@&?10w}`0Iew4w&wM%MKXrfT|Ao>wvZn<j(+q9Z07EMmylL|LU*fL@cxa{{^Q0
z&ZXr3Pr=mRxs=F%vCsA|^K5+o%|6>7T#EC**=PHOOR@ep`)nyU{<r&VH-Le*KZB`%
zaw$e&pzS|k>YrSS@wb7tf8S?g`hB47U-#J<{upTcCzqoC^FZ6*xD@?g2HO70rD*>$
z(Dr98Mf<mbwm-NO_5Y+@M*WY0wqN6ARR0)g`-MwU|8t=27cNElkAb$IT#Dk~xD@5T
z4YcuG|CfO_ihto!q1XS(r39}3gG(KZk^h}biGC#q7TW&Ir2s5N_H&`_Ph6_7ocx!7
z*&kdAz*1zt2<U%sDQbK2Uj+2uxztZAMf&%RHqyUvDF926{e?^Y#8RX!N56BaU$7Jj
z`yX8DCzc}pT|ocMKPUNJK>yW0C;4Ng?H4RX`sbClpZ+=NUsu{lqsaawp#O@cNd6?C
z|HM)xz)sum0y==D$o?vz16YdW&jLDtrAYoPpaWQn<gWs{5y@W#^l6g63h1-M|7!uA
z_^$#wvCf|a^!)^qU%AxpSc>QuF7+QQMf@w5`VW>O{*_Dp2TKwE!li!4Qba$w)bCh|
z=oc>aA1p=ulS`=<N=*OCrHJ;T3Z>SR04{ZqaXd^^D04J0=5u<G2_h<z)4(H$0?~ZW
z!hc+JpE^Q4U{fAM8eQ~|UHM&v@?>`7n<9BpiKkJY*mAJ&ixm}=-^a(_&0%VN{1Dvn
zKDp_8zJNEU@^c+zQt$TyG1d-MD+zk!>i0t41#!+NrYq00E?*Qq@P7O>I^uaATb;0q
znYC*A>EwrpFG~c2EngbDyez|gS*q*$ShKa{c~wY~u;~x3mv_djQ)^9J%WYYsb@K_P
zQ4!`9s^z8n_X0I@H(e_;Hn`QV2((7=4~xus%k*w2Pa(;LG~OV@ZPl0w(L;*$#o^^V
zhCsJB<(USs&{}3HL^wMd=2nu&`_)`+3q4sPQl0j-OhxwI*=%2xNGWEKoslWGZt{DL
zhN#A`w)%wg<M(Spzpb`?xvWQpe=K?}OrSradep>N&S!qD<3F2i?$_#6{@83|{cW^u
zf%E~e*%s8Rd1q$C!G1TA=E4Ntei2-xgr~X;li{T9culsWc@$~X$uZN=Z6{#Jq^E(a
zA|^l~vlqM+{}JDThM%#1FZ7I@<r5iJK{wkWVZ?JO<HbXbE*|-z2-7q;?Zh#kNSl+s
z^UboA#ywP@)5#r}__hhU6Vh))CK!*-`OI3HKcJTE6esD)@|nW{XHbKWFu6u5nb$>}
z$KY~bvy*4F0@gUJnc-|&dOK>h_+f3MV~mh9dVaO&_^8QMZ797EX;B+?1PJH$Ki%{D
z*XU2j!_;bXs7Kc2C1p<E!9?V}*`=2hP0hyM2V-FI*iopuP~hSNhoxfD6fmme$y*oO
zn83FT7$(7CtOhE@fcZUbZ(@IwAzaER?`V?|bb`@j?pp4do?wXQIYr8H__&k7-*n7l
zVz59dg>~5$TfhkOU2=Wuz^S%1o+k!DzuV)^VL#Jb9aEo@!HvPue%j6I5(}G)^*Yje
zn7F`4&UHV(ig!c~@E|DW4As46ZAO2euFu1_iu@|N?Dmpbys%5D+`1U~b{pR7;t-4r
z<FK{GhsJ_t$_zk#FF!evsme_cC^MZXDf0!G*U({b8A5wDJibxV$W4#OoG5DXVd4$b
zVPH%{6(N`Y;A;#Z9H7f<kujZG<A>Qq-1HvU;_)9l#dv<D_(3*nPy0Yga*Gyyig%M3
z6c^}y%f!L2g4@w8I_v_A&%$-CYxH%y!~%|W=8xVr`?v<*jydNw&qs?qYn|Ni(c5At
zWl-MZaEM2)EXPq<{@q1Ls8mLii~tCoU~m$OeR|-Fi#f@@1TBI|W90h|u4|7RPKrL7
z;&=CcPBFIGI%L4<geTnb6@K3nG~;umfFT9qAUV(ee!0*i8Zp+Iah}VNelW6&mtW|x
zbhC^JbS+j!me1kcEg0k0quC{yBTkQ?s4eO9GXdd}r_sk$el6SU)1gZ*$IlY`mY5DS
z@68+g_?}ODSGmKnM2~DQG)YbhF^BL#a+-bzj((g17)aSZzx`!D&Mb#KcK=;Df6hH`
z07+eiYTb0ho<4GA3?rsX!K}jbWknIQhhakdJh+R9dBZRpsRK6z0@?Qh+N3N3*Zcz6
zS-hkyiDfJ?Tl*mQIEWuuVuJBOw;@6Eupm~W*E_30A3?#a0s$|Mi8Qn@$x?zBs)BDj
zdZQKAg3+Lm+pqw0g%Go2{{%|E`$j%x=DyaJMCOW^XQH7q>Y=yPeIbgWwqe>gj<Nmv
ze7(hpJW!!xnqeWwp}CY{PgtO3Q$cbOMb|>bFjGZ?6OOMXiD8<ggpbNWLM$L(`>+)7
zd@@kiRup_R$6f2(BLE^r9gJhS9x({=sOPZM>Iki+A$R9QMY7!SD@%51n;{ES^y&&D
zd#32vw@#*F88u`{#&8@(Z|on(5lw$5l6@+LtPCGi4v(gO7rAIjCUNZfQH)GZD~9rE
z(7F~GOw4Obj0|2Bb#s@4Y<ex~posJxB$mWK`1F``W+s*>J8%IXM)))kdw_K1HUxIf
zDTV++x;X;_zX^)Rj|0U;mCMKBzm28QCe=s9VLN$a-QI}9_!fJUmedg+<g68c1&ZN1
zA!(Y4zfzA9*oY4oOSl}5xqjO!;X7$8b#cP^c(_a>Nj^>DnPdEC#l#<bK2~9ghkNm=
z+9W*eCy583C=g53GfomgXwm`5zXsP^2TU@#9%IN!aw3+zn-yoiL45l*B6-^}{>DJ^
zCTWr*f*33g-|m39A&5y3@Qv}9!+!X3M}ol(aLQ-(*r0G?Gib_+UvdN|u?;$9nFR(@
zBE}U*d>r>wxMhWy_k*XRi9FGW1(vw14I-*=uiQqWC}`>oETsfWGz~}GC|4p<rbW(X
z1#&7OCk4V#;Y4bU$O%|dn>bM)b=n7(RMu{BB1UN1IBBwJQQ9ahpaA$kt}42UGrgbG
zUBxPW$Ot*Rajl6BojyG7sR7RD1BDYRW(=^TtQTJkY0T&sNZf&5n@dUfh9-OkPiOu?
zdo3Q3*{zN^FV3Vp%IxYuV9pX|YG<`B!|_iEH*Yj%wT)*G4-%HqW;ei6DXa-O6~m|_
z2wfYqQOlW3I)o`Kv^glbtlM;ipP)JQMrmAJg!$;4T2P_@9pRpMZk1g6O%Xc68bog8
zI8v&F!1*M%NG@rGGp`VoCNGgU2G1)cP4>Mpke4Npt<FVoAf8{em!d&OFn64+O+avO
zHa}xL+lY>U?UqDAnp~2+WkF^~j`b{<f3_ggDBCdtJRn|}%986=0_F_!xB+bf%cBdC
zd-;A`;Cb<)l)dy2YcOYOQL<6Slfj~h@hoPT1ejE!IF2Phxd|VVS{wt*PNM_gIxUV>
zFUax0&%Uiw62;;b8&Tp{Ra91jZ>*GANr10lky>MokMt-F@+)pS#q+&2TN+Z8+o6mv
zBT?qdQq<do_b|0A5azcwQ0Ci_w2xozyPU^7Nr2ztQSQo8IBSikHCyghmAPDk*Dq1w
z#8R+ojhDxgTH)xJwJm{Xds^YVSFo>)H?LD^t6q3~ird&!X;)Rqe948^PFLliUV*KP
z+v8DX=~qP1j9W2VWxZTP+Kd~2L*k94dNGv^ZXx`Q6)2t|5_h*Sn^_XKP@>wjs)DT<
zhxG=s+9<1pha30%X|*9~sUSVBicXDTRrE-BjSj3_N)iX_)>(}XOWA!S4p_JLxlxtk
z7WPn6t=4i`>1M4mOSvL@ou*MavxY8?46;tiu~M%Td+D@Jg{8ve44Xx#Ue&L{vKgC$
zrMdp$a)o^)_BXl)1=5<=k=O>F4RTqP9xB)uvkmu2>u>m-VFgQ|9%a>rAh8sXD5;LB
zC_-$KGgMrILp(jUlvOE5(h$~kAqIjFTfrS8{S1qJpfP(6Yq!ude+a8pp{ckO%VMrc
zST0GAf}mMw*@bR7tX2|>3f9~**W7m2Owd8!(ka;j5UE~5&z62<3xK4C=UNCx&sqQ`
z^<J`dl2Er5U{W*4*4gG(g88}D#k1CB`nFGml5L-L+W;!Hj%*{?Y;N10Yuh<%+asiJ
z2e8z4-S$Jzb^?G)oi?|h&$V9?p0$G*IxwX=u=P5?xL&|+TL(c)2f?-Z4&w6;QiiwW
zgi>#T1{#&uPdY^a?796$r@#!IH>Emn>vggbdUbN7b#k?I0u?j@zVl82hAu&=uDf8p
zE)lOTv9vA;!j`Vz2o*y&!F{RjhkD)eUfqh|v~D1TraIrPMtI(>&d~Ers^^7X4;V<H
zX{Gh(wDb_@&G#6b_ZTtsnt-KxUjZ#NuU<=VTCX(_L!0llCp_<UWax92>I1gjz;0fB
zfNu`e&<MQdfgD;NkV6Z=k?NOV_>Vfo`Ul4e{*~hZ_7I>B0qh2lh5!%)AQ%AC0PO*Q
z+TfB>U|?Ybv3Z~_@GGeS@&f>=afd*8dHIr(;lG9o0MG+q9RTzIlm~!2ztSH7{QyV^
zNEG~x+yktKk@L?M(*Kp8ff_;l|JS0@-+9pg4hKE>yQuVc9`s+}Aklx}LH`X7;`@UK
z{Wmy>^MB?k{X<h?{ky01ubR@0|1}N*XwZ%SIS%@b1~LCT4*H!2G5;$L`hx~B{SycM
znFcZZ4F~;&2GRc&2mOTx(f=6-{f!3E{(*!3MuTYoq$&N?QlkE?W&ih-J@tPyrN3E9
zRDZPW|D3X?{-b68my|u_A1(VorR*vG)Uy9QWiRqCG$@m*qyIMCc7H$xKWl%GhcK&q
zK%kGIgIsXh_MrdnQd!NwjjN-UKG3f$sBqqSNSU<!`+F>g1wjA>5!FucsMra=dr_&{
zF{Sql33C5>Fr%thUi(g)ok)20MQQ8Z>)NGh4<~IUi8_Aslhz&|E-7O3f-rF?1Rx(n
z>D5r5!l-XR=I<TxHhzB4{k57j!J@+bixWxq$y%Dl#mPDn%l?$~p30l*6*+z~@r?x8
zj2~N7N4%0BYNsoXzPfD@pKcTuGWBde#FrHvZ(ADin1D6xh;PX6`P6=^<99eA8-No_
zE%(aaliD0*mX$l45|ss!RVm2Uos2y&JU_Lfk853;b75{fa&Gr+oBO=w06p5^UfS-|
zl)iFR^T9rNZgBDba!Q)r>j^#)FjE)LE;qlx?}y8<e5tH1w|9r%=aFI-Cbd{SekZRl
z(`&|kj+A?Z_OQSmo9v0A(0llum3H{5EZ_n2`?b7Jg0Fwq$FYFCrSaJF#V#`j=Y`fX
zj5>s}wq#;Gwd|s1y5Y`Pc9Yz&sGD^W7DhjuLoFNDMH)dGA+wcD&Lk$!a=06|ZL_rd
zO04xZom14b%P0DG$9II;{G)vsR+zrgC=4&e!RUa|raFk6&?YRFh43>Oi1QJcI6nL(
z_owySs7H^i?ZXth7HMxZp!<|F_MACCGZTJHQGB8nlMqaCiw$kruMl5t8h;m)wcK*x
zrC@bZo!)1D{q;f3_iw^uygu<dSt&DbjzzaUynQDUEvkk2JlXJCK2vU^YSfo%{N|tn
z(pZENACq%3SMEo~cj3z4qpOj0XKZAh#e+`O&Z%$DZ*ydt_dSkJNEWDNm1;!~3qN}k
z{bO=LW@tmr_fjbRf`U!*5$CwSj%r5sneqKMO7a3CFH+urx0Lw-eNy=J1%j%ol}0{T
z?#VZOLF73QCs5Jp6X0^nnX9!>=B|HFUVGP(d(ukL=JaX&`@02`WvdV75R(sf?wUwn
zY)}T4jOSzSr3sj=D+)EJ`$dp~p0rxtyMm0%nS3kI)<?gzkWh`oO)M9Cyvou=@TMr)
zwJactO>&yxeXIOr(St|kPaBmqL(xga=J{L-lIRy-8DEw<Bw6Z^G<_(cqAG!;f7XXK
zw!qyIi=y*0UtSG<$j^9P0be(pF&5r@)~cof^1`t-y3sr%PpnZ?{L;?cwNcwir@lO^
zyzH5R&4g>)VQtWeU1vbOP=fICt8k)E4i=>vj}L#;Z3b;Qxk%`L;TpDmRQhSWejSvR
z<PKVUTnU|yG&nR7wcNC;^jc9dZ104({z$L%l;xfoADn8&$J!m=EO?wHI1;!boH<U)
zuz1Jh@!OBNNJy}GW92?k2N5^JM-e>~vFC9YK1~{kfYgLYH>P;V6M65R*h|VvR22E^
zJLFup#(gFzQ}+6DK596M>EO68*)$F29fe{B#)Iy$?Zcv5KHr=`^+K9?-Xv<Lu5i_C
z_h^jX@vn*U<MTWjG9s=?X_xvUh&&m#RI5py9$pn`J{fV0aji*PQ(cpoI~j%SD7YL+
z)xf_&EyT|bMlg9T5nmb;G3#fkt~1s>5=ym-@%bKDLshS=J8Kh~0ncmjGW#~5Rl^hG
z6gWU<ec_$jkQ3;fD8$JB_Kx;ImAu^gAkEe-QEjIh5l$7u5gT0+zG<9Gt}5Mv&6hQ$
zr~0AMoa8sMY@Ts9Lx=eUifo#CDywT|Zb`W{Yn-Vzj3iVqZ<jX+DO-b2smjm4*6Jmc
zesyt@O!{;t644c{)#65y^?Aq5rjUMm&aGi#-NT*IpTohTbFKr2U(|+Zd-k>Gk93eG
zyn~Ex*K15`p0aSig;!=Tnr`~zGL>8RwD7leHocE{4&Mgu+1bb1P=7w4I_aNy>Fl{g
zvcw=N-nX%^?>${V8E1AhoYD2tWps0{aM?h-hpf#dpTD(-Xl$j)N%FfBW%}}C&8AV|
zwoe&CdPNt94pyc16{WGrEq&Rh!I~Jix}*4|x%AWa0b;%y3x*B)vBnX@Fqalj#!aKx
z=Jp;%_nh72HG4zI(nZE`?ooR3yXdBv;JKbo-=Q5<hqIn7yYGRr8P#zlQp0g7u0!sG
z2R74EL-e~Rd8L;d4p-t)J@%>-aQe0bj@{!0YW|m%zDzyYp=axjDv&uB!XIt!t(y&_
z$0JwLhwI6w%iIoLb~g;a4LY>#2vvH&kZrHYfy;d5)VvsCHrVoXm-zHN<GjV4`EZ%K
z?U-|Sa^H08G-k|a_sy<P|E~VN9J|Y51Ml&)$Ii6(@R8?Ip)5$u^Lw8L7Q}b0&>hwA
zx+T<UzR-^l53(-`y{G9xmZZpwgYu^ChiV;ps8M@WPuost`rZ)Zy=Cct8tPws<imOF
z`*F=5^EhCNJn%>)kPYG{f^y-a@s_}MlG5Upfw;(_{J+B8$}}Gl^#`no>M0ckT}OFs
z^ttJ1`OD7uW`zba()cMtg5OsMc})jB<p?yw_cB2Ryw?b(HV@I#@-GEG$1NP^Gy~~S
zAr?QJ503m@6hebXI`w=H_4IL`@>77uT!q3E^&WPF#mrcACmAMNKAH}(3*rcf?X~h|
zHy)J_<`l6;uK8V)k0_ECp25{XLPFBR%$&p`iWD6RAQ6PC;T3oh`7@F|QxWe4#q<x2
zJJ*fdaU(Bq)LKI#FZyf>Uqs3zMfLUznu<hE;+wZYq5;2X7VwKGXQCI6qn81{NGt~M
zi&g=@C?#eyO-vwI#L{OfCgd<?4__!qB$l-%W{1s6{kqkuqA}@I>}AnC5H0M5ixsZ6
z)iH`2Nfn)m61i6F1sF!q!pMo<$dY7O(|+jTZC5H0+s&ysI!$@OqgT}7aj2ZQCR0PI
zg7~RJgZD|&h?z(>C^rXK>+nN>XF^z&nSQlv0*R%m@lnE+a6;9y1XDacMPVNaT3v<k
z#9FhUS6WXWme6v`YkR6`>kHz91>X+SfFwl}B^EsLniNecf0p$5NDD|b$Zf=1TiIHk
zB;DUgo}_d(Q%tTfN!H&;a6;I*G$v@H6KosdWyi^PG~sRoToCclpn=54#c;jilt?8V
z4KzGYG%OUs6^=-N;U~wPz#pUGXjbhmcK054L`git2J8|gj)*}&@&JeS9jEYFrWOXI
z=6p%=K_@$-Q?>F_<DtnYZA88jGO?~$OFjHabul8n7#Y}@7EqkfDW2S|%~jus^gl^M
zHo~WZo>Udv4};St!V$eEPEo<>qs8eeCrBtK#JMQ#BWDIbG|fIKtynwbJ10D=I0L4H
zFcC-Y(sF$mfVU`R=x(G=qti*)ldhDwFlN(EHW1O!^rHbd$spn_I+GNYmHR$x1DsvP
znT@-N?8#4h8vcwfL^*v$DW^X?%P~9yFl2*^v)Bo^7*f-(b7is2BENuh-)&?HHE{}i
zWQcBNVR`7^w9Y<PN_~HlE0>y+W0fh;l&Y_ty$a1%-po~v;Jg=seAbkqpzJxfp&=EK
z&61R#UY-9!C*R5=*O*Sv)FYqPy5KQgCN_cUgW2q1bb%7I&=_23##NY;8c^`)G!LgF
z_s#9hH)46u35xtT^F$E2u1z^19*@K5kdpmH(aMEf5}8j26EaUylQx0zYtxie&H#d9
zB%PjZM4>fZVnkDBeo6kZaOvsG(zP$~aLdx#&G@F-(i7I4+SJl-JsAOwMQ?OC2{(+e
z!SW47WkmIN&n+W+v8=8*$|p+h-UgfXNfd?`MA^Go*K$M@h(vx&ExImW!PF>NiYk{&
zeP+|JTA?&$vpZ|^ZPR97$L5Db<)L*YTE}#IlS@0<Y>=iZN2daNE9!<))dy77#Bn9@
zSrsY$8$c(b(0xNC=J|#O`G&6f4a3|Urn5IJx=e>OLO;Z+#csv2c~&dIs*h-@-Nj)E
ztM>(L426bDg>1y}5H%^Q)fEBpWzCiqYZ6A}C*|v@6^%4;vMRQz{WbpNcUk34ZsFFk
ziPSwKtdoyaeuAufT3Yvl-b7OpTTi#nxU|ktw=4o($udyu8*j^lg~7*Ju|Fkf&z<9l
ztg_pR=LNfLmDJlHgYi+p9AQ2N{XuppKc>6E;U2|aNOsF2p8(H*2ge?dXo^3Tpdvll
zqlbchB~eydC^Z^ihtu?4k4D${Mt>E5*c^~e@e|j|DSFftkiXS%2KU7iYsSS7F7-r}
z+e9Q2HoKTK#mxm5mwH|HHHO&~hio)EW45p!H`Ycrd*XX1B2fdSEsmWn)+x=m6<drg
zTbgwnKiIU|2ej6y1U^aed`H-notnkQ)z;44yowARNBYfgg)B;jw9vQhG_!p!4c$9x
zSwCxUifmuz&OP&)W4n-S_oHv=@wA_aYyl|FIbr)flqVY52HN%-mF&otENS3;>k!|u
zuF{^d6&iDzoy*x&MbK$`vy-l@)fT@MlP7pdrIl>B)8bR7d)V6s`i=x-3F&;BnABUh
zb#J!$#!Hn>!Sk-5&8GXJ-DWvm^4GcyQQZ!xw%5=ub$=<gXDvQbEv>AoJ?z&qDrTE?
zRoV2<QTL0Q4AV5RZM*J!^_s``7B%<&utD8D@4a{4<UHI%(el<Pt($PTS7^JJ(YAgE
z-RE-Nr?8E3wS8+m--|(z{z@V-f`RQdPm3GFK(N$+cS&cs*MN^+x54~C5O;stR{w5a
zcWi1WQ&eV7Yt%p#Pp@KGNAYkl(YEvBw00eaA)WXEq+|=A50PyShN+gcowJ7YTSRLK
zzWbslF)@7mV3_sR$Q{#>Ti5m5=*rH_hAZg?2ew&(ZHFt=y)4C1OlUcueEEj$C|USu
z399xZ&rsBAd4`+F1JAMh$T42r3MZSmS&1=Y!6<AZlbdYyx6H-$=uKEf>hVVE2wLlI
z^^TKE*HJ`|hg=t>X?;g*XJTp7i#fuES87Hs{@%{=J$|G+-cS{fUGxp|i}wPI6N1tc
zcl9SkyeGucCnU&QC!`i8WG*J;7$@&bPd?P2lqdI|R7{^#Zk<$Jm{hx%RA>C~j9mJ|
z3;hoo-XFBmKj^f6&|COmz;^M$h;hn9dg_%lD_9-FjF<n0qbMnCY6la;V`9p_oX@6J
z)LDA^!I~h3*93-T^z`aY5LJgL_4u^UV?N9(jDU2EV0}J((iu;onaIa{G=4L&<)Aoe
zKAPnj4;PFGZ!{2qNlh1}8%JlzT<~U%h|>FE<P-4~wu%<F&K5AvLJnrD7I?vOXb)42
z68*XPP7GAKXw%4C>jH+C5JofaJUSkuyH)gU^t{kO>%52Q{OHBJ*TnpU9WNzl0eWj;
zI(lI?T9j&hVR|29$y@X~%i`4i#h&QJ4>^mTE{j6j(!4v23*UH`ydTW|xZq*#5XF|8
zIr++S)+)+qguW_Y^8C7l)5e3*hQ`iV9(q3$5WRed*oTMI0DZlU2f_G}x`Kz+{v&-1
zPkAev{PJTb`6p->`nC@bTZ||j48&>A!|fwlD|GRxi+lwnuyU7)M<ixN{F1vtf1z(-
zrM(k_8ua--6VIUj{5_w~E%BdsTR*E-aI0m^sq-*>LB%g9+ka^d_%dAnMOO{9_4P}0
z=a=uit4)GSb9Sp128$rjlC=*vp~31M2YYU(7|{#qHOJAV<LI^OXKU_V(_RMLKExpE
z@nwG|?m%L6C=)m3=yK%fdbPJ`1n1>?#ruyjY8w?MD+w_hr8yfRqZ`F0E4f6QWj8lp
zs%_>~uN1~?=HzS&l`(C``K**)Zf1U(lULg+=)|xcT|lXEJ#Sm6S=>VIZ!I%!cV=*Q
z+b;tBdied-j@E77VFOOf7*TtsubF~hSM9!L2Yj`w_?p%E^~26rB<9Yi9nL^v&@F?V
zWWjapy$#+?AK>@(-3rccJ39wzoZ%TeN5q`bKA#SOf<4m~l6WuPWaCn8FUe#NBDBYj
zf5nlpxR;Q#;Ys{0`sOBs+_$(-n>4$7Z$?+>D>>+{zD4b?3=@6jyLrXVkcH9UvmX(^
zZD+sFGse#6uyDthgKun0*nwlzU|T$vgP>|lVClfU^Xtdkyx&9a@5mc+C}w^Stp2Xb
z%)ty=-M#q!*mvm#KgV{(u4XLzkB4nP{K$88c7M3WuV3*Uy1w5fWInv_hhgn-XdG}D
zZ*a(}juF3eXqU4WpRt2t+;NL#BQeBw+WF)?#+JIXgvs&)BnOfR+-1vAn+3x}?b?rx
zZ*B%$&E4Za(a+h$2Q5(hoj6plILUm=$Yc|#5UmzDwKv&{&&4PrVQ=CSr&e4#wSI3#
z0|POTZn?VvuN7>R>YsCKFq#e7FgsQgT&CZWu;Ke1Yk8b&<ZS6qU{Gga&{ttxH^N|6
z$Dr)M0K-7|a)ZM_yq@eo_4?oA^&~&*^;egB%vbxeS3tS``JwOCapu)&`_=i<m7MzD
z>H@#l>-Bf38GhF54`|ah{@3+-pe|tWkG#P3yDwfR{v$6ibx0+u=??!(UO*d)Xhi=x
zFYu>&{rowDw7z$`+#mV+OD0)E-^}(u^7Rl><NLvcw14CUY9VB%^3lKN1#T%N--Sy2
z-WMP@S49X}{iiPgJ^X%h-SW4*0FbZGS3&%i7a$gYJp(fW@&Xn>Utk0icoQI`X-Nk9
z^}H{zjl0woJ68bxKRa6Gd+^?vpRzNTaa>VxR|PlbVg{U?7{oaUG3G|sOEw%G95
zVD8T5Bgl{Bd*`&Tf5{8{lCS?zs@)(@)KOyuq9$;uhj`PwKSu-o`d|A3Kl}A<lGs7>
zLkQ7b=r4VNVoi3L4yfDLIbEHfeSzb>lt#FyZ`&<QAYb3$e#P`lzJ9s^=nGt)CVe%a
z@?+`%;V`HZbztX4cVJWM!QOI^k1xv+)1|>W2`J0_KyPK_Iy#BvrO&Gx>t=3tU=rW^
znXhNQ?#GM$@@Kw2_JjmUg5ROi%iucylp4e9^Ur)e&tM;O=+At;1y?mtnfD@Bj!Ur>
z9OM!Q^7T2Ki0)tV^$J5kE<Isba1Q)jRp@74z-~ORdIYG~yC6h2TCR^t`M!DnvtB<z
zDTBc%2IT7{p$YO2fV{v{`6wRaLHVEc`g^ryKkN1Bx86Sn>h&+~@DcsY3nb-CJO|?S
zFBPBLQSwY_346Z;>h&*QPwKiZd>GS%UQlX@coQYhm^3p?&Adu_89!x~?(L@VDm$Gg
zdG<Q89EjJ`Cu#uk`bWF74n2$w;`Rg5)C<F%kAZkSiO}={<V!H&BJ?X)t7fUkw*|Oi
z!S}B~ygpb*6Q~QUZY%{-8XU6*&;srHMDiG*T`%rD!}BRhxZd(pwDhG|cFa8{#4F#2
zd_cQ?OI#ah*Bjuiz|Asf8I!HrXf13_MtOjCed!zYT2?Th`C4|gj1Em!oI$FUc7#2U
zt~Y$DvsqGp8MIjnl<VWm8hL<p{i7CQAYG3Io!zeeAVdGPZo%g)rv4*Pt{3U80Lt}^
zFkK*AKlN#+ofU^!3fu+;%JsJ}pQL?jqjq4}Z)A>5+plA<WN@h_8cTC&;NQK)@V!Ko
zB>j7(EI*@buELXa*P<s5jE6|g*!06J!%9Z?M2E3k>F%-ayNr-XUy_XDPz!!0=u+&H
zjFSks1CvKcXl#ZDG_&#+)0t1<SjL%C!!FagZ97Tkxm6!Ov$t90lT2^BF$d;L{pncN
z%uBPSN@ib;wXsZJgWX+bzsy-f5O}8KijvenUcv~IR;vg{gq<|N*TV>#GprCpgETNI
z)d*L-sDRLoG^q8|2ww?ONR~_*Y&UB}z^2__cDIf+Bt^&gniWm?^(oTOC#lB7(6AiF
zL((wDCS%g@qFiPwcCv6Voe4R@vYb_fEP~3zgrc}V>y`#t<WQptRU-<)>qZt8e_}$z
zHV~F_Cz&kTTilcm%>n1ECyPnYHf7jYPZFIXiw!}TGM(TjNglG3!M>lGvVg6UrK!l{
zu;^diq~%O{AVMB5q5bMMCn8ZngFN95;uRa4I5<(ojXaS9{fa{gn(!o<Jc*gsj7xhX
z{&_ulGJ~!ekCjrq<`g?QoT$-^4;l{BJ|s_}L7NGLgA1sst|Kze%>)sRFf)<sskpS}
zciD<JVk|VSBQH|SMH;nZ?A)%WEghSS4Qv$HBwtV87PF8*Lt~uk*{^4eYFS8aY(%?H
zUC(?dX)bfJF$_Jtp7jo8AqVz|^rxc8t`W1m&qhmE<SjyxbEswckW(kZSA!xKm0~F`
zJ{TV9Mv?dY%u-RQBqA=Eog$w~+DciQAR@Y+q96ofrE0|$hM1x#^wzRcgPw*V4=IXX
z&seFmg>y}0Q&AS%20nj=n2pR7p)4^K0~SBHCW|#FOC2EA8jU)^<!-E$WlxW-wFU-*
zN|Pze4QOn1&=Ns4^^_F`-Zpw01i_6{l$9!4%r*vZJv7@clkh!pcr<V%XNst%s?;CZ
znjkhcdq26pVTk1|F=eHn?$@9yO6=j$<!?r(xlPsF#j&$Am0;F+U;m<}@T-mW%b}N3
z^~qJ0!MrccBz1eHrs|`;E$tbhb2t-+BDw82e2;=9=fkL`QA9m<&d><G)tj#y8OF+s
zUGnM2*EOgMCVTi~E1MS*t*4uRU^_rQiZdGQ*H5&pJ+^TFF!%0zJv?nYm|uQLa>*Vz
zsO{@vbnr=PHoUw)-G0Su>5aoM0-~nL!j=~hB57In;$ob6+hgk(v;j6Iz;fxNmUjpd
zZZRNs7i(k2brR!clrqtYnCYJ9af;9lH=)vW>EV~pk2JeBLNhIvc$cx)NzkFiRPAu4
zFA3Wj#wq^l=7VQ_3Oo66NooDJ8AWN~sv&|mGNry~DWeCy%bb(7!_D|s-VGVXn5HoH
zTvHH)BcgTXg*e7qRs~DYBM4iUw72*c64dBX26uVqj33*tenyg9aPLxHNv+Ef%#J_i
zd7X0-#%L*j%jsQgMQ$!DW4EH_>@d9db*gadhN%1O#H&2U*F`q?*3aCXCJRHdi(f|f
zyl9XJE(>v`FqF0-A7q@JQY~|>kVM%Sn2t}kR~S_VGv3pqrVs0u7r}~UY~5zynnUB*
zy470Y+gW2d&Q9BB*VU!V+Pfotmbh`<&kKUmo#2T|t1|A26XiRu&C?24jz`Ub^bXJo
z`n<yo(YA1I5R`=FDb;yJQTr~hW6aLnGFbDtlXlTDRF>fr85#Vzo72E4(vV?=cKWzi
ze38s4)|cV)4FIz%88|0oGJN4A)jS#0UUY`H1KY*$lVK|ZG8g0$!<zW?$tZNuCG(15
z{T?ac0)`vB&ShrYP|`eok60voT`0@A`4oOS2{dU-4H>tzNvBVz8W&wFeHpinsn2Ex
z3`pH-G8w;GX`aoZ7u_1#8FxsW;Ait22JX#EjJwe3vqjRAMfdhA#yx-P^JTChq>Gt^
z>07ww`6t>XNS`dzejJ?S{4=K^bjXnD05N^OD!xPt9rI=So=ttRu4L#jk;(Lfs95u2
zQ+vr{x}E9p4V>g++se>$Zi(rrar$Bhx<u-^bj5W1milrp+|X-<nfZihK=X1RvE;QT
z%Y6C{PICFZ*wA~+kogQfeR<fpMC`rm%Y6Qk`s#SV(B~kNnE7Hu^Xe46<a5-{eEE$S
zes#WK=zF%re04H?bxC}(<a>3+48l?e;lV(JB;z0wQjF{B7}PKf`f&_q5>m`t>X_^>
zOzv?^eo`zU5_K$57?$KXmMkgu19fZ#Vi>l{IQA1#oagE|nlKy^-Eka4Qe0DYTniYk
z%{VTJ11X-1I-WZW&vP8lmlU5UP#r%Mh95bOA4>{O0IP%HFtF)uAZh#SM8xfXNEv{U
zG3!1Cs2MZ%!@F}SSL>PQ%Q+NxBxc<P08Qik@Eouz08;~yGOVnvRa8|0?E>&G0P_N{
zHUNDCusHzv0<bdxH3N_`06POX%mGev04W17I{={rxZEM<l>WbBVf^in;Ks}SKPe;d
z|4l^vzln&yr(^yPqzucyN*QaE9epgnlw(5YzYSo4e@Yn*+xtU2+)ww1??$#f91$Dy
zav7CEmUaK;WKi?o9aMadaqwQXLA3*LG8Vp1s_@_Y@qt%2`p1+C#qw|0IYw9Pb|Mtq
z^jW`L=eW&1c*rY?p=vv_KM%}53NJp}dMv!8p1^y&Y@BHaS>R&M`sFqUvf_j#(><4{
zS>g6Mp3wMY)sYb6cp|}tPh!=}H@ah^sIcO6vzU4B$H%H$Z}wLJALDp1bIks1Cn_Q4
zh_nUI*KgMai~oG5d+6fFw`O1^>fpipsMqeh(zND5dDV8YooT`>>96xxd>2Pip@vP<
zpCxyszK_LLN|UXQwK^XTYw~xjVGkQVyf|E`Z2#65nE6AdLFPtYrQfz>+F1d6$5}Xu
z=h+G~_!GM<KKZ?X>kP|SIHYeuhp~PJTUm#sDIGPuq=6LM^mvbGC|gMEVLqKUSyw!k
zaun8kUN}ZK4ism!I`h)U1Me%|n7qX6puznX0=Z>E6b*TZ*R~fR^7<yRD@2Ovkkm}}
zl?iUX)?3Q^WMTIiZW6(P<*EJ0;n(ioS|o*ZkQ%&m)aEh0-i-HvE##Y(^6h&R7VD1$
z2c061O>do)9m~h`#JLN4eWLyv*3CCX7H9qX^UW)W0w(^xXAn<bFL8K_V#!_{4Er`?
zu~zd#uJ^Io6s*g9;`c?Z$*t0A(>W8w<fWCX;-T{^9Fd9x@?2v{MP)Yy4AF12lHVto
zY?}(0iTCO6S0{`}<@5R%4RWeEhjw+|UhR!iYSvcXPaW?z<8|d6xhp@BI!0kmTn81j
zml;Ju>DPF@Pn4dyT}KSmu1SwI_8QkarElMT_31s{m`6YY;zM4(U?EzC&&LJP_jzqz
zOT1MT_qEB+YrT61#bfTucQXiC*J%4JM}yQRGXsZLAMqwX?f)p`yN`WKTHIsYJC!1j
z@x!_bZ}>CSlGphZHk?0XlqX)_e35n|pY55;=Cd5!io$Yi4t0LE$sRk3*gIx<51a;H
zl<rVO-~3wmf>!5UX0e*E$;+4XFO5@QAnZS6$=7W>v*DUX$VBJNFl=exkx<_+7lt2r
z**v|m`LxjFC4#m5w!w!+&62|(p1va`3~S1o>2%5E`ad?!sJW#gMcj<77#CGAwlrh{
z$g@R*xwW4cYZWEGtjzS*vb;5?QJyMNlhewhm1M1%5%VMS4bi6QlBmu=NPM*uudOv6
za%Rz41d>6t^VVE<epao%A<u5zE}?XO>QW@X>L`iNGIUPwtqD~_<0C#}-sbtP9QX3J
zNBk=}+y(*VJQ&9#l|Dy9OBaiGnx}<#q1<zmhs2Q9?H?6h)Cx<d1j7zjp4vyJDjx|}
zsEesat9;*-3KDsWVJr%#hCP}$!o#}ONtL*Vt8XrwQkT-j@98v^3TfCGfkg5Zym1Hj
znZ7~H7+<TBgN5>ZzIpwmU!HFtGr7fY>_&yP1eHLd)9@z_74cq;7(@~ldW8@Ewa>hZ
zc^T&QiVcYvke4}#|5mt)lh-(6&m@?J_F5I~LXTQd31QL5`AHu^$NY$Ev+zbNWCF#9
zP1Ih0OUzoAe-QmH#{N6`P2L+D4|Gon5;gAT2}QRnxin3{iR+3K9&bKFo=z62)s<+D
zWNvCSpMI#4xj$yk2tvF|wUBHU9(69aN#!-}?!a{``xgAEfHQkWb%L(?O<IuIni$w1
z#tjmB!Dk_KhCfx|TvJp2#m=QU`4OxhUTjxc8y@3w^%C9qj%dZzfw*|5KFyt^qse2A
z9)p$5z45-}3-((n;aLXdhSF;e{tuFuOCK7ZuG(mLS4J+S@*a1d@qP<0rTpl%!bjSu
z2a4q8TS_3NrtWsIn2uux6*I|-M?IHuhz*qbyqr^EjNzq6lPQveyI4*|sk%Jd%CNc^
z5B0QSQbVp#wMk6#;kR#1SYci0-G9>HYUA^zI9H?f%a`K-*~PuWU1W>#7wfUm5`)0$
zy3O5%4HxK<OQj9>Cab9ZyLUd{E938Ox4-guv9QR8Lc-S-gimMqE9ROHG&*b?45njG
znE2Wy50+k0xAcalx^(T%H&%>ruP_rIcF8VopYcd8zC`}WcUW+ZT=E*1^gU@GlHPmG
z>}Ag#JM&KV<!<Fm?<NNR$K%u*-!pu@Hm=8bj%+b)mT8_UmJ^?SRWqomeR5_lG&Zw>
zj651@hd%UQ@>n~Qd7HxT^P~I;-!}J)vuwE3XE3ujdMsmY%vWy?HbBCAAp80vXsvC(
zX4mJKZPD#O!FSZe*M!>lq|j$c0jg!;>Hf$&J;?*pk{7#32TTKfW8tT00X<{aD5Uly
zL-AY})1oZWKJC-l6!lsU@{L~cXHMa{v2O5QGhlqu2lv?L<Fr5NOu+m70K9?#w>~e6
zHE)i7|D8TZwZgy;oq>K0-uV4~(qcjEtUe@=phQJ6UgcvCRfzvF+`o?6t2;ChXu-U5
z4wg&t(Gd#}DGF9kaW~=!nZF;he&qRhJ@{NR@N&&>JSot=pU06V&}GKEb1mp<%>_8M
z8GIjlp$Rd=_p#;(gY>%vTZZ*J8!{&wKhq3P#y3r|G^VK!zYxZ}L1mlM9}s`c-NR;D
z(P!IBW#*M<H&AaTGG$k)X!eoaJ}BaO#I$on70L@}!PL$~h^|I*)tgjvMDcYw)r*Ot
zEP3zYT8vtnKTNjpJBWI>9`ylbp7|hJ8P@`scbMppp2Ls1XO3aNYVM(7>0^%JD-YUC
ziP@To37)ceT@$srZ-o(xeW>NA0E<N=V4tTreP8FlI+jF=z+zWpacPA!l3`ibVStZ8
zN*hPcBOXVg9Y+O?qd~;cHO4WZ<Csq3;8Qlu$=Elwg<3S?Z@I*CYzQf_MDU5<O=SV|
zEF~)teYYDlPv{d#Af&V(fJENe;Fdr{Ea66><rAkuqNK$|WWkA>R1S#=Qg}+3YA1;_
z3NRr1Vvhwl21(l5NlzgTMresVRAR1)vyOPIWtch!+}R>L*>)od3U@YDO3pTMe%9yA
zz2?MG2>-|qk5hyf2ZXs5g*j;%5h{j0)C!cxPieZDBBPk%iiZdk_kNZVYV6aWQuZmu
z>Lf6@*pGcZED0PA4@Z>bAXaY&Y91lNXoKw(Q|nDq{l9oy_6NVWFbY(ZblFHv#YE=+
zfU2duW($MF7C~~0kp-Q|`Y*^*Z9inOTNl{R*DB2w;q|sStrhLrUhMzD%HvVL(L2uc
z%z!iuHY6Gy>~|czFyPt)0cYgg%y^-YF{I>?g-+c*;o1>*--Bkj9;YNKW&S{O9ihFZ
z8vTzNLY}%H_m9%A8j+LYfzPH9R4JL*gK4-Po|lbT`RJ@-L<FgFfUj8gK5ZsdYDQ5*
z2F)NRMrts5Q~H;?nYUAs?57-@)*;-RStZ4p;l%-wp0v5No6^`X^%*_<EVMG}z`hcj
z>1>0!DT2AqVtF?S0u!u|V@hF4C3(r6dC9@4V(uwogQ@%_SsGlNTC*>7=z{!@^S=Ga
ze@R!6AXq>qk;T)LIfBl#9xUi9&eo-Kk4(vDrYl^7hrKRIQM4}1|5U)VnWwLu&Bs-=
z*`!pIXHs-uCzqZpDAFS@db8+jc%eC0p^Hwzf@>~|L{5}WPD*OAjD(|5Xz*QvLXp`p
z%ZRK^tb+U!jznPa#5EiUzNDs<(rdW3%$717#dqN5)^Kq&DMsF>jq24h?beDK6NzeB
zFB|iSQh_HbYnNp>myORpY85k^oGl|;EnlK5|HxIL_Pl&bM{H)6bL=<@Vs4HuGT+s)
za4s-^Enj(mHR@n9+Ur*34-bnguIS7a3v3ljTuIA)epE$GC+ul>?6t_`v*Rkl0T_WN
z4+-HLvY{#*1nm0I8%n^*ps&6uS$$i#8W1ubaUiR?nyY!{s`<{U%c|p9O>G2igzoCr
zu;j)IpV@St#tR@h?jaI{pb7Kxwdg6~4UckJ8;*Olk&iZN6~J|)A&E-drI9LUY^%LV
zHZIB8xpi<XI6<+v!CBpHQb$vjdIkjtW7_0yk2(vJWTQEl741`-4JSMClE+E)d=_=~
zLu_uvg%E95{~?9I=GQ)r1>TW$si(yZo1tNJCF+Bj&*)OS5vV}S;*Af5<;umC68Wfs
zhU7@Llp#c%QeeDIo{MBrHX%nYH-bj7>1jk$hizSx@B5~L=HM#NCQ+@z+Aa3_t>UQu
z#&D0U3}jZ=Tyst{>TP8Dn}L!9kAg1ll4RwQ{vr1E(#Em5oOG4OacIj2UAE~nl#fnU
zQfd?Yv=vI;7N1&JLDx#u-})k=%_FC6bf~S9u+d7Q#LkMlUHg9f@K)Ost+tw>cDGOM
zY07zi5{=Z#9nK~lzFeNQh>mJPgKK&ngElE1ba@EwHY8!|F<~h;eiLOGE3ubv4Wikl
zq@zW$lgR6B1{&o@&_Xlc#;n)!X0w&2g_U(UjeWa`Q|j$wCwLdHR|y{cf3WvfQEe~k
z-fjYI2@u?ZYjG%2+-WIR3KS`>Z7FWWEx3DecXxMpcehg9T?=12JJ(!u?zP7_=X_^h
z?#tZdA{iqY<M)5w=gD`H%APfDA>nT}scjeCr{;U>4kBu3C}^R$?^ML>qTueTXGlE0
ztK$ypQcmv*4{Q+BNIPn1)k|!A{Z`{mVH@pYM@?Y&4PVn;L-$+JZUNdJ4Ydw!lScdf
z8fLHy+HUP$L!OmX4_RYIx_csZ)olS;c1~ELb`p1A6u2*KwnEIhCpd^|p0{{0zi=}^
zaVb9_m8gG%w`fbsS!tjz$%HBe=6X@;&mJuN5#L`4)=w|&uVz#(a_%Z=q>_B|xkF>{
z<0Vgb;$WxqU~l7K|NWrz^x$aWptX<ClYx+<*rCJPp)sOC7ppR2<1j*o;dTD7Wg@y&
zjp4`K;d{t0%S&j)FetC=>}}b8Vkp+}-@Gn=YF#j4UYEbLE-<Uh&rx(t;xSBtF>K8-
zT$eFO(ilP07!l^u7|G8uGU9OxfpIF$acY-wTFRtx`lfNlrEz9rLaV#+v-Su!P1E@|
z#3*tTA6P$)kplo>6H2_AIKzy@=l}o~`lPaD=m0qoFgi&Fos=vJ?PCSXpaAdyQy<bM
zyX1jNOOr&<DOIMac3mL$=#)%wnGO?<+D}3fooSgSpkXkMaT6h~1<-7164wcR#)@gW
zUS#Tx<`m)RBysADD(g&D+oVkJEI<dS#02!xoOKo;d?GguSOrqaO;O2B%KV&F)156j
z0m`@l9fId<mL^GC=aPu$ibdw4UFH&G;E14exk>XsL4-jl0Bn}fav3;covAMZ3rcHq
zxoJR>uo=<P1>9qxHV4Y02Kiz^*}@<(?l3HcLA*3xG@mEGH0`nkxFMKtT3R%XDm+<Q
zC0-7^TrAOCHgI1`6IqtuWy0M*z(4%Ce5|>U!Mbv;xw6DWaGkVr%e0&_w({_EWh)84
z+huV_29AJb)d*uH7-Kdkcoo3%gwz7)IkftPb;;jy0dPE7EU<=THUlVK)q*6ip@u-*
zVV@xLf-B0*^W{|<EeMeokQ8+x?O^srGcLh9h^grmsTr_IbB^Z_myZPUacNROYc9=Y
z_0=%0*aKdf=`;n&n#=>P>@uE+*)&$`52bfMSeNmLmZwPgmp7~|HyqODlw2l1YT<o0
z!_##IQZ;XyJZ#vvO-V3sIgD*u%TLz@Z#kxIS+q^QJlt{uZkquoy@v68VCkcR^6j8x
z`mYb$rI<SrhxFgHb_%U_VvFhHn|E@@cSw`X=u=_;A8fms%=Fo=yQvYo`G<6c%ex8R
zw@ZiV%E<)x;@|I8htSm~??spIHNK;3e%On^+;2yv>(ts0vDzo?9j5JX-VYex9}b}%
zB{}e6JD7Y&JMDVl5pgh&O1nt5eBgSzyZW$yDR5Yzb+~!ByPbSk*?hR~x_9_+SWj|9
zdb+%Ku65MvdUP$gf7^U?!g%;VatI(jc0N6Lf=2U<+U?jZ;uvX!8oY9Bbb5>)N{uOW
z^7;J<t}HcV<vF);MSJrJRbiyqFzV?C%*wHMr>cc)6s4z(E2qqlr_8dET_LCP<!nj}
zXT9&v-szo52A&NRpGmi$?d+aW3LWcEoKH)g)2tj>c$`b-oeNExoojqQ7eKt2VZQiT
zz{Z7Ub`)}<Uw9!fa&dZip&)wE_vK>q-KCt~WwqbsPBBb1aQS`nav$|disEYg&6T9)
z$ho%1)mZkG{>YUA+U2Vkm*_%EteU4@Dd#>yC&|RsD&McPo?Sl(&R9iWz?EL`#hr+y
zPCZRNjU&BG7`aI*2~E+y417EfmXN)*+lvYyz4aElEi`|UYI|#(a;wyGTOs=-{PLo7
z<hs1&4nN_G62*<?o0|%4!}ogk0zUT*viG@<clo6E?MK5gE%&u<m-PYKKeI{<IZ}R_
zw)|Ywu5Nn#3C9Aro$|2T^043XFr0D{_vWXV-s!se&j<YnZq>)j7lQI^L&B}4ApckT
z;S%qe<7Bq52%`S)h>U*%8R``iXaA!_#@~PpoBt#-(lP;m5*Zy{SaiyN5E=4t+_vY}
z7#}4<34Re7zkm$A*VSKdEI<7QGV)3?KuLTsAY-m#UFLW1;iYO1+#f*3q@i4<LY{J!
zBhKFQEV(*cCNd5cW0f-fzGx2h&jlZ<-nJ#%`~hT$PZTRH*qZ$jd_Yl|G5gTyiipSQ
z3}NMJbx%Wm_Xm&>Y0mVSD2npaPy&zLWn$myr|LMNm_L9F-ZU|{=MqH(wkC5GhC;eb
z^OmMdFd&24`OeC0<rk14h{LVF))U8N_i(Rw+B%T-p~jV%PhiJ3&)|>XgXzMIEuI^R
zi{nLW?~035(;q~JZq=1hva9RQyBn_Z7uQV>e*_=yu0t8_!7w7@8Sx}sTQ{6DjL2YW
zl=BCryhraq6}Ug<M`6VRbYjZz%kyI-e0N9h!qs#Re}PCl3-DrABno~Fu~`i7X#!kh
z06;HizjtS;>|^v%MP>HEk-47(L_^f1E!v;MfQ*4CL5tHKRPNSvi5aWGh(WfYM$i+a
z+9#==<OQI!UiOW|$anVD8dy^FyaevQfDEirq4mPZQ65C@Z&1;$6P+Ps`q=MT66!7Q
zzl}>J31g2ve=+-Q;vEdgn3NME{sm<Gk$Vuck>ND<5*=63E&rx0#@eOWr);wnHKX-A
z_wcLsV1sk&Px(zuZ|-w=E{>Y0c~_OVe$nsRgYqnA#!uWumh3ELakC*9j`4NE<>jL7
z(vsfdE4xbQf75!H)kcAAjLRc0YsTs_e3z-z)f3*<kuj%ij?MS|ck1C+>ESKw;ZrX}
z`e>8YAJL)o_?xkxlH)exNsGrfROo8~YiQWR_}d94f0Q1I@wZY{4afVC!;}bi<3f_-
zVK7ECmys&S!ph@A#RG3vK^IAaNqAfIh9@$w&eeW(mdorx?M5;oM(|EEjKw%BRX5k8
zxFv)=l)NQL<f%m@eKXzJ!=HGv)NM*+*`@EDcsk!(bRU!GI-h8b@vMvJd^E$G-Ed4s
zh*)DBPdw>j+E{7nV)oMqY{ttu)0U*cdCQd~I})2CSmZ(aUi12Uuv^&mdX#o^%a1rW
zlG}^K(B#|QiUh{x+x;pb*h10t*j#<sMw;S0(fNY(=lOF!ZCK*rD)8skM#^K+^-c?^
zTjBnT5m@5Ee}2Uch`lNW;1-4_L8pBBdKHFZpg-kA_rO@}fq$+dg52%tfj7DOY$RMj
z1l1MYYv~w<VyLtm290{rDeAuG#-zn-4wFq$UxSetJbhyMKCr$6yBTdB@dLVFx5YGi
zoKh$G6AXVzGR5zhakH;!1Tg~SzkSEf`;WW>i!Sy*^A0M0txYH9rm#D_n9)~6;~1gF
zmFv={v*Is}imlxj{lU~fRU~NAN5lNV6cjjfy+mx|;o*wpRL@oW(BF?o#7&Y>bIkRD
ztZ2p~(_WL&imCPkBF4Yvb&}F6&GlcEjYpNeCT0AjI&d^L9$i;S!c1*5H?Vaw9@F-c
zgvDERa2aDFwy%<yEoyFXnsp*>44e36ma0s@Jk3P>92F5~)tpSD<wU|-5FvM$>TpT;
zMB)yZfOl$cII?Ua>Et7XpJ_vN#Nq(a=jI%X{9tax^3d}A12*k8fMC>}q|)+@A}vOg
zx16sab~;AX_G>x<HTgg-><s*=?KeF0@}aKSndCS-5|V1~Ba`v5v*?s|q*UkM$24PS
zvqtT_HB?hbSjNub!JFE7=P<93@_?Nyg0m~<ucnwmf|Dny1lfh57>YT9IQjBXyNY>g
zN(EXt1*%iK%6N72N+qs1g}OL<Dt&6o70Eb7hDv*CbGY-$HO)B1mQi~eJ8B;qmT^iP
zruMXMaOXd?Jm8eN<Lv7os;hL6;FkF-?d##=E~xYf;+BU;?SG_GR~^v8t%#f2|ICB4
zpgQ7;TbYJ)@I_KxZ6X=BDo^Ra2wQbQZKfHwx-9Cz#86#*VHvjWPT?GwIV`BJJmA*0
z;T&4{t81*2;MMhEDIHqHEof{B;?<8u9e&MI*WA;>!E2bCI<&1@&^&U*Yh1%QvhP#Z
zI>SoFYuZsda-3Vxx@yL2K8ZSV-oaGYzFWp?xtTh0y;;zHe86i3V&NV`5jAw+Ng-{B
z%Eun~un|}Y6Vi?mee6Z2p^K~y>A;^p_Tj-?)J1cHbduwq_(^K$VWmL2=rEN}0#q0E
z@LC|<tkEYyh8p^Wm@AMTp6QbihedtTM@X*-?r9jNzs5&OQv5zi<<p3`#g8;X`2Fbe
z(Wl??G(ItC;}57#pGMbVEPi_7hCir_dluWL@tHjZe+a`+`7C~J@iSKo{;*~AS>lcc
zh5_FS{)of$S@O-IfzTuVs5=_&c`Bmj7g#iU%wPFD9Uo)qi?|TMczE=ACY`3Cv^K#6
zdffDRHqVlwtQ)~(8tz4|B$}p?LJGlDp7KS$>XOli7BqtCvgnIKLrr7#6@r<%X^e|v
zhb3d}M}pZl+{;pbO%r`IQo^}D<;(K8B@+W!>vs$-`m!=l)6`g-aA9uxvbqj!$<*A9
zaB&Uys<uzl%sK^?aA`;Rs(x<C%&vuS`2;Qcs&Plt+-Zey<!1V-`37~#-0hKY6^M7;
zil}AbiAGAa_IEG?^`Bq{8qe}Szzo!1WX8Y243ytw#-CsY>c7d1Utk96-^q-xIjH|6
zGr+2UBQsF{Kr{X#Gf@6SGk%j9D1V_DzsL;KzoQu>C*Z%M8Q_?|p&6*Z*o=Rp8Q|Y+
z#&0wO<xe)_H<|(di_L(c87Ti?Gr)gmGr%!_V>6I{!5M$D87P0k8Nb;K@W0@UUu*{W
z-*Cpw3IYty(5~|M7a&3YPe1}jN5Ciu7##uAkiw7#7}x+~9$@eT410j_4>0lphCTdN
zQ^05k7#9JfBVY&w41|DD6L9FbFo^{mCO*t#0rOJ81QjsvD9k|$laazK7cfo5ZwKjr
zp&I@NU&5>vVgGBb_+LQce*uaAAAkhTKLH85{gYpS1lE7Fby#|^|7%<4S6T<_pKYB#
z(mGiG#bNqOT!Hxyhv^@X1m-^+rhh0*G5+o_{X=1j`45Nb9|}{9KifM0PU~Ra{w5{<
zmexU^F}pjc{aacG{l)!p=ik#h7{A&&|48eg|LHLOOJR!k+hO{b!W4bx4~OZ$6{Z-e
zfL{*NzbQ=7|4UkD5aXX|9avch{hw(amLQCOq;-Ckb<qBn*7;S|LHk=;=T}(=<8Nu5
z-(?-NztTFt%Q|TPO6&Yt)<OGMTIbKQ4%(k-oqv^e(Eh0~Rm1w5!t}i&@cLs4oG`tV
z{GlNL0Ub(r_7bXGZ53r_wZWh(NuydH5p7kT#b}90qq!UrV|u#5%<5i8t@{uW`-N?j
zC0vzS-Dfp=;Nxa&BI3{&DXh00_gNezbI%O~t)o|-&~P<<9kdD{woaVjr{Nv>I%q?!
zkaS3;z_&j4)_$D*?O9_^>?uv2tjT;u%F;cJa82ikyH#H5v!Zc;-{`P6wPM;TOjAKh
z@Z8+UI^j9?r}vx&{$oMR{_!Q}+2U!H(z+pvQvI*DB_qF$N4=AbV;0MkB-E!@2p|8J
zS>L!L&6lqboL`y4h(jwE^%|N+fO?TDA(Q_;rhhV<_aaXYH8@`}M`|ix?IK@gC|?<+
zrhlr~@uENnr9h=e3U#_X{-W@UOo7@=|8#Xd%0-byaDm3w@2(EYA7Pz8yE-VSg*ph*
zv+V?zrG7GndU$jUz2A7JV}oo)^l6MIdb54YW0P_}y-1krGqV*)y4W}1A)Jp6!4=4|
zq1iW-Pf&|3p5`yIIWX?SQtus(uCBV+Fttor=s%q1YZ}@y@4(Vnp+L3c>)P0{EQwr9
zVUFP)!r8UX!O|Rh$Hh0*#B5hypm|_HT+=Xn!)#wfxqKqZRloU(*>OTW>9ib=_t2%p
z`a3a$_U33!bAR0n*YJ#$p9DEA56@Ad@QjH-EC}~ARKZdYyusCHeAjL0uuup#&D%93
z_1Z6p&Bw-Ug=^CmwH*aF#4LUal;1H@U=mYi6qv*`9;br)C9~^^fga=f@F;O@cS0R&
z2z$mlv37QkU@<t%lRu6ei-+xv;HiEDap8|{mb$(`2exlmlpA!U*)WMI7$z}|g-J|V
zv}hkrwc8sRH!ed5`!NpzV?l*+NMS>!wr7v4L<T%(p~HRU2XQmTF9h`W&_<~8i&J41
z(;v94fc&5ADJhCDiRtZFhyVaiGz{>R>edxXf|1EJv?ETa147h+8nQNhp|ODh6=sK-
z-q0^Xf%p`8AytLa#sI)9FR<Lj#^hNTKpj4H%4oXE#K^^X=_OJEXJfSu2zs!npW`$8
zjkCBy(;BK(Q9n1~T3v!cwq#O}J>Sd>tudc~nv&`)Rsm~@aQL8y^KV;cP`T~k*q0cd
zT@wKImC2eMF3i+lLEZ1Br{E8nfY6v*4b<a{7L?^h8<fQvj4wK^mQfb!vOKNsST#yd
z4v8)ONfR}25L&xfA1p0RiN2!^au<mTT19Z(YtYnOz?0=#sgk$OFxfd+jXVcx)97q+
zynIntLv^qxD_*);@*X;ZPbQLCBem3$oI0VZ1kIAx++OgguPu47vQ&bdN1E|YSJ*(E
z%sG?x_^xi7I+!nu-3s>Cs#hoFFKpB+nhp#fqnp;5ugcU54nehi%Yn$nO(le$)-Q0%
zfMI|(4m819$D^tu<PiH7*@DFy+xvB=y&_*(El+<uoK0ZqG1!Mhkre^3A!F|FMMQXw
zh`%Z4{R)HSXw9-}NsT?ir;)tmjK-ykzNTJxA;Oh~`pZtY;A89L`)LR%K-j$5ScQrC
z=M3%7>9LZBnpew!gPQEaw95x0KQrTuK<$N^&yFAm?L6RO<2~g^p)B0c7kk$$%_@C9
zEkBn~nQqmuJWuiHZt!fVu%Th!-JXAhhNRn2XF`L^?cHafS9pwIPFnZ-9>xqjcidRJ
zAvl|xYIm<I#-Gms1R#&FEk++-bGjc;e>RUSJ5fe^+T<KZdTdW0Osf}+AWy7b26jJZ
zUVHZ*Q5S}fo}QvkjIX^sD7;?Pde-TAiT8S8`&u)5dAWV};*<q>b)33(aCytAFmzv8
zzj*EKEb0mvb<v&muCQ_m%5l~3^KOB&*_`nq;$~d!@v(w<biQ}zWTEnLcy9+tb>Q&x
zDS@-AwzYMl^(+4F3x)WZ)iQX206zA9!MO|p60e+9Xy_pRTOSCZHGT{U+xT?X?kOq+
z=Ue_6wfJB%hnMyNS*(u1Jpq}lo&^YjX{?Ss5`oF#jun)Cfyv~~HNAley56m`_yb%)
zDJRZ?UxMP~gZg{%duM|Z+k$9DWAR~0owziw=`Z+3AfLG}_z0~oOZNDyF)nKm{Erad
z4G8|ZfiKlg?MF4gkb`Xq2of5`3O$wx4P*_y-u{?xAA08pxxEezNb{wF%Y&eQ33z&g
z_arXNH{2c67v{X?jvfc0RPn%)grwPr`>uJA5*R|tXT!bSJ;@{?0^AXv?!nYMc<r$f
zF76SGb$ATaHxZ6&5p4c=tg4ZAY2Jgrk#-o?i@lNdB0ir%-|WkRVH#6wpf9<E>Nn+@
zZ#HFN@9^=WAW@Qrwu+K?3Jy^gCw}p{Q6}M0R9bVmagb<>G5?P{xcCmyrt$$seYkns
z#tpO425SLmbTJwy;iR^4xYe;ShG~J${<zK8F$R{A9yd7SDzSR<!G3hOzEpLw+G(*N
zhOzG<aXP@bZ+$o~9pY3@yv6h4KAZ$MbAC(C!&xWej#sw~$*#k(iH%ovk1zDc0p7$b
z=!TX%;FPH*$cZG7*Gl5l)FsFOqnhWi5%3e`fr*`Y*r*PPvgG0YJJ>j?G2}x#SmqA)
zV|~~}s!7uBF|%>l6m?0G@)67a*m3yD!XmNcKW?zr9g+oflXv^DcIJ|K+mestuy}b=
zMAssd3{!Z>zmeY}V!xS75ls6An8#Y^4S{#W>UKzd35-TE!qS*a<!wtPN9V;t<4xlt
zPs2sR(vM5y(2XYI#WK7}V{411Ai$ca3!r+Ag!MHpo#h0O4~b#S$IRFQu-;<2s%9`@
zz=#RV)4B}uv<y<gc+6=0%;#gVuca_wIc8GmW=auYO3r7Jo+QiVV<b}YWKja+tEuCa
zZ!xm!vdGrrwfeJU@Uw}5*&q8cnjErm(-KIH>M;!Gv$5K;Ednsid2`UiQ*DtjI^uG0
z-E*Ah(fcTGa<JEOJo3>;RCBQ`)6@NO(O5lR_UB@xC57#xFY)A|0+UIj=h36$^N`0f
z5(&`X-{t|<^3r$F($w;Q3g_q6qg{~I<s-PK6dR%6;}<+VNvY&ThjT1|lTWRuNB5d9
zxP+#Ww#K93@D@Ih6?Pk;b;TFnK??_Y(FSe{uUFH?1JDR5)rzi#(`Sv)0_uydOVgLt
z(3p{8){M~D)rv2|90LQ2k1Uc%_X*JQZ;RIfnWwv`vT7yUt(n&Ws8{tR8>uA^YN$Uc
z2}-|P<Ud(J8Hq_jAVgJB^FdBP)x0g;(8<Q+LoHA(`ypIL4nanpuP<9A%OSZ#u^}j*
z0hCh>pinxMPqIKY^U7ySbEudbQ0fRO=EK|{#Ve+c3%K{dI1Uwj3@A&t6|<|kqJb#v
zwZ-IOged3nmE$a)NCTCFrFnA5C?~vC9Xh4Td|;)7s#a*#OATaj-c_ZxI=G0py0aBX
z5C)vXtNyGGCbBSo#Zq9(2fljtwVJo4U9N_lrNSN=j1RSHRG6=6Kdy<)sqj!oCRjB_
zTJ_zv17HA>d>4@X)m`&x%@B@l*6`};S?Z{2Z86ZT!gA{poXo`x0QtT4dJOf&$JLnw
z_L&I)rh@vi*7_nRb96u*Z+Re6rFsJ*Aft{CxxvXCaTHK{*HEwCfF4%geTUSB+|bp~
zn2QdmHclNiMjjuqFH>)-Tm=HcfFq~{P5G_B)qe?0!7zjAUjkF4<G%!^V5gS<USJCT
zOJEA~mm>WsFopR`k^dzyh51Xte+o?h@RuU}5}5w>mm>cmF#YW>MfxQ${p~MBh6zmn
z<u64NChGjtUkZi^O#fSdDbip5QVrz4{H6TeNPqfEVfs?!-~Q4+^rc9@{iVP3rHFs{
zOMmH0k$?M3`Fp)IdXapbd;Jo70~&jSkQaMH?t8<C`Xc!IkiKd3MLYM!CicZ6Hufbh
z_9fr<r6Lpcr}Ot`YV^ZeJ4nA{JHJ{x2(Z}BK8?t~HYLe_Z%RI}Daqg8A2#xIbadq8
z<QN(nzI^!-0|P@`TzqM1sj8}ql9JNV(Q$u&|HFq5qobp-v9Z4kIItNCcBcb7(t%A+
zuq6t%LH%yrz$Pi!+yq;nU`y2R0uF4I`aMa(#;Ct8dSJfQ-$y;L(Fu0cv%ax;dV1>N
z;ZavtH$FZd78dsE)vK>xzjk$Xv9hwRuC6L5C?qE*fBg87oSeL{un>z?$h-s|cFSX1
z2S8?KfnD^#s?P@5&tTzaOirf%Y(M`Gu2}!GZ-tNq1$cORdHeYK`3D3B1&4%&g-1kw
zgBfOH<Khz%laf<X)6z3Cv$At?^ZvgYS^o!wtP=m1Le~HAv2qzL^;Z?ll<0H>5KC7V
z&Q<)s>PBNRFl!CgmaX@MQA#Jjtt;Oc_<!)RX3I2G|7V`34*(Z|ysgvVdAbpmjDYT^
z!DGEXTOs?^j2PqU$Ap8DEb6Q2>#glbZM5!JzQ7HH&rc*QFuTuAzh%zcp)QR&+|NIy
z)e*g<L(n0A>M=9mu-9az<#_cJz5|F(-y{6&`N@n9)$Ld$Y+OKXUxiXd(}wvh+td!Y
zZ7Rhs`!sB?ez!)};U4!T7fny+r@h5X^M4e6=8qQQ=L}iqN2`z3XO^tr21p9LE7#|L
zjbykHb^lrw<R;#4t;L3zY5+jvq&_AKB37cyveN9%cph!hjTjmaI^^E4XNN>UQIV-?
zps0gGqnhOWp!h{F`p09<WDjQ^zqC=D{6#Q76`Uu0KYlB~WrX$n7fOB|Sn=NfYwK2~
zKS>1e3A<cuB777tMTQkG(dkmGBO72Rg(ZY5Dbu$b2>|A;H4+AJ3L0rX1D@aPI;Op&
zFC-rek<zA2I^5Y$nJK<qu86MZqRU&x*~>viY~(4d?K5;_B3Fo>H)_bVl`4WSMuL7R
z_n?Z+sorXKGH<?{cAy0i3rpqZKAt0i^8#mFq(EgWyVA_$fpCCQz+(T-Nl!yA11RRW
zTosh;r3_;9^ZF>q8vGFOrGzZ3eU;IX{g|P*@WpVVUuV{G;&P?O$Kzvb1e4yBSUAcE
z;5aW@JK%{7GMsxL@9`eP$jDSbycY-N`-;(irGAbg$4$PM)zf)R`vac{BY+({RdgVD
zoN{i7X9hwDBIG9|<28<g#$YaI1m;@~uh)cHYe0Y5+cbdZSK&+Rb`q4dVBwJiy@Utm
zX3pNs7*ymWtOY|ha#}Pt9>*I_dmiV={=&prb~P2yawwiFXw=s$?#>;55Ot-YBm=LS
zf|QL+GIB(~umNBuengS+_;TxEcgqVTv>pMSK)qahR&ByR@1;@X;x;#OeMRPebeKNR
zWZa_VF-5BFc6enFpG>x_@c!=eOM(7_1LwXxrHxL6G<e0KujoNKw6A?wrs(`^su<>P
zdlg#ob!GM93q~8F-?WMWM$kPyiOIK>$)A2EbK90*9ZB}277GirBwi$QoW>F*rlwJ)
zdrd&;tU|sIB=d76d=2o8c?D`>a7A7uJG82S(i+M70Jh143qff~hDQMS!Z09`1c-`$
z%cWzU4EH&#x?@D)P`5Xp#0COf+#m`?FOc6SzoP%V78KMUf$GD1%=S{y<k<}7(!tv=
zElnq=;Q;5^DmZ&U7@Of+MP$Dy;tQK1F98}Oh~h9j&Iyav9HRnpqukT3&P0I|*04UF
z);rD{`dcdDFM$45N*cO8G1^cb=K?|>>JQYJs^f3B!wCl8Jgd1NV^S*Uk2!k+>|znc
z1O&W=9Wg*to*ajG8ajsnp>>ihAtrG!>FPi`BB?a*sa&ITNbBoMIw_Jf{!NWf*}ZhH
zxgKGqoo}_eMAEixJGvz?UacEEV_beRuA>HMuQIyptHp(I^!$^@lZP>F$4GY$<R|8F
zTC^ZF_{>%Tj~UrknLS;`NeF>+Z>J}=uS*Rrhy32mM_|yX35unP>xk|}EDO%l68KnV
zB2DE0sF0v1it<@c#X4~tVg}Puk)n})i~z5@IAREJvQ(!sifjp5;nRlm=U?yfBHJgS
znN|gzKFQVCst0ioON11#Z<C!?b@RH~L@GHAzmkm?TUiQ{zc+*lh~xQ`+5m9C*tRjf
zuF{;_mnEf@>a-J5qu*RXZ?uWSF}3~2J#pWKlLvfwMgn_WcpDL-rHI-h$l%xCU-bA2
zTV^hG(h&5Zs0fJPNY^uXmY2lI;mPydi=sq9gOoTZ32xhXK!ri|{B2+|dxa_aXGf6;
zx~qb(C?)n!9cPBC^K1Zqcy7nm?VI^kl|B|nj!6Tmk@a7KAQRu8E;-FzJpE9_Ydt19
z)1$Q_<KJ>m%yO5Cw0MtN30qGJfpvZ<PaZk>Px3L(>w8xp={dwx#!GZ?3M)zdxjp?a
zLi`%OPOm3Vwi6%X+2eo(Pv&ETE32ZOQWnlY2;T}E1-~E`0WAR8=zB0fo45r2(56JQ
z^dzCCclgxU=7$fd@OCkazPMPgx`os)dLBC<$LK?H>}+;Xicnt+HsT{V4AedHorY>$
zO<@V)_2o@y#gyKCr?|6kc6fFgjBp?;Oh{3u9l{iYC9}@eF*C*zeUu2y^TYCa5zPw%
zfMX}f*|pV10_3O^A`2C%>+SmtLUyBNV3IGlTM3DQ^K6A7{X9MVFHlZ)voi1!oOJk@
zdi;69LE|>phNGt+4yPv?nejXQ>Mrt2_M#5YEi|m#d2U;Fyp#Oh0NoebjGqdoDrH!t
zO4NEsHb!#s1|-@^zQ4LZJuFTxG?I8ST7J%<Q-uL+grBJF&D}$9WK|wAObHs)o;rMB
zj$JrH?H~8a&8|mjJG5qpv-uQ#nN}rmY>IWCpZN^hel<=0N<npW_8pJi&RddgHSwc$
zwKreJBKlJ}cqbRbv-O4QxOML?@9p~AuJR}u)i5|f-Nd<R<XY5AX%%sDyn9^H7#csg
zY<lw$wk`@Pnm-DgTfItmfxLIiESAS}-a<+~wqKrSOUGZxI381dwpeUZC>`4s3x6xw
zGJNr%n6N?rh{hj%S+vTGc;FJ*u}sZASMvEm8Q|hPs<W41%`{>9Y^HB><RWCZnNnIG
z4QXUNq;9~n<eV2#ZY38jTt+H53_+Ce$<ih}+cJm5Zhp37fm72(nj1-+K?_GaLRF{J
zyQvb9p7pUre;z`IaR3~{^#*&&`z!*0P(ha5^!B2{A#%LX%!_%i^n^l}l&!0@93xnc
zU3O0obd78Jbs&oQ`Zf}DOjA<DTT;UX$wm++yc7@HG`JVjn%c4s)Pd4l>B)+{ud8qg
z6gHvBg7&$;sr_J#8{=O5S<4Xbvp<XY;pZ=RIc!ot+_r(xy$<IKT$2{uH#p^hq#vJs
zx}mJIk1qUM?jC?w&C1%8ey?sm=y85=6IG$`%2Cve0I+;jVdA1h5f$8FbrAUA5NONA
z<OdDZ@o?CHqOZHCOn{g(UR{fM*HJR!%~IEMdEdmSIe2?B^tzK@c{;6ofj>Dmd0GL;
ze9JODAod`EUR{8f_cI+z2!fVGET22b*QeD70uoSuqoU1CsnTgB#_jd(3(+-#W@@C+
za)m;?xWMZRzwI@^z1YWw9Cz_2<j-0;q^cbZ;W_!Oy#y2`LEqgbet>{b@1PmGjP)-;
zwmy8HY#fyQc1~#2wtO}}n>uxfy2A@FyyEYNd_{S05kUPee^$UMmR;kC*J|h6nMlTK
znv<aZvw#ILLB4D2<VY^f>|iZr|NSpSV%wTa*7QW|GOpFEi{qC4N*Yn`S*s)hzHEO~
zZ)fRAm(HCuOUj`+&3)njoxwLc>=Y<f!1lt|iZztQZvQ*YDXY}Z6S@;tqfaWJ=Gjk1
z5GBHlFxN>RS-YUS&-BZ0!qq84iQ(y3+9N~|7%)BrH9ZgOJ(bbT)j+ji&#NW(1X+2Y
z(+eZg?)ZV8*S!EsGGWqvp?MvNrOFYJYkFWG#TgA$Qm}+%&`7=%v9e=d%O)g7k7n?W
z`VtrY*h+l(Lk5M$32qf|mBC5ikEo61N~xgGWXWOg&*GRDZ41SZC299Tu?iu<Mgr2R
zH-^UwQN@Pb#O_0d`{V<~Zp3U15v3K$IXDU95Xln|<5KbC7gDw3yWk(V-lbbeudEa1
z(+Q{F#Fx&Yz%Gvb(Zg@b0i3ExZxBiSD2N*HrJL{*+IbQ?RTH}%5_{tk`|A>6<>q1f
zo5WH4q;a04N!6rjhosrKr1`p}#kr*Ao1|6z<nOSYvug6DL-KZ9@-7Xm=RB8uc$0jL
zp91SSpR1-^I;333rQFt~+|Q*v+@t^qQsH=0pHQi#K66Y3#it_Gr-J8GQEyYx3DPim
z)3DXja2?Yi@o5D0X+%`>X(YF4WCZCHyy;YG>C}$twDIZm_34cB>CCt3ECd;BycsVk
z)iO98Gq~e3c<VFx=Q9LvGlU5;MR_w{t7X1%%#?`Fl&a5sd^?}{?lv=&3@A#Lg{kvI
z(I`tHAWO+HOCvr@vp!3UAX|qwTTLxn+bCNvAp4_Zwn==pX??aCtnAF2W2Ba2Zj@sc
zkn`0s$2mU7r5+Y{&i$GXR;15$bj)=N$n}ZO4T6=Ok#fWGbHnFzBX4sf9P_>f<T)T^
zb<zQoZ}U<K^3!?qGu2?fG5NXi`T6zvh4cBvxA~<61?9X2m1+gK^>AKB1@-j>urr_L
z+k#et!uH*Q+Wx%uI=GJb!v6Zg!TG}B+rqMZa4%BPq*~FmW6^AUQHD`r`%WH1D;!oU
z+$wMJnp*J>$KuWS;_dq4-TC7E+u}oll4IVIQ?(MsbH|d)_>$}TlH2)```eNSf>J<3
z@p62@3PCYELn%^1DY&5&b)gjft`w8744bbEm#_HgKv5}A5qv~hZ)+Lc_c8*)_+GU_
zAO(;?r@Tk5oETQop}dPHBP3%?0J9blv)`4c(v{D-m(v@Ac>{_011qTZ^5Cs17^ExV
z^eROeVx^EPnWZae7s>@1Dxs<sb7PgAI#sjoRf>Vp8q!1}$W;o)M7j%A?s-++Csl3G
z>aMiv$It5iW`snaoGSIziLCFcwE`<K)`)OZ0UQ`L&8^j)z*>8CH!mk-qA((gRPfXZ
za3_o?BoJ(;4)%3IA+-pDDPq`C1qn+*NMS@oqaY+QI7$F1kuZQM3~34jJyN}PpSHH4
zRTB;%<sl4saSV54+YqyZSR)N+`UJ#1j$4)s=#Z<c2Q(~N)D_agA$BT=8h{2HLYiy@
zX-B!!kKqvF(7N~PXR4dZOC_5FK|bkrRAESAd}y5v;3D-#d3DtF0(Hjc^ygdjfx-Zh
z9svI+kOdupQ&F2;S`R2|US#m<O+a?}4mi@)@JwxV`~bgs7kU`hY`oA+T2Qlo-2B0_
zl-pM&E-VI=La51^X|qlPa8*YF+@me?wQ@$cm!QM<wZR9nOF@XFHUg0_3sJf5k>U8;
zZ1zyt7lE)mFrmiF^FUBKCVT)Ch!YE9ek<^p&~#ne@kmc(|I``T9{vscQ(m0!wQz4h
zOr^lAJNh+3!3hj_iquXRlg=g#nnfm`R5oWKgYRbmkdnd6E0Ap<!xKT_!uDDNN4u-|
zdeCQKmX~gXV+UVhfVDFKRj$Lg8~A0=#8rchZwhYMqtjiwO`HL^32kGy0mOnoj3Wcs
zFB-jcTF^B>6>yYf$3W+|0>LJ921L?ji8k)ej((}YxP)$I9#lsDF6)bdqR~c*txlFJ
zaZN@LD|+Y72{2Ec?L{m6;5|`Nf%1rZLxI;&ls><(3cNrr@E{tf`<4<P9sY~6vlIL9
z_F7ANB8h(C`#0y_sbKK?d*t?e@X?(&E|mK5Y5OqhmTvlK91EE$*(lrYJ<&tqTl9$0
zDczSafeqdH@VeYc>tYk`J~HW|vMofy5rMMZGp4mh;Q(ekMF-ptz?Uq(%u*rK9EWpu
z?mZs^)<}$_yeB&fm%LRchK!mWrNJjWgI`I97a0WHB2T{10QC<5Bv{yV`TGFSe$-J(
z-ThvJ&##B|ntBTQy5+j&j+^5GrvS&}zED7)e9Yk`Xr&!6vIvyX1QCuhvxET<Sw<uA
zr4SnpsmSQ{Z>C$TCOb}<6q!)3G{($c&gMbsZl1u}($NPQc03+*0~2z6B2#f}>iNn6
zi(}6VG(a*$b@0`18Y|+~S(#A%;2zV!00}_-Zp|BZoO%nouT<V*YMcx@Nyyks$UZ~h
zb?JqG(eCGOtwf31^9Dif9rO7DYNCbgJ`s!=nxGX?kj$WB*Q=gQiv_QycDx~E_N<|t
z*jb_j4MGb3=>?kdAy)YcCk9P9_Ac1fk<((&Y#a<#1V?;o`bzW)L{K{B{poegU89HI
z>yfU7_OOML)~Q(?+Q5mg@|ADcEGDB*=_A9&L_fbCL#7N|@10r;?_=#7H122BKz-u6
zl#TU0Gga{H8xdyGr;s9bwDXm3&LBi0!ET%t#m`NPX+tlYh2*nWX!ymJGFs%>I~uX3
zN}_0{MB>ixD{&S(mUL*-(^tJcR~he5nJOFPNTNYIaPT5p+AiOhsGm|kg(Lgn0^%qQ
z{;sn`@$SPjW@MM~NFGSc_Vif23i%7cHMV9;{^@S_CUM3t*mo7d{7f&2t3w?Q{%(Qs
z;(0?4oM4;i;Qqjm*3!-C(nZ52&@Lrod&jav_D-1GBoMkS5Ytf|N%=J0mm7jg^_aX#
z8o8S`+9_Dn!cl6NKK|0nOo_ZbHZ8fIS!;{bjta%pcs6qVdBr;NXC+^`JqBxN<|0?V
zB3rRuKX2iWuM2(iKcH1i{Kdh}DSgC9`=CB8C3>3;SnpzC_TvxeW<IK1!~>Gcb<|3d
zjSfPWo!M67uuF4tliKDMm*l?3Z~#01(&U86o1uU_-xvL5Bj5NL<eHHSi;?!;p&Ts$
zi_7B_nL(PvoQ1WTEOPKiaL0q$hPQW%PE?20AzOpur_F^^J7!~HTF86rOI0M>sx=MA
z&ijvkb5E6sgHX4WAGVwzQcY~z7aR;3xMx`Uw0uqC4yQ~7GsxS?$SB%i$YUe7WF#n1
zfv#%_1Z)BYW(>V1-I6j7yd5G2eBVAeK4(~YgEoRhx_I7bJ<qBQyQRA*yxO@aW~4xA
zyA6>+HDeJkzv^IK8DCgGa{au2&__thQ9*JUs<I$ga;loJw93-4i#50Kbgl8#S>VKh
zr`(#e*_pr$O|}MEOVJs%5QvMIKc#tV1^vSC1y%6~VjwBMjplWR>rLK038lr=T;vsd
z@$%=A(LM5g$`{unM@JV*=77*ef@bBB4QRVD@)tMcvKL@4L5AoSQK7FVA*+lsEmHnP
z_rnzZJn4e8D*FXb$PP_eT3j~AA8tLXFY`;61BA)_hxqJFHp|59h#!eAp#U;&Zb}w*
z+Am}}P+AZr89Fo<er{|Q=)ipqnWF{K?eIVVi0hK{WNNXzv2(Fy{kMkOeK7Td6oVg~
zg$^AO5hMRw#(Ijf_l2;=y))w&5tD#S5z<awKK!8MT;AZ$5=dV@U^@VWOoto>Fog2O
zT9TDQ{fy|fcxK<B32j>x#J9RX2_|RoV@YfG1{3l+gILo${jnL;8vR%^x`T=M90%<g
z=X!L}$xhHeOG7<G_{nyzt44^M?awHl1H#<JW!wP69Q2una$F!TgcGFav?4?3(Y#M`
zRlnlV$}4!LdGPEKa29z0-n2&v8@wn6$nji9rDYbYiU5z_Vzyd;Wcf(3@ebK=W}RLZ
z9~)5zP)MDb+odMT+X{dKlK}6LMtypX0i}AzoKl8RagS2u{!chirF>}Ir%#6wi8Yi<
zlZEOHt|)xfkNb0#u>Z|qzM67_a=s3n?hznGk1;S)(KpxA+{PF2it!z&{{3beH~CUk
z;tg<XI?o5o<_LRkd+(a{D2nV;hS?V0$vIu`al@Rw4cd28w>2$UVx#2qP7=zYufmHC
z2=~5z4)a-D$(9F77x;!D+n&9aTO!dZq8Fcfb$v-(L3(}^ArgB2)Mndi;4lzZkom*M
z)K@a5QB{i!=$(44ACp`0mVyACt5ICyCpH<p?84|V>ge%jN%pBchC{p_+p7sciD>F9
zbwv_Rc#pbkzW``iQ8ak~HGM@2&8ByDrsPCxs;53Ock8PNQ6-1x^yyzqp9}DE`#w)-
zl*rw%@-|4KLy7Za?*EbzB8X&|8S#!D!}Xh10G-(9&CiC}$*%cpBB>!#MtRxE0Y>?8
zX8A@1S@cGv1;xX=bcL160mjAkhyBJSNta0bapm)u#--f>-1}wy%mXGB!-55QxufsY
z6N(b0q)n?!RSQgOmR*s}YIATI%<6ETOB2>@787FB?>8g+m^YpdJB2r#FE@Z|YUTpX
zTiUnx%v<5m-m)~@;}RXWmE;Aov?FnWEjuw~`FA_8TaYbGakU%Gx{2JtR=pI~y~n-B
zW{Cv-)W!v;{md<3>p{dSM)nGl318D8;u5)-VZlf6*U^qq#+N0p-4w0HMoEgkPP`K$
zela0e<5*$xUN*^SO2yoM$Yy4V`k-ozM|;6~wqMKCcHXEYm~P%gA)mv<tcA&N@#{*k
z-E!b?5m$!u3<L0`%hv7Hst;*M?Hg|zlJuowrr~Sf5b<{o8_^#Kt~b_0f(d`b?`_^}
zrH5)cY-dhz@)&2QFstttwxBj_7W6F7@1wfQ-1#SNJq|k^WgQ(h_!n<|aylkKF?T-g
z7Ye>RrGhVEJs);bxj&!QPANT~UGQzxm@<DMf3+Ga+uSi91Ji@dAcZ`f?be{V-J`p`
z5EA`P=^`lZE{<M{ad-77gm^6rcaNn5K-g}$n=*F?Cn}#2NHpH}$#@VTCO&=P*9JC1
z_k3g{0|Ku_&~PoFl;BksVU>3D2y}18kEASWwW1j9qh9UTz^5km9YmYxzMMbOP@N>)
zi6T%uUNfe*(bC4i)608Hzg<I&19f3bVR*?Vrh`i$ubw;%1gM%&;D~3nhK$PM6EbGt
z^Ki$o=UaNpYGe>H&31FuTS9PhGl-A;diZz8o`neY%0Pl2K@xYluUO7NPsyj;kvoE(
zVnTt!j8F6t&p>am_A$Uup2%Z~B8f5u-b+(F%L_?p6k}mTiqLt0my(!-mreqASv~PY
zwg5e23sv-`V+-pS1AXT{1588m$i;cOSz`@|s96WT4C;TjfP|6x0$`ZoXQ{KeiJ;HX
z?|nMwKOJ)5@{1r!t;$%qvM2r^%Hiok?0HE)5!9!#^>krp*z}ox=y@Rxi7%wr`OU6K
z5cX?ZbVPs!OU^g6`VCKILvkApOn`^(J7<!~UKN%cIy6jlr`QTPS2N7?6q8U|(oBCc
zR)nYn9w?cdJCXLMGs=_KpAppS2V>gLlh~5-kefP29us$8W-b1RyQ{|G#sb;ozGljo
z4<x9)LTUH-Zeqlf<oM1iTOmoVXjh?xcM9*>aK8LOzCr~(5?``T;YX$d)e$3Q7{iux
zYhs*XwcC&I{8XgUS&oW<Hwyv(RD{q}R#RX;`^@WHgo<NS@?w7Mtn*ZaF(Q}A(rQ#Q
zxhLkdP<fNO6>hLxry_pI<g>uQ+~_+%VRGr}JJ|x%anAP2T!A7ZjFX`0!}F@Tup$%v
zvjLAEwqHa9^3C0h;pX91@mrag-!mAaBSP%z+Qf=!{00`kBjEvra`nFk7EBKpO%^RS
zp;%b}m+{s<HhzEl;$6SAmi@=p#?4>^haRWptMtd3CJ!BR=XK+i+x`ztr(!*>>ut*y
z(=*LyO(oDXKCR?Kqna@ZBT9Y-(C8aTdqEV5vFU&;LJM{W&r~Td&pjC$JbY&^F_!vS
zhmMUFUM+sbYJeMqo_y_fmw2$Hp_BCbI=v0L?O>Vw-mLiIeObfn;XscMwjbL~KLiaY
zb$`{D{(i>KV<8`H6{#z|LH|mxUTv7pEXE`Tf&LI8I9<kKpS{S2cG34yxk7F(@iUif
zO^+F;i0#$WO^UrZ4oMPK0Ay66?7&{vL2K8)ir$^se6C&fusoO{@r$^$?3nY?behPA
zZQC`Kq0r`$?16pO?f#qbk)M^J0&fhsR^~Wgy;`m);27Wf!1ZcydEne43*nRGO66qu
zG>5s9<|k$a$8s#9uk|KB(dDfP9Q&Vf6w4&7zjNXp9^kaKVURQ6o54sae>|)<MIDtk
zUwJyWRbEy3)KZrzg%GfV%9-I#ykUcuFe|{pIlMpg(TLA!bV%@`;^M$u|B7#ZT)HaW
zy;0dY&}prNuNCfX{(En6i<xy?`-~}<4MPRT{$uL0(yb*j6RE^eY&7;tL+3;HjQf>2
zt(tz=&#-5V!_<y`?NBhDv0BzF{-*0yw@dN1P0hl3<THofvvH%aum<uM+LryYsMgUl
za~pDYj%`Ki2JsmolUzrR;&6b=*bQgpN)p70(J14f7>#w2ky^5G67w3No^5TCoAv17
zux#zC9RN*j*In~zOO3{_?SJ~{;G)y*<36drbAfGhPzAi^csB`e_}JsJtv=z;ZCgp+
zH)JDwZD$R;PBHa~GTNrRdYVHt!nI*_#n-b6ohNu1F8RXRLFMr92Y7Z|o9vIqtH}Ci
zGeN0KRdu!%k%INkct1;$+%`AZxi)S1MW$<tmevPc@8HXRb~d)~95{IyDjr>Lc)qwt
zQ%H_W;3~Tg77}R{cR4?;sb9?rxpgTZS(qsiZm)YpgJJ<)G1#G5=}zkg#*GH9WZjlR
zG0rvBP{^ev*#|ozF{{xUr|}w%4PSSB6}Kz{(*iq-Szm)AJSdF{_~|SQy$Y*67e>Sf
zhbBBbR){<1wfPR0-hqK3wTdyn9}j7-^I*1ncbEIMom-B930sVQ(FdT2En%{+2Mw%7
zTcKrZ#ZW5_Mo=+osc{|q;90HY%I&9stzt5A1_gn<CTP42Yu#_~EJh5x?hIV=-*|L>
z_mHRcPONcJ=~Wl>GvlK5X}q+43GoXSc0M$4x9jnFUF$;}s}TZmw4?<~#p?Uc@Rs`e
z#z9b^w!{n_@F^0i2D!Y5Aiw5KZ&W{5BcC4EuOECqS^2WFdyP-|ML@ugcwXYMjvQ1T
z>>qqNa{OCzITfzyoFOi*_QqZ+ettR*V3mO8nt+cg=Kg-pVSb+iX#;C>-Lxcv{e=Br
zbbGz)HHrlVt#7|<-=^zS3ADm<7}^fXi48LGwT#d4^!glhgX^|%ZClo>XSQu;;pb&F
z>$dnN<Pn|+{HzpEaL%?_OJ4%Q_|X%>9vg5n`{sxnxeyXutP)z25z4|D_?R19x$S;U
z3tGC?i}v$-;vdnL8V0P0h)MS*0(xTXn4y8}EX5<)dV|pH{fhkbz<KI~JL*FS5ux1Q
z!necQ<m`cQE_8k0_|koYufBzDMG_bWZI=hZiZa}NK^n6m^gN-(K<|y{@V5xxIHtqv
zVne2DLuapjB5MPsw;g3^Lu7bDnC4#C^3ZK>y9@P2Y3{gt(nfA^Icw+1=;v|o#+p+Y
z1{(E7|GW+tM+EC-eS0nu6XhRsWfd2m9`j5$NLL~p9TZDA?X6`PuEP`KXXh@a8cc!c
zZc|tnYZ~iiuIk@ZYvZFDha3}k_8q|6N})#?K+zN9F8P(~#=Cq9nk^AOYHwG4<DkHU
zUef1xz7_w;P*DsNmmc=*>$aP;L&Dp*gvF{jbx2}5PrM&)yx~n8pF_AnUt(!)G!b1=
zeqH=2k7=P~#LPwT<4n>c-A$4iPIB;ZgvxXb`cBA3oW_o1Y{qQz#$^H_FlpvG_6vT*
zd0glb{<jYwfY#g|3zBhn{t*d4T1s|!)K$vb(-b&0^kl<SkL$P(h(2N;;)^h%aoj_2
zsh+&C$biFsB3hd9p#Ow#K7+SEgNOdxjNvy-sc&!QL01l%)ODUDNLj5^PcT`s9u%?^
zjf6ksXQ{?#HI1hPW2X8#fbS>ZG3BxijIs>_vW*Eu;dv9=<037LJbB|%Tm8dN3>3_+
zz;G`!MOHIhZgYfhbGZ90o#}IZ`ZL_)bDgDft!i?(9dpC#b0c<hf_Ss6>`ZY|;o|%A
z5_a>Fkn)r3^WI5?KFafku|V??!*g&eQv*-4T-Ou9W%;U-c>MYWH30>6js=zZ1x>pJ
zUoOpC=nET=GWzK<att}j)KHv^6g)W#j-qp3LW=GTWTyheS5=sLjq(?KV+M^Bv<RZ|
zt<onri?mdV4IPR}W(!BS&2hE^Q1UVGS%7C!C1<y)mjNZQ-X(ELC0IJed$%R*7j`=Y
z%F{-ru(fd^-rqh>F~z0?t)K+9-A-iDm(+9h=c8kV0U1lnm|NjUkjqIK%E_h66U9ne
z-jp^*mjmkaPazqFtQlkD`88uD7zri!0Ts7u6$E@>4yO{{gbL1v3T1APAaW&FLkZtN
zMLSt}>PF=|S>&qZ4LHDR*==5F{A`h*Di}L~9ygFycNc>Z07&4d{)nt=U|g+N06xvH
zG~KJFVE~)lRa>i9bdc424XoMcH9720F2$;HBdoG>tQuD#EZ&=qn|GwwVqk;}a42op
zh9cL66V}np)wrrxLkVk%Fe^L>B~+11x<JKUIo3f7HtY2<iB3Vefxh4cYxV(j;#O+H
zF<>5nWqw0l?0S8fT|+*yNHSkCK^n+w9Mr}@Q0By{a~GIB5SL%j*oe$Wsg^#u8dgUL
z_9koelLNJ+VlS{_l1ZsfCfGMDgk?Hef8T4=@@>@r*3_ZXgd7e4S~Lx&*4D!}Z%a2j
z<Wxn?HEuNsPYhrjFo5?G(0#`I$5~pQm$x8)Y*sXE@Ed4(T&hlJl0@Gr5NKNf7FYnD
zg5gEV0Z$VFBJF@3C=O{MDml1yq>T#D3h!D5z>EN*l2O80;F8aS5#Cbt7WgCX8(yf_
zU){C-h^$J=6YvHAQMup?$l83?0B)RUdf#!+ECAxpfM>#u4S=rqZ@XmKkl*SdBeTG}
zvSPWMKy+sCpYo${@>B3K3L(FZ<|jhA$9ke-l%>JjV-uA1?j@dOW0q}WkJWw;8ZFTA
zZIA0>j|;fhW3b0lqnG5hxQHT9<VjyRV_zguU#Rmd<-3l6tp+Uq&ahIs{Z;A;^zI}e
zR={Y2USStNr~d@dUDrz0e2m7%h`k1tZW(j&4y1oHA`+?(>LB1L;J7Gi<ySbymhA}3
zLohBvOz6|Z>!h3SOZtM$IF1U6`+(eq#!rrx5snQM!ODA!i%o%pz9j)C3;|%YoCXfS
zBF2`9*_ImFheW*xj3Y;kBhbWbl^<f)CL<c(Ml_%F-V=$jpVtMpcVF*y4;>fw#f*0D
zzXLbDn+zD73g{ac9NbCtzFF|?Bx`*mg4WhbvwRE^q-Z5?!eKuddjEc4Onx{|1LaE)
zBP8kd$nI$W%ZVPviMCGH>CB0#&WTB?dI8$t3|in?r;^xDnN=!{r5{SmUzL<OG^K`=
zWP+8f5T@QQDJlGf>Uk@x4^6%8O|!Kh#SRu|9(s$u$2pAIj_B74k9sVdCX7T;G?WFL
zVX+*4#{;T{V$Z#Wk+*<nH%+3{6YXFHuHXp|%|Xwb@;v!bWciFq!_V|@()KrJUyBu7
z(j~majah?C>zN89+$Sax5j4taCylkSurz#IogQT!ZaJQQe$Zc%^nciT>!-E?e&0HH
zaA_${p)D>2DmbN-;#O#JE2X$Yakm7L;10pvT>}IU?(Xj1;_~vGGw(g;&fNcC|Frka
zXRq&C?f-IsLgL-%D@@0kh!=qkO@~^_URuiYS{Yc{NjYULgoLE~T5X5ry<FNo1)*T;
zVmyoBOR!JxW%{j5N<jSLr1e0&UQ~5RQp%&3qp>K7iC6nGf0arrVuq4AfWp|P)xrG{
z23<6KqIldo*t{4giCyP>0>><`CWVslPh*b;iy|mg{o`}ypIF|%P^^(7i>1o!^`SEk
zYp#ntALA+LW2nStEkDL9Q0rw|Bqeh#^`D1aH5c>1!!Fc;b=LTQm>2=1c+*g#Y{X!#
z$ecZ_y_|5~s(E@6VCGkA$R76TV0>9w*t#_sXk@uEfV3Y{NU@gA3nBD*rY~%roBfY!
zmf9u`M&){I>be3iextwkFxyDIh!1SXZxzR-M1M`93ba}3^C#IZCK&o-4X!L7#akU?
zjG8wkYWzL83ff=s77QkBNv+6SZh9PDwRMi0O^6#e&(I^<ng*5BXl;Ge2s*lMn!Kfi
z@L;s~F(P;{&;TUZzSB~?#QM5GErMtFpNXXrz#oWpos{k55f#V@8RqCVLnh++oeR7L
zk#dl$$>y@xkTTxPqZyXXDZD$`8*V_K5kW*S+mO+hvRfv#bW1qLvqF(L3VeeAMv3E1
zkD}?1w(W{jZ<oqNSB$#S&$C-~O4IJGC=fP3F#2$JzNkB<(wmAl-BFU=gl{8zQ<Jpx
zGongQA_>>}Hz%LL59e$~<&+N0yobt9fn+HfCb}KZl-ZjVvLsV=pFhsF>7QZ=MZ3$J
zdlu~&rv?ou9ZFN5^DW0kN)Me(@+TV7ovzg$P8u$8TGMc4oB{wOX`aXQ%l_PN33;h@
zHt((Ol~U2twlUHqu?!L>U?(LFo3JHaeKDU{B~ZK=4W#+}v1NT0eU*FLn#L&QwP6nc
z&ifMG?|7nWXN==)Z8xYr?2_qi-dmx~QR_TW!I6LH=O5Ct2agW$)Ba6tE^;uhX&78o
zPMsq$=Pn}A&`xCZR7o6{8C!{VIF1JHN-u}P(!YNqGbV<}eL4(QJ`XcLhYW9y&?n#J
znJ}GP@mT(gTyB!_Y4jZCFf{$!h<-N8ch!*^mt_D$5r?Y|zsU{%R2Y8o=kaX#Vdfd$
zt*$K`JPiBu9bNDk(%5PmeQZ1y{vnn4U;5*5X4=EEhD<f$E1CFPzfZ8|)K9z1{ao25
zWcj53&uu6nkb~oihxmVRw~zl9cniY8$4})TBTFd`$gQlZuBnYEuPZBQs(=)iCO1Gk
zlDa~X=zaYIgF^!)ZII!K$%)38o|&ex`7nPn=C$>W&8^LnmJsCj;nCVI?BMwC#%X8C
zzZ>RD$R~30i%ASkzaV~vIU;slRCz`Rh~%|giVW{}3Hf_=_2+itzTM#fZtv%^Z0ka}
zdpwDfk`Ai<5hcaFfx(rd1Ifc;nF_dPZ<N93n2kE-tob93l8`OoIc{^tEQP~KV@aMp
zLElgAujUH=4|e;)tQtt>j=3A@Y^_skvD)|*;%uR1+keE4i5*wQ(}Z%pzb~k!k>EqJ
zFs;Pw#*D1>(0^z-5OjOXQM5(<rE7uGL|{M__mdtTHNBDK4|&LI%G%1M>GEIwsgH`c
z9|Go6Gk2E;oA_-<Yd&uhiuKoMtwagY*k<3om)Z`_{G2=GCZq)SKjQAGoh>vO%SrZ8
z=hCk+-zfM|h20-pe|ved-k;WLF2KKhb&j8V+$;8|H2+~Eo_7RhGQ;#1i;~izS94n7
z{YOF8aTzS<4=a)@2|da|i(h}Fy1k6jU<5pVxAS+~_sUuE;jG<RgYfLFr7=G<jzGib
zF6Z$J6FJ)X6)t`w@EOKy#P=(b8wSpZig*vweWP^oN?C!7TTL%^((v+)49C<MquMfz
zVhL<OPrjD0gO<(s8QW`;KFY3}Z9BzD$l(nOCuO#2Dqy1_FV$<xJe$vdtuk#s2t?m)
zm<f#P&1b?@4cUy<+<HS5{oKY)C78hhNN;B9zn7nonM*|y3aAM-&nT|l+bhyv#4VDl
zY@gfAE#lkG&kd7okTo6qA!nX5XjY(Cy5QB9n3v7nWLC8VEBtA?8{5t7F*{gzSi9id
zq*AgZgjrft=kA!1sYVanEmd91N+=JfmMG34kL55#5ZSZ8viprkURbrOBy1GIl&<Yp
zhzKq^X@8<yQT0U9o{;8V<^gb7l3|K$_OX)C8e)i2=S5e}Uk;T7^iRnSNO)4C3D&3T
z-N|IKX{*Qc!Ey=c`A}MlJXMRV*!<C3art)MiVlBTt!5OrdBqdPv3t~~RyT0g0{f1y
z9jYHWa6WAay8i*F5AnjsG~+m^;d*0POxDyIMp@j0VjF~_j7zvedvDXN{!D6OaDVt=
zK~4K&(1fM*>2$oR&@sb(eC@nZg81X+IDRg~Y1H{s7{?Z=;)BXW4xWDFE+C7(q(As_
z(rLZChx_*6mz%!nMpttxcs~9_sbTA6YVmsPIb+yK&WJmTb5fFpIJeq|qaFP-RuN9s
zfF70d?Ztx2GA=ug>+;Kt)zGlJ5x)W*+@2kl+J+OjsXOP6zWxg@qk+7%PY;(YnIBAa
zzdWzK@03QOj}19vf5$}M4iwg^CbXdnFl6>+Dqv?KG<Kzt$8a3SW5oOnl!_1sO0%|;
zMU4Po-WdWro=a3$mj`mkh!dD<N}M82g82e7i0oPE$&*e%ESN@w9PdSa^H;#+Dxb;X
z^E;XOPDA7f($%Zuk?h)$9{7mNU#RU8!nk6yNW-+v6JGHg5lhECj?NSCcJ;5EQ~jHe
zU$zTOJwh)CDV>XdHJ-8d6q9a*`##-?W4MWMQ9qtVl=o-8Ajf5S8--437SgZ7%+@Q{
z_*En>2ExoH7;na~%_$ftuPw#ary54aMf)s}U8A>8z4FZeyf0mo_L@ShJJO&Qu}SZ^
zo2-1s_X)|i#sHF4C^obWot4Pui@*LLr9GPXmG9-acSxF^sw4jtf;n&}Us01x-uv~q
z9v2_a8^&ETbO7a;;JCRGW2~bL)RjeORd(2IE*gFnWBU59*0ASNRoaa)rr>ge(x0iS
zbO|FC5vs^M+D())2`pcdMM&PB?>zG<3ike4J_h!Vxb=&FQv5#Aiq>M5*=wKK?hcsy
zB}`&o%_Qe5nD*s$FrDWJ4DJ?vEF7N?a-u20xF-z@-c!k>{vgdSXMs&kI`LBQgN(}W
z&#DxSU_ZH<3{AR1^_P8<B8ssEX9Q2@;WtXTb36Zi9UiJ~w?zmQ{S8z!rOvO5<1KX^
zw6t<9)Oa(h`ofWrllgAqNq%)v9@H#pxZNnq?tGZ%1xxt#T7I^#eXH2(1CoFMFvjn^
znb3w7V}^m1e{?_T7g570b6hw#)p$`ezfuHI!q*X0ROhHJ;YzV7nqOMEai$Rg+8F$v
z&s=!8_}Wbu^gevQe1q?@VVJ4(GW+)BwuFlqVpG+kOg`_x;j(EZ7V}R9s>_CpAMvE?
zWu1U5&6U?RpB(PG*6}Z2Blug&PmgjmAqJGYb*+G48!uD-m8%7Qr1PKG{)0`6Si%Bj
zSYdjB%Z@8JO}n||n`J?p=gUuq{mN%Hx;Rn`v&7LwZCeudtlAarjnxTVC3Y2<fed~O
z8|A?iMHx`rJ0rS|tM10Viilr#>vYphD9MY8C=m3Q)SWLnxi249w!$W>hFhO{?O8Ni
z1o^c2OsC<alrgvi0mD;k@6M{`)tS&;f8XRbXmi1tw5qWEaXN9(m3uDba9~?xS#W6J
zXMa@mzy{s?iw<r$P_Ci@{B>VSBkaN|8}b50Mt&Y1_mHL7s<5TRGJ+;#2f(oeZ;{6}
zlIPjHD<J>4MpPUp8Xa7n^}E?jJN&)G3l*tStnz*R{hMh7_C@Jp;pcjK<1t&w+7es%
zx(xrT@%S+3Ld3zY#cP47Rcc4O0TJBa{2o)wrGM%ZMPaKoF>~Z0Qn*Sg{(tI{7mvdD
z5!?4i*6b3Cf8#EjE=!L8%-=4R?6|bx3>`na^Ke0<aBZVFvI$V5W?%u=cd{s*hN{tW
zB-hqHcArA5ZO!MKyMs{+BFwvZ+T~`iKe_b^=biz(Y+KL2q#~D4{9d#vOnsqork48Q
z$uHM@)Fao11)5JYJ8ZCt658_P{m{|g(_yKEH%{anIsJCZ{qZu7urB(S1Manmke&+(
zme$glN$c<?=cEj1KH(^ZgIHj6W?6mVLBv34Sm8Cg`}aVis2ZC4<HgrD0WziK#lt51
zKM9)twAiG(@urdluppX7z|J|OO_`~Bz+F`k!EK1ycIwOVq1`Z-jhLO^IA8czG<;;%
za&-^bx58I{2V60;c?<n=K`hu1CobTWol$-l&iv7u-E?c|#z5`;GdJfhFYv($G9s+T
zgBiw^S@rmEBk)Y*nBv@3V^HklcC5@s|B}dJc<O<E>-vTL*!>56vC%b_`%`=KyXenv
z?FOT7hw)9DCyH1vW}dY9>+bt5&os6LHoLfLE`en@9?p3l#IDXJmmbg+?{-r!6%5}Z
z2A>OQpPM?*`vAv()BJ8dK2v<w6v2Mz_5MNezBFpS??%0<SNydzv{HVEQD1%C!SUEO
z^+sa66$J(&Ddc%5loIPGXl4BADg8l6$G<0Ui_Wc>>p!qwX|kjIHS4)B>jUbi4e83Y
zV#NbWYkZs019<B_bZ3G{W&B_!!BZ=Wu?{wmE<p<?ei8A3A6<L`qJ0$1fMYJtC*u9J
z1i+hQ%)$Fn!QuA4kpPTfO+PD4omC1Uo2%d_!*z&eh?szz#18mv&wD3UK|xKBmn-OX
z0*L)xXto(BP@IHi%;!WJTu%9s4uHW^?iYvh)}H}yGQXhv_f6m`By+?U0tzMpns*8K
zIlT*9cJkxpS1jBKJO%{Wx_t1+2l}v*upnI=C?N~3U)OsA;41_#Re)S4==kXbIO!Pj
zOm92K(fY&zK_q}}6sb@IAo%J3Jqdt|0iBK!uqS}Oj6-956%eK29g`Q5Y!=ar3Nhaf
z!P|o717Tk>KqGmd@?~DB0mXl?iZ4FSM2bWM7#f_c@^O=M2;T8v5Qqb*3<2on0G6l4
zHCIah8AcxxBWMM{)eAsFMxm2L14sZ*yzI7X6rFuQE=fcgZv?O*0<Sk_*Ac3i=*{ul
zZGch<dF7uo<A?(Jyy%U+TaV3;jQouAj%qv7MC&OS2Ek5%SiM32+z|GO4M2$uW7mof
zIzfLh1Bx&J<fMV(yW!SRfWDRJY~L7n(h!r3I7>GvGRjz0*@)e3trt5H>THgrAs}tH
zP{Y~4gzG>iO|(fz{|D0`Q>`ST=r~vdE>uQ@DxJVi7RZSku7e5SV0&V2f!kL}0Av&|
zZcL^L@Err7QVTD57W(e-UC88$$LkQgP`6-9R|hpJ&)q3cTHMHFbB4kls#&&3#nZ&C
zZ|HNHvAKe&^m!23v6L9sz{r)b&Z>AX^QUKp09KN=U6cQ~0nj4SZrs8?ti)ft0<S0m
zl`1K**FH~-ZUj{#E?$`YbpnBGY|OR#UZRoKZlb&^e8w$uA>{eetdOHjDnlQaPC`;r
zdD2O5JQxW+E+F8%P7N$inoxn*v!%1hWp$dT7H7tU?fT*iL7@#PuVga8Jc(Uj;PvF8
zHGiRSD%kU0H$5#Fm_Z?5BaTr!X9<)Q;*vhm7RR!e`8G6Dr#c=-JmJVJCKM3!aWBnW
z2oIAzqckf=$Ll6bX;)05FVSv~;Lz0OYl7hvZjN&!TwW+LX*UmK=Z=k`J@+S8y)oa|
zFG)NUmsuOg9tu2Viw2a(i)sN&uEDQs!_3@)X%=`SNDPt_3mg9wdkg3vJZG2Cyw0oq
zgPD-8h4}`OMWFm_^1kc<?MMUJd~en`59zqcp8Us7IhHW(h!hole>Om4Z`6gX7zvV8
zOe+mUlEOz_czTt#1kUydC>$Q~T1KS{3qd~%<b5g3Bh|KgrfuRk2dCI8q6*C@aE+X1
zC0R`@?Mf=+sRUrB<G~rA%^;&b!|<SVjL+u4(Y>-NSs->3U>{t7i4Qn(1*8_1%r@p?
zXJ;ul6vBEcvVpNQmZii2rBxOwU7#Z0q#}Pg7=k)3Z!NT=u#As`ceOA$%{Q7mF{-*Z
z3Ljv>DVXb|RgP{5*f%T3SplGtq|~}qq+3)3h2;4ACuFUcl(OfI%w~O@PhTq(f79>m
zDMw0^M5TjUcFtZl7ZR$vmys=4W!2Af{eh-DB@PW6KexIey00AYPoNUyTSZ({EzOZ-
zF;`7VgJ-R6ReMwR{g$B&uP6vmZdX+Jil&HAs}c*Z_OLJGc&^f?Fe9F$w)mH*9y8cl
zxbe)MMyQ_<M=LcvpNm>P50|PEwbxidlMQY9TDe!VXOVbDUAr@q{ViW;ydG$dU-eS1
zv3sBXwRV$6V`J4=V|5Y2_O@QTKWjX>X4AQeY}F;owSIP_p*guhX|5;<-`_;1>;{lZ
z6QZ!-fjAHjP`L$;_IoXaB97%*RFmI^$UjrxPhYoaE-y+aW{1Z$wd{Xr1!Yto;UWI?
zRIYOP7oyOE<q%Kg(Btzv>K^x+S@l)|V`SP&ef(7l<w9^tVLgLF^R#e7s7dyhye8xR
zri?aZYd>fczj!>pMKq(CJf(xgz70^^n0ur7TH)EIP?K{~$o*|wA0@~msmV*I<+pt8
z(=5&{LNp=K;R_@I-*^ily0jw^k@y@qBcz|8te>rzvd}9lY1$tx+jSw$q<$T0@<^@F
z$`lAP0l%Y-#!H>^xu3v0_JJ;6L<hBQ^O!LB5?|Wv4q<RtLBCH`FyGBe>*Ij%iza9t
zHN>Rf`20Gi_j7qKTXk>3ovUkLZyKjW%0X|ERbPf~NUm;Qc1mB_LT{nMuVUK%&rbc~
zlKr-rZJ4W)lPv^akq43=Q}X?E2N>Eqte`<d9>CNMxuLrOq@l}*$Y5Du@Py~!D1rHm
z$e<mVX<ifF^~~IkWvJgob<<NZ_m%aYLQeO>5NTvu4j#j9`QR{TKyhZ;FVE~SXI1te
zsyhe6t<T|-;IhKX(%#D9h4-p=#UoFueedj{eFe`Wkx^~X2-N_hw!N*k-VA?Hk<NMu
zmvD@QAS33s<{7EF=$O)>lhV(eWRUL|yO$E*q0-sL*jp}THqmhoQT313K%<=TgFqGF
z4<%7OAmq=4RBYFp>}ZPuq4$f!4gs?f!qM->KPG=+sjI^$je;h%7bm~x0KXqj{(yHb
zAv&pQ?S1?woPLifE>6`UNh$SkjTR@rHme2Dvh)|LS=CH?!qt~Ovtx_3%}b`_4tx9#
zr;c>%&D^G8*`v?k(}MvsHSDu7gb~ntg?Ku}u)`rcXe+3s1FkoldFap)JIm{U%DN{^
zwk9pI9*BU?brsH#y`Co{oPYMWN84(i(r><Tao&1CqmH+JaDJ}qZZ2eX9`kG#^=OT#
zUX1J>%+DAk8E(!WZC;37Tv!CpO?iR(VBGV~Vx9MkYjnc*h2;cOi_dizWxz{^MN8}V
zOB#WTD+J4hgG(!1lWU@je+eOG&xT1%=6Euff9x%>nJgQc8e2>KdTa6vPsvPteMPWy
zMT~h>A(sQFv?>|ADiZ8KxU|Z&vC25HLUW|0Q~O&h_&2NfZ;cnfGc0Fr@0ZS+d3Yb*
zT{V;Bf0Xl#Ter$x7bxWrepnYE+VHPk7o*>h65EjWUiVku@K-ncda?0Ai57im(PG+u
z0N3UfbQ7>`6&PTS`O`cRd&{8IyykulCAv-lALh5()Zn(!I@;19+74l`a$=;naz3(j
zAzH~5UpcT?f%R_<5bXF3ZQ6#Z5)+P&b`P06wk((tP3k>gYJR;_T)1g%v$H;>XspN7
zK-l?%8x1qh4)4-+^kg@?#cq**k3eJZ#(58y!nTyiwrt3jX~c`k$iCKlpVXxc1F^Hb
zxU;ynkGbelB0lphNU4|`t&{s(cbG#*>H$g30mmiEaj9muf4=R3#2yqnYg0Q<)U|lj
zSybP1!Csu%zYi_l#>6~`quH!q!yD39OO50X+C0F1b$rbIuAjWMo;7izrJ0$=$C<PC
z{dTjez4tp%&+xj|RuOSySku<h{9Y*+169Q~q2(TrHl?z*-;q2;*(sHQw$IRM@X0B?
zfd(USgN4YU4X49Y%f2<m$&NUDd5B6XP3U~-+c~Ck7ae=s?D!e+@C4~w-|mt{0S&=k
zuH#=$ooD^W94~u_achUCkzYyw@h-UWJhtQMx~pDQW?qMWN}Fq*(WnO2lN%sNt6A?h
zg=5oG*nCPAk4ZfBc{vG=)I8~dW&b+G|G8Pg)>??Y7S6pN{`uP$@966?^4A>LNXhv8
zJsJ}WUFe8V7$Z9V2UhM;O*wZ?`*(ghqNcqS@c4K={^N?%XdzTH_|;4PJf92J;nv8v
zNH*H55S-gTt&A%=|JFGEZQys|e<JC)n@%5|_^3H&?`2e;CMaur2UUK5_vd|C(OKld
zv6R4N{bT0}eV2;JEu6GX@A$fPdFpf8yG#SUhzH?(AHkIL3#N*C>ra)o%Ov(`ewE?Z
z)n(Uou9wNmk8-H%{INCP<@^5-w=$Y$uHc0HdeqF$6_AQeP*%&_ErgA=JaKmS8kZWc
zfsMh!eXijZqhDxYUs6G1<y8_pZ;>XAhkTRV4L&U=0xy}>6Y9BO0ck>T?t5NaNa$I6
z!MvL1JO$wQgf^9n9Hcg-gUL<qL)@vUe>F*c#%$;fNV)J}t;5U5kHkFbPslAk^&6SJ
z^!A`QGCt4zFq%%AV{RB^X4mMISHk;O?y;O3Z^J>7NzXspk8p*s@#g-OBBt$?;Q3{e
zEb9m#v=zXeTre-$X7CWNfR|76jrL_LBnx@reJGqLz@k%arXFNh_oGeQYoCZU;x#Al
z;(P7Fm#v~`KF>@j{EwqE?dxL2--0Teni{~)i_Lus*5#L{f3?4`lOIiWL1~1&2?QP9
zltgg|C#@O=|Mm=Bjw`{udY3ce!g9(q^@3`7!aSOLaqil11#W3NPoOu%^I@UU1;Q*@
z?SgozW?(B8CerYDwmHE2&4x(yrnEw7Q#+Ik<<4~Ar2+rJ;`fi|IJ7hQuP!yai+}B}
zsUgb*7u)N`>$&7S8vBWVW4E`D#D>0)A}oe}XlFen{?LfHQQH94dZ5h~TGd=&Z+)$V
zyS|{AipO6>yp9*GGlh}oYxkKjub^FZe{U5kQ$r}<kjyVwkdto~Oh+(107K{_dWO}8
z-=DPzV{@4`7n*CMRc7*!A*%A9hkkE-VXee4vUQHkV)9~sC98?0-B`~U^(i&PB*~(*
zcgr}XIwU?(J?sh?kT^3t3eNw0{U!!!$jaib+McHocTTQllmT`YG_t$yQqv8#Q+gU-
z<7&tJh;V?9<-5$Zz%>BdiwJZK9du|TF19}=13McUaEz@m&jbJ<D6x6v&+u-6T^t8s
zjNAwM^B_4e%WYm2VfmjR_p$vck(TIyH^_Y){~aU;ac*BBJAmY*&zAxJM9vZ?*-2Y2
zXO6e#=*>b6M{m;)8&G~)C+C`J%-hjk?p9q=?f8sWuiQ2T_1QzSMVWvU#P}iC9_CNL
zt>-Q;wk`*>7Ql+(OVA#OkF0MS(GK8ALL;%@$w8+XK=Vg1KnZ%VUxU%xuYkzXwyQaR
z9()20Xa$Y5tS`?HI>4G|tXsUQk%Z$t(<@i)@5br)Q~lfE>yL0U==ZOK%57V~JEnLL
zSrEaA0j%_DM^x2K(SE^UcyNoCC67R%;0wk&AD|ffoE<<3{8CiXwo?3#=fmCJyOA~&
zfp2juCkK>h81Gp}b-OMBptjA7Q(<`-8{-TZh+%k?rldpo4-tcsr)xIrc@UaXylD8-
zcv~q(ilEEoq68U%9PN;h)Mjv2@a+1DxAHil*WbsnxAC8`xs8$j__O@Pz86bXD$NWt
zK4td|8>Y%dSM!!>4`thRtq&Kw;V)aSkMSM6?hZA|TCYzH%398^k3CjzCm-vjKV2Lu
z`<&nY_&3MTR!%7CAC!>9#0jJp<=u7G?x*v6fnEnX9eUXj9dhC;oxDysV8x+;i68Tn
zWs|fB952csv;}w2zCgJ&JHneta<ap(c=T(lMw<K9+xQP{c%MV1aNQi*C8=zkp_#AF
zo}+a%MidAQk7rP{0=s!azlrK`$~^m_T7#Vd3kF4~Ohv><GAX?Y9Qw5`mg*YKsva5i
z8)KMWFQF^bs)EglHj9xs1SQEH_3pRvFv;s4v?hk6-xO6<8aUCctTQ5GP*wztzO4QY
zTj5|Rmx5$M_XH0qw?k~PRSt*2EWYrAd|V)M#ca%#g)n#<b`|<MJz)kV>^vr##+gI%
zl}beQf|TA$hdz6WnN3s9F=?WBi_lGmY3|<6IqE&;M>;fNc?(Ap`w2Z_^J`%zv0n)<
z*|+$@uX|LwpON}u;+=mtAN1UFOuJJUr+Pe92F#4TuOCojato1EsLhV}P5qotEuT@u
zD9bf#<8}M<({_{Ya`aBWT%lXs$FYb|r)=FjvrjF!Zx~dgXcq^1xdnmJ21I??kJQ`8
z>OV1`_Vc`_{and~IxzJi=1F)jp%m^qkcs{yUx;Z>B|v*3%jS$+?vI69#?AL!xyjr&
zh3x8OeG@rSKY%hhd7|G{(5eeRlvwE%>C6kEyhmfQol1)IjvI#>_|CF0VkHe;+b0oH
zpGMK&<uL!F#xr7du7Fx|{`#;ngOXFPkVz{3O{X*4ZxKgnSP3%ygg-ap?;PyPRbs9T
z(Ux-fkWBW}k#`@No9=#5Tbfg1Wy#?OTrsIdbd}iJa_B6+Dy&1+l-Rp*EX@8kYZ$OD
zbxhWoib6Ssfd1^5yrmi4f3$FRo`Z9`G||ikhBzZO{&8MXC(ZpWbZI_NWHW-J>Gzv_
zY*~eoc<tk--g1A;#XHn~UqPnwUBM8EY0`#C;JJeK+O-XrxjcZ;42Z)~-$syY7r-Qf
z29(xp$1W`gz2*E#NP*r#ms%e547x%BaV6n2tAH2~tW)+=c4u3c!EB+w#<2ygZmbVn
zJP<kzI8>-Rt^)<XhB;anqizKoKG<o&2CKYV-`K*b7!WG@+_IrlcZx5L!f%rs;?}QE
ze-esAYQh)pHejHR8N5ueEs!?*UFzc4Lz!BEUAX<H(vc&iqFqH4-+d%7m|jlp%#;E{
zd=#qmmx+wfTuj=XZZtfuI)_4Z_grklFXgB@@6-LR0=mF(P-$eWJj`?Ndy$Y(o_>uJ
z`eKZF6SYuzYE6kP%yI)IG~LMjPtk2LR}aHurjz?)__380YC?OWw|2BLnefm|-UBY1
z`mxAsP}W%Imc4zcwyu@%&?eya>*MCf+MdA#(=3n0gWB5qc`su{b(*JKDSz`Z%$8<e
zcp2~8rQr1-tq+EW=L%-{Sq8q1jrB44YR@cx$4jD94Iz=WJCduE=SrtOnj-78$F3h(
zj!xBm7oOt~3;t0IJB!(N++cs~?51cxi!;q$;45nwB-A=f7A9LC0yhj<_ME5Td+p!|
z@sH}9o=suh@1$o4j;9g*Go%sSOFy}0aZvh~sw}#1t9vYDTzvFTTZpdjCxy&df7nH}
zk9D*K#q|^#x^s0;^Py9jhuD(MB^InX%&qKd>6YjUsb;?Gl_ngy#dWp6B6<q7MXceb
zT~_6dpXNn~Y`zV5lc^LvPXdYbQYgE1IC}qez-eCRYIPhNUN|rOW!pUWb@|WCIR0sA
z9&q0>+@rK#Y(GCOeV1nWb^*WjB>APcl8wQ?<#$iiux(3k;>&+4IG8so@h#_NFFiI{
zQg0%VjVBG^_a_vzchYrY;x%tw&mlhdUoO3mdXM!l-r`=LJ(fLotbTI(x1RW5KHYl%
z_~^Mq;d<4h3b<lGH}l06@CMGfJg_L-ft-o*eV^6)R-1Za?vUYv{hp)z>T)$3$uUcq
z&=^_$-vD)(1^kQb{Mi%ynNa?$z<{@`0o*bH9GU@KW&wiW0Ab1i{+R%ws{j#!0C9mp
ziJgG2l!3C~KzY|dMVUY)vp{9mATCzj##mjF1o?(o|Mp2Wu^=^rouDUk=PzKeF>CN|
zaIk59un8*Id?wg(C)geca=Z%u!wPbi0lDOZ+)yCb9gxQr$SWcEM}835j2{SS`<=-c
zf&#;Kz~R7<NY;?(8L*60NIW<sF+U_36#@t9gXw^w5>M_<na~`w&^&Nx!tam*RA|Xg
zXc-W4Mjnc+GIOM|Gn`pBZODf-p&-pWkX9hHjTPD<1MM<{_PG9p{DeHaFtXT!T3$d$
zfv|B_*rW_>+6*=ehRx^07E!R}9oQ-`Y@PL2uMA{c)3GN%Y#;Sx?hHEyhM%&AzYV7K
zneyZd4!gc`{EIS;nF@d02}i??z+j8Ol8wMIkH8Iyz%Pg(?2RDajUdI1e8v_@E*nW<
z9!V7vd7mFHixxqL8^y>L^;$NH$vlcBB#NydilaA*Yd7jGZZt1jH1|8%=nv-6{2|eT
z1<}I2(W1N2Pa)LLY%yPCW4@ZlNQcD87R1Q+#whN_DC5R{<7A6fla2jh9;@+GMJ<Tc
z`42-IYk(W~lP&J4f%@A#?kS0CUJz&58)v;6$7+ikZ_gI*C>#IBJl-WF-mM_sqc`4b
zH{J&~!H+E=KsF)BJOLDv5W-xL0O?JD?IwidCPuO)M$0C~nkPP`P!kIhlY0~4yNPMI
zNf~TOf3jqga?F$RLXrv!l8SnhN_LaVaFZ+8lB;BsYZ4PXM}poYBpc=@7mXxy)hGYF
zO77@Q?qW;nrb_AEO-41O^z5ee;ie4Y!aJ{1#?0ZPA@B)7_&_3jp#VO84PVrPuQkBe
zsZu-5lXrKYPV7?;cT<mNLmv0o(%5m+{+Xv;hNLZiNV_$U>Ks9*I!Obtr$0MML%T`C
z6iUakfZ!#i6S}7p+@upVrlYfG0I4&ug)%6TGA6S!UNmNq7pBwWWs(VHQfp_voXcRu
z%b?lIc$Jj-<|dPwI*YY0lU*+3-CpMVB(%}4G|{B+G7{6z?Ac%BvcFnnONVAZrBLPj
zvK9BTmGN@Eg=QaZXZ>LJnU641EzHsB%hB7*F~H0H$(}1|0amxjO^IhDGlZ;k<j$<;
zTI1!}vgg^$<vCj9{Rz!;Da>>0%k$XF^TNybVbAxI%MY-~4+_l(73POf=i$WVn&K6J
zf!XO&1+f+d@u3BYg$1t3eFgBnf;7Cs4EDk-xx$<|bE=PM{p1Bja(?1_=zzcCD^<A_
z+M>@QKGnDv)g~0x?&UVl6(QJ*kfBAbjYVB@#XZ!;-ATnLq2m6%;z7KUmYd=+>XLT3
zl4*;Q*}~$<#*&%2lDWR(CF;@@p^^>ll1e<WXGn_2;=a=9OT037(baQtB4g3%=%3XR
zMV0J%wX9`#<3+dbWq;>#AE=A(8_NKe<rt9iE4lJ(?IPUUG7Oskh!dV1i8ve;Bt>OZ
z9u;`~73ee-t%d*bD3CXmGc;Aany+BgDd$M8yx9{!-9z8&^F6ev%E`;UxiMeCDB~6{
zJJl8~>oYGatP<BL`%)zS8B#5yQ!RfhE-75CL{lwmDXOYdrg&Q|g<qp=S))@_t<_)s
z4Zl_kQlmywt1Dcqrc)~|Uu$GpYYnNjEvmKYuXUuU`=e8%#Zm2b>cYQYcU*^OvM*-i
zE_$+OE()m+)~OLmt_K%YL+9&5J&GddtLnpb%3?UG0wE2F_zf|Z4axlt@%s%Bj>gE_
zCu3)0LULo4a6`U)V|r6#?rmc_$CJ^ah~}!UfyTZ}yxwrWK1seQm<G`<j0n;}<UAQV
zEfF0MM1B!su&8mEqq#@8c|fPRN2h4NuxZ}IZN40_eB0E=(XuYzvT50}4QbgeYT55^
zIfOI<#H%z>&F2sY#D2>qr1iR}^|rtDe!q3I-+WHK6+^+3?4%V3io`8O;twFNIm%95
z<&mVGZWr@Na;r89Xd4xxwIZ~M?4WJ=qLiAm{k1|nlU2J!G8)}o8waQLbI<m-1RcDb
z9fur9POA<TiuQNK9l`@0qE_`Jo*kc|RU`WyU!Qn81d=noU1FfKF0Mm{pz9lF7g0+4
zhxty80UK_SE}elcJ<gU7CeW^*w7%E}UB9imP47AkIlC=W?6slYwgf%W1f5n2J$#p)
z_Rt<T+K#DrJ%0{*ekpXibD{z~+br&SKu}v31r%feg*(s_3`IqzbotGrVy$`~o3hk7
zdy`WvLOpxa2>P~Y#GY`H-kcO$8`{2t;=Z<9@d&G@`PM2G(O0F=AC}yEhTq>%te>yj
z->lH@3F&X-91yC-Yq1)rxf7Qc8R#dFM|chl>-N^}^bIHs<}>$?KnMLr`o?t!7dg#_
zQU+IPd!C3ejU}t0*MA1riib=ViWWSFjy->^AcoEdy8i7Aoj`}1bO-(p4BLA4UlWW>
zDfHeejQsW-MuUww#|~i)j)>h2;}VXlG7l3fjtY5>kitgqsz=BNN8jF!e5WED+y5|1
zr#SY;bCeM_Hi#Hz8XV)f8)YLLclj{Jr8rLLImQbc2Um}N7#t7t92X><`1yWZRB__|
zcKkDJLa=)L>)-^D=Y%Zb<oLS@Ma9X3+X>Zgut~?Ni64WLJGWz6gj10BlX{9%>mHLo
zVN*(9x_=K&9o$Zu6HXhxpR!h*PVktrfBFDcPW~C3ZuOXUBb>?oIO3%^({(%T2b;mJ
zoDNc)v6Y9<ge&qo4$gRb%|uwwPS;LFm&|(W4aFbMMl#MO4^GA2&*n5!LAd6u4rg))
z=RT3m6)a9>FV0mBQsxuR>ow0+m&^;lpQ&q}t(0z_Z&Q>?Je-e)&$q)CMo#Ct1}Bj4
zg<-hjK=W)5;S+I7)z7skdblt(xY#1SFgrLsvADPnGxj4`GHzbnC|N4}vlPU&B(qPr
zOp}>=s5rArw|q&5SF5=E{(kA|aM_V>`BrcG;(i%R%mZD2`rdj4Utettw(?bRg@Ah%
zn!ZfDG>Rv-N_8aJY`v;WxB8WuXbtCTk#1>%l75XP)e}u=jecpCwRDxdWsSgkjaz9w
zG;fV}Y2u~+IswtTz|!!$mi5mMz?sAKyW(|;hh<UyjdOwx37Mmfo{bgxp;76Fjnk41
zRqi20{Y@RESKq=m2^BYWm6kLfHfM@AjgB@X^|xkex6FnXja#;QJhyD<H!SG4hZnX!
zIT0<|d2ct1Y`ZJ1J1=dwq-^_!EqIFURNZX{d$0MY?o{jUKwIX)4?6{jov4R3TK!!t
zy4?h^`Iwg7G|$~nsobk!^m`c#yV*8#>E3&zDSL$vv$;!q@ppUW-m}GG`w)@6+SHAt
z)crl(eZ<2|{loqo?Lm9kOr`z-#Pi@2ifE>@<-irOKSZ?LM}O$Qa4<nXJ>q@%ka9Se
zIyJR)=z4d!LO-=2c66Y7w3Rx!mU?81I69!8+<7?C6FK~JX0v#te{4c~d=)nFx8+#P
z^Z4Om;fDT1Z{Zj-Z5-fpq9SsF_jUn$`9v+{goJpU;M1wp-N}oxd9t)q72VTx+LvRL
zkEf!D(^t!L^af{Av}dg0qi<Tzggno<4d&Qio{KJ=y-yqAd5NA3z(}wZVDznLCPnbt
zp@qB#yi)<Z5uf?2+@f?m&Q*H8dbEE#e~x}~9%Cth#?UB9+^Av8&x@SaIo<@zU*M&j
zs}Ya9Nkj9A{ugN}K*00&>Bk_C;iXIXWmYoUtJeS`5`Z%?T2du0EAp=s-KBCXuEWcJ
zZI8tlvexG(a-5pPf0L2_M4NDvM$3qvSG6X;#~wjDyWx;Qq9vCDWDKuj3@$`HE^t#W
zV8p{##~0CW0incY$%cS9xdsNqTU@*PxoAE#yPK-;tM=BbQiGN7$HK_R3s}=lnfrCp
za#`{`TbeTBr0>1#ay?qad#AS;GAfND4!pP|=&u|B$j3t76#%>vzz|Wt@`-ERhkNT7
zeW&c<V7Y#+^&USR!>F|(7z4<|%>!gbM+eeTadVSkgV2R|p)oPMSoje{B(ZQ@#zb^{
zG+Z7K8YC7sAt?o(i=Gev4}M!iM#cd^G`F-O+uA!iySjT&z5j*Z4v&nEJ&sRIPEF6u
z{=dYxGaL<dXXmw4XdwUEasoK$9Tflr9TA(+!H9!l{@ak0K^!QD_T{sXlP{(u;2kFA
zj-gp500&0o`dnqjs2fJd;vdK`_N!M8OU>OWhd3=P5?zL&SHOWx0iBPLOJ;;^3xy>@
zbovXk{EK`zHdu8|x~%XA1(n00L(l)=Z~rgtt?sI<%%2)=F6=9twy3YY2G{@q+M}ZD
zB?)Gm#yevn<ZCovOiWIofN!9AFchCt(-44x4UTgq7T>wMj!^bZ_~3X!;C^N{mHUH=
z3>FxKPg2D({rH^85F1FwLwiMY`t6J9YNS4$1T_W#noJ!zqB!zR+LSUwdGtg6Hw*wp
z+TD*i`RP(OIqlU#gW;Cu|Hf~XTU+jLucrG%hhOeD%P?ZHuK12{k<U0xGtDG^uRg)V
z`1zyPP6kI4t8?)Sv>Z8(_Pp|hf-`ex3mw2f>wv~Ytvay^k_eH>42dN%q*UWBFOS!c
zcz!~sPWf449GsYL7b)}NdmITS$Pk6{VFYZwiIV&AiWXan2=ljCfaL75o7QgBa=c+M
z+g5^cocUIwS#H5r(je-`Qk?o~(0Ga`+WT!u&7m_ez;~7@ib%8VjX!|$dnHQ{grJ)C
zC)FESJ79VYEp>ioCeNOkg95poOxpWY|8;5bPx5vskF}i)dI&R<F9CgfObAvi*vZ!r
z4UNGR5J!tHQB+}+^qbSRUF(Ot)eFmmiboW4(Lv?F$NhsU`>xkkMvsfTk$yf}$<v%Z
zr_WWGNCpgj>o=&8vRT>fe_yJJ;_=9bVvqbumpVB6dn9{iBL9|&&gsPg<LOa1GP!)C
zVWXA`wHy5#aK{2{L<jSzWKyYg0b}r|n~uxm6)k_n>wM<;-vOtE`d#~!D*?fTfeo##
z@Bx7@W>m^4s%PkMsIS>!2ls)w4ompJNEN)9vDP#n`li_ln;%CQNI(*J;yAAU7+U0j
zPMOk{|1+<WrH(vpZ538EaUfAMb+LQV|CglyY|-_<+D-pSH|-xtUy59;ydGIK$M@v|
z0A6!Pa1luW+X)_H)LK}AC6=%4n!f(KQf5E*Rw=4Gb5B*-31lMim81_@`oPIZg5Z)Q
zItM38l;c<nNOH0(;m*!sTb!<(`4U{6qFBoA&Ib68JrlW~`9M;Lg@`EKs?`s#xQt(7
zXIQTk;rSo^+~$1O^^#WgJGSggYe!q@s@C1zGY)v!r=#eRQFMZ0?0$kH%VcMy2@i!z
znFLiu8o=)#M+H;ZTjO`j1K520yQ}T;!~Or$Ymy9J;mjuiVVU>?W<lbV)5+x6QuO8G
zGSifdEO_^L=wY|&!3S8;^i%-mo1348GNxp(k?<h2U!6wuf$w~=LSJ+w_Z=jtHc^w3
zo?998>wV)?7eHfu1;Pz^g$psNBT4%2RMRv{iubE66rN6BLbRQNhMjHUI5EqqefCuc
zY>bA{5JPmP$%~B%`1V!J;2W7hP2o=_FHfWAuaA3b40?Sd!UER-J@39!Z~Z1keV*h*
zVPg<?iL<q}>tlc4Z0`8X03aYyz*+^<6V#r@)eq!wyY%+!w|vpA&W5id^Cj9CNbTP@
z1{DSyLGa%cNC0M@6+#os)n=6<V|}NeZO2Md=-PwON$8Hl8o>UHVZx{G@Hhdg+RX2f
zSkycU<wL>bFF3KmgkPA%xnqag?lwt%Ik$xZdSm!*&e9)&iun>0eIuLAk?9lX+t>82
z0*7#7i-ombDJQ8~tO;fYm9e%}G6ph?YB_K8l^@yc*J4=Ad{LX`FG=XMs?-IUqVYE%
z&*!On*!tuoDTKeK@ujn&t&qG@L63`$=m_XI#NfqX%Eu>|8~Ok9Ol|Ka{GBV@r_>nV
z-=lYHV=|+8kbQkJOrT>|V`6<s_+EFH0R)^FDib&6Hn}H|+Veq+9mxS%o?vApdODN9
zfH>jNDB1L+1eDNe`}t4CBv}zt!Td~3`Hyz+WZx~j^j+jHx!1k}4@}QRe9;2T%n>C|
z_-#3?#L5&B2EtYgV;SI@XWxhyG<t3g@qL_^F^K@g9!0afHfK`b!F`r^rX-@^Hn&T*
ztGd+wmZ8u;)M%xS1|C)Bi-S3>#yqB&Ylp+W@Ht4_Mu?1Pkkkf?_Vo{Q`Q@9r%a*HH
zYmI>GA3}JN#pgG=Hm3D0M1lKk*yT#1Pp^5LYU8KV!M;v`=`Wa8R>jSu35N31)1Hix
zE_8M$pY3Ec|J;Q6Mr{_bQ$|cFeg}eaM?SNfh#MVVJ!4oU!((&0if?jufR+rFMSv7G
zUeQp&YJVSsEjE{NwWTsaO`(QiQ+cTw*1fE>d7pmry;^y-6-cSq8RmOMQQdpc1ppdy
z73+p9hiaJuDhS_VXGMHh_}e{oi~qsp8I~ftj$Mm1QT2a%OcD^Q*1O+)l%&$A9xp2+
z@-fEGwbCY{N<s!vTMqDj&Rqq^n{i}`bj;wKF~wTq7^3PuZh+N}dehBhdDsW9sDU}=
z@x7_s+%4PvwlqmEEygX}xU|D|)9^vNnGSD<QmRGcf+38N%CN24>O93mi+GGty8-Jg
z#bz_<X`$I6qB@O?lrIuvatr@_Yip(lkDQxs7m?{7A`%sCz8+IeuPjBC{eT^Ln<g(|
zi2aMZ*TenOak~Oku5X7x4}Fds<`=`9N`@6rLWS=FBVW|liXPgkzr9;$AZ|cau^+p|
zYfsY*A7+yc+CrK<wNWDf;L`UF0pWLB0>ceM!ZRnfw=dSvTt0RiC^@a==#H`s^ZQu0
zoaYMP@2Y5M^|lh8&}`cebG~hy+Qt4W;+V3c5Z*Y`qjXW3R<KP`cIjJpXpL{AyQ>rK
zaf*UDNW55FH>n?PTDly%yi!=)(5tKKYvsCZ?+`iJV0NFm9lGikemLC~zb!0jvG<Js
zQE11H*c$Zw+uuZc96{W?BdvU6Fod-z^Ok??c=2qoX=%-HU=~Sk+dnq{a8W<p%=3oW
zwXbLCyo~svUv21O3PSU@QMu(TzV)*8a_JJypy@P7`Q9FbZn>J6>b!3F{)iubX`|sW
zCa-sYrlEWDOyqL<7Up@}^mu<_P<P&8@TY8i;qK~__ut*-e_cJbkB<OK-!ItStEO(n
zbynB$zWEv!sO&$7EWRX93WR)vh6uN}(eAAv3%q(iPLSpHYnKOpzgGhOZ}R0a1>6=c
z?706*($4rtYk0n>cdF6wf3F$vaYpLg)S)ZSpL-_2ciOCJ%Yo4~KvFYMYKEUh)Aw(;
zGv`i#5|9OcX=i`wE3Fx%?i!@3;kOO)>;LGXd==C@_2;vu{TFbMiEFS~0<)}uz&BKo
zHBPXdCPn9^^Eidg_k`el*94F|>W4PU8*<?oKmqb+1ucFIexVWUkpK>@2U}DGTgkXk
zQ3eED1@80&sDZ)Igpj2Akb{VTzuSMd)Irgdpye}ACZ$aRDx|PJw78xy><Wy7vfk$p
ztp<{MsoOfZfJ*8ih#APlN=TZfkERBs6J;)F>{_q`>8pnhT!m8Xc=*ovc4<N#1)LfZ
zpaTi8x%^P1nbWPQWu5>`i83ULQembZ7P*rU=Dq{fjrXT!2|ICxIBU2*sYowe!$W3Z
zV`@Gv6yf8*(A6EQZE!fIRs<6&tee>zupQnB_P*Tq#W0T`6O07phlc~h&QTFHnxSkU
z=sTH62CAqJ;D|$%^Ygq&!fRNtM)<4QDDGXa7Y*TX3NUL!WD!M#DtYAFhG>yq*Efj~
z%-6vm1VhkVqI7zqL_-o|<gQ(L-6BWgUB3v1A}L&i8e-JlVg)XvK@_ka%Gm09`y^0|
zreK_LgQMbX7+^O@Ul2T|>H0$}&W<X6GB;NI8dg;wCkBk$WDS^OiTA#af8`RVqy=-8
zjTzjE^ukRDZAdtlif`wOCe}(YOmP2s9S3zwOprD3zlN^JL`MlGhUU7(B_;-DYbCwG
zQ7F%gkh)GxZ%C3BOv<H7t_)Fm8hQ!p<BQomC1hh?3MRK)CqHLJbGeKWNKD=|g>m2}
zBcHstxDqwJiN0pw?pc2qEvLTOlqc=&;Ce#sZ0xvTm{V}FX94_4dyDv+qI+pKb`8hX
zgloINx1OxGi9zjdF&jOp(NvyBf@y!Aw6`)dAeWxkks(j(E!W9z+LQIRyB+>VFdgeA
zz1cY(@5y>A$Gl^n+yTs>?t4k5{U7P=zqWLp0zaCk0e3uu;Xl$_dWxioRP#(8ZT{wl
z%=g+^rl2HAenZ~ItREF=AD^7Jd-Pm;nHIfSQrdR!sIyI;q_@Y6nXmU!r7beW`m(=k
z=NR#3gEX?gxo5Ll<mlhzOs`~1Hs%;>tH|QznkD5P`sQ@krv7fs&@0S!5X$Rh$mPDt
z)e*{ZYRvP*BecSEL}yQR-HWxK%L`7*b?!?H=*xPN->Qb)<ey@vH`M3ry5~l?7bG-#
z2*?$r^mzv*6{I(2`pgw%vuA}+73Sj=hG-WSTd1cc6_yvKXU!E>hi2zd7uC<@6=@gM
z%;h)S6p0HJA>@k6<nf;NTX>UDaq(VJ&s_0K_ToP6!mhpI3HIb(>JoV0e<s{}+>)8T
zg7LnRwZ??$n-WN2$$C;LioIZKu2h4xbdS2sSgZ6%J9kHsz3j@O<UqUZ&I11=sSMB`
z{dcZR+Pw_TqkN;W46~^mdQt|wEq^9hPC%1|jsIl5EyEM8xZf+Mn6KDnt)SM)B;T)i
z!%<E}Q+d=^A<NWM*$k{?#m{)%U&%*P!E#%<Tv+)oxhiR=@}qn@Z&8)_ZN-QAs=m;w
z&mPsTs4B@@_$NrU(tPEYrs^7tYGs`o>HKQd<YYxT%NpILs&C0P#d0-z!nJR}HHI9C
zI`Xv^$<+oPwQ%-YOPV_Jh60=UB0G*c*M09L?Yd>RI=9=p0n=JfnpkuE`e2U;YvFp=
zy*gRYe0>f}eJCU<Xum#&#@B<UA+5JQwy8l(wl1N+AiBRHv)?QJw!yWaAuGA@7fwSi
zet1SvW4VV%&V1vykj4s+Cc&P@YI#_#9Hgm*#-XaIiFLNARR@8a*VKL++i8gy*tcm*
zMm*Fb28Ej)cN<1%a?0eJ=k}e3J(|~8o9Ag-N`nzg@}VmnEjz+yi#jcxt}U{=w=D#w
z%?I)oNBFH55QBZ;)-27|%lX#ouPryiVR!qjn0MOOG)R8|Bo+dxiG>6n_|Ntu$wXR0
zZjlEC$Y&{S)oiUV?$Sy3irW~hjBys)l)-I`p6zuT$TthA%+Pl36e$NrG${tm&m$Oo
zPt$G0|5TR#C$jXv8%uYcQdS*O3}~c09Ssa<Z&SMPT)K4sS77OX`bz&RuVm_3q^;XC
z@uI`w|HPI4tF83pPjJ>nRm7n^i$HPmpg{)EpUO&M1E{E{7@cl!jAw6LN^b(9H))|a
z<*qlCwl7_zFY^h#_3X<{>B~p-6)yA@-}PlG^oFFMf_3{(uzMp?K%9(*{fJ__`h|WZ
z?LfQ8K&S3Nx90#VWuWg#?YJ=TgyD|R(+-Y_3{L0{PI(T_qzuj>1{W3vm+l5vXouEB
zhBoMQhqgS2c2b7+5JLwGLq~T*C$z(7BEx@mhc7&bX|Ga-ZxF+G3&Rh0!%y5Ty66a|
z-UznW2oOGkN7Fn)usA|=KSDw`N+vq`TyOM+*C-`?l)8D8mU?lN{(h8!ZtRul*c-hu
zX0I_;_!xWh80X>`nEQT=hi;rtbo{;E_(!jC0r>bK1KQ^Ze6@jbF*v@cSBJRb#FzVV
zX|9RSdJ{6u6S9L7a@G?{qLZq6lZw`p-@GQZ;gh<Allu1)hKiFq_mfZ9?N8XGF>K0I
zbjs3t>NkAK>VC=^jxPio_4k~1g-^RTPkSy-+vrYH98UX-&IIbs1bfZ=93q%`;nf8|
zq9P7v!U<+0q5V;dsBp#E1h3g7y4l3R*<{h#H0#+^(YbWRxlFyee6P7ez1iZ$*@DHn
z48r+buKBXVxgzWNYOnbU*nIuqOsL|h^Es-SVF4k&V7`UwfGu>DEc7(@_a80{!WTxY
z7bXT5CPf#g;EOZOi(`w6!~b6ZUl^e0?cLG+-QpeI<bB=XZQkRJ-sP>`=gr>f?cVFn
z-R~{m@$KC${M*L70`TCAzK!1z4G)B@iTCZ_!z>s4ZK(uq&;Wkm0iNImZQ#8);6M)E
z%L^Xi2tMHg4dJ|4;TXQq9KMSZ-r<I<;k)?ZB7V>&&Wa&^;&n{ot(f90KF~0(ED0b3
z$Rz+HAOa*XXe<97<1g&u%VH{b@&U~u4FvE4FHjCgp5%Em<39e+PF^e-KpY1V03^@>
zJm6N4KmgJJN#)?>dDG)he!@VGERYlxz)(#BU;#z`3}8MEZ+<sXKIZ#8=W3!V7@%jt
z@E;!#4Lm~*e~vd|PUjkI=V`)6#jqze;0uS)=fJQvyg&kT;^+X7K7HN-FF@rZ@B(@s
z=|TkzmhR~|LFoVx10q1>AHn6$V(5f!&#Z1Hrz8r-peiE3=;5^In*sneFzLOJKrz5N
zhcE&y;0wDh08IW1eZ>pKJ`$9kT^>>E%`)h%F2QA9EL(E{#y~Y|ZtF?|4F7@bxeo3I
zfB_b8NEsVI0?5(rq8<Rjz(~Fj=)J&Cq6Gj*fE?`3EU*7Q?ed)OWWp0WZVY>(=Qrc+
z;m!;2-s^|J=HM;>i_Qz_)$UF6?gh^aP~sR8KP${4?fPE8{5~c;(dNX!6AjNo>h25Q
zF7bQ<3_LOKz7Xq!#0z@?@g?6b1YcUeaPrI2@f+XHIv*wnKMW4i3j=@i6aVux9}Jl=
z0=@9?Dj)F(pYpx{3kCoKC8O~>-@nz)CLezcD=+j;Z}LSy@dhvUV-NK)-}EzY^vK0O
zO!O>0fA#77_Foe80DlMq|Mk6q^JNe5P`~g?|LNeK_GE7l;}r=kzbsY{_wXzC`SBn3
zP7K<P@w_nfVBhyg@ArJ)3uhnrm>&$QBx4i71I;7C_=-=?q<<giH0zJT?YM6FYESl;
z&-9#+^2ZQkkYMr3a{B+JAI@3NCM_}Y!hlz-U-O+$`@v9Bnh*FAFZh=495sOZy#OVm
zpYs>K`|Pay_OVQzPV^c9?h6nM$Wi=H5B(0W6t@2IzOY@)4-A>$0KmW<$DS;SZ~ftm
z{q~_zhu;fw;$FT09e~gQFbL4XA%I9Y$bgukI5>l7;7FLUfG9|KX+b&IF@iYr$SBY`
z=+OAcg9!8J`U)E>J4;(@dke54v4Y#{`wJW_JWO0{e2ko|yv*F}{0to}JxyJ0eT|*1
zz0KY2{S6+Tt;;KJUTJ_R;1B=<!^3c(&g0sl-mbu4A&@Shil`$Kp-7hw@PI*rM?W2p
z0w5Y8g9HH`0xZ8=5~XOBfPjkZ@uN2ghP+be%#kaJGNsCuEL*yK2{We5nKWzKyoocX
zPGh<x?36}IolpTY62PK3Q~(c${vP!UHIS&rp)*VY$naF%ff$i6D$440>d*s5e>OX5
zHm%yVY}>kh3pcLZxpeE=eTnuiumlihjO52rR)BsBbQopoDua%ZUb_NFus9UR#5ykV
z3*gA{BgKprC1CJ$24=m!B<v!MI<@N6tXsQ&4Li2%*~;?L{-*i1INQ2=`~H3nJh<@T
z#ETo($@@68+{&pXZw@`W^y$>ATfdGSuj%Z`m~%h&{JZ$^<jb2sk3PNJ@akoGZ-2{l
zIr#MJ+rN)Lzkd7d^}pZ$NPhqhNML~m9*AIqGzo~`e+4qAV1yD*NMVH*UZ_Y<7#axS
zh8}(hVu&J+s9T35jwj!VDqpV1Vv8=mD3ggWQYYh#Hr|M1jym30<BrAk=wpyV4oPH@
z@M&lyji)8aWRp%l3FUD>M)_KbR9=Z?mRfF^)|6b5=H-`Sj!9;jX8IE5nQ*14W}9xl
z31^Z>#>pC+bXMMpXP$auXy=|bS?On>f(}Zkf_)aM-F%2H%4nmGVt44HC`Br1rIucb
z=}&WJirkW%ehO-+qT)p9sMr8%YO1QP%IY<xwi?Z=u+B<rt)I$jYi*w9Ov-DozOK6I
zui@Y-Y_Y~3n<cTxmbUA%&OQsRnaV~>%e2&9i*2?RTB~h<&2|fJxZ<7|?6}NuYi_#g
zt~(vN;iAfJyz<W5p1a`MOV@9{`tEBNzS{cBZ@>Z%oKv|3+XC>x3NOs?Zr?T>3%n3d
zOmW3Gy(&Qv67;Zf#~gp`amXN#JWIwRpFA?k9+#|f$}X$yvMeUQOf$?I(=xNoHOCC*
z!xsMxbkHXGJd9~ZAB}X<N+VtJ(oR2pbkk5zt+dQkUmdm7SYM4=!a{!ycG$UWr_9z{
zYi%~vXs7M8)oiQ1_S;E!O|;l{-;H-L7@Oup-+uoMc;J9b{D{$lAC7q6gezir;*S5l
z_#j6AKzZesUyk|ZkrOg`=AM84!{%^_p8t30rk_s8-Vy-8dh4#g4*TmyOC5XdwzEFF
z+qds7JK3-pT|w}|4^MpY#OLm|@yahBJo0yWje7LbhpRg8)&n8B(bw~yy=mL;zB%yB
zk3T%=;*(E4>25(kef#b|YcMfAeSf?2ugp(>?e@EZfBvxVpCQwr4}AdimIMZvKHojB
zZtrWL0|!<;|NW1F+LK`K6!nS<TCjED6CeVaXFd;pP<iTWO9>x{!W63ORsGu__88d0
z4T>&?7TjD<Mz}oE-4KVyJK<W`r@|l#@m353h6Y{Oz16u8cNFyA5?_eH!ikWF<tyU(
zs3<%iss)Hc?4lPN6~AH7P=Ygb;uR9hsKhhYZHHLQViDQc#tLY$OGW(Aq8;zpD;Am&
zi7?!w4EvZyK1MNs2aIF$-Z;fJ&e1H3#G@l087e&n@{fZg<P)Jdi!u7ohKa<Y4m(N3
zM(%`*kSwJszZ5|=R+4jmtYH%;>B?0C;EIa0A}Dz%%1+7hl)UVvq{1i+P38}cmn7yB
zIax?UelnSH%w@sq=*wtI(@&BtCLpc3OJULvnT9;%F2}~qT7J_l&YY$>&xuNC_ClNA
zGo~st*-A35@s>DTW;f-i&L8Fzo&4<QmQpE5+8y(i*i2qGLl{m^?vsw=^unh@9~vcK
z>Mfp|yyif&Nzq}>GohhGBu8<%P_vK{q9iRTf~t8@c*0Vi7+q;aWeG=dR*Rk^Eay#Z
zYSNtQbVMk9r!3gHJ$D8)7BN-mH;dZQhO%X(J8h~{(F0IQ3e=|&%jh=iMoXjOjHVyN
zX-$~=)Ub*bQVIl*RoiJ%s=@-O*W>0vJ$g^CE-0j9t*c!L)YG97^{hcnt6uNNRRr4e
ztx8>sT<?n5#Fi(oQvE4k+v?ZGt~Ia&9IP_wI!eqcw|29f9ZytaIn}>r@UhnG;799e
z%hT3Wv(GGSXI~53>_`+CY%OhMLF-tTY80lFB_Lc0OE=fXcDTfC4r6h<SKVH<r3FoH
zTz`96;WCo7#jUP&X>(kbs<gSu<t|`V+g0c`le!lp?{&|M-qd_nyShthd)4ZStR{7<
z;1#TSzk1*F>UY0uDl1P&%e~vOx4Z70Y<(*$UYYhxy8LahgWr<fmu|Pf2KFxhdYizb
zVt9Oql}U#m-3;Oo!$guRSV0g>jN%ikIK?bpaRE~&pBBq_#WY6oiwDt@64!XgGq$mR
z2`K>~2qDNr7BZ2KY~&;td6`HqvXh$(<ss*HCSVn@m9Ko3DN9anPTM7yyBxPKV^z#a
z`zD#o99%4`dCi*ovHZ@=wA{kk%`l!boxwS04cnQ)ndLK^{|snuylMg&aDkx@jp#%x
zI?)+m0ikC>=tWC<(uQvIqh%3kN^2U?mbS_xCIISCi+a?gF14s>Vd_+?deo@-aG+le
z>ts~9ShJ3`tpi=_U*-DNyvFiuDYjbcUJDz<!RD2#hmGuHkIl~lE4H$qU0r9}O4`t_
z_G^DVF=|_T+lhVE&UAJ%jm_fQ<PMp*+jMMhtNY5TvvZimEVGt@yWBlsk+#>3Z>P3<
z&hP%R+uXgPd!Off;3mcPI{Egvgx?sPpli71?ri6UZ%E(-ued|kTT!1Rj{?Etcm+HT
z@{WuA;~l?o=N#_vblXSe7H>ImBdk%4JNK<JAMF7>PV$lGeCIgFXU$1OahD4{ni!9{
z)SfF#`GS<$2MYSopKeNn<D#|!{`rLu>|3i7^0`_Mk<+0Lc03k+xhh4vL8HqemWO@q
zPlfqayIyCr&m8Rm*L*VD?=DuiqqU`SH*nv{Wp}&_ey38u1=YcKXK->ow|@tE;0F(R
zlIp#$d<Sk-v7X_yKR)uD@8{$pTY0+s-SR5>dgo7{x6j))^yB3%<~h81{=^>jwC82v
z5e8ty!yU&P2H~BNx^#QbzV^g_r0N6AdIkTk_c<IseAIqk{OEu2_=r!wR_ES&;WJ(O
z+&?At>j%ij8&LcrGJW^Yf3Vxv;_%?#z2a%kc+vxY`{=L#^zF`m+o6&CtbD!i=}#cg
zXMY12J>RE);U|Ar$A2JVegx=%>|%iWgMgy<T?_aj4fuc=I4<fpQ_6)_?gxCRwR<7B
zS^zj87=EaMDfln=mwVu6ee-pB7U&@YsDd?UF%cL$`<H;@w}a*9d^QM#*~Wq%NO@(~
zgK>9+4ETdV$b<^RffJ@iAjo_q^m|j-Lra(*DA<Hsm`6isVG5Xpvt)(ELxbs2hFpk-
z^n!yE^n$^Meabg~X()%&vxXOBgcJydc*s*&qBw_rXf98fTRPTbJXT{1M}pxsf@63*
ze0UyO=!cI8D*JY3Ern#2m}Hi?WTqv98*+w_=!w@NNdT8zc_wE|W{N?Eil=yrjA$N?
z_=&IhD1%mLn?{SBrc$Ihig0Fv0OE?U=!=K~h~PkKERX@9MvTT-jK??u%BYN?CTF^+
zeZMG;4bzLzFlvsrjK-*q*~pF9h>gy;BAqym<2Y@sh%MDWNRH_UC+KJ??5K|KC@qOs
zG42SD^_Wh(I4|O8kNcP@^caig$d3W}O8`kN{wR<KDJSv>Yy^pr4JjxF*(mzg=#Udh
zO$-Sw5J{04IV0XSZykAVv5+zmAORvtk|Sx7C5e(I`H`?dZ{Q}9E9nX?S#B`d4KfLC
zG}#RpsgXM=eC}sN5$AqD8I(ZzZ~PW-9Z7Hf#$`i;k38v=gCdD%rj&9sm2Xp&a8{LU
zla*;>4ifp4UpXd{lyC}{cs((ek7I}kcb15AV`jOQY?*jMsZuD1auVk%773Pli6vwA
zfuZ$BLrH{?wR=bOmr_HQDfdc?=P*sFmyPKr!B}AfC53@$Ta#%`iAb4McXaR<f0yNx
zcgdK*jwzaD!k0z(UZNFWEqH)Qr<xMzWr|ouujzly7n;K0m7|H9MpB2ala`sOdr-(n
zn0c9ZI9`*fn8SjXxv88xvYQ+vU=o&FUua1bW^*!li1t;R6P2CI$(>FDncp>>(J7h0
zS(pSCg~SPm708*$xt-nVo<-7}OvGH&d3DlRXZG2D)(M+#)Sk@JpYI8v%;|;nX+ZeN
zd%o#{;t5}Qn0=kupzIl-5gI4)iAf5IngvRr_nDzcsE0h*p@Np2wGt|#9`c>_Ii40;
zp5_^xDC$ba8HVrYnM^sNGfJ8Rx}X?(qtltA2Wp^VIGa@1hN$(QxNw^^YNQoHp+Y5D
zk-4Mb)T7slU_n}fGJ2#{x|cS3p*otS8rr1!=bGqweQy}1;GmOMYNo_dqBy#uEGkcy
zd7@0ZreX?%atczPX$@t1rg{1sN&1|-MV`n7o*TMlKYE2y8eT&hA0euzi&`IiI-gDI
zq=M?DIXHn2I(i?v4S2cAsGABKj{2tq2B%AEqL2!uzGbM`Nu`{ssyw2QXwsUL>Za$(
zrWrS@M(3dE$)L4lr`4dTtLm!@whWWXhMx+ZDk`dKdaP1JsAO2FxY`Xx`m53U9iCdN
zWLK?wcde#Lr$P#*b4sR)Dy`w#E?AnZT3W7Lx}}jys{83p-Rg!ADz5RWgu6O*uR1}o
z8mFP!qS-2#?#ivVs!;Sg4ZSL_1uIhI3aEj)t_ZrC{JMIZiKz#=iPr$F&jxF;Xo|3)
zs<7#*uIVbQb_lTEs-6)$4&ZvRCtD&JTdex3v7_p-_6oAwI;A1|uHTBND2uZ{(y9l?
zuo8%`Dax|6YOMQ8v;Qiyh?uH5tF-z-twY<g$eOTF`?QS1uK0ASQQDOzyR=)YUMlOS
z4vT!m`n5qjwwG$DBYU>qK(JiPwu`w8Ml*<UyJ9&eHF9gWbn7;DONc0jx68K+dkeUH
ztAixl6HlgOQD(SOX0mJBwvC&UJ6o?2cZ)`ui&$B?Qdx?Li?~sSMxto5d5#OZt!Q^Z
zDVw->d8TV~sGGXb_>7a7wKp5Ov8xxal8m!Ri<(x9vrxOYyJ@*A3%kp^l;*pz0KCB~
zX~Sz^s_P1?#=NPPw6Y7mN&>qD(!ABnYH1d|**mTRyD*Haz2A!r-m51S3%=z$SJQir
zT5G=RyAtA?C+<|szVW+$kjpUeE5G+US?GH$<cq)k3$J2hzy9mL0X(7VE5HYQr}W#s
z28_TB94idmEBx!g6D$p{5&;*C!5OT<8_dBOJPRER!Xd1|vLM1G48qa9NWm!#VWVp_
z9Mdr`?7}f@8Z!LCfC<AhjKepq!!A6SEu6zN48%Wdmls07D*U}^O0|-Em6+?dp6ht}
zYs69f45~u}RcysqjKx`u#S3RNMc~C>48~zB#$jx7rZL86jK*F(#1<mKQtZZJ#KJub
zt5OqkIakMbEOKQm0eEc3BZtRpjHD3!#)CY+O-YzU8-wdvor?U#gzU&vrogpu$ZZO-
z#d&s?Tz63X$eZk{h77a38JAV7i7(2Df@{plr_84=8ly*RU)(DpMSRL!TbiKfuQO}5
z&gylh9LtZ4#OMmNXp74dOUitr%ezd-v;44^9LzpCu@}<D#=Og^T+2wy$_VDl5dzE5
zjI&L9%VV0W-n@sJtj*&LPQm)g&Uv+_TC}zthiT;O&e(v=VXMsGoW#w{#>M>3DJ;$F
zOwHZgQP*rJ_?*uZ%+0_|&m5YzJ}AxujmnYSfln&R8)eUdywD82$?;r$mAuSKXS65s
z%oZ)h`&`Xg$Iq>d!XS;(vgWncI?|cf(y&+2WEInFDecC+%%g$|qX~`B%S@sIP1E>`
z(Fr=wBCD(uO(7rs(*(WJeOc1h+|(qU$w@8M+G);mJgiUc(*9htjXc#|9l2Ev(aXAs
zu4mK}lFeSds6pMZL%r2SJ<QJR(`XI8BCXXTh}EzM&<S$ZzH)u01^vxLy>)NBEpg4)
z*=y4sd#=J9v89^T?OfQ84P%~c)iLY07Rb@;Od?1P+3YJxL0|-)4cebA+M!L_qiqDe
zDP%_=1gg#2tL@sY4co5WmZp)~uWj40joXTQ(wkk_szVOD0NlSV+`&!Uoy)q(d)$uA
z+kAc4J)yh?^xSn8-AXvq%PqgJf&@s=0^7~q-R<4q{oUD(1hXLC+1&t0aNg;S-s`R2
z?Y-XOEeqw%-t+C=_5I%2t$EcoZQYzr*uc`?`^~m{eI^19-~oEr@j~DQ{+<a=CYr6_
z(97Uq`QHzovk@*P7Ea-~dEgK@;WQb3vKY=K4({QO3*!IO;UjLaC9WkZj^bar;p@WU
zE$);tekCF<<6BGPRFdO2PLVqfB|bII;|=-ZD|F&P9<4y`BsOm3D2wDx(&S6Nk58VE
z`VHkc?$87y<yHQUSzaVcuH^;G<wO$ZG+#cAV=g3SPUgNi<mrOuY5t0Bz9U}l=F$r1
zI#TCz?uU1dBYKYKa=7MySmb?frF?E9a;Glns!He^8R&;@q((CY5%B0o5CM{I!j*36
zmyYR~uIZc3>7DNBpAPDwF6x~w>50zhLFn8@qcIvIl&$XSuMX?6F6*=I>NE_<r!K&$
zPV2kQ>%H#lzYgrAi|d4(-QYa|+noU|knGE@?9I;X&+hEe4(-z}?bS~0*KY0Ej_up7
z?cL7p-|p?<4({VF?&VJI=Wg!lPVLPO-YqcR!@kQb0Ppio?-=!N@Ar=H`L6H#&hP#1
z@Ba?)0Wa_aPw*H8Z}10?@CmQ*3(xQk@9+-~@ewca6HoCN6>squkMS9=@f*+a9q;iU
z5Aq=|@*_|37A0@;Cy(+euktI;@-6T3FAwuEFY_}OPxCcz^EZ$4Ij{3O&+|R+^FI&t
zK`-<cLr?TYZ}dlx^hvMuOV9L8@AOX(^-&iu^;1vvRd4lIkM&uv^;^&NUGMc@5B3#d
zFZN?k_GNGOXOH%2ul8%t_HFO>6>kssaWD6CPxp0i_jix?d9U|-&-WF5@ArQX_<=9@
agHQN{Z}^9g_=&Ih2#e47jsNz60029}9H<BY

literal 0
HcmV?d00001

diff --git a/docs/examples/te_gemma/media/graphs.svg b/docs/examples/te_gemma/media/graphs.svg
new file mode 100755
index 0000000000..f734637e6d
--- /dev/null
+++ b/docs/examples/te_gemma/media/graphs.svg
@@ -0,0 +1 @@
+<svg width="1280" height="720" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" overflow="hidden"><g><rect x="0" y="0" width="1280" height="720" fill="#FFFFFF"/><path d="M645 209 645 446.818" stroke="#000000" stroke-width="2" stroke-miterlimit="8" fill="none" fill-rule="evenodd"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="24" transform="translate(201.111 246)">Without CUDA Graphs</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="24" transform="translate(855.749 246)">With CUDA Graphs</text><rect x="64" y="319" width="91" height="49" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#F2F2F2"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="16" transform="translate(75.6135 349)">Launch 1</text><rect x="155" y="371" width="90" height="49" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="16" transform="translate(169.288 401)">Kernel 1</text><rect x="245" y="319" width="91" height="49" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#F2F2F2"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="16" transform="translate(256.462 349)">Launch 2</text><rect x="336" y="371" width="90" height="49" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="16" transform="translate(350.136 401)">Kernel 2</text><rect x="426" y="319" width="91" height="49" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#F2F2F2"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="16" transform="translate(437.31 349)">Launch 3</text><rect x="517" y="371" width="90" height="49" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="16" transform="translate(530.984 401)">Kernel 3</text><path d="M47 368 621.291 368 621.291 372 47 372ZM619.291 364 631.291 370 619.291 376Z"/><rect x="680" y="319" width="145" height="49" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#F2F2F2"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="16" transform="translate(694.058 349)">Launch Graph 1</text><rect x="830" y="370" width="91" height="49" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="16" transform="translate(844.463 400)">Kernel 1</text><rect x="924" y="370" width="90" height="49" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="16" transform="translate(938.451 400)">Kernel 2</text><rect x="1018" y="370" width="90" height="49" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="16" transform="translate(1032.44 400)">Kernel 3</text><path d="M663 368 1237.29 368 1237.29 372 663 372ZM1235.29 364 1247.29 370 1235.29 376Z"/></g></svg>
\ No newline at end of file
diff --git a/docs/examples/te_gemma/media/graphs_1.png b/docs/examples/te_gemma/media/graphs_1.png
new file mode 100755
index 0000000000000000000000000000000000000000..f42b50fe0d7804e638f5e719f90cd381cc565fcb
GIT binary patch
literal 16100
zcmeHubyQUE*Y6Mtf)XMjEsZotmnaRAO2d$XG)UJVARQ7Sjg)}I(A^yhQbP|V-NMi_
za}VF|@4d0!d)IyMT6evF-1|L?#mqT#&hzZOpU<=7v-XM5P*Zq-M~MdlfgUI+%4&f?
zn1;ajIo!Lz|M<EvX5a(+{R^enxWM6uYZ(rlKX#MTbJKQu>*i_ZY6Y@(baJrbdT-%s
zW##zZ#>wpvt6d5Nq6aC-%Dnc<*kAN^d-Mh_eB`9-X}$Ea>){J}`>Ln7jPz^_ytcnI
zU3|fEZVT>23jJ1D{rivm*P`Qs#hyOpjr=OGR#LQ#nJ2;>$k5Hgpn_|tIP>rY?7*$5
zWCxj7tX7x+|K8kGl76_Ek&#$_@cnABsr~@z{>J_DwyAdFeFoqd|5$82nx$G74jgXg
zKFy7(gJi_S#PMpJ2n3=s_5*O4%}n3b)zzE^3%IV?Rb`XulCW^Rw?>)BzGvEGfKs7a
zJ>lHU0K2tHo^r=Q5qAxx<!29eT(Kj;b_+v3FC5y3u97Y%p;$i)e>K51HqS|+Rjrfr
zhLXmuS<lzcNv(_Kxqi7$1sOxWUe+WFPhJsvAdeX)uU@O`$~L3N<_L=}cfz{%tsjj!
zlKa#v<V6b`eB>XoQ>$YWX}c@%CvKjrX|=YRk%1o%f@DkN%IxtuQckjzWAE4B?C}(R
zv;tGfo#Y_aY{4&(ep~g~)I9TdZ@R3?MtJyuSC?9yh#sI7Gi~f)d^e+^=ED+3wFP}P
zYI|u}S=1eYHi21&@#mQ#6q4-6q&)G&Ov=8hNv;~d?^O~6-Wn$Ucz}!<J0OpVUn-9d
zgJm2)k&A)9^m(E5%ho%g(0qx>&TJ(t%?q_cp25&pb!vsmybm4Ex5Cdm#b)eanEu>a
z^PSVKwJa7NhKTq4KEU$Azu2KMBhD{k`gFx}F7iZYSHG%05|^i&%$<_PHi#RmBe{NM
zG2Wa>QAVS6nhJMx+<>dg%F5=GbZ5%nZ0nJLOl8l1P$)dqk1*hljHW^*5`esX?W4Y3
zKFS$Uk1r@E{0Q@TvoSFz6&;IQ-DV%#kxcnR!9Cm5%!>u8VCi#_HONhlGT&WJc?)_z
z*cVNq@B*|O%tjF*M0;18wwu{dR`!^CAzvjG^5Ee$WwtDpTZErkm7FDLg`m7}c$wOX
z;SEnIIQODYe1HV7>UQ|wP>M~l2hRSZrx*0Lx45Ly-`U-@_&3b`{{6d0>@ILQH^i#J
zno%ub(OPgiMb*Z^z8lU%(4{ktcdg_b&3M8iC`|h9a*%?zj3+@Lv0|g~Rn|E_PL25p
zlm+w)YP5sknAfcAdvqf(W!|{x|Cur6;$?(kYnDdw8lI4vd$Y*ypZ71*z#ND_t!qcI
zg~bd7-FAl_yH4JneRMm1ZBkHNci-YoU7mhji4Us;9#Y_A?5ElAeMT(?O}NdO^vW5Q
zt?|At-XT)bWNYf+p;ZQ90g{q?R;xy+)@*m2#(7pQMHTbMTf5Sie@e>6KEkeq>_z_w
z$p54zsBYYSRifKCxvS7r_dN#TF>Mg5v8DrM6=f7#Kb_*v{i#bW9}%afK6U_MSR`~t
z-+NWspRRF-GW+Ipe+8GtA^B$_n!wpK>RpW>77!5)_a1T;giRMCKkDiYh3u?jJ>%-e
zoi|nyt?Ais&C1ceKS^dE?pGUSb3z)Xxtcej&3Torc!sqAd0`Q9%<>x5#|l~Jv#+v(
z+I<vc7lfIG{IIX<^60_<#O%xSY)nE3KNI_8DfmQCu;%3&u-lbIpPs=O1&lF$Hm;E^
zu!heF*(lAn#>>cVw&SsFb{(X8{uRc7bPnH;7s=Ct)|-lL3OrZYq_e|29T4yP>Grcb
z_KKr>WV^i2-WA(Dm)DX1awwN)#O3}VrQI1P^g+d1mG<-k_ORl7Ye6~2A&+!m4$@)Z
zfpghQ5tsT&L7Oyn{5-cl)w+Rj^I5cPF5NPUgpx5vILEfwu0wZ;w)fXKnR?YaR07z9
zpyw5`<tsYL(0*9$AqW(C71z0S^0w3R;c|tr+2T-pZb9YU<orjf7weJ(BRmAY{y{qn
z$TSw5CczBncJfZwOY=G=j9(s=Ij{?z7CxK_=ryC(c$K&+`l4*fTx-@Texgi$VA$<T
zH`%0hUZrt&fg^e4DhEMck3u0-alfCc%_wBAh2cJ}yNhfxd9>_2gp!iF!T=+`5c{T7
z=U8GSUA?!hWGX_{Pyn>F&gV89K3R`#olXNh7ha`5`b4sXs_p9N@$19I@Py9I)y)TV
zxxR;H6rw0wYECpQRl7;__Vmq@J@>R$p$1&jFWT}h4w=`jY<|}+tn8GVQxxFY(#=mv
z<Co!CgO)+F_ER-R?{US=CHzWW*W)wx<Pe!<Tx^|^79;hDuyKP#zaxX%QBP^l3ojw_
zAIy;Z7^9pajOG2_)xH@>7)!3Y6{7gvahClcKMc|cdP7P@pTY%*LV2Ge;E4_jNo;B9
zsjxpLNo1@Ro#`jw;c$8XfqU2|!(Zg&CYKZjly>p)#({t_Kph?N6B?~fB+6lG@;_G=
zQ}jIOe!S(l%H$*7)j<g|d_MHg$;a(5cYy5MUDhfLRXVP#5cLXKT9U8c4AaWHk`Joz
za+<LiR&lPh>N@pB>z{l|PK|p>Ds6fWdz0cbMZ$(Y248H4S@EV~Kl+>%12#IcOTiY9
z@>vsZScU1D)qXSIx8_;an;IS1hT)V8iDOsGSyYkwiLO4G)%x2kW1><@Nl#K@{F^b9
zIpNpGV?)_T-8*-P<XBztm3fQz0;r+z5BpS79jA6#m>|E)PC7p4p4X?3MF{3$?P130
z(lWL_+<;E5&7Wm!!AG7S{3KWm$yE4jvYy=2k%&7Py0EORDb#99w%f91#0#1|(e-sB
zwEgb6n$`@LoqaHB257>YstWuK9Kz`YoP25WSWsZU>um|uG#BaeR_lARx>N*oGm(2O
zCN}R%<+PE+DIjWi`(pxMM(ob}3U(w~TKtkrf!H#EOif~?;*-)KBGsTx!Y#eWZ5#6~
z8x9%%D0?IKeuh&v+7Cm24h5m$%g|bj{(IEr-ye84H&d;97cH#C=D3dfoYD#1VYMSL
zoxeJFp4}toJvo~m8`le7G}M(L`oY_YEyJ64)BgroA0yvU!xXP$qlOFuT$ZwRT&Y>S
z!nRzwMQ-z6ae_myNhtbeRi{)BWz6}qU?RE-Jd_aPB!vR2qNchVc49vc3_R_LjTtuM
zPc;i#zRd8YCUUOY01efbcahe&MLw?Hz;ITNdn^5GD!?iPY0RPWD1ez_bghM7N<WBV
z>4CrHu%TK2ACpp-XZEPV;W!^#^NN0on&5JfNiYcH{So=;^F01z{X3w*y&Llx2iEwc
zjM%|1k|7Yt#r*{B@%Z>fEHdf#K;CWVwdaSH!(S3)NGh!M=WV`A2aV?>#($pIKDkPg
zKRO1s)Hw8UWAB|OJbCDf!HbpQj})$F1tNZ?9DT2v@x_Jxqk%_1Wu)B$QSk3PylHU}
z4UKL5A+R*|a{d+~Ks?~D!jDg0L4OJ9ObRydHh=X2zGNTKq@`O;x1GB9$xs6mL?Lkb
zw!x^ZNK$KTeKU?`E>;R_RK_*3*=gu-Oxm`R<!Lo@Q@v$)y$gK2H=YZ;%Ce}YJlFws
z^dV*r?pyl>e{iw4ANDe>z39~5$Yq$L;rEp_pl@hPP#Y*PX_B)>)Bpsk;*?3YGQR7j
zLiCx>((Um#qf0jIYSNM=$iuLwLN+GZu51(LQ>f}WM@gKVDOey<eERFx>Bki;1uu9F
z#h+>EC(}o8Pi3&+zOek`^*$t-<lSLF1&(y-(uy~7`#aW5Q^BrG2X0NDb?uI)8E5?J
z5}2-QB%9DJ<t-U!)(0|Yx-xaO!BYXnG_ULu+-kkNYb#-RpZ7ZMm#MVpgRnN5CDX<4
zUo<8?AN9RzB|)%0m89Z%G<98XdKe2GgUkTSc+eVVhAyuK1rAB>YU(}f*=&i0v4ock
z4OC(KMqOjTxncxm%BIZ;{8D^+R~wycYt09?`0rEjk%kf4yF>E^{V9LJwFvc-m)&NT
z?ePz6f(6-II;)mGom8NrrX}V}ik8@@7mzg6XDkC%A&b%hv2bL+TS}qAF)uZzXSR>)
zOuA{kd%#TNmTT_R2CCTI_fqXeazVu^;;o$&#sGhYgNF!>&_KdgalP{!oo44fz3VEE
zq<IZ}ZwZFsvrTFIdS8@>-@)}5y{*Aob;xW$cu?g#L&me$)Cas=;_Vc58zeSGC72+4
z{cF#gi$&@9w5+JUE5=Vu<kV%aMD*Vifqu1mQG7ph2-<SRtcngLd6Q+eJxk71WkWX`
zIqd>}*aOvw@t#?Jq-5KkU*Xju3Lq3;O3#hlXhbKP8p6d1do)mWUnzlJiGfK8({ffI
zn^WbJdaJdQXauR#h@8Bdr;*osp5o7eV5SHPK9p_g<!>G@G-Cb}fTIYj_WcSIr}VY}
z{Kk*|@tNRgx@jbdcKMAHC*;J$$bULLGvFh`r4pqd7i8~F{qnqe*30zn(8(J64A1Rg
z8(Wdk&xaDxB?mWDZI>7N(G;G7v9t?!I}<_#`oSd5R0hweQ;%YyZUdJyZ7VNNcf|1$
zm>2+)FtQp&sV7T;G~IHF4)sMW+#WM2h1@@{qjTGk)4QI<*Q@m620}C7-k&G*5)4af
zv=Mw>-{m__xURVQ>FY8@gMWM&m(^JmF`cJyeDujer~d&6?Q`w|2!&O|2?nsBq4x}X
zUYPU5NRKKuKmQqsRT5e@_zr~W2}jMz@mebQirZD+fq?1V{n}PF)ThS-!e6XAW*?lD
z?)9_CbmUiEA5STacC5S6w_k%Ye2-fz^b*JNyhKn-H|fCoVcBWnS?a%VlmRCZNYZhT
zjmo(ikmYM*ePfDfI=ch!T5OMDp_Gn^*mRN7@NRjhRwu-5Fa0rLu0PbQoqBC^VlF(l
z8u{^_i1l1%COkV}Upou^zKPVLZ_hjHawOF!Y_l~#>AsQs^pY6Yd|FZ@yROunS);fx
zBq%dodYjjwM^u3CtjkamT4;jA1}*P2FYQrtfRqq(){QN0yMnl17LM98zkyxm?SLr9
zopC7w@~UzL4<CP9*NuP=zZ&K4KsTOe5=^LsWyOu>Z2C$u4)e>7Mc5sf?4=4-?n$)|
z?+Z9nU)`t10X1Be3xhgNw2cl&-H|iP9WGcM(Nt)_uLX>J&U&{P6?Ug~5z~c`U_aqC
z)c<&zI|%Sj2Z*N^bW%lYK!WPy(_%8Ca@?CX372X;9}UVC&=A_?p^@T<REtgz&@HqI
z&n4;ZGl?~`;5mF|JlfV_*l>Nm&6d@FS>DGAc2K^K=hD4ikt;;zXA|0zZ__-{Lx!o#
zpjWiTUJ)|Wjuj}kwPX>{#<(_i=_!=f`RdB*8XjfGm@n)w!5;oz@o;a}uy-QXV4MG-
zidINyN&L24uey2R*SliZG{zZFNR9NVf05$%R&V&7t?m1z8osHS(YM!4GS9)q_XZ|B
zI4JVGti}lL?^q~A?hH4c*M5-SfxIB)xmK_E{i*H1mSkzIrJ0=It>1KpeL?<PZ<Oas
z5v=!ux!0u|(Pm==T6V2%kCksslK*O@%Qx;p-n*=g$Jo0c)nv?8&Y|92cIKMlUyjCE
zNn>!!FFuK*TsSUTi$c7MuSYlHAbrFGl^8*w5*<2OC?+{&TH=@nIBJ3D%Xw#{KidKP
zZnFT%$4dq&lf|G;f3;A*i9X}zollv}SaK*jLDzG*rO~QH;NFwg7X|^r_=?GL^n;6X
zqqEtehV~!k!^Rr(@6YTbj?O<b(4JX<o93X8j=iF1)+f=8mB&U)Ln2f^F3Lq`|FpS$
zIov2PHZY;=fnAh_)U-Sck39oC_J}qqQH*QBdO@u)rnTMH38iIo^Dway=SojzGWi1g
zbEln?KW!ruCEk$T;{(5wQrsN^e1mrD?`yj|^Irl-jxTMccW^>*t~-u|mx`ai)5H(K
zAHf7si;hRvyaqL<8V8^(-#@KVoAgTfzId%_V<#G#bZQ3rjw~<Xc++26F`!U+XoIKh
zOD%685Ag!~7RHWFpMC~s%A!sBKlwhL^*Il2W!r}1sqT5Tu6!foxGPw{Y)^Mww5e^f
zl;%O;SnGsf()A8)YpPqU+pit?nGl$8VBD6Smi*xB_X)QG|Gp~p*f>C4h(CQ8H>ksA
zO;*D@4%9w}E->$>vww7He>OW;1gjK%pO?)S+f)3hB9Hu#+~5mKViJp=7srqJyuXB7
z3d&IPwAwP}wnF}VQ%EWEsomf;g;jIOFmd>8g%$Gs=h(4#jDtg`WcygNz**ZQzvw_#
zLpRTR{tvx4^XibyZLttdBz|+2Iw@E;IH)%2!pOmx)i66?HauXlL_#5C66*iXef?|B
zN&~C6kEa|LV2v};Y}ZkykNkQZ<Og$VHT6zUl;oAap)s^Y8M2eJ@Rywn4Gn=!DY%HN
zn6=0{SAX{Q$&_E|gr7b}TAq*8x_cbP)+ayK-a$nq;!5X~t4Ovy&QitUr3#fRm3nU3
z&&31tDYVT#&g2cVp@F0FaklQ67fOA{pV}Iw+}^+8CI@9SjNl&;)@|IKU&mVr5d2ax
z1>d?{%GlowesTp*h>fNZ=QF)Bwb#E9Fe-};&lDR==ZvVx*XK1gptCos$m{kUGV8;b
zP;uk<a(*SuwXkjijq4mImt4cw(Ot)tRat=V-utZmxkZGgli9G1`)lo;%f;U9&-VHl
zuZ8LlyYvwcq!PK@{W?wP4^LezRs2I<Ag}tFaU{`(cySgpauyhF0T~eqrx&><?ZfK_
zH-`JKp0j9FF6@#gdtW=lit`<YyOc-Waw_nwQMO&h>!NC0{h2)_i5lSZcUtPYgx`k*
z{UCEw0pRA!vgQ?D&zp$Lv-<h^s9&vQWGeksPEUt5vs)Y@60KYF^WFSIw8m_muv?^D
znFI*keTP=u8ir}2P^=jN&$-6Q2RMWeut8y^=rXIxBqY~3z&7aiv96m&wx5-V&boRr
z8?Mm#Aq>j9KimfmK31X-^c8v9HbXdn4`+`x#-Dt5a^ZEp`*26cJOB73$B#u~o>1zy
zlO;x(Feo+LEwt`N&&d<Kj>|OJFa=ml%=@0uZ7tLrL4R#?$Y={A+R*Ob<RCz}feVS#
zDPwaLz8y{{<hACx)YVO!0oBLp9`8~m!AWuLvnfn(6DXBkpwYSH?80#^;a!?0A?Qh2
z)pTzO*DJy2CfBa^KivbTQ`Lbe`b}7v8pm}GEDWQQiBtVrUM3kCN`_<m`uvf{Z0kY&
z^f+FB@#W;O=r`LlcNf=h1;wbMrK{IV^WTdb9}n6%WdrO;7XU85eO-gvXLYQ4WN7bF
zZrRLOd*&Y^P_dXTsKmGW^mGKe5d+|(#@-ESfk&IS9^P_Oy$OklBp2EMnI@Ybu@ZpE
zyFG~@wNquO5vnsgB*!r;v5Sd&-}v7%OJ|!fD*~I|3~lnQoj2TwE7n1pQ3B|_m3y0&
z3kPjO#Hu$rJ!K~9MUyUfs0smZP3{(^rN%Wmd>-2^p7kv<i2&L<U)H{UhYC`3a(UIl
zFu7ll1#YDVD5Xg8nM=Q3k|IB!`Z&c1S?|1AbjUnmk$Wwez2*GeD@V6kCCNo9>CCRd
zQe|vzGPF#^v%`wqjQ<|Z!LF68LkxEftln%mKAX?>%sIvXCg}~-wRk$t&EXBz?tXbK
zbxt23maDwm&~}NjY&ujV`XOCrR_-ENw&|8PNz;oH&cS;ob;%KDX4nVkdHO^QR6R8l
zH+J-Iyi(w3_NQHa?n8unt6$(ZD+;q_JNB^ThSq6;)5xub)Op?J@H$f`m*<aORc=8T
zW^c)=7~;NkjZ3P($g08Q)^bu<{$vN;Gm7e2JOGbBHp;2u;OknjrC(dm6Q)8YCw~#d
zP+2u|tmB@I{5DwAfst^@Uv6yt&7u_QA*ADXa2n7deQay5ijvO$bbqs88z9!4TSAI(
z|4mz!RA)A+yubOIOPVH(j*)LP*rDWwtPtF%GVQl2EN{G*9AD?FVvGd`2Y_B)&tt+<
zB{CWecPO9zEc~b`X2D`;DO<S*VHo>M`glP4+DXCX5~nzGn|D#HxBk1E#SlcV$1v^8
zF+6RtJLd@k$szkqPra*6`R_T;q*pkugMqRbOGOKh<>n8q<tBMps}y4YYWkVzud<bp
z>sH+4Ve)F`doq&tP<(?m!{a_`)%JrpMWx*f#Sd{yS6nQgkz}HaFWO&LH&b^m@;;-S
z7c`Rh=KT2gdbKgs(m7qY5?7r>@_J4766>4rK6F68vvKaQz4PhdaBO7-NE_jPPtT0Y
z)GX)HFfO>`at!73+n7b>_hydPUnKqXA=uVmJemp{{pxclpLAb}T-xb^XwjFjd*E1i
z&Ghno<^o}^+P*<(2A$Ze?UVyGXYfbb?JgY3t<1;G<|4lc8mw`oBd8O!#OTHZ0A?b+
zEG^lNz)T@J%<1!vhgpcoBZg=ay3AWa@q<7zyb1vAF%9lrLp?(~iSw4WOD?B=bnZcj
zIC;cXFU`rG7MDs1a={b1^!u)O$iIF!$HK0@!`KfF#^~nh8ETgfirzaYl5E1s9ge6>
zbm}YT%Ff9-AacdOTJ7SXd#Pf<1ys?WGIg;*6e6ya-_IRhZ2uZ85^bR(89oS}wU=ub
zU@-^CJ^B1UtvpGOF!W0Iq5fR$XNsb*GQWGdGojIYFESvr-HHbc-uFV8e^{y{Gq;@Y
z{7m7qqaEizaVc8G*@HLXX+wQ5*QHVT(#VSYEQx`+<JHGe_yG?RP5Ts_Ztm(XTs<v?
zy)H%W-JveIdMvtl)iL!p%(Tu%wJwN9L4wKfUuF^OXN};;uC8<vejd^5D=lSkNfMyf
z3d-G*b(Fcs@_8|CZA!etzZ)kd;HNeykVpE0@osENq!d=a89=#I*qyK{z8d`q=;T<R
zP;`cm;{FuZZ#itP6bchy>WtiUF?pO_ntb+pXxjAw^0$HvuP@3r?m*@hLe1#?bUMI-
zh~8?#l6VjJyIj1s*l>VaO!lCooTM|ulmrg?YtK19Y}t0NU>U||PUCaM)B=<TCP`rT
zwQ#qYQwV*^ijBhyW#w<1RG;W^KgsHbxrbfhfP$2cZZK##x~Kipr(0sGN3&x}4)j83
zdl@zKQLP&Yj-?8ZJ&_OHZ)%@7d3xqa5PtSu;Cp?bABXDxwru$m!`={0%bkwId<#d3
z0kmL+@9tCFpvNpJ^tf-6&<%LN`q&+M<CE}Y;eVOH=*+LB?qoGwnbfT*nwKl}>R+LW
zyFz4x7a~cA+oP<C+|o49RP+;$e)glI2HSlobQdd$9_xRcsdN!lWf4#%lE{7^hlQyI
zqket~dvnYG@matmKnL4ndpJN28(q<EXBE)}#a{Ei4W}wZYt1=v_TIyTJfTcV7Y8So
zTgGB{t%N0Zy}x_NKxIIIEnNWvL;LIAT0R~De0z0?ippyJzlUkN+NJ&O4zh0#3>;uY
zWrLrmxHY(Z^{iS~8T@*0rs~Y$*%X}&08!x4VsC$C#*2#++xy$o(F?0Vp!aSt$c*I;
zuuJcYUTaZHRgkWITl|=rE+r2<tElEKP3k^Sq;ho9w@x9p!9VJ!K<LolL97k<|A7H+
zS%XkC_~Bh`g2)F^f_Ho%)yzA}#y<n{tvy}>=mMGk_?^DuTAO>ODEB2>+QUZt>b?Zd
zAp1cZd%FU<s5h<XNAeat;R!&!sE{kv`9-9=iL*0fwKr|cCEcftd+95@TwPB`@D2q)
z-=&uex}30^EvMy5`3LU`6Qre8{cFPp0|YQG`pNE(m$_tK_k9|(N^xD!Bl@9J8h}=!
z{yy{7+kYrWHR{dvsBJTXIz-11>+H}dILO4hBP|K9(|-S#qLqHo<hIiJKTOg;Ow#`+
zOp=zy7hMpD*7mj+;2&1$A6DrfR_PyB=^s|<A6DrfR_XsGR!PavC|(|*mmU@VAGJgM
zr``O&>i1iHC;L}F7T@gc$2c+2;szWv|7oZHLmybF6kG4V^??O(4p$Nc-L~Or9;f{$
z3-vGU<JY_O6rKZ%n$)5ChvdtDljj)&Kq|)nZyMiJ>4x|rF^ad8BZ>AwOYy&&FuNwg
zflIPt=2ddHKQXcZZFm3vbhV5QP(9qOhtW`WGNNV&;(WREs*)f$e3j@rZsb3NngO@V
zCS~~5ZHuQOYjQEOM8jVblB;NHYL;5i1FNj8mt3YU^>~+JRas{=OYs2q;NwJLcUtP>
zUFV=xdwbcRJ5x4g#WHL0jFkkePouTeFYRD%X=l}Jx&6NfHB+<MUd!+CFjPt_R2u<Y
zV6>joF?RyO#-Q1Q3p$IZ`Q{DjcRM>xzqOc9^@fa?7?Bz`cAxdlyoLr5*oy3D;+^8_
z^Ci!;v%GF%x2i`6H?ldrt`Oo86m^uVTs4}{wY9fdaVoFZwpO0EbmC=u0FoXT(^eNM
zKgv-a?k*BA6>!NMiI1<TrS))EBX<NBF}aRfcb0Vs$km`|Y{#{R#L6}1&Y~$A;j6~^
zJMn|b@>qB8KS%V%R1z@KD>!i7jebnB%WKX8Lp(Y_^&%5<2e*(PVMtM(^++AG#i>t}
zXMB+Yi=kN&v^>@w7h_6Sw_7a0ue3Y7SPw>$$Q8+Y{0nR|n{A<AZgVPqtO4l0;4{fJ
z>w`p0YZ@EJpf9u09|nI1T%F%so{ZHyfsI@TnTd_3=W6ZvoZu2IM_u?^6NPcsKE}p0
zj_pW~1-FS;DSQ@pO(013N2H`?ebJ9;HnIh*3G9WaN_99?Sx7<&Xd>qwn;)>VvlGV%
z=p>0{0lyx!qXSNNr`4wsWyZ~9_~hI@Dm<+rrKOyp4&f!g;0IJf%bhrMlkSW17L|JD
zV$zyLE!X&A-D5cqMzW>BU)<rIOa2XL)V28OQvBOm15GeEPW1hT>GpU54P4ysg4e|F
zSnlR>6asTV`kc@X`+wFZq}~nAQ>kJe=Es)l7|9erMofb(u1>c>pz7KCQN0f*>)rK}
zB}vJ{XYJ*cmGMx>L&#d)yyJ-~dL!NGQ%1&p;I@(}JQ*Shv@gIcdi@z>)if}`Q%c~4
zp!`|@Q5r5c(qFRGy0EVcrmD&~LCP%kLuc(nf4=Qr?TtF_z#;@Wvpxu-$8J5)*jXCz
zvW!}d@q`@DSj6Sj)Vw}-f9fBcwQsE7HfXt1la-M0@MeV&626Q4DeeJYkA4=5=&aT&
z(|IN+xU&8QEN^AS3|dY$bnMi(&iqK_*nq*)dioF#5ASH!I@|2}Z2w~MFd$4Wip=a}
z<0sIal_F?I#-f@M=n416nXhwr`Rdh!gT_tTl%H>n+{Wa*HZwhW&APCEZ)|wXfW$oy
zdTy5<1`9Q5^)WkGXdD<E>|~6S2>Siol3K!x=wx%WAFy5AU?QspFx9}wNZ*fCf#W#H
z4VJsR`^7pBBtpOXZKrvq>5mjXz4ib}$L4(@K=y&EXlUCN-dca$dFh0`kf%OI5WSlX
zMIy_qE*v^PqG|Cg;R5O%*QagBGsBkagXT|6_x!I;JA(<O!V9wlC~xmeV%OFHFuH_B
zo$IxD@jV5a%>m<!P!pUUEK;2>L8O5zM}LNS7}?qH0rMzmY7)!G(Z;m;o|^+|i(Kzj
zi`!o(vZ}9MZf5t7<tfA`Cx<p4wowJ&eXuhb3imyfbUTF8tgo*-i)CIQppY1KY2>3f
zZ{BRp*YnW&?(ucgT>_28$9-VPlSqcQBN-8KQdhX3qcH`#m6=M5hP7C64HJ{tb`$~%
z48X?8+0#uZ9UUEgztp6aayXeQm~%LV*9_CK<yc-MQ%wSWa{@et17HxVd9nPOTn2zw
zRPVUeL#;P&KHjPfnCmI6_iy6U{dsb6kG<y>7K|GsnNgZ~@-hke-2GB{N8E`}SJ%?G
zMLiy1EnmNWZQ2!bPp`rh19jbwrivotdg1EI&!V0c?Y;}^v=_4I4t<a!=KfR)ee)jF
z(e)Akc&~cYyfX-B7$gRG*W%)0ueBKAqbTqVCUDBk%-nD~u5vtQ2L+yedE0pj(oa7A
zAuwtRfI_dvhImA5m$T(+r-V2C7GxBnu1v|?#*sgMr~=v<dCXefj;3K`6nzf&CFE&2
z!X)E13XhXUQIwUH5n*Eh{(wTz_EFpQfjA)i&FO^t)?%~ppOX_wQYZdYV3(m&Rar*S
zRh$<4bF~z_rq2sUi_xocy6dk|zz!}qZa!LgKWYX@56pcFvv04`VL1mp4a=CgL4lju
z5ZXA&GYT?Jy+C-YuSIVpDKI54@x??oDg?l%3=uXHE9)9Cdp8HP1+#Es)(i=4`Ga?h
zmv-d7)5TAxfMDw^@4E?1?z5rFDk_zxotQ=tB;sNij0(CwY^PI6=9Dosq^h%@eOM7|
zZmh1HevXRT0K&naX)xP$q9;1%v{>Z}ckkY8b*0T1Lo@{+=92$8&G+x$D{QBpw)`35
z@Y=7PuJFJ1*qf`x=g=*e1xADW>~j6c6ekBHmDwL&*&F?EC1cezPS^_E-he!tKSo_2
zcH0YGQ(s4qqc5nYlcZx2bA4BelX*bvg2ag`r+yVSU}6zVeR+R>Ns#HzWby4R8ag_m
zSU3cxr(0uQJH>_0%N-cE2G9C9IXMXe-F-kg$zHTu-b?^^RpGYtCa{K_+#T2V@A<rr
zOhFydj|Ids45ikh1tievsx0>@ctL^JtSRr_z02zEY;0__vA4$t-7dM$cERno{HB^3
z^rl2-Wi^VYZ{ufr@Ep<56H6H)UcU?H1?TPuGLBsrZ~0SsT~=j(qze-!Fs*caz@p$Y
z$L4OkFu#RU#r*ySpr*^H<hdytg%t7FqsWqQXEZ9Dh#+Q^2Z4MLJL`>7DtxZL6{9Hy
z@tj%_WOoUuw^G~QO|VFu{(1=t><q?X<l!N_K0n0y)%bwjAs^fxur*aej)#wr0Rp_i
zS&cTsc-VPyZK{B^j~a>lI)J2J@xzv2Muk|@KfedJW~*6~xeP;ZPXVlhK<PjK`VP3e
z6d?ynY?<d~W={a5=4oY_#)u+@+ob%jmVXZ>f&$Suo6^Ry{>P_}$YK$dRJ&ZDqfrQo
z1W=L?a)FhWo-Vn`gF{Fgot#YOhq~OP5O&P4UPE0?xA%E(W|A-9agr?nYv8q!YAr#f
z!(Q|v*$Oc1fa}(|?wbu_IpUXL1adE5VtMS(easa1T+Ioi?dqe4lwV$~#pOZJs3%Zq
zIJ-jrZSZs3g*||_o>784X=12^39=z5Q6M4%I0mw)4+o3Qo=u7#fKQbA0@*FAUmX!Y
z3}I5rdN+i?#?wz;acyKU;%C77(gZ$*$4w0l4e`GHL1DkpK&&otjP2Np&;cDV*^gw1
z(nvK2Abq;Qn7=-2^hq(U>d`e<nt&E`{}dJ$Vu4^~hSO#*Ra5zU6J;<Deq1D~OMRHw
z4w=~Q881*l9t47T-L^C`-j9gt)jQIF&g(yhhTh$Sw^D&BT-TK|{Lf~I8|kldqqw^!
z)TLv9XcOkX1uZVV=iuP*iD@tq$)WGh0ATP#_sVkv14_WOmF~Mb>PYoWG16N|uw*wG
zdMhd2OG`_Wi17j9AV&KuOH0e;Sn-37emV%y4d!!d<9D@P2wGV3i=~r>uMFrGsbz>@
z=qH=i&zn`x(nm7H`4DeEC&JGE_HA`*EDs1sKyW;5Da;_?<lyM8c{fo3%$VU0)4x@~
z*4CDOm_L~Z9GjL#d7n;-`gXH}Ji)uI?JyV&(;1zsmcTS)FEq1%wnbKh-`3aWMadxo
z^CT;~s{WBJO{Z(*atC!<D7_NJ)7DX>nfECxivWNf5Okw~^#j~P&sUjXKql_S-w$8w
z>aLy5JB2YRHSARl(+|6bm6AAH=Oi$Bn~Y{jQteJdfu8_>7bo!t*CgQ53Yhj08QE%m
zJCaLQbMIL;=WrI9S!w5wq*8(eT0NEjj#Jwu%Qy9G_&e_^&b31bS{%^hU={>~zZz>3
zt47cf%+M((KOg%xAWNf;2_>$!@=|~_j*Q2I+LN5i*9pp+<<)f(!!IsQ1(>?K;ETy$
zIiI(;Hyb<qF%xGn4xv1d@d7xL2+zfpAvyx15#U?TbQ`YozJ3J}N*sY=X(i{1vs%FF
zDZCgK{jmtu=93ZeW5A7=Pknz=VNiAf<Yb4U?a)}=V)6o1lP&nx!v~x<FRcLn7|@tP
z%#H1K+ZJZ|JpDf3$nimpwRMvf?~U$tc73{I)4lfV#luet2|=`yKJ=`tIJs=Aw<*;7
zQAr%Y(0hCK5*N$Cw^2Jr7#^-b>y52eNCJ3(9yxO3kSOjZ+)-(?#6j!1iKVqQBQx_I
z;Bg>)du`=}9raN;Ed!ZX@02b$#$mp$)3$aMh?)<YjdIz#-d3-COLzREitgnJ_)t~F
zukPJ?O?DX2KRsieRN2*><@Y(!!hf&NX95UQNZL`uI{U{NYOLkW2TkE+&opoSx%J~4
z8MtA#-$P-zSKc>o&qH_&vr6*vc42m`^XoCvVIQ%R$yysY%Kgan*Qn5aPoR<rnE=a1
zB_l<X*CXZ8BoTjHO3!%~d};t+lI?^7Qne5uE6t0O!h>WpT1;qp(gj>R0g@KT3gA%8
z7sH+`IP^QHk0lp5aBB`1;1>E(?;<vZXMHYK!X(kvn5$oso6`9#LN^bZOMUmdT92}&
zslVE&Od=)!`@0#?uB*$e?V@r(hvNvzu+M7z|Gf6FtST4-{B^vR2GK4rrt;9J0uXzY
zm6gqDlvR70vQq3dCHQ<GIqs0yH;uVdXo^;#E@HZ^i|&RI&~&Yku>P_>+tUkssL@&T
zPNc^2o~)*<i;GKXcf?c1$-n9y`2(IVpx)6e2VRAeFR%9^FZOvRdE7&ivt+#YEP1M$
zRSOSKL&RT<^YLS#Tx1*h#xm)5e8<4f_LtXhMPMAfyh&|TnqcNPBxj@yI*oVXGUA0*
z4~pU8_4{YTq*18;a0J;QgMQpM!rFzQ&$rcY(8D9|dln8tpbp6RILPhhTN#3?uDX&~
zute80^@BBHFXFf3;_Z~Bsi_!W3YcjjyVI07m*i^A5thDBa<`O|P(sAlPatc}3<=0u
zh71>}$%bexrzg@G-@8vrUwe`fgaV_5C3DL`Z~Pt+(h=OJgKrA}dFrkHkxngq%Xe`F
zvrJ)7W`aNJcFhrPw}UDrq0Zd4!<zg|tU|8sK5n>3g)~IJc6!2UNaF}TbPj3hn&WAg
zbur6X0>1((Gr}PHuO0~7)0Wl9@II0sSG7GdsCN#+@s2#Qk0c|;m$JB#8eifHBb*^+
zza&P&h`z+{aJ5*fVEwB?J$ipzwYn3j9}@tQOw-v?|IUVakZhm5kR3~8L+)>n6Uy27
z>23**gi)$L+u_<7)bV#0EM4ozLCz}nlMG$Gj0nD&m~&)UGzn+(d05acX?&Gq6MG0h
z!A`(kQ}CL$fA?sR^_^V=Ynr+_YrXT_gWV{$0Olc7I@|hDzYSnH;=Z^dpvioaze_(^
zQwX)ql;JRP#SjUOs6w+Si{9|fUC>5YQ5L`ffW2nw8hIg`qmqOa)*M=i8|yn)<eXqb
zkhxYd<Gz^T5KRAR#`7|VIZLrTJBUQB+037~7b-goAt|6nQ8@VNGId=Lqmr3#`KN45
z+S!N3+=_m=gttr!3r3v%8;7M>2P&bhsy`&GYR{9cX7)V*cO0G@es$$z472)8!6s75
zk$aJhd+p4SB%}HMh@N17zHt6eOih_NjY@|lhood{w;kZ9AqEk-xSEe{D@HRQEdqQM
zVTJuHzx`qp>8R9o=_b<DCPG0!fR_)O>fd8YR|P-TSCZ#fb2dSJd#t5@7|9pS9j@hf
zJvG{dMXyCNM=Er`b6UV#yg`AtIT4&n$X-uz_t(R?bBXxy519DYzVcPAMDYQC>g<ni
zzz41viW#>U%{e`FTmG#yx~mv_`22U>8-{2)j6^O(e3_!IJZCa#oVZ*|R1aX=zT{o^
z6D8f3Sb;r%*$o~o)uv#cDqmdx$qBhEhHzk}j^}Ji_KL1LCea-FfR%-|!>~Nv$ER=c
z=mr*zP+wnvpGPeN^hxB}DL1J*{c2=So@$9Sf;RwLwa#*ihq(KJL}3xzmeub^q<3yq
zi;hF8P#UZB^{dHbqi7skTZzHsf1&S6bLlz1=GFW}0bS(A#et+c{ojKawA@vvzZp{e
zum0yI;9$8kSdpC#x1mL2?VblRr%%Qa$aeR6fZ@=#e{Y3+I>g65U^lSXN?TUXiUBWo
zT$KN^%ODiYvY9mO05f{Lc|4R&c4y+-%CF<K8Gua*v+_po&({-%#nLI%7i(j2^GtdL
z>9;3~3;-Gb&y&qpe{WX?<TD{5A5>t^wi8J$-Oi3_<Q2~&pQ(D-Ut5-L7VkWxTu^|d
z7YI~oK)SH9KR>pGsMZDz?97s|-?_6T;41{UGmJ0fwh|v;P(cFMl4G&7_5V6Jn^H0Z
zU<29xuw2PDj#Jvr;2734#M>(kf6d#stcm5K3!0k5V0=WtcV<o%O=`63Eu%TGg=s>6
l`)04~-2Z)^bqtNu*;cmmgMFF|cugIoB&Q}@{=ziqzX0Ag*t-A#

literal 0
HcmV?d00001

diff --git a/docs/examples/te_gemma/media/graphs_2.png b/docs/examples/te_gemma/media/graphs_2.png
new file mode 100755
index 0000000000000000000000000000000000000000..35c34ede5559bd0c26ce807789ee6d3fdb2bb062
GIT binary patch
literal 15177
zcmdVB2UL?=*YE4rt?aEJd#gwjuz*MtDG})^A|kz)P(%nV6d?qpTTw(nT7VEhDWQY}
zArLy*%BDjQ0s#^NBApl_B}78I5AXYpao_j5_uO&6Gsbt$IE;~zXC+zBTx+hi=9=sG
z{E0U=HQ+lga{SPtLwtt!?pPi=bQpT*&@cMG9RohWA1>Dg-hPGMHnjc?_>20@<26tg
z4$-#{u?qAI33CgE9P$bX^oJ<G+=C&I0GM}R2<r$@=g^_^hYauBvVM$W(!EelaQgl}
zN8O6RK@wz&QvW#r`>)Sl{Mul?bbIgK>0f0s`S0BNv)xGlW5I7{Z}uFyQ*ij1Mfn+}
zUwF@7`2BaOUx)C9q4@D!CM0wowV#OBq<LNkdy#%bYO^F^q4Ry5X5A^+yowjn!qqhp
zlk@0?yre{O=cRaV!%_A<`L$&J;3UPhB!0DdzsnIo+Yh}?^yE8q=&yEf3Q%zBYlb9-
z^dj%@9|1~(k2Uwn;~#gYn(y!(ZM(HQ@iId4>{cg_dZ%D`=POaz^}|n}ME}720`%@r
zUTlrO;qh}8Ig3>fgw`gzGwz7dpZ%wwYY@HqO{S!ueS*z!KJO6U{`=SS<oAHht9S8$
zr@uZ_1-$-WE$qD0Jap(uLb#lq+-dED@%sC(Y!L7!xE{LK7$vQ?+oN?1hff+F5Gyr!
zm=YWH4i=RX7FDn0rSkyz)&HE$|57_WTh~dNj<vov1)YG?o?Piep-^{{fy+ZmSs57_
zsYT#@r^?Q}DD6*epUnNIT60y3uHM!@m9R7O95s{BJkyq_F8cK5U#1)9fhvFG)-C9R
zCtKv^u;W)8nDN4Y9_!eUN!PXeP-P?~EOw&I^|N{M-xv3Po?rNRk@fT5GgK1@N*J<p
zRoEv<pH>s$b$s+K+gE4hs$oIQ)tmonrC;3u31YR{%}wp1Jh-RS#d%X>Ik4j^k)XBM
z<xKnGOI2OEI$i3mU79)*D3~4Iw$5;^-s^$TjX?V41>NBRfq!!^XaSdb3;Pk%*!8ru
zqORtlGp4m#QhQL(P+so)PL2mVP3Hs1_*=xiA*2|S>KeE-+#nM#-t$E=e9sVXCh0>m
z)Fh^<vP}CJ(@VY-UVP_QaaVFh=cKP~SG5g_IT&WzO-a8PSOe$n$%w5Ow2}7k-II(R
zCK6kVO^7jN7lIqK{V-J{6r)i=PJJH4Xlgeyu{Ab=;1OYCAxIg>!;U5$xgZ=DLY?+$
zj0&3n*x3|nV9seBD&4-?&04R+v$Wn*Lcuy_qt$bEXRc@}kaT(7LMB`*yFrei<}jN=
z$)$3s!F4RODx_Txq>W(S_+!Z3Rc3hhbHD)qH)}mZWGw6@Wzw4JwN*NN?XEWUHX&x+
z^0~{@&}AB8-t-4Uv&2L0Qv2)b)KBrX(YePOU2Q0@;GCYb^xgKTZq=FqFY~bGvG?4w
zMrAsBt`5jDDHW?%+LQL%Ht@pYfyq`^cUaSL<*F~3DVKL@KQcPIT))**%7`=6C+wc=
zEjq+{UTvNG{;s=ASZ3G31jy_9Y8yRPK*T~vb~7`sD7Fpvm?&^N+~M49P_c3&GmL;0
z&*bSyHxBH>!3RU~&zi7gsi&bLHsXts6xLQ^KT_$ke%c$d2W?1A0yC&?09jUBR~bH6
z!ycKl5y_dtS<Q#SSM@|Hec*4^dmi)Yg*n-m3K##r@>gP#7?CDqP#b5lQrU#!U8=N8
zj(3{09l=TLX}|Ao_tC}))`@65PjcmRpM@<Dp(AHhH|n4rJ=cm{iRKuZtM3qJU@`o?
zIQ4qRSnJ6n^vadUS_?t3%)%OLV!3D=ck?uyOV`&&s&|yoK^L6&9%P^s4axP*IK9r-
zD|VjtOETix;momHxQ{tPTQDbv&O}TS^vH;u*L;{-p%O8B^oY<(*8sybo}|8<U0X8s
z`{FmVIa$SCb(7{0g4xrvdJ1mI*O`3FC1y5q?Oxv9eIW|j8Mu61D=gwY);j-#greL~
zy;dbI>f`OB+2SLrFKuY&D0OBEUz!ceQr(st!oX~d;IhrA-x80PfcNwT9}6Du@u9*%
z5QCIpgyLJ8Z%D*8h6E*fa5sYV%@c1eeJ>1_GF&PfN-Pm9Lw9GZ+UeMT0PJA5Y3#)j
zclH-3Ji;G_|BMKs{+WGHk+R}FqL`1hFibKSNpv$&SY<?dow?^}&&Ceb?TLIrx7_MK
zH#4e8l0*1}{OY&CTd&~3I1#oosIYLxhxGGI^lDzq!EH>nRQ1BNTnp6*HC?Jjl~EdI
zMqb63>Z?ULw&%=k^z7A_&;c44^V27I)CUW+P795QAjP9=6iemA%PF`cOpX09zv!i(
zW5gG$3u=szy@Cmy(@HL&jR@ZEYa!bSd=a}dXSZ77R_fo|OQ}!Z8u?K)F06-pWtDbT
zJCto7-b`|8jq-VF>+g#&&R=nMz|_^%l>^2_%@ob`FvUEsWoZx^(rI=yIa8o#L208*
zf;pV(utc>JP4yAsc2{2%@QUP)x3)fH+PN7;Kwvqo(T*t|A`z>*XBG0YKV;9Kjx4^o
zogedK>*7%b>B_>`dc6d6v5cRgu1?84-RMR~8s=GQQDy99*G>~7a*G_aHpni=Z@j#m
zpYi>u1JJqeS=$-LRy>gp#3S~qIPr5B#;a_iW-|17|J1p@=f;(&?5RR6U^e}pJsQy6
zV27U%N#9=njI>V|X^=PclE|e6wY)<<Cy3Vvqk_rW5gUmb(d+3K`^EDMI`_equk6$o
zJ5-KKwS5`1BD1%Sjz<uiP1N>kEzMl)E(^y6t3+{oN2Iu;ktPv~x@^ZA3f!duJ2lYK
zyVNro^X%xrPur2tkUG~*luejeylT~#Dfeb_M<~s_?Mrj_5|=5}Kt1hk_hz8bpi;(a
ztQtPgYzpakcCP!{h)6X%VZF191qz|(%We#WfmN>$V2&iZ?aqHwlI>etESO>qcC3Yp
zDJj_~^E&T)XGDp6T}B1pL0@%EE!o$^*b^V)9KHH_Ee&yI(qag#^kjLal%82si!mtC
z&3R)xSOJ+=mal#Ki`U74p#?z?Muv$=BfH_r%HB}cWDbqkHM>mR8^nGn<JyA5Ts3Uj
z3JB8Ak*m(lH*}XzT9Ab(NTz?|>f49x*4xd#kGakAhC@|k<&8wEHz5o%&&Ecar*o@h
zpgHKAskR7M;b@zsVO;|%cRHwp9+aa>EF-$wT_#@(p}}}U_626rGklk6$+o_8udo_h
zAB}5kXpL*+qq1^8mZtZImvNa;__}&~Rh{A+`|Xj<^7n75Th_L7+dfZf7(rgjXcK1Y
zg`Ky3$5O9qUT}r=TC%yF{f@!j=m2dkE7~Rx8*3n&%Md{at2A@F8`^!nYd2m_F)6Ef
z)ZE}E=J%sIARhnRU0O@ZcEfbdZ7~%P_sHKj*E3HN%y?B|1tmt0rS1QDnrHRYgF0X;
z1!Yb3G3p1PYIEn<Wrky^8nKkGNwKz!+WvlDZ$3tzQLa_p>Els7CSk{#AA82F?1m*S
z+c}g6@M3Cc_}uElB`UevI{FBy5*VV!8)=o*t*>LF7WdF|vt$I`I6vE3sSPF!Qv+n=
zM&3FRG><h)f5Acb>_5DNFkV+%)2sW&2l;cp8ov(;*Aak3hA`Kn*CKd91jllwkX;bL
zPDW?p);1ZhvJnuD>6nN3-mlVKlMlGvYBrKM-DMxp^!|=Y#|8>zU}8w#7&^1N=lY9w
zX#Zowq2**`Ej-qo5t?p*vvZIbLi#H-vYS)yAz+tz_jgYXEsj3=l&-p^6?a@&`~Ir1
zulMhA(k%<Kh@#HP3Uc;(H3|==|Ba7x?nkckMp&kpY4V3ecTeAh-Z)3vgzwsSWwNF)
zH?8b1^ir&*uB1r<i%T|nTz%7+-u#d#52jiQhmgEO7c06A@ha@(I*vY~liE57M2x5C
zkH?jpe7(m*i!Lj+(n=#;PL|fvhcYu~mr#YMd3Zp!3hrncIb@(06cOWI=tr}CUj~(s
zVcf7n0W+#p#^+$QzI4bd-5z;yIDg2B7h4~Cn;3qw{nKXr^gN3)Lw&F&Av+)T4cL?;
zHs;}B2YXFZY47o9s^t=DrDX+n)G>(pE2KE$Of`2;d4%8Y`=}vI8Q!U1-1z;)M3$&X
zbz|t;r;h<E{!GG2MFy=oE6(bN`Iv*7!&5U_`Lk-jl6pB@?t6?`kDXP{$N~XcSGTwX
z=WAp>v!utkm37P+=!(&&9`jU-_1`i=Pd86jlR?r8j1)}gKVA)iCf2~6bSnQ0kEUH4
zo_U~rZWZTN(+*h6bD4S%OInGvm34sLupak8TSlXMpeUYf9-I(nTTuejIN!>Xig;hl
z0-}`~%C?lIC_nN|cu-ec0E@_d=<gc4oiVC9eCm7S^oyu`!b;fum+n^(-}c(m8)d>v
zzED=vnr^eBTrVPKz1?zmb|N)v=J?CPfsDNlEz4t#W^eE8RY~sDU%$VlLe%U)J=yAS
zPBL2o{O5}&d|&k2oz8f&`726H@yi)os~mwPktR$zKQOD)Ax4<%Aog$H3y*uXQT$+&
zUe~hj&YEGM@J|I}tmh(WjKq*WE2ACR?#<;-c860wr;pb3zP&yf_tA0}9VM<OVfU+z
z3t3!vy_Nb7h$}A@z>0o;!xI<s<{}_gY30jpT$1ohZg(1wq`H@{_<mr^vkTx)@34_B
zxD<%dx^(gK%S2L%+^8RPa8!J9)GK<Mk$bfXtt8)zKvLyFJlm$<S0>EcN=DTd$9$Gj
zR35iTf%%F*j1-Wo3-NFoe#Dpg_}Z8={i;fjriCb2p|+ORb5CjHi?2^Vfg;X>A|bV!
zS}&~ehP>y}Dm&&PZAQMZ->%HIDYNGhp1Wbt1j!N~Y$>09bVF0QPElWy7im9+Sl0Qt
z&iWXHX6hN}52c`1^|pInlyl$t(R>PWs3?(9-fs@;H#zZl<`ff2I!a?ZuQWQdr{c4L
zKs1;;ntLi2R+rptpTZJ!xG`QSmpQwLe6=YfM400UHF{eh&}YMILJgY8<)Ka->cH&M
zcCwu;kS$IAZ9Vc*gW^t$h=k(v9Kn@@P_SCA&H1)aguzNLA)*3frSE)CG?cJt;3c@R
z1ul7MWyz)>F6YJ+tv9atF9<GCynI8J6~nr&@rqe~C#&gZN(dV*MCaX5u~7{0`{Gl_
z9KO)5Bc?&AY`iQTl`lj8(WR3y`XXfZZ|mrlkE;G-bPDV(L27)uE20wIYNiutR}~ZO
z79X?P_X|D9tqPsVt^J18&}N6`lC)*@=0^NAICV`P<3r({9=$bfjnnB`^qu4*5kG66
zR1X=`gF`Kn{c#m1mfrBlZ4AR_*w3Y9*~K%kjm=G4Ai22;Mv+06%Q~5N>*88gDq+s_
z`4)epSO;f}Yo06WhsfO(q5&WF*65?@3uQc&`fq{5&;Hf^LAoMO$flSk1R1wBWU~9%
z!ELnlOHfXMv7W1aRxyM)UR)qwiAFpg89JL(!7{5ny76i9I;13<yVcOX8|bRWL#uqF
z!ndV8l(&36v3Ffo*O|+H+1|)(@TFH@lqbJS%0{A9l8jlfd{?LRVo{08Ar;1Q?T>e=
zkYW*E{}YCU;zsnls|_ViAQLy|h^VBF1!Q?e?ZXwjet!h1Ioiv^Now4M9_CQi+8UJ~
z6JnYIBx$J-qgR>5%fPm&9_YPt_3qv-^RyzrT5O&8=&|fr$b0_>Fs=KfK9)ruFnwP1
zLF;Dyem(pPIMq_O*{m#wl_0WV?zHtv*11*{ye^$c666mk^BeUa7+uaYdUMw!8X8u!
zG?~F}WDTV_xN6ms>Mc|6*2J6wohu(Ke<j&ZEOUYeitj}(J`9gCDfAUlaIOvcL!xxC
zvqg>}<?}a1?T>EM{yo;|!t8zT*}k2m#BjRQ_&mI1dUk@@=lJ+Vtixn##XAqc?%$>g
zsV7Aj2H$|-8j*XyMO~^ENL86n={s-wo#gfOl6I)?4aq`Ve#2@5_%<m!vj8gbyZ5>a
zt1F5#QR<oUdixxG#~><Ss9*jrgx<&cdqQws<0?b^F3H)8qEl~Fma`%<=$3e$hM+Kg
zM|a{^xDKx$Lv!J^l2XpQMP{CSX$|73!^57?u=5q!#clhZD|hZ~?xaET?@>@`@EB?6
zWNy}I8yN&PG5a=fk2m##&)%<6^m+rQ1x~A^+sXVck;_5ak}_@5{qnX$NxI|fX2|G^
zhA>m=R&G&v*h3e7vm%ERbA73wS47SoyPU4lyT#9j9bHbbpOui9J?A<rCK1rzfN>DT
z+kg7Kq>H%TeJ6?M`5Dw>^lkUEn)@X&zZ?7~JTQr$|2Ri)vYbRF;xv!Gb}`Psy#*5x
zX|g8$?itUO`r}_nQ?g_ikOd#m7`0VfTieSKyujt_rz-#8JM-+v5qkmdH?)+A=E*;o
zD=x8NJpb0LzVMJ%q&QIH(zAcXRB!(~3VmUd;RYC6?EQ@!2YBUWc3N86q7ra<{usWx
z+A=kSk<Va{rBzBlz9Cf3q}9RVhV}oAbSQaU`s25;c=2C>ItA;S&ECCDVa3fLQ7MeJ
z(xuOTW$C<JTaZ-^Et4#CzrZ#U`qz#czJt|y_S<roF>_N}9^dXhzjeY?%c{SVE@e18
zaP;4-s4v{Z&(k^OJ5Bj%#;zATpWj+z#h8AIuDrWvwtFh!U!htKY{i$deBDr-!^?YQ
z0ZW_RdBtrw{4ayUUpGI&RxT9?wGmYRBZG~wt$>c$A&R6UuEaxL=EkN~#0*K-#2GsA
z8#>|(owpeZxf*IxgL=Y9$pKxT&3MUvb)SW>)OmV*xV9FMjk#?KP0T=%>@jfvaR<U1
zyf&94Gmf;JO(Z1UYFJAKH7>s}?$4i1=5j{sYL-S7L3gg!z48YUs1ZxtZ{nkPzon+w
z%0Z6V0K&LZcKI`R9M7f(otJETXR4`XlBq~KxA}N<<+tTYI}!Ml)K>If=w>btkiGUU
zEIAT(-W-OMv~Rt-@HT0i0cyC5iR0jW(<Pm2B_mql3JBceQgmmv7|-p9?{Y7ZPGiii
zTPWWvhU<a2FVvj-+wu|_A|0g$;4Vs_i|-c(m<sTXiQi+Q7t;M8MM4&~$~USAmE@Zo
zOb_?tipzLG6ERCMJC4HST}~1dwVrw?Gvc)<XQWc=;ko>=Qxr-p52)D~r*WWu0d}`8
zpN(MCPOKekT@%_eIyb7p4s#Oy=%TJdY<k<X<LM$X?A^y$IhB1d0Zt1~l=Fi(>fqSM
z{pwS5kSxcBGX1H?6{fL7wDsM(T@qnrX6>)6)TPAn+N;PFa>0_be&j=w!k#1116*xl
zm4ix0k+)?K(htm?cL!-Klp61P#>4@~&DO@|`|j&Bc+UjxYRtVE5H?B4K)BLvG3mq!
zP@(E4vHHQPnK03rQp&ZSU3g~Ng<bZtQc5qaoXTz^@~Af4z;?2~+3I<HS;e@$jau0W
z!y=!>?S=N+3&(9R1byM#Mkd@nVQv0lJz9qlZPe}k_DFQ)_2q(@;B>^Y9o*f1DyG1c
z*i?E?rSoS^)8(RGiHe~bB_EN@QTU?J;BF7EoOXMmbfuVPmq6(0OR-5%`YYN%kpKIL
zxxSqhPt}8T`B0nKRybV-AWqu&%?-_8oOjBc<{;`N@tLGenJ*7ZrM?dWgkyK3%(wU_
zuR?Y<P6|K7UiT^=IaQOftg<l!(HF^F7|GzbEj>>zM4~_X<^ag3>?l%A#zxs#n-hSE
zVc%|9Mk?%ntvIZpC8@A{NC6zNf%T+^2gTzr(6#ESY%;HsQjGI;aKDu+UCE8M#!1R(
z+cd6jTpUqVPn+-IkxO4zse#8fiMK>lh(};eEBym($5Kr-FIoW=2dQFM?i?N@+r3)a
zs8mrV6%);Rd$^ky(`v(|xE`U_sFh~B?{M40+m#cOo+Re|Quop?dYYu&IET8}FPvYX
zIElAAxJ~>k&&8w#=tc+PY<Ap6hS`G8QbmXj<z)!@&5?57-{hp}!zNw=z7MC*OFO1}
z9WHd#BNrxb9)-8-jZoq5ofjl>u!>?5+nLw)c06-QuIV(diyfFamt+8e2E<m~s*Ihu
zNF4nlCU^!T*{NA7lBcTa-o<|8J5<n#6PHUK&F;-qRQOVE(x19?07uwBzH?xFX6fj`
zIrpm^yxI?(maxNJt-cK$-*ni8xsJ+mz+qWW*+fs*!M5m|pP4Rg)UIMz*3YogFMwQU
zCYNfgr`S4G(5|AQ3PZsT<(e;&`E^{2-p|d#H~7Q62WwPcDlBy_oLMR{vx7$3n5S;A
z2WA=dK0b|M5bX68-!uiV4cjfZ<-8HlIWZV9)o1wuf1lof?xs#v6nyvEde}mc|HrMI
z=i%>`GE~+h(-+j%H7=}E71TMt6jV~MtF!&-1j|dAQUGv+Y2pMeVP7am?2@bk8V!<L
zElL9OEQ`ixtTukeGT1+skI^6B{oo;_5#igYLp%>=W!5cq5N(&c89kD1<^geLu?>xP
z_9H3{`FBfPPL$~D?rtrkFN;~G77R7FvJ@W2nwgV3(T3=<7C7SaS#IRpFzQ%<sIL%G
zP2HK;B54<Sn0bFSd}9#=ISwWOIaemh{@o>Ew4)v&CTLFx$SiT&RxkIq0unmr!{P6h
zDyE#Y?!I&o2l$FM)qWCp=`#9$+y{mPHg#a8?}b%T*YKRt+D5FG07G!VhVeGsLAIN{
zRMb|@{u|7$_E?Os%q&AIRV2i?;`GTXY`oWDZr`*6cjuX3OLduHosQrax~m8|61FXv
z?c_|;Z3L;>O!+caj5^C(+lPdS)Zzre1q1p8Uy(LPe2a-%o<+r^-D^8Ov;0gcLyb_!
zxYnTiDFM;kCwY@!Dta?#`O4$k$_(5w#SiRdm#$~xRkTR;ngMC^YHSXp$3rA8Aado&
z3h^PT+R`L?d+<JX=cA3Z1fmQP3bu=Bya_p~qrH>wNM{FQ19VDmY;NlR2_;fp1gtFx
zb}>Yl{TSG*tm0a6XXrV#LZ~YrcnBgr3k`56ciFFZFgn>kjD6+hOWS+j%Dz_&b8fQ^
zh_lmGue0aFB3gem(KTWacr9uS=)+$n^Y5|v*e11L-Gei%JNj<y6Cv-tA>aV-HqZKY
zaDdl+`0c#Q;qM5Qb<LCOIpJcQa3&g6r~~>PY`HKXGO;?~+d^n?;KRDnVP&x8(A!VV
z8kR^;1PA)QwCv+7c4dtKIGYg~NwBj08iK}mmULCmhLBRvn!t{$G<20LcbNC}?bS4z
zj-n<wwO+jlN16hp7@5)=+WCa@D;3@^A$O+QPs{>Q4o0YJa@|g&epkIZP*UKRjB{=^
zb4mWtVQW+?HX}>glx<(o@{WxNjhj>m92sjiHj&vh$ZI$0;*>QiofGc$7fko=$jVDw
z2e1Xw1z9k)FZaF^C#-1bxNCtYtnmBNlh(S}vJ5>(JXXIh&+b%rFzLlK(e<{-uE^D_
z)XVvu=C{xWGCF#E3T|^n9MnYW##jzq1LlH&g<^6ZQ$AuGV0pL9WY>Hap5heM&w2B`
zY8!9GNu1bP+l>_PJ}&1m9<eM6Nj+Fxb4r?>d1X0v-!mabVA<y+;r^4kAa$Qm^xo=b
zYuB1RZ4?Yd4B%*g%hQfeftEBp+a4wgAL#w1mH_6irE2BE(#g#v1z1PzgJ9%Jc*EU+
zK)-Z<+HqAl^AbeN>kZ!0L<}Tl0<6}uYrUu8M5OswtB5xr1V&94)doJ?7}HUj*r7)_
zi5l<eweOft0Go31Wb?{ZV6TZ5ggbmN1Iy-;TSlXCHxJlKg>~|6+bLHxclyn)hL0(F
zrM}cUGxwb+Oi#~tG)d`?11nfMWf~HlBS?O9T>QAEoV?r*AFz!%f_Wwk;2j}s!!2K5
zJa;0Lxp`K*1$V2?*}VLKmelVomK1J_vThOz)xEn_QEE=10Hcf24rV|8$BToF2N-4d
z0_b98plDpgIperXlc2YRdcYc)AsRv&j2^fz1?77stJ!Z|K42Z5xPE^9F9CP>{+zqb
z=mh8H(5adYIuNbm$j9N^FN+<uE^$jr3P!4}DwvEw;u<o}@tLz4>K%(r*(IGj0HXlD
zdZq)q<m}Ikw%dDU2Zf9e1-9Ck=cB-<U``(%Z8Qu`Wf}AUtRSUJ7x6-3^sY?od{}4J
z>1zQ0UhUQf{O3<Wh~;8+n@@eodxD=iiu??8{Gzjam<{^^FsrNmd`8e{@u$(&(|%(}
zf6_MZG4ksjA2{}^{h94(bNAqul%#Oqcvs-usr~c*dCROdWN7wa2kj5r@~QLz_r;kv
z?|Y!sOYW0)qLx5h`3PaWOiV;-{*8C4^z*f}iMF3oFErN;Uwb0CVB9~mkF~qzAtzJ~
zOdJqkd9TR2<>TO1$Q(c)xzx68Zbe{h3mG!18V4D-w8|--e=rqK&lUCkxxqaqrm3JE
ze5|t01M@+(#xcY(G=vZkNpQr9@gHl#&>(lirF{YW0MihbL5nylV8<!cnI~k2G_O&$
zJ9$q}TE}jW)JIj8i}=<5mVW4v$iRUXpfPVSH_9G8IWbhRAK;%dtBMt<{ZOhHW;gOG
zeSEcVYl@}D7GfcK-uuzIzhnLTkt#9XuG!``5~=>0CLvZiRx`+3T2S#tos`m@;=yt0
zV>$cg3pds^m4Kk6-MHRpRRCMrOc9h@+r4-~dK*5Mdd_+Mi_XzD!j8C}phC~k@Wo`{
ziBXv2ND{+kJ6kJyfAv0)IrZmcl{cDqM)-jP1N|vhP{s%)fPuyz_ZTCsbw!WcCd1+7
zr6+vaNN0B!zdzH7Wz;;nUp`44Dk3L`xQw<icP`So#)+o8b0>x4KIpxP=FTsyU2<P)
z9yslVUT#n1(_j*EHHyYc^g6jVqq8qqD#fDW<0WgzcS)$M@&Pdfny-l#ku*q<)I8_F
zg7<Q|7nrh?N=5lW#XR-*npfp18<s@dW}A|dNgTPs08-lAXtJ;unkf&~U}gDbhuW*5
zQcH67gKqK@&zjpJ%fO_o%T@gu@C(kX4_@Xq1~kX~xKrF35|uPLIVlSyI*<fdV}~TW
zp1w%=IJyhRVQ9^D{!5A&D7$w(;j~Lz*Dj~@187m?s+J`CYwc?benak5v3le7+!Io(
zdx<gG+@-M$c<}d~DyR`)mF}~6;sn0tyIPZV`rOFysY*xc=M?ttQtcC+F&g^2s+F#8
zUHv|jG5=uOCJ23jXh5~1ql3a-=1Po+tsm{{wC%89+ZPa#AtftvNJq+2^vZfdLUqW@
zj7@~6bIV!rRCI;49@hUtH^F-J%#ai(9fOvsFiIX6{OM$sQES_`Ti(z=lwHSq`xYZf
z!)#xb8yU@Hk&{U>L**tW)qYDqs;z-7RAYg!pWfa@Bu0Om`zMq>zh22(!{?V|(4D=d
z-2+UY+;`zj1S;|7`3FXp%da>x@bzem=i+bV_RWhi2v6u<(csp%PYM~#M&_0y9cDQA
zx+0L7be-N*?ZR=^P-Y~QP=xL7551<C6~yU*9NF)Rw$w79%>K4merNgQN*`;JLf}3p
zza0_Stq+5=NG6VPk6p}OmspB^sJbqZ2t4+vQmSl{q&2UoqEJn_4kwa-DE5L=1yU;p
zx1Noa=?}ThXgOA)O}z#ro(A3_+z#&0N)`S)p1vjjsHoGF@0G{!@&&`q(ts)Tb;}Uu
zrC<Cf8DiP2Jkb&PQ$hKKIs2~<h){cJMb1`k*7ie}P6)lNtAWu+<q4C4lbtVVR1A93
zh8mrjNZmna-4lNXV(~acQ4uD8qr1XQc~mbIN%do%d;!9T{G95`{jEDMw)*?&ZX~Rl
z#PaBGI=vq8xX}}{WVj<7e@j`iKmUD^DCad}XEIZ=Z~C-Dn<Va)B;;pNcKdzms&wT4
zL*o#y3jKg`upD!K-uBmvU{Ce^4>7oZ6K7!6!$45?vvN14&uoS1WOuDTc=zn|fVFqu
z!-f4}#p(Zq3c4aG=LpS_c0FMHAiU(zV8W8sl~~+u_skykPx2PTUkk+J_4ih9kyiC5
zS5tmH$cO=U@%8d#PfyQ(2vZ*Y6Sdr3pqT<fm1`IOsT2Bd0WkkRZ`}+?W#YvT1WFID
z0Sctt#ajoAM#JUU(L;wGDIx!%_$hP%zW?7;2!uc&1)V^fo(R1KmXLJBAHenNq|94@
z|D66;Ij#@v;E??v^s0^EuQ~V}XvEEsjw~hjf4cb%rvhkCy->f%fo5v(zpo%n@TdI;
z6H{rW?&$wxfHJ>9%z*|8{*T9L1_Ow>06|(;)2`%yvC7WZt(;<_P|DF3#ZE^Z%R0@m
z;JO{U!pyzlDY^!3e{`SJEKVn;1wPz2-Mh5TdG4$@I0M_BdexMdA|#>s{P6#`TU_aY
z-@ZPa?9Wk?{x_J}5%~`Z-T$f){y)G@bB<b|>4p$6j(<GoJWZVbkB}ntf6s^+?f(g#
z-+xJS_dnJX&bLgj@}K&rQ1C(2&*@j81w6peK3U|Y5q$nJls&YGi3#^lQc%Pa`IA-1
zYEUMh&&#EyrPQKH29w3jU<+_ZjOu(1Mnaj?I@mo<!nBO2ibYz&Wd=7b(AvT_D4%Ol
z^7H-4_**Ln0Tu|ee<q_6(Ll=UMY-x6i$h$Gy&ll-s8{NesYtY+-U)>ynMn*qKlD&t
z2}o7@I(kHK-$6L<t$1vq>8hUCspk_1B0xY2=w1U`Ay-6iwXBESrgygmP>@KZhfoJV
zZ`RkR$dMny(rpkyUl<QJMp%BsPE%UDU-7!jRVa;iLHIr1;N%1SXK+&TVBa_`^68T1
z<L9716kC55?8FceSDmRWcIde=+{Bi{rQm#_*5<NtYHKJse{NW&wN&rnyT}8{Bp{r8
z>q)Xe)~0MNgNRd8GiwQot)=PdXL28X{#_fvn!xLkld@GW0tbUC*g#TgZ>d$3#jC&1
za}I=vfE-Z>i`rI&inG2TYd2t5hdQkG4PZpBfMM#uf>mT><Sa`k-wXlKZ(-g8y{0oG
z-HDsH$;z$KfhA6art4Ut*uH$GKy|v~y8`vh0dvZW>HJeMGiW$6X61}PABkDDL*KqG
z2lH#ZoPXVIjH4tpAirH_8u{&SH>|tl@*U?0k^9bf3r>ED3&nD&IT~C1ot(%PB)>2F
zlSEx<OO<opnPhannnSUHAm8?$E|(E%IJ5zi=GeD#bTg>i+!%%W>5pzxo%ck$XU=R%
zOx&9@q3m7)Iu$^=6z18_NTFke*SB}Ik$`6^A6ye^*y)k0yhCbD95$+?3%lsWjqf+g
z?SYG(_o;<5F&buj=ZLbC6+=|lR?KOZj_q=c3Rqi_e|FDA4_(LF6-BEM2FK=OI^B77
z7gB^&;7g#q3&Rk}4(|xQrSlxRt>|zgZHoLkC&`?YE)QZt2bWnmo0Vbt;>;qrbI=v|
zkC3;8VPeOw>7If}cHY_JQL}VE&)7?WMe%a>_^B0GSE5S{(G~ACuoB+VFM&~4G$?Ct
zCp~jfjO0eOif2ZuK?(kt+FH<Zbf~ZTmvVL;v66zkzEvHiu|^&5xne@b_u4Mk5H-Cn
z5p)`i`ZE+XzV`@IiLH*JtUYsdr}_L(USV-P^nf*e^O%40w%$u{+gj0M)qSq|eMJ+i
zt@7h0;OmBEFUwMksyZ{TjGajpAlNh&yNI%=l4RCkmirs)2tSKqxK@*2@%>~O%icZP
z?)0-!jBCYbi@L7yF~hNztbMnHXM9YJHO8y*#n(~7;+K?hD-RP_YXh-^C`B{7$-Z+V
z{?IR6&B@W!DY3#4HJr_6k?}~BIflT+u?>3VVNQ)6Lfo&(;F?(fTI2k(m%2Bce8gzl
z=DF_ZtE-rVeZesA(Px)zBEF^Sz(3wbT;GeqpY*Fp=?wIxv|HIUe!Mb2hLm%x+00~>
zVRcHIm(-Pcy&|9UF{zJ6FN1)kEGZVTq?#drWw=#s6SI+Zajw5BMG}$$S?q%HXZRa|
zBq)5PCozFem~<y1b@|zGN*#1%pJe_Cr6nwG#iogD01r&ce9Z7@lmptQ_j31!h^ZbO
zbF!+V-e=1;oUDZ;_@zoNHNfnaV)uT?_(un4A~#hr#f`O{T%Mj-T3{_~(kU@MHL~Q+
zd?43Y8$RDFo9%jD(+@hyh85s0S!``9bG2%C9XU$x*ozPoQg;e$c5*73<GdB6Biydt
zM5Iz34VTFk#W9>$)SZSjRpy#e?$H)x?Ig<~A+!OD?H~WmY{QdDp9-Y>EOu92!q3#Q
z`ej=p2Rs$HgIXn#tolSB5dZ;K7qP>l-n+*>E($JzFj}_NFyXNn<K!|V|FSA4)=M@p
zB8i}j2ZQ%pjb6pV5su}K88(RVnY|K}#sy`xIi~n@Y_0j8Up_&nR&H72v%gz^qj$M-
zXE9?&I@FuIhDGl81HPA$Qqz~71Q|rSV1^Myrg`_yZ7=1VyZQb}pu26?JRTcoi(8?6
zH0n56%Dmk<^CLr=$;_%f?`$y+c}tnMEaX<oybH)mbY-$=E!NJvW6P>wUfgIQtITL;
z`CD{ki^v)*@;X$^ePVMqz~PpWwC<y3$8vD(GGpIn$wFVLb93*A1a#t?rHOWAfh@4l
zcNrK+gz<>K&zCWMDdfkHw<4jSWuornd!-WzBbFoRidAjKHO~+0v)}(}diu-Ar)x#u
z!jB?1##fI=S(kuxr%KZ#Zn&V`N?T-#%y#{rQblh^tX7Yz?(XNv7t$U#B><iJe&dfY
zm#KHg<a8e)U#w3;_1;tMCUz^EAOgb|U1<or@3EIE*-Lr5j=mCWi<6zw`9nMZ&YZ+d
zmW~j8^cJNs6QrITMVd$W`*!zR+;{u*dhVPL<>a{=Uw2Bt0jcS!si%C(BlbRwI%KDR
z0zs8VX(l6aB4TWMo0OsZ*|RdlLeXS&kQv@Gc&8^9p<vjPlZ~0*6-c24aLKl{Y5`Wz
z^Q9XXy}lD5nk8VVmau(Kn`Zb;rU_GH#p{HmVYWMYc3_j;bkdL{e9CYpNpX-QxOoSW
zDSst>Rx8q*FC^FIEt<&gJmtT;3$fLj%D&-nT6x%B=OgP=LR{lU;2p+Lq%H%!oa<|%
z-eRm=RP)(7_wi_zvQ|}2$`RI4RhB3z4Y%Xq%o>-|&~nW|1P+Q~WF={~;9M0?S~Vo>
z%)<eU0JSCx1579r-M}~1mpyn3PLd+|*3k=H1XSj2$ujSNR^!UDbS<JWv1}7<-dEKr
z;vQRooywonTx3eJa}Ji4?pFG$Hihh}6W<dnnF5b`x88Mccay(n3zom4kv-CD9-LGY
zQC2p{Z(4<r0~IgKx%At!SoVzFYVZ5^D`^I95q$c#?)PNE_r@ySIqQyF2Z6nQ#weuE
z#K5;huyHafOX<2ETT?{J5QfY8khY?0+RB^%-F(XR>zmr7m5M8MqNQ03iP_!>3zdY2
zH;Br2ozD;(Hzo%nD#+i1&{w2JSxze@XYZ!-{}H`07CMdy`QC5f`-u;BsaK@~jrmY`
za^`I2a?y6ORky{<>SxnB4552slHDgAsuo;yA#Xq(zeHusJfTb(=r)7}TkTa*2En)2
zC7xuuL>S81;lATh6$Y1~`?(ySeJI9#FBfItZQ8k<qroe+aIeS5$H$`bbdv+4c4B5J
z>`@C}k%OF;yz3kWg}-Ja>budwPwBqK7fx>WE%#3GA1@!bmNx7;mxCkRG&d1YNG<AE
zLw7|umeWu?k-e0ID2m6WnTb^<i_^p#Ou)|O%>EChVgVKKdzN&<F~gyiSeDIX-<Vf6
z_cV03Y)i&>jA^{G%zv22*{r57?(@#$1ggdW;pA7HQltGir_2~++~<aH+1L>WFR;J4
zqHC?=2g;+qDghFS4A?SC1r}_mzq<)|c2_9Uwesz`uN{h4NA}nw2isigx=sS3+*YQ?
zDE>&99<VI3W30-;d8Rnq6}lZlR>4-oAA7T{;9$`+0gT8mERt{C#ew(WlQ&q|{wIFT
z#W&FTW`EA2cib!V?i=qNAvI*$C+m4=D?3E)ug5K4k3F;A+8zF4KE!dqp9)N%W?t%e
zI=gj-l&NS{xre^4sBL;WGnXFp{L?G4Ge;%R_6FB3qOZZXk+RP{620+v=$3Lbl9Q-j
z{Tbou9Q@j4n*v@#N2I$TFUR7{l#AOcvy0UNT@+&xW|OMP#`!XGem<KJ^2Vm3TF(rp
z^nod^V)J9g$6~-c$e2Bkmc5qRYN|rEj@2ua4Z835C5;~eHwknQ!JhHae`y8?waiZT
z8YBWf95#O!0e0)%iXBV!YjSVE@=J`Pv1ZISehUQ>{Rs_wdOErmjDSe99N=@5UbTk-
z<#hCJl`^AMfdJ_xe9vkCz*AV@;QW9*Gf1RvojWX*1YhN79~s@^#OI5ZBy#*HM4XdC
zTe@d#c%yOtpD;f>QOkr85bH|Ir!%A0%4b^n)iK6_Q(RnVYmI@v!P&0NX`4t`Q<Q^?
z*zWG7jmhYo+La^@B+GI&<tXyeH=&V$GKoiG<&$L;l3<U(Nk@%u&{sXx@(yBjCl9bD
z1p-EFR#s9HGF#4$Pi6~>or1lcIV%X|^t42*4vhl&_JzRmk|{RJA^N63pzRW{v5%Bj
zwv-(#(!kxQ=bkRt^pC>U8gHA%=N`n0smPVT9SsWBD(#&?A5oEpW@dKI&d&MG-4QRU
zO<6SFwDl{q=ZYOf14t?PIn-)MLMUqzOYD!jXAr1)VGU-L+iMp6<4^w^)Ar144In94
z#si{FK0I1qypK%Wa`w*xnY@Z+`Wg<!@rhcyxQK5ZeNJL;wC)tvwUz)%s-l57sf3b&
z2#1pW8FIXM$LoXWH@Wi&cl}&g&3)oWW8jV(nr5cu8quegtAQk=rD6Yt`1W{wop`1-
z{PAuiLv^OPdspXXQ@`{Dj%oM1ee@id%?i|1?DwbE7zzjo7;e>n-K<bkm?~EblPKgf
z=rddn&po4_{CTx-z{>)=DdYvNMyYKSmy}@2%F0+%ZQ8Cei+dok)QG4B$Eode+{ibl
z`Uz=~J4z4@wo6l}|LpFe?wZ{EvBoP6gkFN5!-rWBn0$9VUpFF_!?`Zqo24nGPZ3jO
z1&)2Z<x;u+gSEVRH|31Ffzrh4U1TkZXZBI47yj&Vcoec=rsdc1@o`f;Wm~GorQd<&
ziRH|#<o}vDe*W@PD{+U^;hE&JnYp@&38$lnNzjD@vssNVxG_kXYyxzelk_g#$sLit
z0c@*?9qq~%T;r;Hq&#dpE7Ay2A+@ktheqGE41Np92YVWtrjyi$D&R4<{8<8_SCc2+
z7mC_d&iOL-jU6jI{lC>48VYiy>3;{MNiJv?_Vo2xpa0{%>F?KqCM12p;C#riTS!V|
zl*u_YLm7Upis*dD(l0@scMvI@F651X)S7|tqw<Q1W+o;kvs%Y5C*?>ogSmxkYr)}K
zd4G1R-^Wwdt(|9i?J?_tv&P`gwYtd&qv=QX`QqQQYYktfu_qDi9%Pm}iuu=XWmDs&
zZ3**63W?eXtCF8@hClenCeQuTy#CtNN8)yljwrJyX`HS%vlHK=x}z6(cDlkn;My@7
zQ$^-BHpNsb6)*{Obwx6lp5FkgbHuAOkt#qM{F8iqBm+;)fr`0>1t@2Q9_?~WY%#l*
zvA>|nU2798Bl((BeopV3tHt5m!qS?5A>xR$nnU@OX+>Sq_)@E#jnXy=6#w^T;9uu_
zeSAKgEWF~lVq;^YY?3x%a4Wevzr`?h7-*7=YIrhATwQS=iLaOm0z~Jg$*BFGHinkA
zwg_J+)IidUWhQab`~D_Rk6B>Gz(U%{YF;e>gbMprO3JZ*sCD!}62HLk=xDEMe^)l4
zyj<4H#6NnVXlr+`tK`n}(;b`j6O^EaD;RrzryZ}|<@S6Z+BU5QIz_tR|65#%U*8=u
z3oX}C*WkkA$|o+daTrXRo6xX~DR=k-Em9K<2A8#6iO-1i_s911_QIFTN>aCfTTbJ|
zEEwN0GD>K6QCyFDvC-Wyt}a8~Qc0cFQs&N1d_(n>mp*DW@|O(0{(U+sLmMC|cQeA)
z=llPIe}3)z2fbtO9<>F&sz3JM4)gwBIN{hnKUs{Ia{7`{4bX-|hWe&=aJL^k`@aD7
CQjcT+

literal 0
HcmV?d00001

diff --git a/docs/examples/te_gemma/media/plot.svg b/docs/examples/te_gemma/media/plot.svg
new file mode 100755
index 0000000000..481f156df6
--- /dev/null
+++ b/docs/examples/te_gemma/media/plot.svg
@@ -0,0 +1 @@
+<svg width="1280" height="720" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" overflow="hidden"><g><rect x="0" y="0" width="1280" height="720" fill="#FFFFFF"/><g><path d="M203.5 526.5 1109.5 526.5M203.5 479.5 1109.5 479.5M203.5 431.5 1109.5 431.5M203.5 384.5 1109.5 384.5M203.5 337.5 1109.5 337.5M203.5 289.5 1109.5 289.5M203.5 242.5 1109.5 242.5M203.5 195.5 1109.5 195.5M203.5 147.5 1109.5 147.5M203.5 100.5 1109.5 100.5" stroke="#D9D9D9" stroke-linejoin="round" stroke-miterlimit="10" fill="none"/></g><g><path d="M265 159 322 159 322 574 265 574ZM447 318 503 318 503 574 447 574ZM628 440 685 440 685 574 628 574ZM809 495 866 495 866 574 809 574ZM990 517 1047 517 1047 574 990 574Z" fill="#76B900"/></g><g><path d="M203.5 574.5 1109.5 574.5" stroke="#D9D9D9" stroke-linejoin="round" stroke-miterlimit="10" fill="none" fill-rule="evenodd"/></g><g><text fill="#404040" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(275.188 149)">87.68 s</text></g><g><text fill="#404040" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(456.403 308)">54.11 s</text></g><g><text fill="#404040" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(637.619 431)">28.22 s</text></g><g><text fill="#404040" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(818.835 485)">16.75 s</text></g><g><text fill="#404040" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(1000.05 507)">12.13 s</text></g><g><text fill="#595959" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(177.491 577)">0 s</text></g><g><text fill="#595959" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(171.077 530)">10 s</text></g><g><text fill="#595959" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(171.077 482)">20 s</text></g><g><text fill="#595959" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(171.077 435)">30 s</text></g><g><text fill="#595959" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(171.077 388)">40 s</text></g><g><text fill="#595959" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(171.077 340)">50 s</text></g><g><text fill="#595959" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(171.077 293)">60 s</text></g><g><text fill="#595959" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(171.077 246)">70 s</text></g><g><text fill="#595959" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(171.077 198)">80 s</text></g><g><text fill="#595959" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(171.077 151)">90 s</text></g><g><text fill="#595959" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(164.664 103)">100 s</text></g><g><text fill="#595959" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(259.651 593)">HF (baseline)</text></g><g><text fill="#595959" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(430.297 593)">TE (subsitution of</text></g><g><text fill="#595959" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(405.697 607)">GemmaDecoderLayer with</text></g><g><text fill="#595959" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(420.753 622)">te.TransformerLayer)</text></g><g><text fill="#595959" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(608.469 593)">TE + THD attention</text></g><g><text fill="#595959" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(747.892 593)">TE + THD attention + CUDA Graphs</text></g><g><text fill="#595959" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(955.438 593)">TE + THD attention + FP8</text></g></g></svg>
\ No newline at end of file
diff --git a/docs/examples/te_gemma/media/thd_bshd.svg b/docs/examples/te_gemma/media/thd_bshd.svg
new file mode 100755
index 0000000000..47eed69565
--- /dev/null
+++ b/docs/examples/te_gemma/media/thd_bshd.svg
@@ -0,0 +1 @@
+<svg width="3840" height="2160" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" overflow="hidden"><g><rect x="0" y="0" width="3840" height="2160" fill="#FFFFFF"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="96" transform="translate(664.716 313)">BSHD Layout</text><path d="M1920 459 1920 1991.8" stroke="#000000" stroke-width="8" stroke-linejoin="round" stroke-miterlimit="10" fill="none" fill-rule="evenodd"/><rect x="128.5" y="1356.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="365.5" y="1356.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="246.5" y="1356.5" width="80" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="484.5" y="1356.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="128.5" y="1472.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="365.5" y="1472.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="246.5" y="1472.5" width="80" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="484.5" y="1472.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="128.5" y="1587.5" width="79" height="81" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="365.5" y="1587.5" width="79" height="81" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="246.5" y="1587.5" width="80" height="81" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="484.5" y="1587.5" width="79" height="81" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="128.5" y="1701.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="365.5" y="1701.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="246.5" y="1701.5" width="80" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="484.5" y="1701.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="96" transform="translate(294.581 1311)">Q</text><rect x="742.5" y="1356.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="979.5" y="1356.5" width="78.9999" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="860.5" y="1356.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="1098.5" y="1356.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="742.5" y="1472.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="979.5" y="1472.5" width="78.9999" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="860.5" y="1472.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="1098.5" y="1472.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="742.5" y="1587.5" width="79" height="81" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="979.5" y="1587.5" width="78.9999" height="81" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="860.5" y="1587.5" width="79" height="81" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="1098.5" y="1587.5" width="79" height="81" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="742.5" y="1701.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="979.5" y="1701.5" width="78.9999" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="860.5" y="1701.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="1098.5" y="1701.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="96" transform="translate(912.066 1311)">K</text><rect x="1306.5" y="1356.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="1544.5" y="1356.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="1425.5" y="1356.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="1662.5" y="1356.5" width="80" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="1306.5" y="1472.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="1544.5" y="1472.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="1425.5" y="1472.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="1662.5" y="1472.5" width="80" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="1306.5" y="1587.5" width="79" height="81" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="1544.5" y="1587.5" width="79" height="81" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="1425.5" y="1587.5" width="79" height="81" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="1662.5" y="1587.5" width="80" height="81" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="1306.5" y="1701.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="1544.5" y="1701.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="1425.5" y="1701.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="1662.5" y="1701.5" width="80" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="96" transform="translate(1477.22 1311)">V</text><rect x="2148.5" y="1292.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="2326.5" y="1293.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="2237.5" y="1293.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="2414.5" y="1293.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="96" transform="translate(2028.54 1358)">Q</text><rect x="2501.5" y="1292.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="2678.5" y="1292.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="2589.5" y="1292.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="2767.5" y="1292.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="2853.5" y="1293.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="3031.5" y="1294.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="2942.5" y="1294.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="3119.5" y="1294.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="3206.5" y="1292.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="3383.5" y="1293.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="3294.5" y="1293.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="3472.5" y="1293.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="2148.5" y="1418.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="2326.5" y="1419.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="2237.5" y="1419.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="2414.5" y="1419.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="96" transform="translate(2032.21 1484)">K</text><rect x="2501.5" y="1418.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="2678.5" y="1418.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="2589.5" y="1418.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="2767.5" y="1418.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="2853.5" y="1419.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="3031.5" y="1420.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="2942.5" y="1420.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="3119.5" y="1420.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="3206.5" y="1418.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="3383.5" y="1419.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="3294.5" y="1419.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="3472.5" y="1419.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="2148.5" y="1544.5" width="52" height="64" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="2326.5" y="1545.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="2237.5" y="1545.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="2414.5" y="1545.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="96" transform="translate(2032.71 1610)">V</text><rect x="2501.5" y="1544.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="2678.5" y="1544.5" width="52" height="64" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="2589.5" y="1544.5" width="52" height="64" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="2767.5" y="1544.5" width="52" height="64" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="2853.5" y="1545.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="3031.5" y="1546.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="2942.5" y="1546.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="3119.5" y="1546.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="3206.5" y="1544.5" width="52" height="64" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="3383.5" y="1545.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="3294.5" y="1545.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="3472.5" y="1545.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="53" transform="translate(2063.07 648)">Cumulative sequence lengths:</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="53" transform="translate(2841.94 648)">3, 3 + 1, 3 + 1 + 3, 3 + 1 + 3 + 1</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="53" transform="translate(2442.44 771)">Sequence offsets:</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="53" transform="translate(2916.28 771)">0, 4, 8, 12</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="80" transform="translate(318.26 1982)">[</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="80" transform="translate(348.26 1982)">b</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="80" transform="translate(397.093 1982)">atch_size,</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="80" transform="translate(781.793 1982)">s</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="80" transform="translate(823.293 1982)">eq_len,</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="80" transform="translate(1105.46 1982)">h</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="80" transform="translate(1153.79 1982)">ead_nr,</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="80" transform="translate(1444.04 1982)">d</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="80" transform="translate(1492.87 1982)">im]</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="80" transform="translate(2232.79 1982)">[</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="80" transform="translate(2262.79 1982)">t</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="80" transform="translate(2298.63 1982)">otal_nr_token</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="80" transform="translate(2816.33 1982)">s</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="80" transform="translate(2856.99 1982)">,</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="80" transform="translate(2895.66 1982)">h</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="80" transform="translate(2943.99 1982)">ead_nr</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="80" transform="translate(3195.57 1982)">,</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="80" transform="translate(3234.24 1982)">d</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="80" transform="translate(3283.07 1982)">im]</text><path d="M2142 1283C2142 1262.57 2142 1246 2142 1246L2268.96 1246C2268.96 1246 2268.96 1229.43 2268.96 1209 2268.96 1229.43 2268.96 1246 2268.96 1246L2383 1246C2383 1246 2383 1262.57 2383 1283" stroke="#000000" stroke-width="4" stroke-linejoin="round" stroke-miterlimit="10" fill="none" fill-rule="evenodd"/><path d="M2470 1617C2470 1637.43 2470 1654 2470 1654L2439.9 1654C2439.9 1654 2439.9 1670.57 2439.9 1691 2439.9 1670.57 2439.9 1654 2439.9 1654L2408 1654C2408 1654 2408 1637.43 2408 1617" stroke="#000000" stroke-width="4" stroke-linejoin="round" stroke-miterlimit="10" fill="none" fill-rule="evenodd"/><path d="M2848 1283C2848 1262.57 2848 1246 2848 1246L2974.96 1246C2974.96 1246 2974.96 1229.43 2974.96 1209 2974.96 1229.43 2974.96 1246 2974.96 1246L3089 1246C3089 1246 3089 1262.57 3089 1283" stroke="#000000" stroke-width="4" stroke-linejoin="round" stroke-miterlimit="10" fill="none" fill-rule="evenodd"/><path d="M3201 1283C3201 1262.57 3201 1246 3201 1246L3233.66 1246C3233.66 1246 3233.66 1229.43 3233.66 1209 3233.66 1229.43 3233.66 1246 3233.66 1246L3263 1246C3263 1246 3263 1262.57 3263 1283" stroke="#000000" stroke-width="4" stroke-linejoin="round" stroke-miterlimit="10" fill="none" fill-rule="evenodd"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="53" transform="translate(2199.97 1195)">Seq. 1</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="53" transform="translate(2457.48 1194)">Seq. 2</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="53" transform="translate(3162.02 1196)">Seq. 4</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="53" transform="translate(2904.51 1199)">Seq. 3</text><path d="M575 1778C575 1789.32 572.436 1798.5 569.274 1798.5L335.242 1798.5C332.079 1798.5 329.516 1807.68 329.516 1819 329.516 1807.68 326.952 1798.5 323.79 1798.5L114.726 1798.5C111.564 1798.5 109 1789.32 109 1778" stroke="#000000" stroke-width="4" stroke-linejoin="round" stroke-miterlimit="10" fill="none" fill-rule="evenodd"/><path d="M1749 1333C1760.32 1333 1769.5 1335.56 1769.5 1338.73L1769.5 1572.76C1769.5 1575.92 1778.68 1578.48 1790 1578.48 1778.68 1578.48 1769.5 1581.05 1769.5 1584.21L1769.5 1793.27C1769.5 1796.44 1760.32 1799 1749 1799" stroke="#000000" stroke-width="4" stroke-linejoin="round" stroke-miterlimit="10" fill="none" fill-rule="evenodd"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="53" transform="translate(317.331 1867)">s</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="53" transform="translate(1801.9 1594)">b</text><path d="M3531 1756C3531 1767.6 3528.37 1777 3525.13 1777L2796.2 1777C2792.96 1777 2790.33 1786.4 2790.33 1798 2790.33 1786.4 2787.71 1777 2784.47 1777L2130.87 1777C2127.63 1777 2125 1767.6 2125 1756" stroke="#000000" stroke-width="4" stroke-linejoin="round" stroke-miterlimit="10" fill="none" fill-rule="evenodd"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="53" transform="translate(2786.19 1862)">t</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="96" transform="translate(2550.83 313)">THD Layout</text><path d="M2497 1278C2497 1257.57 2497 1241 2497 1241L2529.66 1241C2529.66 1241 2529.66 1224.43 2529.66 1204 2529.66 1224.43 2529.66 1241 2529.66 1241L2559 1241C2559 1241 2559 1257.57 2559 1278" stroke="#000000" stroke-width="4" stroke-linejoin="round" stroke-miterlimit="10" fill="none" fill-rule="evenodd"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="53" transform="translate(2375.05 1738)">Pad. 1</text><path d="M2825 1617C2825 1636.33 2825 1652 2825 1652L2708.01 1652C2708.01 1652 2708.01 1667.67 2708.01 1687 2708.01 1667.67 2708.01 1652 2708.01 1652L2584 1652C2584 1652 2584 1636.33 2584 1617" stroke="#000000" stroke-width="4" stroke-linejoin="round" stroke-miterlimit="10" fill="none" fill-rule="evenodd"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="53" transform="translate(2638.24 1734)">Pad. 2</text><path d="M3531 1622C3531 1641.05 3531 1656.5 3531 1656.5L3414.01 1656.5C3414.01 1656.5 3414.01 1671.95 3414.01 1691 3414.01 1671.95 3414.01 1656.5 3414.01 1656.5L3290 1656.5C3290 1656.5 3290 1641.05 3290 1622" stroke="#000000" stroke-width="4" stroke-linejoin="round" stroke-miterlimit="10" fill="none" fill-rule="evenodd"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="53" transform="translate(3331.41 1741)">Pad. 4</text><path d="M3182 1618C3182 1636.5 3182 1651.5 3182 1651.5L3146.56 1651.5C3146.56 1651.5 3146.56 1666.5 3146.56 1685 3146.56 1666.5 3146.56 1651.5 3146.56 1651.5L3109 1651.5C3109 1651.5 3109 1636.5 3109 1618" stroke="#000000" stroke-width="4" stroke-linejoin="round" stroke-miterlimit="10" fill="none" fill-rule="evenodd"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="53" transform="translate(3071.62 1731)">Pad. 3</text><rect x="1382.5" y="464.5" width="428" height="354" stroke="#000000" stroke-width="2.66667" stroke-linecap="square" stroke-linejoin="round" stroke-miterlimit="10" stroke-dasharray="8 2.66667" fill="#FFFFFF"/><rect x="509.5" y="417.5" width="736" height="614" stroke="#000000" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><text font-family="Trebuchet MS,Trebuchet MS_MSFontService,sans-serif" font-weight="400" font-size="64" transform="translate(658.871 507)">Attention mask</text><rect x="657.5" y="550.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="895.5" y="550.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10"/><rect x="776.5" y="550.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10"/><rect x="1014.5" y="550.5" width="78.9999" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10"/><rect x="657.5" y="666.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="895.5" y="666.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="776.5" y="666.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="1014.5" y="666.5" width="78.9999" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10"/><rect x="657.5" y="781.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10"/><rect x="895.5" y="781.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10"/><rect x="776.5" y="781.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10"/><rect x="1014.5" y="781.5" width="78.9999" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10"/><rect x="657.5" y="895.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="895.5" y="895.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="776.5" y="895.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="1014.5" y="895.5" width="78.9999" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10"/><rect x="1425.5" y="542.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10"/><rect x="1425.5" y="666.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="53" transform="translate(1555.66 601)">token</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="53" transform="translate(1546.62 725)">padding</text></g></svg>
\ No newline at end of file
diff --git a/docs/examples/te_gemma/requirements.txt b/docs/examples/te_gemma/requirements.txt
new file mode 100755
index 0000000000..c90fb6dad0
--- /dev/null
+++ b/docs/examples/te_gemma/requirements.txt
@@ -0,0 +1,4 @@
+transformers==4.41.1
+accelerate==0.30.1
+datasets==2.19.1
+sentencepiece==0.2.0
\ No newline at end of file
diff --git a/docs/examples/te_gemma/run_gemma_2b.py b/docs/examples/te_gemma/run_gemma_2b.py
new file mode 100644
index 0000000000..db2fb087c9
--- /dev/null
+++ b/docs/examples/te_gemma/run_gemma_2b.py
@@ -0,0 +1,15 @@
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from huggingface_hub import login
+
+access_token = ""
+login(access_token)
+
+model_name = "google/gemma-3-4b-it"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(model_name)
+print(model.config)
+input_text = "Write me a poem about Machine Learning."
+input_ids = tokenizer(input_text, return_tensors="pt")
+
+outputs = model.generate(**input_ids)
+print(tokenizer.decode(outputs[0]))
diff --git a/docs/examples/te_gemma/run_generation.py b/docs/examples/te_gemma/run_generation.py
new file mode 100755
index 0000000000..eb781f11cf
--- /dev/null
+++ b/docs/examples/te_gemma/run_generation.py
@@ -0,0 +1,22 @@
+from utils import *
+
+hyperparams.model_name = "/perfhome/repos/ckpt/models/gemma-7b-hf/" # "/tmp/gemma-7b-hf/" # <== Add model weight location here e.g. "/path/to/downloaded/gemma/weights"
+hyperparams.qkv_format = "thd"
+
+# hyperparams.generation_cuda_graphs = True # 709.8s
+hyperparams.generation_cuda_graphs = True
+
+if hyperparams.generation_cuda_graphs:
+    # It is necessary to preallocate a static buffer.
+    # CUDA graphs require static input tensors for every kernel.
+    # This approach may result in a slight increase in memory consumption;
+    # however, the substantial speedup achieved makes it worthwhile.
+    hyperparams.cuda_graphs_static_batch_size = 64
+    hyperparams.cuda_graphs_static_max_seq_len = 1024
+    hyperparams.cuda_graphs_static_max_context_len = 128
+
+hyperparams.is_paged = False
+model = init_te_gemma_model(hyperparams)
+
+print_sample_of_generated_texts(model)
+benchmark_generation(model)
diff --git a/docs/examples/te_gemma/run_generation_llama.py b/docs/examples/te_gemma/run_generation_llama.py
new file mode 100755
index 0000000000..2f90995bd1
--- /dev/null
+++ b/docs/examples/te_gemma/run_generation_llama.py
@@ -0,0 +1,10 @@
+from utils import *
+
+hyperparams.model_name = "/perfhome/repos/ckpt/models/llama2-7b-hf/" # "/tmp/gemma-7b-hf/" # <== Add model weight location here e.g. "/path/to/downloaded/gemma/weights"
+hyperparams.qkv_format = "thd"
+
+# model = init_te_llama_model(hyperparams)
+model = init_baseline_model(hyperparams)
+
+print_sample_of_generated_texts(model)
+# benchmark_generation(model)
diff --git a/docs/examples/te_gemma/te_gemma.py b/docs/examples/te_gemma/te_gemma.py
new file mode 100755
index 0000000000..f24b700979
--- /dev/null
+++ b/docs/examples/te_gemma/te_gemma.py
@@ -0,0 +1,808 @@
+# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+from contextlib import contextmanager
+
+from typing import Optional
+from functools import partial
+from collections import OrderedDict
+
+import torch
+import transformer_engine as te
+from transformer_engine.pytorch.attention import InferenceParams, RotaryPositionEmbedding
+from transformer_engine.common.recipe import Format, DelayedScaling
+from torch.cuda.amp import autocast
+
+import transformers
+from transformers.models.gemma.modeling_gemma import GemmaForCausalLM, GemmaConfig, GemmaModel
+
+import torch.nn.functional as F
+
+def setup_cache_params_from_infer_params(inference_params, lengths_tensor, max_input_length):
+    """
+    Converts the `input_ids` to variables like `cu_seqlens_q/kv`, etc. which
+    will be used later.
+
+    (Currently a hack, this should be reformatted to a better method)
+    """
+
+    assert lengths_tensor is not None and max_input_length is not None, \
+        "lengths_tensor and max_input_length should not be none for qkv_format = \"thd\""
+    torch.add(
+        inference_params.cached_sequence_lengths,
+        inference_params.input_sequence_lengths,
+        out=inference_params.cached_sequence_lengths)
+    # inference_params.input_sequence_lengths[:len(lengths_tensor)].copy_(lengths_tensor, non_blocking=True)
+    inference_params.input_sequence_lengths.copy_(lengths_tensor)
+
+    inference_params.max_incoming_seq_len = max_input_length
+
+    max_seqlen_q, max_seqlen_kv = inference_params.max_incoming_seq_len, inference_params.max_sequence_length
+
+    # # Allocation of buffers, it works correctly with CUDA Graphs.
+    _allocator = StaticBufferAllocator()
+    NR_BUFFERS = 4
+
+    cu_seqlens_q, cu_seqlens_kv, cu_seqlens_q_padded, cu_seqlens_kv_padded = [
+        _allocator(inference_params.max_batch_size + 1, dtype=torch.int32, device="cuda")
+        for _ in range(NR_BUFFERS)
+    ]
+
+    torch.cumsum(inference_params.input_sequence_lengths, dim=0, out=cu_seqlens_q[1:])
+    torch.cumsum(
+        inference_params.cached_sequence_lengths + inference_params.input_sequence_lengths,
+        dim=0, out=cu_seqlens_kv[1:])
+    # If layer has shape [b * s_layer, h, d]
+    # offsets are of the form [k * s_layer * h * d for k = 0, ..., batch_size]
+    cu_seqlens_q_padded.copy_(
+        torch.arange(0, inference_params.max_batch_size + 1, device="cuda") * max_seqlen_q)
+    cu_seqlens_kv_padded.copy_(
+        torch.arange(0, inference_params.max_batch_size + 1, device="cuda") * max_seqlen_kv)
+
+    # inference_params.step_dict = OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist()))
+    inference_params.pre_step(OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist())))
+
+    # print(inference_params.step_dict)
+
+    def get_cache_params_in_infer_params():
+        return max_seqlen_q, max_seqlen_kv, cu_seqlens_q, cu_seqlens_kv, cu_seqlens_q_padded, cu_seqlens_kv_padded
+
+    # For the time being, create an ad-hoc field in `inference_params` to get the variables.
+    # @sudhakars: to create a better way later.
+    inference_params.get_cache_params_from_infer_params = get_cache_params_in_infer_params
+
+# This class has been modified from
+# https://github.com/huggingface/transformers/blob/98adf24883b007c2a7fb17bab1c01b1614673433/src/transformers/models/gemma/modeling_gemma.py
+class GemmaRotaryEmbedding(torch.nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float() / self.dim))
+        self.register_buffer("inv_freq", tensor=inv_freq, persistent=False)
+
+    @torch.no_grad()
+    def forward(self, x, position_ids, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        self.inv_freq.to(x.device)
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
+        # Force float32 since bfloat16 loses precision on long contexts
+        # See https://github.com/huggingface/transformers/pull/29285
+        device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+        return emb.unsqueeze(2) # should return in [b, s, 1, d] format
+
+
+class StaticBufferAllocator(torch.nn.Module):
+    """
+        This class is used when we use te.make_graphed_callable().
+        CUDA Graphs require all tensors to be static. Neverthless,
+        torch API make_graphed_callable() takes care of output of torch modules,
+        and makes them static. Thus by wrapping allocation of memory into
+        torch.nn.Module, we can greatly simplify our code.
+    """
+
+    # pylint: disable=no-self-use
+    def forward(self, size, dtype, device):
+        """
+            Return buffer of given size, dtype and device.
+        """
+        return torch.zeros(size, dtype=dtype, device=device)
+
+class TEGemmaDecoderLayer(te.pytorch.TransformerLayer):
+    """
+    Wrapper class over TE's `TransformerLayer`. This makes the wrapper very
+    similar to HF's `GemmaDecoderLayer` and easier to replace it in the code.
+
+    Args:
+        config: GemmaConfig
+        args: positional args (for compatibility with `GemmaDecoderLayer`)
+        kwargs: keyword args (for compatibility with `GemmaDecoderLayer`)
+    """
+
+    def __init__(self, config: GemmaConfig, layer_idx: int, *args, **kwargs):
+
+        self.gemma_config = config
+
+        super().__init__(
+            hidden_size=config.hidden_size,
+            ffn_hidden_size=config.intermediate_size,
+            num_attention_heads=config.num_attention_heads,
+            bias=False,
+            layernorm_epsilon=config.rms_norm_eps,
+            hidden_dropout=0,
+            attention_dropout=0,
+            fuse_qkv_params=config.fuse_qkv_params,
+            normalization="RMSNorm",
+            activation="geglu",
+            # attn_input_format=config.qkv_format,
+            attn_input_format="bshd",
+            num_gqa_groups=config.num_key_value_heads,
+            kv_channels=self.gemma_config.head_dim,
+            layer_number=(
+                layer_idx + 1
+            ),  # Layer numbers in TE starts from 1, not 0 like in the HF.
+            zero_centered_gamma=True,
+        )
+
+    def alloc(self, size, dtype, device):
+        """
+            Allocated the buffer and works correctly with CUDA Graphs.
+        """
+        return self._allocator(size, dtype, device)
+
+    def forward(self, *args, **kwargs):  # We need to additionally pass positional encoding.
+
+        # if "self_attn_mask_type" in kwargs:
+        #     attn_mask_type = kwargs['self_attn_mask_type']
+        # else:
+        #     attn_mask_type = "whatever_default_is"
+
+        # if attn_mask_type == "arbitrary":
+        #     # @sudhakars: following logic doesn't work for `thd`
+        #     attn_mask = kwargs['attention_mask']
+        #     attention_mask_inv = ~attn_mask
+        #     generation_case = torch.tensor(torch.tensor(attn_mask.shape).shape).item() > 2
+
+        #     if generation_case:
+        #         # @sudhakars: for some reason, `attention_mask` for generation is of the
+        #         # form [b, 1, 1, s].
+        #         attention_mask_inv = attention_mask_inv.squeeze(1).squeeze(1)
+        #         assert torch.tensor(torch.tensor(attention_mask_inv.shape).shape).item() == 2
+
+        #     # Create `position_ids` on the fly using `attention_mask` since HF
+        #     # does the same in generation logic.
+        #     position_ids = attention_mask_inv.long().cumsum(-1) - 1
+        #     position_ids.masked_fill_(attention_mask_inv == 0, 1)
+
+        #     if "position_ids" in kwargs and kwargs['position_ids'] is not None:
+        #         assert torch.all(torch.eq(position_ids, kwargs["position_ids"])), "position ids don't match match exactly!"
+
+        #     # convert [b, s] to [b, 1, s, s] since `arbitrary` is only set for
+        #     # context phase and context phase gets [b, s] sized attn mask
+        #     seq_len = 1 if torch.tensor(torch.tensor(attn_mask.shape).shape).item() > 2 else attention_mask_inv.shape[1]
+        #     arbitrary_attn_mask = torch.zeros(attention_mask_inv.shape[0], 1, seq_len, attention_mask_inv.shape[1]).bool()
+        #     for sample_idx in range(attn_mask.shape[0]):
+        #         pad_len = attn_mask[sample_idx].sum().int().item()
+        #         # set the columns to padded
+        #         arbitrary_attn_mask[sample_idx, :, :, :pad_len] = True
+        #         # set the rows to padded
+        #         if not generation_case:
+        #             arbitrary_attn_mask[sample_idx, :, :pad_len, :] = True
+        #             arbitrary_attn_mask[sample_idx] = torch.tril(arbitrary_attn_mask[sample_idx].logical_not()).logical_not()
+
+        #     # Update the attention mask to arbitrary
+        #     kwargs['attention_mask'] = arbitrary_attn_mask.cuda()
+
+        #     # @sudhakars: `max_position_embeddings` is not even used inside GemmaRotaryEmbedding
+        #     # @sudhakars: change the hardcoded `dim` to something like config.head_dim
+        #     te_rope_emb = GemmaRotaryEmbedding(dim=256, max_position_embeddings=self.gemma_config.max_position_embeddings).cuda()
+        #     te_rope_emb = te_rope_emb(args[0], position_ids.cuda())
+        # else:
+        # When the `attention_mask` is not `arbitrary`, then for the purpose
+        # of this tutorial, we're using `padding_causal` (for context) and
+        # `padding` (for generation)
+        # @sudhakars: find a better way to provide the `tensor_format`
+        te_rope_emb = RotaryPositionEmbedding(self.gemma_config.head_dim)(
+            max_seq_len=self.gemma_config.max_position_embeddings
+        ).cuda()
+
+        inference_params = kwargs["inference_params"]
+        # @sudhakars: big assumption that the input is "sbhd"
+        # batch_size = args[0].shape[0]
+        if inference_params.qkv_format_legacy == "thd":
+            (
+                max_seqlen_q, max_seqlen_kv, cu_seqlens_q, cu_seqlens_kv, cu_seqlens_q_padded, cu_seqlens_kv_padded
+            ) = inference_params.get_cache_params_from_infer_params()
+
+        # this args cannot be passed to TransformerLayer
+        keys_to_remove = [
+            "position_ids",
+            "past_key_value",
+            "output_attentions",
+            "use_cache",
+            "cache_position",
+        ]
+        for key in keys_to_remove:
+            kwargs.pop(key, None)
+
+        # import pdb; pdb.set_trace()
+        # We need to return tuple to be compatible with HF.
+        return (
+            super().forward(
+                *args,
+                rotary_pos_emb=te_rope_emb,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_kv=cu_seqlens_kv,
+                max_seqlen_q=max_seqlen_q,
+                max_seqlen_kv=max_seqlen_kv,
+                **kwargs
+            ),
+        )
+
+class StaticGemmaModel(torch.nn.Module):
+    """
+    StaticGemma is based of HF GemmaModel class.
+    It is adjusted to work properly with CUDA Graphs.
+    """
+
+    def __init__(
+        self,
+        model: GemmaModel,
+        dtype: torch.dtype,
+        mask: torch.Tensor,
+        lm_head: torch.nn.Module,
+    ):
+        super().__init__()
+        self.model = model
+        self.normalizer = torch.tensor(self.model.config.hidden_size**0.5, dtype=dtype)
+        self.mask = mask
+        self.lm_head = lm_head
+
+    def set_inference_params(self, inference_params):
+        self.inference_params = inference_params
+
+    # @sudhakars: is `arbitrary` fine being the default here?
+    def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor = None, attn_mask_type: str = "arbitrary"):
+        with torch.no_grad():
+            # static operation - for CUDA graphs
+            hidden_states.data[:] = hidden_states.data[:] * self.normalizer
+
+            for i, decoder_layer in enumerate(self.model.layers):
+                hidden_states.data[:] = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    self_attn_mask_type=self.mask if attn_mask_type is None else attn_mask_type,
+                    inference_params=self.inference_params,
+                )[
+                    0
+                ]  # static copy - for CUDA graphs
+
+        hidden_states.copy_(self.model.norm(hidden_states))  # static copy - for CUDA graphs
+        logits = self.lm_head(hidden_states)
+        logits = logits.float()
+        return logits
+
+
+class GemmaGenerator(torch.nn.Module):
+    """
+    GemmaGenerator gets one layer of embeddins,
+    makes forward pass and returns next tokens.
+    """
+
+    def __init__(
+        self, model: GemmaModel, lm_head: torch.nn.Module, dtype: torch.dtype, qkv_format: str
+    ):
+        super().__init__()
+        self.model = model
+        self.gemma_layers = StaticGemmaModel(model, dtype, "arbitrary", lm_head)
+        self.qkv_format = qkv_format
+
+    def set_inference_params(self, inference_params):
+        self.inference_params = inference_params
+        self.gemma_layers.set_inference_params(inference_params)
+
+    # @sudhakars: is `arbitrary` a good default value here?
+    def forward(self, hidden_states: torch.Tensor, mask: torch.Tensor = None, attn_mask_type: str = "arbitrary"):
+        logits = self.gemma_layers(hidden_states, attention_mask=mask, attn_mask_type = attn_mask_type)
+
+        assert logits.shape[0] == hidden_states.shape[0]  # b
+        assert logits.shape[1] == hidden_states.shape[1]  # seq_len
+        # logits.shape[2] = number of tokens
+        logits = logits[:, -1, :]
+        next_tokens = torch.argmax(logits, dim=1)
+
+        # static copy for CUDA graphs
+        hidden_states.copy_(self.model.embed_tokens(next_tokens).unsqueeze(1))
+
+        return next_tokens
+
+
+class PartialForwardWrapper(torch.nn.Module):
+    """
+    This class wraps a `torch.nn.Module` while partially modifying its `forward`
+
+    CUDAGraphs' `make_graphed_callables` method takes in a module but if only
+    `functools.partial` is used to wrap the module, it changes the modules'
+    type and that interferes with the `make_graphed_callables` intrinsics.
+    """
+    def __init__(self, module, **kwargs):
+        super().__init__()
+        self.module = module
+        self.partial_forward = partial(self.module.forward, **kwargs)
+
+    def __call__(self, *args, **kwargs):
+        return self.partial_forward(*args, **kwargs)
+
+    # @sudhakars: should we use better abstraction?
+    def set_inference_params(self, *args, **kwargs):
+        return self.module.set_inference_params(*args, **kwargs)
+
+
+@contextmanager
+def replace_decoder(te_decoder_cls):
+    """
+    Replace `GemmaDecoderLayer` with custom `TEGemmaDecoderLayer`.
+    """
+    original_gemma_decoder_cls = transformers.models.gemma.modeling_gemma.GemmaDecoderLayer
+    transformers.models.gemma.modeling_gemma.GemmaDecoderLayer = te_decoder_cls
+    try:
+        yield
+    finally:
+        transformers.models.gemma.modeling_gemma.GemmaDecoderLayer = original_gemma_decoder_cls
+
+
+class TEGemmaForCausalLM(GemmaForCausalLM):
+    """
+    Causal LM created with `GemmaModel`. The underlying `GemmaDecoderLayer`
+    class is monkey-patched with `TEGemmaDecoderLayer` class before
+    initializing the causal LM with `GemmaForCausalLM`.
+
+    Args:
+        config: GemmaConfig
+    """
+
+    def __init__(self, config: GemmaConfig):
+        with replace_decoder(te_decoder_cls=TEGemmaDecoderLayer):
+            super().__init__(config)
+        self.config = config
+        self.to(torch.bfloat16).cuda()
+        self.hidden_size = config.hidden_size
+        self._model_generation_phase = GemmaGenerator(
+            lm_head=self.lm_head,
+            model=self.model,
+            dtype=torch.bfloat16,
+            qkv_format=config.qkv_format,
+        )
+        self._model_context_phase = StaticGemmaModel(
+            self.model, torch.bfloat16, "arbitrary", self.lm_head
+        )
+
+        if self.config.fp8:
+            self.fp8_recipe = DelayedScaling(
+                fp8_format=Format.HYBRID, amax_history_len=16, amax_compute_algo="max"
+            )
+
+    @staticmethod
+    def _padding_to_end(inputs, lengths, max_seq_len=None):
+        """
+        Gets the tensor with sequence padded from the beginning and
+        return tensor padded from its end.
+
+        Parameters
+        ----------
+        inputs : Tensor, tensor with shape [b, s] containing token numbers.
+                 It's padded from the beggining.
+        lengths: Tensor, tensor with shape [s] with lengths of the sequences.
+
+        """
+        max_seq_len = torch.max(lengths) if max_seq_len is None else max_seq_len
+        batch_size, max_seq_len = inputs.shape
+        new_input_ids = inputs.clone()
+        for i in range(batch_size):
+            new_input_ids[i, : lengths[i]] = inputs[i, (max_seq_len - lengths[i]) : max_seq_len]
+            new_input_ids[i, lengths[i] :] = inputs[i, 0 : (max_seq_len - lengths[i])]
+
+        # Disable the input preparation that involves extra padding
+        # inputs.copy_(new_input_ids)
+
+        # Trim the inputs to no extra padding i.e. fix the max seq len to
+        # the longest sequence in the batch
+        actual_max_seq_len = max_seq_len
+        inputs.data = new_input_ids[:, :actual_max_seq_len]
+        print(f"actual_max_seq_len: {actual_max_seq_len}")
+
+        # For Paged Attention, make the valid sequences, multiple of 64
+        # inputs.data = new_input_ids[:, :4].repeat(1, 16)
+
+
+    def _next_64_multiply(self, x):
+        return ((x + 63) // 64) * 64
+
+    # This function is overriden in TeGEmmaForCausalLMCudaGraphs.
+    def _create_hidden_states_buffer(self, input_ids: torch.Tensor):
+        tensor = torch.empty(
+            (input_ids.shape[0], input_ids.shape[1], self.hidden_size),
+            device="cuda",
+            dtype=torch.float32,
+        )
+        # import pdb; pdb.set_trace()
+        return tensor
+
+    # This function is overriden in TeGEmmaForCausalLMCudaGraphs.
+    def _create_inference_params(self, *args, **kwargs):
+        infer_params = InferenceParams(
+            *args, **kwargs
+        )
+
+        max_batch_size = kwargs["max_batch_size"]
+
+        # Initialize some legacy params
+        infer_params.cached_sequence_lengths = torch.zeros(
+            (max_batch_size,), device="cuda", dtype=torch.int32)
+        infer_params.input_sequence_lengths = torch.zeros(
+            (max_batch_size,), device="cuda", dtype=torch.int32)
+
+        return infer_params
+
+    # This function is overriden in TeGEmmaForCausalLMCudaGraphs.
+    def _get_max_input_seq_len(self, input_ids):
+        return input_ids.shape[1] \
+                if not hasattr(self.config, "cuda_graphs_static_max_context_len") \
+                    else self.config.cuda_graphs_static_max_context_len
+
+    # The buffer for generation is some part (beginning) of hidden states buffer.
+    # This function returns pointer to it and also copies there data if provided.
+    def _get_generation_buffer(self, hidden_states_buffer, data_to_copy=None):
+        # hidden_states_buffer has shape [b, s, hd]
+        # generation_buffer will have shape [b, 1, hd]
+        # Notice that "generation_buffer = hidden_states_buffer[:, 0, :].unsqueeze(1)"
+        # will return uncontiguous buffer, which we want to avoid.
+        output = hidden_states_buffer.view(-1)[
+            : hidden_states_buffer.shape[0] * hidden_states_buffer.shape[2]
+        ]
+        if data_to_copy is not None:
+            output.copy_(data_to_copy.reshape(-1))
+        generation_buffer = output.view(
+            (hidden_states_buffer.shape[0], 1, hidden_states_buffer.shape[2])
+        )
+        return generation_buffer
+
+    def _generate_context_phase(self, input_ids: torch.Tensor, inference_params: InferenceParams):
+        # import pdb; pdb.set_trace()
+        hidden_states = self._create_hidden_states_buffer(input_ids)
+        hidden_states.data[:] = self.model.embed_tokens(input_ids)
+
+        # We need to update offsets before every forward pass to make cache work properly.
+        lengths = input_ids.ne(0).sum(dim=1)
+        # import pdb; pdb.set_trace()
+        if self.config.qkv_format == "thd":
+            # inference_params.setup_before_new_input(
+            #     lengths_tensor=lengths, max_input_length=input_ids.shape[1]
+            # )
+            lengths = input_ids.ne(0).sum(dim=1)
+            max_input_length = input_ids.shape[1]
+            setup_cache_params_from_infer_params(inference_params, lengths, max_input_length)
+        else:
+            inference_params.setup_before_new_input(length=input_ids.shape[1])
+
+
+        logits = self._model_context_phase(
+            hidden_states,
+            attention_mask=((input_ids == 0) if self.config.qkv_format != "thd" else None),
+            attn_mask_type="padding_causal" if self.config.qkv_format == "thd" else "arbitrary"
+        )
+
+        # We choose logits coresponding with last token in each sequence,
+        # which have various lengths - they are stored in (inference_params.incoming_seq_len - 1)
+        # Tensor when qkv_format == "thd" and
+        # they are the last token in the sequence when qkv_format != "thd".
+        if self.config.qkv_format == "thd":
+            logits = logits[
+
+                torch.arange(logits.size(0)), inference_params.input_sequence_lengths - 1, :
+            ]
+        else:
+            logits = logits[:, -1, :]
+
+        next_tokens = torch.argmax(logits, dim=1)
+
+        # self.hidden_states have shape [b, s, hd].
+        # We return hidden state for the last token - output has shape [b, 1, hd]
+        hidden_states = self._get_generation_buffer(
+            hidden_states, self.model.embed_tokens(next_tokens)
+        )
+        return hidden_states, next_tokens
+
+    def _make_mask_one_token_longer(self, mask):
+        return torch.cat(
+            [mask, torch.zeros(mask.size(0), 1, 1, 1, dtype=torch.bool, device=mask.device)], dim=-1
+        )
+
+    @torch.no_grad()
+    def generate(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        pad_token_id: int = 0,
+        max_new_tokens: int = 0,
+        *args,
+        **kwargs
+    ):
+        self.eval()
+
+        # We need both autocasts: FP8 for operations that can run in lower precision
+        # and BF16 for those that cannot.
+        with autocast(dtype=torch.bfloat16, cache_enabled=False), te.pytorch.fp8_autocast(
+            enabled=self.config.fp8, fp8_recipe=self.fp8_recipe if self.config.fp8 else None
+        ):
+
+            lengths = torch.sum(input_ids.ne(pad_token_id), dim=-1).squeeze()  # [s]
+
+            batch_size, max_input_sequence_len = input_ids.shape[0], self._get_max_input_seq_len(
+                input_ids
+            )
+
+            # This is not needed since the padding to the left is already done in utils.py
+            # # Pad input_ids with zeros on the left to match max_input_sequence_len
+            # # This adds padding tokens (0) to the left side of each sequence in the batch
+            # # Shape goes from [batch_size, seq_len] to [batch_size, max_input_sequence_len]
+            # input_ids = F.pad(
+            #                 input_ids, (max_input_sequence_len - input_ids.shape[1], 0), "constant", 0
+            #             )
+
+            if self.config.qkv_format == "thd":
+                # For thd layout padding is at the end, otherwise at the beginning.
+                TEGemmaForCausalLM._padding_to_end(input_ids,
+                    lengths,
+                    max_seq_len=self.config.cuda_graphs_static_max_context_len \
+                        if self.config.generation_cuda_graphs else None
+                )
+
+            # import pdb; pdb.set_trace()
+
+            # InferenceParams is a cache, where keys and values of previous tokens are stored.
+            # Moreover it stores length of both already generated and input sequences.
+            inference_params = self._create_inference_params(
+                max_batch_size=batch_size,
+                # num_layers=self.config.num_hidden_layers,
+                max_sequence_length=self._next_64_multiply(max_input_sequence_len + max_new_tokens),
+                num_heads_kv=self.config.num_key_value_heads,
+                # num_heads_q=self.config.num_attention_heads,
+                head_dim_v=self.config.head_dim,
+                head_dim_k=self.config.head_dim,
+                dtype=torch.bfloat16,
+                is_paged=self.config.is_paged,
+                page_size=64,
+                total_num_pages=64, # 64 * 64 (max_sequence_length) / 64 (page_size)
+                # is_cuda_graph=False
+            )
+
+            def init_cache_params_in_infer_params(inference_params):
+                inference_params.cached_sequence_lengths = torch.zeros(
+                (batch_size,), device="cuda", dtype=torch.int32)
+                inference_params.input_sequence_lengths = torch.zeros(
+                (batch_size,), device="cuda", dtype=torch.int32)
+
+            init_cache_params_in_infer_params(inference_params)
+            inference_params.qkv_format_legacy = self.config.qkv_format
+
+            self._model_context_phase.set_inference_params(inference_params)
+            self._model_generation_phase.set_inference_params(inference_params)
+
+            hidden_states, next_tokens = self._generate_context_phase(input_ids, inference_params)
+
+            # Generation phase.
+            if self.config.qkv_format == "thd":
+                # inference_params.setup_before_new_input(
+                #     lengths_tensor=torch.ones((next_tokens.shape[0],), device="cuda"),
+                #     max_input_length=1,
+                # )
+                setup_cache_params_from_infer_params(inference_params,
+                                                     lengths_tensor=torch.ones((next_tokens.shape[0],), dtype=int),
+                                                     max_input_length=1)
+            else:
+                inference_params.setup_before_new_input(length=1)
+
+            output_tokens = [next_tokens]
+
+            mask = None
+            if self.config.qkv_format != "thd":
+                mask = (input_ids == 0).unsqueeze(1).unsqueeze(1)
+
+            for _ in range(max_new_tokens):
+                if self.config.qkv_format != "thd":
+                    # It will not work with cuda graphs, but it is not used for thd qkv_format.
+                    # Attention mask in bshd needs attn_mask increased by 1 to
+                    # include the next token to be generated
+                    mask = self._make_mask_one_token_longer(mask)
+
+                # setup_cache_params_from_infer_params(inference_params, input_ids)
+                # @sudhakars: could create position_ids from mask here
+                next_tokens = self._model_generation_phase(hidden_states, mask, attn_mask_type="padding" if self.config.qkv_format=="thd" else "arbitrary")
+
+                # self.inference_params contains for example kv_cache.
+                # This needs to be called before every pass,
+                # to update the information of sequence lengths.
+                # Here we increase sequence offsets by one,
+                # because we generated one token for every sequence.
+                if self.config.qkv_format == "thd":
+                    # self.inference_params.setup_before_new_input(
+                    #     lengths_tensor=torch.ones((next_tokens.shape[0],), device="cuda"),
+                    #     max_input_length=1,
+                    # )
+                    setup_cache_params_from_infer_params(inference_params,
+                                                        lengths_tensor=torch.ones((next_tokens.shape[0],), dtype=int),
+                                                        max_input_length=1)
+                else:
+                    inference_params.setup_before_new_input(length=1)
+                # next_tokens is static output tensor, so we need to clone it
+                # - it gets changed every iteration.
+                output_tokens.append(next_tokens.clone())
+
+            result = torch.cat((input_ids, torch.stack(output_tokens).permute([1, 0])), dim=1)
+            return result
+
+    def forward(self, *args, **kwargs):
+        self._model_context_phase.set_inference_params(None)
+        hidden_states = self.model.embed_tokens(kwargs["input_ids"])
+        logits = self._model_context_phase(
+            hidden_states,
+            attention_mask=((kwargs["input_ids"] == 0) if self.config.qkv_format != "thd" else None),
+            attn_mask_type="arbitrary"
+        )
+        return logits
+
+class TEGemmaForCausalLMCudaGraphs(TEGemmaForCausalLM):
+    """
+    TEGemmaForCausalLMCudaGraphs is the version of the class TEGemmaForCausalLM
+    using CUDA Graphs to speed it up. We need to make one trade-off.
+    Namely, batch_size, max_seq_len and max_context_seq_len need to be static.
+    It is necessary to run generation with the same value of
+    these variables that we recorded graph on.
+    """
+
+    def __init__(self, config: GemmaConfig):
+        super().__init__(config)
+        assert (
+            config.qkv_format == "thd"
+        ), "Generation with CUDA Graphs are implemented only for thd format."
+
+        # Preparation of the static buffers.
+        self.config = config
+        self.hidden_states_buffer = torch.empty(
+            (
+                self.config.cuda_graphs_static_batch_size,
+                self.config.cuda_graphs_static_max_context_len,
+                self.config.hidden_size,
+            )
+        ).cuda()
+        # This is in fact part of the buffer for hidden_states.
+        self.generation_buffer = self._get_generation_buffer(self.hidden_states_buffer)
+        # self.inference_params = InferenceParams(
+        #     max_batch_size=config.cuda_graphs_static_batch_size,
+        #     max_sequence_length=config.cuda_graphs_static_max_seq_len,
+        #     qkv_format="thd",
+        # )
+        self.inference_params = InferenceParams(
+            max_batch_size=self.config.cuda_graphs_static_batch_size,
+            # num_layers=self.config.num_hidden_layers,
+            max_sequence_length=self.config.cuda_graphs_static_max_seq_len,
+            num_heads_kv=self.config.num_key_value_heads,
+            # num_heads_q=self.config.num_attention_heads,
+            head_dim_v=self.config.head_dim,
+            head_dim_k=self.config.head_dim,
+            dtype=torch.bfloat16,
+            is_paged=self.config.is_paged,
+            page_size=64,
+            total_num_pages=64, # 64 * 64 (max_sequence_length) / 64 (page_size)
+            # is_cuda_graph=False
+        )
+
+        ## Taken from TEGemmaForCausalLM above
+        max_batch_size = self.config.cuda_graphs_static_batch_size
+        # Initialize some legacy params
+        self.inference_params.cached_sequence_lengths = torch.zeros(
+            (max_batch_size,), device="cuda", dtype=torch.int32)
+        self.inference_params.input_sequence_lengths = torch.zeros(
+            (max_batch_size,), device="cuda", dtype=torch.int32)
+
+        # def init_cache_params_in_infer_params(inference_params):
+        #         inference_params.cached_sequence_lengths = torch.zeros(
+        #         (batch_size,), device="cuda", dtype=torch.int32)
+        #         inference_params.input_sequence_lengths = torch.zeros(
+        #         (batch_size,), device="cuda", dtype=torch.int32)
+        # init_cache_params_in_infer_params(inference_params)
+
+        self.inference_params.qkv_format_legacy = self.config.qkv_format
+
+        self._model_generation_phase.set_inference_params(self.inference_params)
+        self._model_context_phase.set_inference_params(self.inference_params)
+
+    def record(self):
+        # We want to record model in training=False, because it will be used in generation.
+        self.eval()
+
+        # Here "the trick" happens. We override methods from TEGemmaForCausalLM
+        # with their recorded version. After invocation of each of them,
+        # captured graph will be replayed with minimal usage of CPU,
+        # what will lead to huge speedup.
+        input_shape = (
+            self.config.cuda_graphs_static_batch_size,
+            self.config.cuda_graphs_static_max_context_len,
+        )
+        self.inference_params.reset()
+        # self.inference_params.setup_before_new_input(
+        #     lengths_tensor=torch.tensor(input_shape[0] * [input_shape[1]], device="cuda"),
+        #     max_input_length=input_shape[1],
+        # )
+        lengths = torch.tensor(input_shape[0] * [input_shape[1]], device="cuda")
+        max_input_length = input_shape[1]
+        setup_cache_params_from_infer_params(self.inference_params, lengths, max_input_length)
+
+        self._model_context_phase = self.record_graph(
+            self._model_context_phase,
+            self.hidden_states_buffer,
+            attn_mask_type="padding_causal"
+        )  # CUDA Graphs recording
+
+        input_shape = (self.config.cuda_graphs_static_batch_size, 1)
+        self.inference_params.reset()
+        # self.inference_params.setup_before_new_input(
+        #     lengths_tensor=torch.tensor(input_shape[0] * [input_shape[1]], device="cuda"),
+        #     max_input_length=input_shape[1],
+        # )
+        lengths = torch.tensor(input_shape[0] * [input_shape[1]], device="cuda")
+        max_input_length = input_shape[1]
+        setup_cache_params_from_infer_params(self.inference_params, lengths, max_input_length)
+
+        self._model_generation_phase = self.record_graph(
+            self._model_generation_phase,
+            self.generation_buffer,
+            attn_mask_type="padding"
+        )  # CUDA Graphs recording
+
+    """
+        Functions _create_hidden_states_buffer and _create_inference_params
+        from base class are overriden to make hidden_states and inference_params static
+        - not changing their position in memory between every invocation.
+    """
+
+    def _create_hidden_states_buffer(self, *args, **kwargs):
+        return self.hidden_states_buffer
+
+    def _create_inference_params(self, *args, **kwargs):
+        self.inference_params.reset()
+        return self.inference_params
+
+    def _get_max_input_seq_len(self, _):
+        return self.config.cuda_graphs_static_max_context_len
+
+    @torch.no_grad()
+    def record_graph(self, function, input_tensor, **sample_kwargs):
+        # function is invoked on argument (self.hidden_states,) and all kernels are recorded.
+        # record_graph() returns captured function, which can be run later with lower of th CPU.
+        fp8_format = Format.HYBRID
+        fp8_recipe = DelayedScaling(
+            fp8_format=fp8_format, amax_history_len=1024, amax_compute_algo="max"
+        )
+
+        # We need both autocasts: FP8 for operations that can run in lower precision
+        # and BF16 for those that cannot.
+        with autocast(dtype=torch.bfloat16, cache_enabled=False):
+            graphed_function = te.pytorch.make_graphed_callables(
+                function,
+                (input_tensor,),
+                fp8_enabled=self.config.fp8,
+                fp8_recipe=fp8_recipe,
+                allow_unused_input=True,
+                num_warmup_iters=3,
+                sample_kwargs=sample_kwargs,
+            )
+        return graphed_function
diff --git a/docs/examples/te_gemma/te_gemma_loading_weights.py b/docs/examples/te_gemma/te_gemma_loading_weights.py
new file mode 100755
index 0000000000..41f62ad7f3
--- /dev/null
+++ b/docs/examples/te_gemma/te_gemma_loading_weights.py
@@ -0,0 +1,160 @@
+# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+import os
+import re
+import gc
+import torch
+
+from typing import List
+
+from transformer_engine.pytorch.fp8 import fp8_model_init
+
+from transformers.modeling_utils import load_state_dict, _load_state_dict_into_model
+from transformers.utils.hub import get_checkpoint_shard_files
+
+"""
+    This file contains logic of mapping the HuggingFace GemmaModel parameters
+    with TransformerEngine TransformerLayer. When we have initialized Transformer models
+    both with HF and with TE, we can copy parameters from the first to the second.
+"""
+
+
+def _load_weights_for_fp8_model(vanilla_model, hyperparams):
+    # The weights are loaded from the file with state_dict
+    # of model with weights which contains also fp8 parameters.
+    # The weights are in BF16 precision, but they contain fp8 metadata
+    # computed by the calibration procedure.
+    vanilla_model.load_state_dict(
+        torch.load(hyperparams.fp8_model_weights_filename),
+        strict=False,
+        # strict = false, because some parameters have
+        # multiple pointers to the same weight
+        # vanilla_model._model_context_phase.model
+        # and vanilla_model._model_generation_phase.model
+    )
+
+
+def _load_weights_for_standard_model(vanilla_model, config):
+    # The weights are loaded from the file with original weights.
+    archive_file = os.path.join(config.model_name, "model.safetensors.index.json")
+    resolved_archive_file, _ = get_checkpoint_shard_files(config.model_name, archive_file)
+    total_dict = {}
+    for shard_file in resolved_archive_file:
+        state_dict = load_state_dict(shard_file)
+        total_dict.update(state_dict)
+
+    replace_params(
+        total_dict,
+        vanilla_model.state_dict(),
+        config,
+        qkv_fused_and_interleaved=config.fuse_qkv_params,
+    )
+    # Copy parameters like embedding:
+    _load_state_dict_into_model(vanilla_model, total_dict, start_prefix="")
+
+    # Force mem release. Taken from huggingface code.
+    del total_dict
+    gc.collect()
+
+
+def load_te_model(cls, config):
+    """
+    Custom method adapted from `from_pretrained` method in HuggingFace
+    Transformers repo:
+    https://github.com/huggingface/transformers/blob/f497f564bb76697edab09184a252fc1b1a326d1e/src/transformers/modeling_utils.py#L2579
+    """
+    config.use_cache = False  # To make TransformerLayer compatible with GemmaModel
+    with fp8_model_init(config.fp8_model_init):
+        # there we need only to create model
+        vanilla_model = cls(config).to(torch.bfloat16).cuda()
+
+    # return vanilla_model
+    # and now we copy the weights into it
+    if config.fp8_model_weights_filename is not None:
+        _load_weights_for_fp8_model(vanilla_model, config)
+    else:
+        _load_weights_for_standard_model(vanilla_model, config)
+
+    return vanilla_model
+
+
+def _get_all_layer_prefixes_to_update(hf_state_dict):
+    """
+    There are many parameters in hf_state_dict, whose name start with "model.layers.[number]."
+    This function extracts all strings like "model.layers.[number]."
+    that are starting strings of keys in hf_state_dict.
+    """
+    all_layer_prefixes = set()
+    for param_key in hf_state_dict.keys():
+        layer_prefix_pat = "model.layers.\d+."
+        m = re.match(layer_prefix_pat, param_key)
+        if m is not None:
+            all_layer_prefixes.add(m.group())
+    return all_layer_prefixes
+
+
+def replace_params(hf_state_dict, te_state_dict, config, qkv_fused_and_interleaved=False):
+    """
+    Replaces params from TE TransformerLayer state_dict with corresponding parameters
+    from HuggingFace GemmaModel state_dict.
+    """
+    all_layer_prefixes: List[str] = _get_all_layer_prefixes_to_update(hf_state_dict)
+
+    for layer_prefix in all_layer_prefixes:
+
+        def copy_from_ht_to_te(te_name, hf_name, start=None, end=None):
+            te_state_dict[layer_prefix + te_name].data[start:end].copy_(
+                hf_state_dict[layer_prefix + hf_name]
+            )
+
+        copy_from_ht_to_te(
+            "self_attention.layernorm_qkv.layer_norm_weight", "input_layernorm.weight"
+        )
+        copy_from_ht_to_te("self_attention.proj.weight", "self_attn.o_proj.weight")
+        copy_from_ht_to_te("layernorm_mlp.layer_norm_weight", "post_attention_layernorm.weight")
+        copy_from_ht_to_te("layernorm_mlp.fc2_weight", "mlp.down_proj.weight")
+        copy_from_ht_to_te(
+            "layernorm_mlp.fc1_weight", "mlp.gate_proj.weight", end=config.intermediate_size
+        )
+        copy_from_ht_to_te(
+            "layernorm_mlp.fc1_weight", "mlp.up_proj.weight", start=config.intermediate_size
+        )
+
+        if qkv_fused_and_interleaved:
+            """
+            When qkv_fused_and_interleaved=True, key, query and value layers are on one tensor
+            in TE TransformerLayer. Moreover they are interleaved within each head.
+            Let q_i, k_i and v_i be query, key and value layers for i-th head respectively.
+            Then TE stores weight tensor in the form:
+            [q1 k1 v1 q2 k2 v2 ...]
+            This is done to maximally optimize performance time.
+            """
+            te_qkv_layer = te_state_dict[layer_prefix + "self_attention.layernorm_qkv.weight"]
+
+            def copy_interleave(hf_name, idx):
+                src = hf_state_dict[layer_prefix + hf_name]
+                for head_nr in range(config.num_attention_heads):
+                    dst_offset = head_nr * config.head_dim * 3
+                    dst_slice = slice(
+                        dst_offset + idx * config.head_dim, dst_offset + (idx + 1) * config.head_dim
+                    )
+                    src_slice = slice(
+                        head_nr * config.head_dim, head_nr * config.head_dim + config.head_dim
+                    )
+                    te_qkv_layer[dst_slice, :] = src[src_slice, :]
+
+            copy_interleave("self_attn.q_proj.weight", 0)
+            copy_interleave("self_attn.k_proj.weight", 1)
+            copy_interleave("self_attn.v_proj.weight", 2)
+        else:
+            copy_from_ht_to_te(
+                "self_attention.layernorm_qkv.query_weight", "self_attn.q_proj.weight"
+            )
+            copy_from_ht_to_te("self_attention.layernorm_qkv.key_weight", "self_attn.k_proj.weight")
+            copy_from_ht_to_te(
+                "self_attention.layernorm_qkv.value_weight", "self_attn.v_proj.weight"
+            )
+
+    return all_layer_prefixes
diff --git a/docs/examples/te_gemma/te_llama.py b/docs/examples/te_gemma/te_llama.py
new file mode 100755
index 0000000000..426b79cbf1
--- /dev/null
+++ b/docs/examples/te_gemma/te_llama.py
@@ -0,0 +1,759 @@
+# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+from contextlib import contextmanager
+
+from typing import Optional
+from functools import partial
+from collections import OrderedDict
+
+import torch
+import transformer_engine as te
+from transformer_engine.pytorch.attention import InferenceParams, RotaryPositionEmbedding
+from transformer_engine.common.recipe import Format, DelayedScaling
+from torch.cuda.amp import autocast
+
+import transformers
+from transformers.models.llama.modeling_llama import LlamaForCausalLM, LlamaConfig, LlamaModel
+
+import torch.nn.functional as F
+
+def setup_cache_params_from_infer_params(inference_params, lengths_tensor, max_input_length):
+    """
+    Converts the `input_ids` to variables like `cu_seqlens_q/kv`, etc. which
+    will be used later.
+
+    (Currently a hack, this should be reformatted to a better method)
+    """
+
+    assert lengths_tensor is not None and max_input_length is not None, \
+        "lengths_tensor and max_input_length should not be none for qkv_format = \"thd\""
+    torch.add(
+        inference_params.cached_sequence_lengths,
+        inference_params.input_sequence_lengths,
+        out=inference_params.cached_sequence_lengths)
+    inference_params.input_sequence_lengths.copy_(lengths_tensor)
+    inference_params.max_incoming_seq_len = max_input_length
+
+    max_seqlen_q, max_seqlen_kv = inference_params.max_incoming_seq_len, inference_params.max_sequence_length
+
+    # # Allocation of buffers, it works correctly with CUDA Graphs.
+    _allocator = StaticBufferAllocator()
+    NR_BUFFERS = 4
+
+    cu_seqlens_q, cu_seqlens_kv, cu_seqlens_q_padded, cu_seqlens_kv_padded = [
+        _allocator(inference_params.max_batch_size + 1, dtype=torch.int32, device="cuda")
+        for _ in range(NR_BUFFERS)
+    ]
+
+    torch.cumsum(inference_params.input_sequence_lengths, dim=0, out=cu_seqlens_q[1:])
+    torch.cumsum(
+        inference_params.cached_sequence_lengths + inference_params.input_sequence_lengths,
+        dim=0, out=cu_seqlens_kv[1:])
+    # If layer has shape [b * s_layer, h, d]
+    # offsets are of the form [k * s_layer * h * d for k = 0, ..., batch_size]
+    cu_seqlens_q_padded.copy_(
+        torch.arange(0, inference_params.max_batch_size + 1, device="cuda") * max_seqlen_q)
+    cu_seqlens_kv_padded.copy_(
+        torch.arange(0, inference_params.max_batch_size + 1, device="cuda") * max_seqlen_kv)
+
+    # inference_params.step_dict = OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist()))
+    inference_params.pre_step(OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist())))
+
+    # print(inference_params.step_dict)
+
+    def get_cache_params_in_infer_params():
+        return max_seqlen_q, max_seqlen_kv, cu_seqlens_q, cu_seqlens_kv, cu_seqlens_q_padded, cu_seqlens_kv_padded
+
+    # For the time being, create an ad-hoc field in `inference_params` to get the variables.
+    # @sudhakars: to create a better way later.
+    inference_params.get_cache_params_from_infer_params = get_cache_params_in_infer_params
+
+# This class has been modified from
+# https://github.com/huggingface/transformers/blob/98adf24883b007c2a7fb17bab1c01b1614673433/src/transformers/models/gemma/modeling_gemma.py
+class LlamaRotaryEmbedding(torch.nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float() / self.dim))
+        self.register_buffer("inv_freq", tensor=inv_freq, persistent=False)
+
+    @torch.no_grad()
+    def forward(self, x, position_ids, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        self.inv_freq.to(x.device)
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
+        # Force float32 since bfloat16 loses precision on long contexts
+        # See https://github.com/huggingface/transformers/pull/29285
+        device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+        return emb.unsqueeze(2) # should return in [b, s, 1, d] format
+
+
+class StaticBufferAllocator(torch.nn.Module):
+    """
+        This class is used when we use te.make_graphed_callable().
+        CUDA Graphs require all tensors to be static. Neverthlessly,
+        torch API make_graphed_callable() takes care of output of torch modules,
+        and makes them static. Thus by wrapping allocation of memory into
+        torch.nn.Module, we can greatly simplify our code.
+    """
+
+    # pylint: disable=no-self-use
+    def forward(self, size, dtype, device):
+        """
+            Return buffer of given size, dtype and device.
+        """
+        return torch.zeros(size, dtype=dtype, device=device)
+
+class TELlamaDecoderLayer(te.pytorch.TransformerLayer):
+    """
+    Wrapper class over TE's `TransformerLayer`. This makes the wrapper very
+    similar to HF's `LlamaDecoderLayer` and easier to replace it in the code.
+
+    Args:
+        config: LlamaConfig
+        args: positional args (for compatibility with `LlamaDecoderLayer`)
+        kwargs: keyword args (for compatibility with `LlamaDecoderLayer`)
+    """
+
+    def __init__(self, config: LlamaConfig, layer_idx: int, *args, **kwargs):
+
+        self.llama_config = config
+        self.head_dim = self.llama_config.hidden_size // self.llama_config.num_attention_heads
+
+        super().__init__(
+            hidden_size=config.hidden_size,
+            ffn_hidden_size=config.intermediate_size,
+            num_attention_heads=config.num_attention_heads,
+            bias=False, # LLaMA specific
+            layernorm_epsilon=config.rms_norm_eps,
+            hidden_dropout=0,
+            attention_dropout=0,
+            fuse_qkv_params=config.fuse_qkv_params,
+            normalization="RMSNorm",
+            activation="swiglu", # LLaMA specific
+            # attn_input_format=config.qkv_format,
+            attn_input_format="bshd",
+            num_gqa_groups=config.num_key_value_heads,
+            kv_channels=self.head_dim, # LLaMA specific
+            layer_number=(
+                layer_idx + 1
+            ),  # Layer numbers in TE starts from 1, not 0 like in the HF.
+            zero_centered_gamma=True, # LLaMA specific
+        )
+
+    def alloc(self, size, dtype, device):
+        """
+            Allocated the buffer and works correctly with CUDA Graphs.
+        """
+        return self._allocator(size, dtype, device)
+
+    def forward(self, *args, **kwargs):  # We need to additionally pass positional encoding.
+
+        if "self_attn_mask_type" in kwargs:
+            attn_mask_type = kwargs['self_attn_mask_type']
+        else:
+            attn_mask_type = "whatever_default_is"
+
+        if attn_mask_type == "arbitrary":
+            # @sudhakars: following logic doesn't work for `thd`
+            attn_mask = kwargs['attention_mask']
+            attention_mask_inv = ~attn_mask
+            generation_case = torch.tensor(torch.tensor(attn_mask.shape).shape).item() > 2
+
+            if generation_case:
+                # @sudhakars: for some reason, `attention_mask` for generation is of the
+                # form [b, 1, 1, s].
+                attention_mask_inv = attention_mask_inv.squeeze(1).squeeze(1)
+                assert torch.tensor(torch.tensor(attention_mask_inv.shape).shape).item() == 2
+
+            # Create `position_ids` on the fly using `attention_mask` since HF
+            # does the same in generation logic.
+            position_ids = attention_mask_inv.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask_inv == 0, 1)
+
+            if "position_ids" in kwargs and kwargs['position_ids'] is not None:
+                assert torch.all(torch.eq(position_ids, kwargs["position_ids"])), "position ids don't match match exactly!"
+
+            # convert [b, s] to [b, 1, s, s] since `arbitrary` is only set for
+            # context phase and context phase gets [b, s] sized attn mask
+            seq_len = 1 if torch.tensor(torch.tensor(attn_mask.shape).shape).item() > 2 else attention_mask_inv.shape[1]
+            arbitrary_attn_mask = torch.zeros(attention_mask_inv.shape[0], 1, seq_len, attention_mask_inv.shape[1]).bool()
+            for sample_idx in range(attn_mask.shape[0]):
+                pad_len = attn_mask[sample_idx].sum().int().item()
+                # set the columns to padded
+                arbitrary_attn_mask[sample_idx, :, :, :pad_len] = True
+                # set the rows to padded
+                if not generation_case:
+                    arbitrary_attn_mask[sample_idx, :, :pad_len, :] = True
+                    arbitrary_attn_mask[sample_idx] = torch.tril(arbitrary_attn_mask[sample_idx].logical_not()).logical_not()
+
+            # Update the attention mask to arbitrary
+            kwargs['attention_mask'] = arbitrary_attn_mask.cuda()
+
+            # @sudhakars: `max_position_embeddings` is not even used inside GemmaRotaryEmbedding
+            # @sudhakars: change the hardcoded `dim` to something like config.head_dim
+            te_rope_emb = LlamaRotaryEmbedding(dim=self.head_dim, max_position_embeddings=self.llama_config.max_position_embeddings).cuda()
+            te_rope_emb = te_rope_emb(args[0], position_ids.cuda())
+        else:
+            # When the `attention_mask` is not `arbitrary`, then for the purpose
+            # of this tutorial, we're using `padding_causal` (for context) and
+            # `padding` (for generation)
+            # @sudhakars: find a better way to provide the `tensor_format`
+            te_rope_emb = RotaryPositionEmbedding(self.head_dim)( # Use self.head_dim
+                max_seq_len=self.llama_config.max_position_embeddings
+            ).cuda()
+
+        inference_params = kwargs["inference_params"]
+        # @sudhakars: big assumption that the input is "sbhd"
+        # batch_size = args[0].shape[0]
+        if inference_params.qkv_format_legacy == "thd":
+            (
+                max_seqlen_q, max_seqlen_kv, cu_seqlens_q, cu_seqlens_kv, cu_seqlens_q_padded, cu_seqlens_kv_padded
+            ) = inference_params.get_cache_params_from_infer_params()
+
+        # this args cannot be passed to TransformerLayer
+        keys_to_remove = [
+            "position_ids",
+            "past_key_value",
+            "output_attentions",
+            "use_cache",
+            "cache_position",
+        ]
+        for key in keys_to_remove:
+            kwargs.pop(key, None)
+
+        # import pdb; pdb.set_trace()
+        # We need to return tuple to be compatible with HF.
+        return (
+            super().forward(
+                *args,
+                rotary_pos_emb=te_rope_emb,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_kv=cu_seqlens_kv,
+                max_seqlen_q=max_seqlen_q,
+                max_seqlen_kv=max_seqlen_kv,
+                **kwargs
+            ),
+        )
+
+class StaticLlamaModel(torch.nn.Module):
+    """
+    StaticLlama is based of HF LlamaModel class.
+    It is adjusted to work properly with CUDA Graphs.
+    """
+
+    def __init__(
+        self,
+        model: LlamaModel,
+        dtype: torch.dtype,
+        mask: torch.Tensor,
+        lm_head: torch.nn.Module,
+    ):
+        super().__init__()
+        self.model = model
+        self.llama_config = model.config # Store LlamaConfig
+        self.normalizer = torch.tensor(self.llama_config.hidden_size**0.5, dtype=dtype)
+        self.mask = mask
+        self.lm_head = lm_head
+
+    def set_inference_params(self, inference_params):
+        self.inference_params = inference_params
+
+    # @sudhakars: is `arbitrary` fine being the default here?
+    def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor = None, attn_mask_type: str = "arbitrary"):
+        # import pdb; pdb.set_trace()
+        if hidden_states.shape[1] > 1:
+            torch.save(hidden_states, "input_ctxt.pth")
+
+        with torch.no_grad():
+            # static operation - for CUDA graphs
+            hidden_states.data[:] = hidden_states.data[:] * self.normalizer
+
+            for i, decoder_layer in enumerate(self.model.layers):
+                hidden_states.data[:] = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    self_attn_mask_type=self.mask if attn_mask_type is None else attn_mask_type,
+                    inference_params=self.inference_params,
+                )[
+                    0
+                ]  # static copy - for CUDA graphs
+
+        hidden_states.copy_(self.model.norm(hidden_states))  # static copy - for CUDA graphs
+        logits = self.lm_head(hidden_states)
+        logits = logits.float()
+        return logits
+
+
+class LlamaGenerator(torch.nn.Module):
+    """
+    LlamaGenerator gets one layer of embeddins,
+    makes forward pass and returns next tokens.
+    """
+
+    def __init__(
+        self, model: LlamaModel, lm_head: torch.nn.Module, dtype: torch.dtype, qkv_format: str
+    ):
+        super().__init__()
+        self.model = model
+        self.llama_layers = StaticLlamaModel(model, dtype, "arbitrary", lm_head)
+        self.qkv_format = qkv_format
+
+    def set_inference_params(self, inference_params):
+        self.inference_params = inference_params
+        self.llama_layers.set_inference_params(inference_params)
+
+    # @sudhakars: is `arbitrary` a good default value here?
+    def forward(self, hidden_states: torch.Tensor, mask: torch.Tensor = None, mask_type: str = "arbitrary"):
+        logits = self.llama_layers(hidden_states, attention_mask=mask, attn_mask_type = mask_type)
+
+        assert logits.shape[0] == hidden_states.shape[0]  # b
+        assert logits.shape[1] == hidden_states.shape[1]  # seq_len
+        # logits.shape[2] = number of tokens
+        logits = logits[:, -1, :]
+        next_tokens = torch.argmax(logits, dim=1)
+
+        # static copy for CUDA graphs
+        hidden_states.copy_(self.model.embed_tokens(next_tokens).unsqueeze(1))
+
+        # self.inference_params contains for example kv_cache.
+        # This needs to be called before every pass,
+        # to update the information of sequence lengths.
+        # Here we increase sequence offsets by one,
+        # because we generated one token for every sequence.
+        if self.qkv_format == "thd":
+            # self.inference_params.setup_before_new_input(
+            #     lengths_tensor=torch.ones((next_tokens.shape[0],), device="cuda"),
+            #     max_input_length=1,
+            # )
+            setup_cache_params_from_infer_params(self.inference_params,
+                                                 lengths_tensor=torch.ones((next_tokens.shape[0],), dtype=int),
+                                                 max_input_length=1)
+        else:
+            self.inference_params.setup_before_new_input(length=1)
+
+        return next_tokens
+
+
+class PartialForwardWrapper(torch.nn.Module):
+    """
+    This class wraps a `torch.nn.Module` while partially modifying its `forward`
+
+    CUDAGraphs' `make_graphed_callables` method takes in a module but if only
+    `functools.partial` is used to wrap the module, it changes the modules'
+    type and that interferes with the `make_graphed_callables` intrinsics.
+    """
+    def __init__(self, module, **kwargs):
+        super().__init__()
+        self.module = module
+        self.partial_forward = partial(self.module.forward, **kwargs)
+
+    def __call__(self, *args, **kwargs):
+        return self.partial_forward(*args, **kwargs)
+
+    # @sudhakars: should we use better abstraction?
+    def set_inference_params(self, *args, **kwargs):
+        return self.module.set_inference_params(*args, **kwargs)
+
+
+@contextmanager
+def replace_decoder(te_decoder_cls):
+    """
+    Replace `LlamaDecoderLayer` with custom `TELlamaDecoderLayer`.
+    """
+    original_llama_decoder_cls = transformers.models.llama.modeling_llama.LlamaDecoderLayer
+    transformers.models.llama.modeling_llama.LlamaDecoderLayer = te_decoder_cls
+    try:
+        yield
+    finally:
+        transformers.models.llama.modeling_llama.LlamaDecoderLayer = original_llama_decoder_cls
+
+
+class TELlamaForCausalLM(LlamaForCausalLM):
+    """
+    Causal LM created with `LlamaModel`. The underlying `LlamaDecoderLayer`
+    class is monkey-patched with `TELlamaDecoderLayer` class before
+    initializing the causal LM with `LlamaForCausalLM`.
+
+    Args:
+        config: LlamaConfig
+    """
+
+    def __init__(self, config: LlamaConfig):
+        with replace_decoder(te_decoder_cls=TELlamaDecoderLayer):
+            super().__init__(config)
+        self.config = config
+        self.to(torch.bfloat16).cuda()
+        self.hidden_size = config.hidden_size
+        self._model_generation_phase = LlamaGenerator(
+            lm_head=self.lm_head,
+            model=self.model,
+            dtype=torch.bfloat16,
+            qkv_format=config.qkv_format,
+        )
+        self._model_context_phase = StaticLlamaModel(
+            self.model, torch.bfloat16, "arbitrary", self.lm_head
+        )
+
+        if self.config.fp8:
+            self.fp8_recipe = DelayedScaling(
+                fp8_format=Format.HYBRID, amax_history_len=16, amax_compute_algo="max"
+            )
+
+    @staticmethod
+    def _padding_to_end(inputs, lengths):
+        """
+        Gets the tensor with sequence padded from the beginning and
+        return tensor padded from its end.
+
+        Parameters
+        ----------
+        inputs : Tensor, tensor with shape [b, s] containing token numbers.
+                 It's padded from the beggining.
+        lengths: Tensor, tensor with shape [s] with lengths of the sequences.
+
+        """
+        max_seq_len = torch.max(lengths)
+        batch_size, max_seq_len = inputs.shape
+        new_input_ids = inputs.clone()
+        for i in range(batch_size):
+            new_input_ids[i, : lengths[i]] = inputs[i, (max_seq_len - lengths[i]) : max_seq_len]
+            new_input_ids[i, lengths[i] :] = inputs[i, 0 : (max_seq_len - lengths[i])]
+
+        # Disable the input preparation that involves extra padding
+        # inputs.copy_(new_input_ids)
+
+        # Trim the inputs to no extra padding i.e. fix the max seq len to
+        # the longest sequence in the batch
+        actual_max_seq_len = inputs.ne(0).sum(dim=1).max()
+        inputs.data = new_input_ids[:, :actual_max_seq_len]
+
+        # For Paged Attention, make the valid sequences, multiple of 64
+        # inputs.data = new_input_ids[:, :4].repeat(1, 16)
+
+
+    def _next_64_multiply(self, x):
+        return ((x + 63) // 64) * 64
+
+    # This function is overriden in TeGEmmaForCausalLMCudaGraphs.
+    def _create_hidden_states_buffer(self, input_ids: torch.Tensor):
+        return torch.empty(
+            (input_ids.shape[0], input_ids.shape[1], self.hidden_size),
+            device="cuda",
+            dtype=torch.float32,
+        )
+
+    # This function is overriden in TeGEmmaForCausalLMCudaGraphs.
+    def _create_inference_params(self, *args, **kwargs):
+        infer_params = InferenceParams(
+            *args, **kwargs
+        )
+
+        max_batch_size = kwargs["max_batch_size"]
+
+        # Initialize some legacy params
+        infer_params.cached_sequence_lengths = torch.zeros(
+            (max_batch_size,), device="cuda", dtype=torch.int32)
+        infer_params.input_sequence_lengths = torch.zeros(
+            (max_batch_size,), device="cuda", dtype=torch.int32)
+
+        return infer_params
+
+    # This function is overriden in TeGEmmaForCausalLMCudaGraphs.
+    def _get_max_input_seq_len(self, input_ids):
+        return input_ids.shape[1]
+
+    # The buffer for generation is some part (beginning) of hidden states buffer.
+    # This function returns pointer to it and also copies there data if provided.
+    def _get_generation_buffer(self, hidden_states_buffer, data_to_copy=None):
+        # hidden_states_buffer has shape [b, s, hd]
+        # generation_buffer will have shape [b, 1, hd]
+        # Notice that "generation_buffer = hidden_states_buffer[:, 0, :].unsqueeze(1)"
+        # will return uncontiguous buffer, which we want to avoid.
+        output = hidden_states_buffer.view(-1)[
+            : hidden_states_buffer.shape[0] * hidden_states_buffer.shape[2]
+        ]
+        if data_to_copy is not None:
+            output.copy_(data_to_copy.reshape(-1))
+        generation_buffer = output.view(
+            (hidden_states_buffer.shape[0], 1, hidden_states_buffer.shape[2])
+        )
+        return generation_buffer
+
+    def _generate_context_phase(self, input_ids: torch.Tensor, inference_params: InferenceParams):
+        hidden_states = self._create_hidden_states_buffer(input_ids)
+        hidden_states.data[:] = self.model.embed_tokens(input_ids)
+
+        # We need to update offsets before every forward pass to make cache work properly.
+        lengths = input_ids.ne(0).sum(dim=1)
+        # import pdb; pdb.set_trace()
+        if self.config.qkv_format == "thd":
+            # inference_params.setup_before_new_input(
+            #     lengths_tensor=lengths, max_input_length=input_ids.shape[1]
+            # )
+            lengths = input_ids.ne(0).sum(dim=1)
+            max_input_length = input_ids.shape[1]
+            setup_cache_params_from_infer_params(inference_params, lengths, max_input_length)
+        else:
+            inference_params.setup_before_new_input(length=input_ids.shape[1])
+
+
+        logits = self._model_context_phase(
+            hidden_states,
+            attention_mask=((input_ids == 0) if self.config.qkv_format != "thd" else None),
+            attn_mask_type="padding_causal" if self.config.qkv_format == "thd" else "arbitrary"
+        )
+
+        # We choose logits coresponding with last token in each sequence,
+        # which have various lengths - they are stored in (inference_params.incoming_seq_len - 1)
+        # Tensor when qkv_format == "thd" and
+        # they are the last token in the sequence when qkv_format != "thd".
+        if self.config.qkv_format == "thd":
+            logits = logits[
+
+                torch.arange(logits.size(0)), inference_params.input_sequence_lengths - 1, :
+            ]
+        else:
+            logits = logits[:, -1, :]
+        torch.save(logits, "logits_ctxt.pth")
+        next_tokens = torch.argmax(logits, dim=1)
+
+        # self.hidden_states have shape [b, s, hd].
+        # We return hidden state for the last token - output has shape [b, 1, hd]
+        hidden_states = self._get_generation_buffer(
+            hidden_states, self.model.embed_tokens(next_tokens)
+        )
+        return hidden_states, next_tokens
+
+    def _make_mask_one_token_longer(self, mask):
+        return torch.cat(
+            [mask, torch.zeros(mask.size(0), 1, 1, 1, dtype=torch.bool, device=mask.device)], dim=-1
+        )
+
+    @torch.no_grad()
+    def generate(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        pad_token_id: int = 0,
+        max_new_tokens: int = 0,
+        *args,
+        **kwargs
+    ):
+        self.eval()
+
+        # We need both autocasts: FP8 for operations that can run in lower precision
+        # and BF16 for those that cannot.
+        with autocast(dtype=torch.bfloat16, cache_enabled=False), te.pytorch.fp8_autocast(
+            enabled=self.config.fp8, fp8_recipe=self.fp8_recipe if self.config.fp8 else None
+        ):
+
+            lengths = torch.sum(input_ids.ne(pad_token_id), dim=-1).squeeze()  # [s]
+            # input_ids = F.pad(
+            #                 input_ids, (max_input_sequence_len - input_ids.shape[1], 0), "constant", 0
+            #             )
+
+            if self.config.qkv_format == "thd":
+                # For thd layout padding is at the end, otherwise at the beginning.
+                TELlamaForCausalLM._padding_to_end(input_ids, lengths)
+
+            batch_size, max_input_sequence_len = input_ids.shape[0], self._get_max_input_seq_len(
+                input_ids
+            )
+            # import pdb; pdb.set_trace()
+
+            # InferenceParams is a cache, where keys and values of previous tokens are stored.
+            # Moreover it stores length of both already generated and input sequences.
+            head_dim = self.config.hidden_size // self.config.num_attention_heads
+            inference_params = self._create_inference_params(
+                max_batch_size=batch_size,
+                # num_layers=self.config.num_hidden_layers,
+                max_sequence_length=self._next_64_multiply(max_input_sequence_len + max_new_tokens),
+                num_heads_kv=self.config.num_key_value_heads,
+                # num_heads_q=self.config.num_attention_heads,
+                head_dim_v=head_dim,
+                head_dim_k=head_dim,
+                dtype=torch.bfloat16,
+                is_paged=True,
+                page_size=64,
+                total_num_pages=64 *3, # 64 * 64 (max_sequence_length) / 64 (page_size)
+                # is_cuda_graph=False
+            )
+
+            def init_cache_params_in_infer_params(inference_params):
+                inference_params.cached_sequence_lengths = torch.zeros(
+                (batch_size,), device="cuda", dtype=torch.int32)
+                inference_params.input_sequence_lengths = torch.zeros(
+                (batch_size,), device="cuda", dtype=torch.int32)
+
+            init_cache_params_in_infer_params(inference_params)
+            inference_params.qkv_format_legacy = self.config.qkv_format
+
+            self._model_context_phase.set_inference_params(inference_params)
+            self._model_generation_phase.set_inference_params(inference_params)
+
+            hidden_states, next_tokens = self._generate_context_phase(input_ids, inference_params)
+
+            # Generation phase.
+            if self.config.qkv_format == "thd":
+                # inference_params.setup_before_new_input(
+                #     lengths_tensor=torch.ones((next_tokens.shape[0],), device="cuda"),
+                #     max_input_length=1,
+                # )
+                setup_cache_params_from_infer_params(inference_params,
+                                                     lengths_tensor=torch.ones((next_tokens.shape[0],), dtype=int),
+                                                     max_input_length=1)
+            else:
+                inference_params.setup_before_new_input(length=1)
+
+            output_tokens = [next_tokens]
+
+            mask = None
+            if self.config.qkv_format != "thd":
+                mask = (input_ids == 0).unsqueeze(1).unsqueeze(1)
+
+            for _ in range(max_new_tokens):
+                if self.config.qkv_format != "thd":
+                    # It will not work with cuda graphs, but it is not used for thd qkv_format.
+                    # Attention mask in bshd needs attn_mask increased by 1 to
+                    # include the next token to be generated
+                    mask = self._make_mask_one_token_longer(mask)
+
+                # setup_cache_params_from_infer_params(inference_params, input_ids)
+                # @sudhakars: could create position_ids from mask here
+                next_tokens = self._model_generation_phase(hidden_states, mask, mask_type="padding" if self.config.qkv_format=="thd" else "arbitrary")
+                # next_tokens is static output tensor, so we need to clone it
+                # - it gets changed every iteration.
+                output_tokens.append(next_tokens.clone())
+
+            result = torch.cat((input_ids, torch.stack(output_tokens).permute([1, 0])), dim=1)
+            return result
+
+    def forward(self, *args, **kwargs):
+        self._model_context_phase.set_inference_params(None)
+        hidden_states = self.model.embed_tokens(kwargs["input_ids"])
+        logits = self._model_context_phase(
+            hidden_states,
+            attention_mask=((kwargs["input_ids"] == 0) if self.config.qkv_format != "thd" else None),
+            attn_mask_type="arbitrary"
+        )
+        return logits
+
+class TELlamaForCausalLMCudaGraphs(TELlamaForCausalLM):
+    """
+    TELlamaForCausalLMCudaGraphs is the version of the class TELlamaForCausalLM
+    using CUDA Graphs to speed it up. We need to make one trade-off.
+    Namely, batch_size, max_seq_len and max_context_seq_len need to be static.
+    It is necessary to run generation with the same value of
+    these variables that we recorded graph on.
+    """
+
+    def __init__(self, config: LlamaConfig):
+        super().__init__(config)
+        assert (
+            config.qkv_format == "thd"
+        ), "Generation with CUDA Graphs are implemented only for thd format."
+
+        # Preparation of the static buffers.
+        self.config = config
+        self.hidden_states_buffer = torch.empty(
+            (
+                config.cuda_graphs_static_batch_size,
+                config.cuda_graphs_static_max_context_len,
+                config.hidden_size,
+            )
+        ).cuda()
+        # This is in fact part of the buffer for hidden_states.
+        self.generation_buffer = self._get_generation_buffer(self.hidden_states_buffer)
+        self.inference_params = InferenceParams(
+            max_batch_size=config.cuda_graphs_static_batch_size,
+            max_sequence_length=config.cuda_graphs_static_max_seq_len,
+            qkv_format="thd",
+        )
+
+        self._model_generation_phase.set_inference_params(self.inference_params)
+        self._model_context_phase.set_inference_params(self.inference_params)
+
+    def record(self):
+        # We want to record model in training=False, because it will be used in generation.
+        self.eval()
+
+        # Here "the trick" happens. We override methods from TELlamaForCausalLM
+        # with their recorded version. After invocation of each of them,
+        # captured graph will be replayed with minimal usage of CPU,
+        # what will lead to huge speedup.
+        input_shape = (
+            self.config.cuda_graphs_static_batch_size,
+            self.config.cuda_graphs_static_max_context_len,
+        )
+        self.inference_params.reset()
+        self.inference_params.setup_before_new_input(
+            lengths_tensor=torch.tensor(input_shape[0] * [input_shape[1]], device="cuda"),
+            max_input_length=input_shape[1],
+        )
+        self._model_context_phase = self.record_graph(
+            PartialForwardWrapper(self._model_context_phase, attn_mask_type="padding_causal"
+                    if self.inference_params.qkv_format == "thd"
+                    else "arbitrary"),
+            self.hidden_states_buffer
+        )  # CUDA Graphs recording
+
+        input_shape = (self.config.cuda_graphs_static_batch_size, 1)
+        self.inference_params.reset()
+        self.inference_params.setup_before_new_input(
+            lengths_tensor=torch.tensor(input_shape[0] * [input_shape[1]], device="cuda"),
+            max_input_length=input_shape[1],
+        )
+        self._model_generation_phase = self.record_graph(
+            PartialForwardWrapper(self._model_generation_phase, mask_type="padding"
+                    if self.inference_params.qkv_format=="thd"
+                    else "arbitrary"),
+            self.generation_buffer
+        )  # CUDA Graphs recording
+
+    """
+        Functions _create_hidden_states_buffer and _create_inference_params
+        from base class are overriden to make hidden_states and inference_params static
+        - not changing their position in memory between every invocation.
+    """
+
+    def _create_hidden_states_buffer(self, *args, **kwargs):
+        return self.hidden_states_buffer
+
+    def _create_inference_params(self, *args, **kwargs):
+        self.inference_params.reset()
+        return self.inference_params
+
+    def _get_max_input_seq_len(self, _):
+        return self.config.cuda_graphs_static_max_context_len
+
+    @torch.no_grad()
+    def record_graph(self, function, input_tensor):
+        # function is invoked on argument (self.hidden_states,) and all kernels are recorded.
+        # record_graph() returns captured function, which can be run later with lower of th CPU.
+        fp8_format = Format.HYBRID
+        fp8_recipe = DelayedScaling(
+            fp8_format=fp8_format, amax_history_len=1024, amax_compute_algo="max"
+        )
+
+        # We need both autocasts: FP8 for operations that can run in lower precision
+        # and BF16 for those that cannot.
+        with autocast(dtype=torch.bfloat16, cache_enabled=False):
+            graphed_function = te.pytorch.make_graphed_callables(
+                function,
+                (input_tensor,),
+                fp8_enabled=self.config.fp8,
+                fp8_recipe=fp8_recipe,
+                allow_unused_input=True,
+                num_warmup_iters=3,
+            )
+        return graphed_function
diff --git a/docs/examples/te_gemma/te_llama_loading_weights.py b/docs/examples/te_gemma/te_llama_loading_weights.py
new file mode 100755
index 0000000000..a5ab151f67
--- /dev/null
+++ b/docs/examples/te_gemma/te_llama_loading_weights.py
@@ -0,0 +1,224 @@
+# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+import os
+import re
+import gc
+import torch
+
+from typing import List
+
+from transformer_engine.pytorch.fp8 import fp8_model_init
+
+from transformers.modeling_utils import load_state_dict, _load_state_dict_into_model
+from transformers.utils.hub import get_checkpoint_shard_files
+
+"""
+    This file contains logic of mapping the HuggingFace LlamaModel parameters
+    with TransformerEngine TransformerLayer. When we have initialized Transformer models
+    both with HF and with TE, we can copy parameters from the first to the second.
+"""
+
+
+def _load_weights_for_fp8_model(vanilla_model, hyperparams):
+    # The weights are loaded from the file with state_dict
+    # of model with weights which contains also fp8 parameters.
+    # The weights are in BF16 precision, but they contain fp8 metadata
+    # computed by the calibration procedure.
+    vanilla_model.load_state_dict(
+        torch.load(hyperparams.fp8_model_weights_filename),
+        strict=False,
+        # strict = false, because some parameters have
+        # multiple pointers to the same weight
+        # vanilla_model._model_context_phase.model
+        # and vanilla_model._model_generation_phase.model
+    )
+
+
+def _load_weights_for_standard_model(vanilla_model, config):
+    # The weights are loaded from the file with original weights.
+    archive_file = os.path.join(config.model_name, "model.safetensors.index.json")
+    resolved_archive_file, _ = get_checkpoint_shard_files(config.model_name, archive_file)
+    total_dict = {}
+    for shard_file in resolved_archive_file:
+        state_dict = load_state_dict(shard_file)
+        total_dict.update(state_dict)
+
+    replace_params(
+        total_dict,
+        vanilla_model.state_dict(),
+        config,
+        qkv_fused_and_interleaved=config.fuse_qkv_params,
+    )
+    # Copy parameters like embedding:
+    _load_state_dict_into_model(vanilla_model, total_dict, start_prefix="")
+
+    # Force mem release. Taken from huggingface code.
+    del total_dict
+    gc.collect()
+
+
+def load_te_model(cls, config):
+    """
+    Custom method adapted from `from_pretrained` method in HuggingFace
+    Transformers repo:
+    https://github.com/huggingface/transformers/blob/f497f564bb76697edab09184a252fc1b1a326d1e/src/transformers/modeling_utils.py#L2579
+    """
+
+    config.use_cache = False  # To make TransformerLayer compatible with LlamaModel
+    with fp8_model_init(config.fp8_model_init):
+        # there we need only to create model
+        vanilla_model = cls(config).to(torch.bfloat16).cuda()
+
+    # return vanilla_model
+    # and now we copy the weights into it
+    if config.fp8_model_weights_filename is not None:
+        _load_weights_for_fp8_model(vanilla_model, config)
+    else:
+        _load_weights_for_standard_model(vanilla_model, config)
+
+    return vanilla_model
+
+
+def _get_all_layer_prefixes_to_update(hf_state_dict):
+    """
+    There are many parameters in hf_state_dict, whose name start with "model.layers.[number]."
+    This function extracts all strings like "model.layers.[number]."
+    that are starting strings of keys in hf_state_dict.
+    """
+    all_layer_prefixes = set()
+    for param_key in hf_state_dict.keys():
+        layer_prefix_pat = "model.layers.\d+."
+        m = re.match(layer_prefix_pat, param_key)
+        if m is not None:
+            all_layer_prefixes.add(m.group())
+    return all_layer_prefixes
+
+
+def replace_params(hf_state_dict, te_state_dict, config, qkv_fused_and_interleaved=False):
+    # collect all layer prefixes to update
+    all_layer_prefixes = set()
+    for param_key in hf_state_dict.keys():
+        layer_prefix_pat = "model.layers.\d+."
+        m = re.match(layer_prefix_pat, param_key)
+        if m is not None:
+            all_layer_prefixes.add(m.group())
+
+    for layer_prefix in all_layer_prefixes:
+        # When loading weights into models with less number of layers, skip the
+        # copy if the corresponding layer doesn't exist in HF model
+        if layer_prefix + "input_layernorm.weight" in hf_state_dict:
+            te_state_dict[layer_prefix + "self_attention.layernorm_qkv.layer_norm_weight"].data[
+                :
+            ] = hf_state_dict[layer_prefix + "input_layernorm.weight"].data[:]
+
+        if layer_prefix + "self_attn.q_proj.weight" in hf_state_dict:
+            te_state_dict[layer_prefix + "self_attention.layernorm_qkv.query_weight"].data[:] = (
+                hf_state_dict[layer_prefix + "self_attn.q_proj.weight"].data[:]
+            )
+
+        if layer_prefix + "self_attn.k_proj.weight" in hf_state_dict:
+            te_state_dict[layer_prefix + "self_attention.layernorm_qkv.key_weight"].data[:] = (
+                hf_state_dict[layer_prefix + "self_attn.k_proj.weight"].data[:]
+            )
+
+        if layer_prefix + "self_attn.v_proj.weight" in hf_state_dict:
+            te_state_dict[layer_prefix + "self_attention.layernorm_qkv.value_weight"].data[:] = (
+                hf_state_dict[layer_prefix + "self_attn.v_proj.weight"].data[:]
+            )
+
+        if layer_prefix + "self_attn.o_proj.weight" in hf_state_dict:
+            te_state_dict[layer_prefix + "self_attention.proj.weight"].data[:] = hf_state_dict[
+                layer_prefix + "self_attn.o_proj.weight"
+            ].data[:]
+
+        if layer_prefix + "post_attention_layernorm.weight" in hf_state_dict:
+            te_state_dict[layer_prefix + "layernorm_mlp.layer_norm_weight"].data[:] = hf_state_dict[
+                layer_prefix + "post_attention_layernorm.weight"
+            ].data[:]
+
+        # It may happen that gate_proj.weight and up_proj.weight will be in the different files, so we need to
+        # load them separately.
+        if layer_prefix + "mlp.gate_proj.weight" in hf_state_dict:
+            te_state_dict[layer_prefix + "layernorm_mlp.fc1_weight"].data[
+                : config.intermediate_size
+            ] = hf_state_dict[layer_prefix + "mlp.gate_proj.weight"].data
+
+        if layer_prefix + "mlp.up_proj.weight" in hf_state_dict:
+            te_state_dict[layer_prefix + "layernorm_mlp.fc1_weight"].data[
+                config.intermediate_size :
+            ] = hf_state_dict[layer_prefix + "mlp.up_proj.weight"].data
+
+        if layer_prefix + "mlp.down_proj.weight" in hf_state_dict:
+            te_state_dict[layer_prefix + "layernorm_mlp.fc2_weight"].data[:] = hf_state_dict[
+                layer_prefix + "mlp.down_proj.weight"
+            ].data[:]
+    return all_layer_prefixes
+
+
+# def replace_params(hf_state_dict, te_state_dict, config, qkv_fused_and_interleaved=False):
+#     """
+#     Replaces params from TE TransformerLayer state_dict with corresponding parameters
+#     from HuggingFace LlamaModel state_dict.
+#     """
+#     all_layer_prefixes: List[str] = _get_all_layer_prefixes_to_update(hf_state_dict)
+
+#     head_dim = config.hidden_size // config.num_attention_heads
+
+#     for layer_prefix in all_layer_prefixes:
+
+#         def copy_from_ht_to_te(te_name, hf_name, start=None, end=None):
+#             te_state_dict[layer_prefix + te_name].data[start:end].copy_(
+#                 hf_state_dict[layer_prefix + hf_name]
+#             )
+
+#         copy_from_ht_to_te(
+#             "self_attention.layernorm_qkv.layer_norm_weight", "input_layernorm.weight"
+#         )
+#         copy_from_ht_to_te("self_attention.proj.weight", "self_attn.o_proj.weight")
+#         copy_from_ht_to_te("layernorm_mlp.layer_norm_weight", "post_attention_layernorm.weight")
+#         copy_from_ht_to_te("layernorm_mlp.fc2_weight", "mlp.down_proj.weight")
+#         copy_from_ht_to_te(
+#             "layernorm_mlp.fc1_weight", "mlp.gate_proj.weight", end=config.intermediate_size
+#         )
+#         copy_from_ht_to_te(
+#             "layernorm_mlp.fc1_weight", "mlp.up_proj.weight", start=config.intermediate_size
+#         )
+
+#         if qkv_fused_and_interleaved:
+#             """
+#             When qkv_fused_and_interleaved=True, key, query and value layers are on one tensor
+#             in TE TransformerLayer. Moreover they are interleaved within each head.
+#             Let q_i, k_i and v_i be query, key and value layers for i-th head respectively.
+#             Then TE stores weight tensor in the form:
+#             [q1 k1 v1 q2 k2 v2 ...]
+#             This is done to maximally optimize performance time.
+#             """
+#             te_qkv_layer = te_state_dict[layer_prefix + "self_attention.layernorm_qkv.weight"]
+
+#             def copy_interleave(hf_name, idx):
+#                 src = hf_state_dict[layer_prefix + hf_name]
+#                 for head_nr in range(config.num_attention_heads):
+#                     dst_offset = head_nr * config.head_dim * 3
+#                     dst_slice = slice(
+#                         dst_offset + idx * config.head_dim, dst_offset + (idx + 1) * config.head_dim
+#                     )
+#                     src_slice = slice(
+#                         head_nr * config.head_dim, head_nr * config.head_dim + config.head_dim
+#                     )
+#                     te_qkv_layer[dst_slice, :] = src[src_slice, :]
+
+#             copy_interleave("self_attn.q_proj.weight", 0)
+#             copy_interleave("self_attn.k_proj.weight", 1)
+#             copy_interleave("self_attn.v_proj.weight", 2)
+#         else:
+#             copy_from_ht_to_te(
+#                 "self_attention.layernorm_qkv.query_weight", "self_attn.q_proj.weight"
+#             )
+#             copy_from_ht_to_te("self_attention.layernorm_qkv.key_weight", "self_attn.k_proj.weight")
+#             copy_from_ht_to_te(
+#                 "self_attention.layernorm_qkv.value_weight", "self_attn.v_proj.weight"
+#             )
+
+#     return all_layer_prefixes
diff --git a/docs/examples/te_gemma/test_paged_attn.ipynb b/docs/examples/te_gemma/test_paged_attn.ipynb
new file mode 100755
index 0000000000..543ebe9262
--- /dev/null
+++ b/docs/examples/te_gemma/test_paged_attn.ipynb
@@ -0,0 +1,33 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ace403ac-c276-4378-a4e8-0155165f9934",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/docs/examples/te_gemma/tutorial_accelerate_hf_gemma_finetuning_with_te.ipynb b/docs/examples/te_gemma/tutorial_accelerate_hf_gemma_finetuning_with_te.ipynb
new file mode 100755
index 0000000000..7875ffc9f3
--- /dev/null
+++ b/docs/examples/te_gemma/tutorial_accelerate_hf_gemma_finetuning_with_te.ipynb
@@ -0,0 +1,314 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Accelerating a Hugging Face Gemma model finetuning with Transformer Engine"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the previous [tutorial](../te_llama/tutorial_accelerate_hf_llama_finetuning_with_te.ipynb), we demonstrated how to accelerate HF Llama models using the Transformer Engine library. We replaced `LlamaDecoderLayer` with `TransformerLayer` from the Transformer Engine, achieving a speedup. Furthermore, we conducted the finetuning in FP8 precision, which yielded an additional speedup.\n",
+    "\n",
+    "Now, we will undertake a similar enhancement for the Google's [Gemma](https://blog.google/technology/developers/gemma-open-models/) model."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Dependencies for this tutorial\n",
+    "\n",
+    "Following files and media are necessary to effectively run this tutorial:\n",
+    "\n",
+    "1. `te_gemma.py`\n",
+    "    - This file contains the code to load a Hugging Face Gemma checkpoint in Transformer Engine's `TransformerLayer` instead of Hugging Face's `GemmaDecoderLayer`. This is used in the following two sections of the tutorial - \"Improvement 1\" and \"Improvement 2\".\n",
+    "2. `utils.py`\n",
+    "    - This file contains the code related to dataloading, hyperparameters, setting up model/optimizers/accelerator, model training and other miscellaneous tasks like restarting the jupyter notebook from within the cell. \n",
+    "3. `requirements.txt`\n",
+    "    - This file contains necessary Python packages for this tutorial.\n",
+    "4. `media/`\n",
+    "    - This directory contains the images used in the following tutorial."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%pip install -r requirements.txt\n",
+    "\n",
+    "import torch\n",
+    "cudnn_version = torch.backends.cudnn.version()\n",
+    "assert cudnn_version >= 90100, \"cuDNN version >= 9.1.0 is needed to run this tutorial.\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Differences between Llama and Gemma"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Thr Llama and the Gemma are very similar models - both are based on Transformer Decoder architecture. The most important architectural differences between them are the following:\n",
+    "\n",
+    "\n",
+    "| Feature                                      | Llama                              | Gemma                                      |\n",
+    "|----------------------------------------------|------------------------------------|--------------------------------------------|\n",
+    "| **Norm Layer**                               | Standard RMSNorm <br> $y = \\frac{x - \\mathrm{E}[x]}{ \\sqrt{\\mathrm{Var}[x] + \\varepsilon}} * \\gamma + \\beta$                   | RMSNorm with zero centered gamma parameter <br>  $y = \\frac{x - \\mathrm{E}[x]}{ \\sqrt{\\mathrm{Var}[x] + \\varepsilon}} * (\\textcolor{red}{1 +} \\gamma) + \\beta$   |\n",
+    "| **Embedding Dimension/Head Dimension**             | 4096/4096                              | 3072/4096                                  |\n",
+    "| **Activation Function**                      | SwiGlu                             | GeGlu                                      |\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## [Baseline] Running HF `GemmaModel` (Precision: `BF16`)\n",
+    "\n",
+    "Similarly to the Llama tutorial, we begin the experiments by running baseline Hugging Face Gemma model finetuning in BF16 precision.\n",
+    "\n",
+    "<div class=\"alert alert-info\">\n",
+    "\n",
+    "<b>Note</b>\n",
+    "    \n",
+    "This tutorial loads and trains a Gemma 7B model which takes up most of the GPU memory and therefore, we need to restart the jupyter notebook each time before running the following sections. A small utility method `restart_jupyter_notebook` is defined in the accompanying `utils.py` file. This function restarts the jupyter notebook so that the GPU memory is flushed before the model is loaded again from the checkpoint in order to avoid running into OOM (Out Of Memory) errors.\n",
+    "\n",
+    "If the utility doesn't work, comment this line `restart_jupyter_notebook()` in the following cell and manually restart the jupyter notebook before running the cell. Repeat the same for other sections in this tutorial.\n",
+    "\n",
+    "</div>\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "10 finetuning steps complete!\n",
+      "\n",
+      "Average time taken per step: \n",
+      "298 \n",
+      "milliseconds\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Restart the notebook (to flush the GPU memory)\n",
+    "from utils import restart_jupyter_notebook\n",
+    "restart_jupyter_notebook()\n",
+    "\n",
+    "\n",
+    "# Import necessary packages and methods\n",
+    "from utils import *\n",
+    "\n",
+    "\n",
+    "# Default hyperparams, also defined in `utils.py` in class `Hyperparameters`\n",
+    "## !!! `model_name` attr must point to the location of the model weights !!!\n",
+    "## Weights can be downloaded from: https://huggingface.co/google/gemma-7b\n",
+    "hyperparams.model_name = \"../../../../gemma-7b\" # <== Add model weight location here e.g. \"/path/to/downloaded/gemma/weights\"\n",
+    "hyperparams.mixed_precision = \"bf16\"\n",
+    "\n",
+    "\n",
+    "# Init the model and accelerator wrapper\n",
+    "model = init_baseline_model(hyperparams).cuda()\n",
+    "accelerator, model, optimizer, train_dataloader, lr_scheduler = wrap_with_accelerator(model, hyperparams)\n",
+    "\n",
+    "\n",
+    "# Finetune the model\n",
+    "finetune_model(model, hyperparams, accelerator, train_dataloader, optimizer, lr_scheduler)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's add this information in a table and keep comparing it with a few possible improvements in future sections:\n",
+    "\n",
+    "| Models                                                      | Precision | Step Time (or ms per batch) | Speedup (over baseline) |\n",
+    "|-------------------------------------------------------------|-----------|-----------------------------|-------------------------|\n",
+    "| HF (baseline)                                               | BF16      | 298                         | 1                       |"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## [Improvement 1] Replace HF's `GemmaDecoderLayer` with TE's `TransformerLayer` (Precision: `BF16`)\n",
+    "\n",
+    "We replace *GemmaDecoderLayer* with the highly tuned *TransformerLayer*, similarly to our approach in the [Llama tutorial](../te_llama/tutorial_accelerate_hf_llama_finetuning_with_te.ipynb). Let's observe the impact this change has on the model's speed."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "10 finetuning steps complete!\n",
+      "\n",
+      "Average time taken per step: \n",
+      "257 \n",
+      "milliseconds\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Restart the notebook (to flush the GPU memory)\n",
+    "from utils import restart_jupyter_notebook\n",
+    "restart_jupyter_notebook()\n",
+    "\n",
+    "\n",
+    "# Import necessary packages and methods\n",
+    "from utils import *\n",
+    "\n",
+    "\n",
+    "# Default hyperparams, also defined in `utils.py` in class `Hyperparameters`\n",
+    "## !!! `model_name` attr must point to the location of the model weights !!!\n",
+    "## Weights can be downloaded from: https://huggingface.co/google/gemma-7b\n",
+    "hyperparams.model_name = \"../../../../gemma-7b\"  # <== Add model weight location here e.g. \"/path/to/downloaded/gemma/weights\"\n",
+    "hyperparams.mixed_precision = \"bf16\"\n",
+    "\n",
+    "\n",
+    "# Init the model and accelerator wrapper\n",
+    "model = init_te_gemma_model(hyperparams).cuda()\n",
+    "accelerator, model, optimizer, train_dataloader, lr_scheduler = wrap_with_accelerator(model, hyperparams)\n",
+    "\n",
+    "\n",
+    "# Finetune the model\n",
+    "finetune_model(model, hyperparams, accelerator, train_dataloader, optimizer, lr_scheduler)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Compared to the \"baseline\" implementation, we see that using Transformer Engine's `TransformerLayer` in place of Huggging Face's `GemmaDecoderLayer` gives a speedup of **16%** even when using only BF16 precision!\n",
+    "\n",
+    "| Models                                                      | Precision | Step Time (or ms per batch) | Speedup (over baseline) |\n",
+    "|-------------------------------------------------------------|-----------|-----------------------------|-------------------------|\n",
+    "| HF (baseline)                                               | BF16      | 298                        | 1                       |\n",
+    "| TE (replace `GemmaDecoderLayer` with `TE.TransformerLayer`) | BF16      | 257                         | 1.16                    |"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## [Improvement 2] Replace HF's `GemmaDecoderLayer` with TE's `TransformerLayer` (Precision: `FP8`)\n",
+    "\n",
+    "The last improvement is about enabling FP8 precision. Let's see how it works."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "10 finetuning steps complete!\n",
+      "\n",
+      "Average time taken per step: \n",
+      "214 \n",
+      "milliseconds\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Restart the notebook (to flush the GPU memory)\n",
+    "from utils import restart_jupyter_notebook\n",
+    "#restart_jupyter_notebook()\n",
+    "\n",
+    "\n",
+    "# Import necessary packages and methods\n",
+    "from utils import *\n",
+    "\n",
+    "\n",
+    "# Default hyperparams, also defined in `utils.py` in class `Hyperparameters`\n",
+    "## !!! `model_name` attr must point to the location of the model weights !!!\n",
+    "## Weights can be downloaded from: https://huggingface.co/google/gemma-7b\n",
+    "hyperparams.model_name = \"../../../../gemma-7b\"  # <== Add model weight location here e.g. \"/path/to/downloaded/gemma/weights\"\n",
+    "hyperparams.mixed_precision = \"fp8\"\n",
+    "\n",
+    "\n",
+    "# Init the model and accelerator wrapper\n",
+    "model = init_te_gemma_model(hyperparams).cuda()\n",
+    "accelerator, model, optimizer, train_dataloader, lr_scheduler = wrap_with_accelerator(model, hyperparams)\n",
+    "\n",
+    "\n",
+    "# Finetune the model\n",
+    "finetune_model(model, hyperparams, accelerator, train_dataloader, optimizer, lr_scheduler)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "| Models                                                      | Precision | Step Time (or ms per batch) | Speedup (over baseline) |\n",
+    "|-------------------------------------------------------------|-----------|-----------------------------|-------------------------|\n",
+    "| HF (baseline)                                               | BF16      | 298                        | 1                       |\n",
+    "| TE (replace `GemmaDecoderLayer` with `TE.TransformerLayer`) | BF16      | 257                         | 1.16                    |\n",
+    "| TE (replace `GemmaDecoderLayer` with `TE.TransformerLayer`) | FP8       | 214                         | 1.39                    |\n",
+    "\n",
+    "\n",
+    "After turning on FP8 precision, we get even more speedup of almost **39%**!"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Conclusion\n",
+    "\n",
+    "As shown in the [Llama tutorial](../te_llama/tutorial_accelerate_hf_llama_finetuning_with_te.ipynb), using the `TransformerLayer` module from Transformer Engine to replace Hugging Face's `GemmaDecoderLayer` results in a speedup compared to Hugging Face's native Gemma implementation."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## See more\n",
+    "\n",
+    "We also prepared [tutorial](./tutorial_generation_gemma_with_te.ipynb) in which we will show how to speedup the Gemma model generation using Transformer Engine."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/examples/te_gemma/tutorial_generation_gemma_with_te.ipynb b/docs/examples/te_gemma/tutorial_generation_gemma_with_te.ipynb
new file mode 100755
index 0000000000..acb93b795e
--- /dev/null
+++ b/docs/examples/te_gemma/tutorial_generation_gemma_with_te.ipynb
@@ -0,0 +1,1277 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "40364db7",
+   "metadata": {},
+   "source": [
+    "# Accelerating token generation of the Hugging Face Gemma Model with Transformer Engine\n",
+    "\n",
+    "Generative AI has made remarkable strides in recent years, with Large Language Models (LLMs) like ChatGPT at the forefront. These models have revolutionized how we interact with machine-generated content, providing capabilities that range from writing assistance to complex decision support. The core functionality of these models is the generation process, which involves predicting the next token in a sequence based on the preceding text. This task is critical for applications such as automated content creation, translation, and more, emphasizing the importance of efficient implementation.\n",
+    "\n",
+    "\n",
+    "\n",
+    "<figure align=\"center\">\n",
+    "<img src=\"./media/generation_animation.gif\" alt=\"\" >\n",
+    "<figcaption>\n",
+    "Animation 1: Hugging Face Gemma model token generation.\n",
+    "</figcaption>\n",
+    "</figure>\n",
+    "\n",
+    "For those seeking a deeper understanding of text generation mechanisms in Transformers, it is recommended to check out the [HuggingFace generation tutorial](https://huggingface.co/docs/transformers/llm_tutorial).\n",
+    "\n",
+    "In the previous tutorials on [Llama](../te_llama/tutorial_accelerate_hf_llama_finetuning_with_te.ipynb) and [Gemma](./tutorial_accelerate_hf_gemma_finetuning_with_te.ipynb), it was demonstrated how finetuning can be accelerated using the Transformer Engine's `TransformerLayer`. Building on this foundation, the current objective is to enhance the generation speed of the Gemma model.\n",
+    "\n",
+    "This tutorial will introduce and explain several advanced features of the Transformer Engine that contribute to this goal:\n",
+    "\n",
+    "###### **1. THD Attention Layout.**\n",
+    "\n",
+    "Addressing the challenge of computing attention for sequences with varying lengths, a common method is to pad these sequences and apply an attention mask. The Transformer Engine, however, offers a more optimized approach—by specifying the lengths and offsets of the sequences, attention can be computed directly. Instead of passing the tensor with shape `[b, s, h, d]` and the attention mask, one can pass a tensor of the shape `[t, h, d]` along with tensors detailing cumulative sequence lengths and offsets to run the attention optimized for this case. This specific attention layout is referred to as the **THD layout**. \n",
+    "\n",
+    "\n",
+    "The letter `t` in the standard `[t, h, d]` layout is equal to the total length of the sequences, namely `t = s_1 + s_2 + ... + s_b`, where `s_i` denotes the length of sequence `i`. TransformerEngine supports a THD layout that incorporates gaps between these sequences - the lengths of the offsets need to be passed in the additional parameter.\n",
+    "\n",
+    "<figure align=\"center\">\n",
+    "<img src=\"./media/thd_bshd.svg\" alt=\"\">\n",
+    "<figcaption>\n",
+    "Figure 1: The difference between BSHD (default) and THD attention layouts is as follows: with BSHD, one needs to provide the attention mask, while with THD, one needs to provide cumulative sequence lengths and sequence offsets.\n",
+    "</figcaption>\n",
+    "</figure>\n",
+    "\n",
+    "###### **2. CUDA Graphs API.**\n",
+    "\n",
+    "The speed of GPUs is increasing at a rapid pace. It turns out that sometimes the runtime of kernels is shorter than the time it takes for the CPU to submit them, which can lead to significant overhead. CUDA Graphs can address this issue. When certain kernels are executed repeatedly, it allows us to record and replay them with less CPU involvement. This becomes particularly useful in applications like token generation, where a `TransformerLayer` is run for every token that needs to be generated.\n",
+    "\n",
+    "One can read more about CUDA Graphs [here](https://developer.nvidia.com/blog/cuda-graphs/).\n",
+    "\n",
+    "PyTorch exposes graphs via a raw `torch.cuda.CUDAGraph` class and two convenience wrappers: `torch.cuda.graph` and `torch.cuda.make_graphed_callables`. More information about the cuda graphs in Pytorch can be found [here](https://pytorch.org/blog/accelerating-pytorch-with-cuda-graphs/).\n",
+    "\n",
+    "<figure align=\"center\">\n",
+    "<img src=\"./media/graphs.svg\" alt=\"\">\n",
+    "<figcaption>\n",
+    "Figure 2: CUDA Graphs reduce the overhead generated by the long time it takes to launch a single kernel. It enables the recording and replaying of subsequent launches, thus reducing the total time used by the CPU.\n",
+    "</figcaption>\n",
+    "</figure>\n",
+    "\n",
+    "\n",
+    "###### **3. FP8 Weights Calibration.**\n",
+    "\n",
+    "Assuming that the model is trained in FP32/BF16 precision and the goal is to execute it in FP8 precision, the process isn't straightforward due to the absence of appropriate FP8 scaling factors. In this scenario, FP8 calibration becomes essential. By conducting several forward passes on sample data, the FP8 scaling parameters can be computed. This calibration allows the model to operate correctly in FP8 precision.\n",
+    "\n",
+    "It is highly recommended to familiarize oneself with the [tutorial](../../examples/fp8_primer.ipynb) on FP8 precision to understand the importance of proper scaling factors.\n",
+    "\n",
+    "\n",
+    "<figure align=\"center\">\n",
+    "<img src=\"./media/calibration.svg\" alt=\"\">\n",
+    "<figcaption>\n",
+    "Figure 3:\n",
+    "If the model is trained in BF16/FP32, it does not include the computed FP8 scaling factors. When it is run under <b>fp8_autocast()</b>, the value of these scaling factors will default to their initial values, which can cause numerical errors. Weight calibration involves calculating FP8 scaling factors from higher precision forward passes. Once these factors are computed, the model becomes numerically stable. \n",
+    "</figcaption>\n",
+    "</figure>\n",
+    "\n",
+    "###### **4. FP8 Model Weights.**\n",
+    "\n",
+    "The typical approach is to store weights in higher precision and then cast them to fp8 before operations. This may prevent accuraccy drops in training. However, for inference, this level of precision is not necessary.\n",
+    "\n",
+    "The TransformerEngine includes a wrapper `fp8_model_​init`, which allows for the creation of models that store only the FP8 copy of the weights. This eliminates the need to cast from higher precision to BF16, saving time in this casting process. \n",
+    "\n",
+    "<figure align=\"center\">\n",
+    "<img src=\"./media/fp8_model_init.svg\" alt=\"\">\n",
+    "<figcaption>\n",
+    "Figure 4: Model under <b>fp8_autocast()</b> stores weights in high precision by default, and casts them if needed. It can leads to slowdown and increased memory usage. Using <i>fp8_model_init()</i> results in storing weight in FP8.\n",
+    "</figcaption>\n",
+    "</figure>\n",
+    "\n",
+    "###### Benchmarking\n",
+    "\n",
+    "We'll evaluate the generation time across one benchmark: generation with context phase max sequence length = 128, batch size = 64 and number of generated tokens = 896 on random texts with random lengths.\n",
+    "\n",
+    "<div class=\"alert alert-info\">\n",
+    "<b>Note</b>\n",
+    "    \n",
+    "This tutorial focuses on showcasing the mentioned features of Transformer Engine in the context of token generation. It's important to note, however, that NVIDIA provides [TensorRT](https://developer.nvidia.com/tensorrt), which is optimized for inference tasks and should be considered for such use cases.\n",
+    "</div>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b18f91a9",
+   "metadata": {},
+   "source": [
+    "## Dependencies for this tutorial"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e5201d77",
+   "metadata": {},
+   "source": [
+    "Following files and media are necessary to effectively run this tutorial:\n",
+    "\n",
+    "1. `te_gemma.py`\n",
+    "    - This file contains the code to load a Hugging Face Gemma checkpoint in Transformer Engine's `TransformerLayer` instead of Hugging Face's `GemmaDecoderLayer`. It does also contain code for generation with THD attention, CUDA Graphs and weight calibration.\n",
+    "2. `te_gemma_loading_weights.py`\n",
+    "    - This file contains logic of mapping the parameters from `GemmaDecoderLayer` into the `TransformerLayer`.\n",
+    "3. `utils.py`\n",
+    "    - This file contains the code related to dataloading, hyperparameters, setting up model/optimizers/accelerator, model training and other miscellaneous tasks like restarting the jupyter notebook from within the cell. \n",
+    "4. `requirements.txt`\n",
+    "    - This file contains necessary Python packages for this tutorial.\n",
+    "5. `media/`\n",
+    "    - This directory contains the images used in the following tutorial."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "31390c76",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com\n",
+      "Collecting transformers==4.41.1 (from -r requirements.txt (line 1))\n",
+      "  Downloading transformers-4.41.1-py3-none-any.whl.metadata (43 kB)\n",
+      "Collecting accelerate==0.30.1 (from -r requirements.txt (line 2))\n",
+      "  Downloading accelerate-0.30.1-py3-none-any.whl.metadata (18 kB)\n",
+      "Collecting datasets==2.19.1 (from -r requirements.txt (line 3))\n",
+      "  Downloading datasets-2.19.1-py3-none-any.whl.metadata (19 kB)\n",
+      "Collecting sentencepiece==0.2.0 (from -r requirements.txt (line 4))\n",
+      "  Downloading sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)\n",
+      "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers==4.41.1->-r requirements.txt (line 1)) (3.16.1)\n",
+      "Collecting huggingface-hub<1.0,>=0.23.0 (from transformers==4.41.1->-r requirements.txt (line 1))\n",
+      "  Downloading huggingface_hub-0.26.2-py3-none-any.whl.metadata (13 kB)\n",
+      "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers==4.41.1->-r requirements.txt (line 1)) (1.24.4)\n",
+      "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers==4.41.1->-r requirements.txt (line 1)) (23.2)\n",
+      "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers==4.41.1->-r requirements.txt (line 1)) (6.0.2)\n",
+      "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers==4.41.1->-r requirements.txt (line 1)) (2024.9.11)\n",
+      "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers==4.41.1->-r requirements.txt (line 1)) (2.32.3)\n",
+      "Collecting tokenizers<0.20,>=0.19 (from transformers==4.41.1->-r requirements.txt (line 1))\n",
+      "  Downloading tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)\n",
+      "Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.10/dist-packages (from transformers==4.41.1->-r requirements.txt (line 1)) (0.4.5)\n",
+      "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers==4.41.1->-r requirements.txt (line 1)) (4.66.5)\n",
+      "Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from accelerate==0.30.1->-r requirements.txt (line 2)) (6.0.0)\n",
+      "Requirement already satisfied: torch>=1.10.0 in /usr/local/lib/python3.10/dist-packages (from accelerate==0.30.1->-r requirements.txt (line 2)) (2.5.0a0+e000cf0ad9.nv24.10)\n",
+      "Requirement already satisfied: pyarrow>=12.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets==2.19.1->-r requirements.txt (line 3)) (16.1.0)\n",
+      "Collecting pyarrow-hotfix (from datasets==2.19.1->-r requirements.txt (line 3))\n",
+      "  Downloading pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)\n",
+      "Collecting dill<0.3.9,>=0.3.0 (from datasets==2.19.1->-r requirements.txt (line 3))\n",
+      "  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)\n",
+      "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets==2.19.1->-r requirements.txt (line 3)) (2.2.2)\n",
+      "Collecting xxhash (from datasets==2.19.1->-r requirements.txt (line 3))\n",
+      "  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)\n",
+      "Collecting multiprocess (from datasets==2.19.1->-r requirements.txt (line 3))\n",
+      "  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)\n",
+      "Collecting fsspec<=2024.3.1,>=2023.1.0 (from fsspec[http]<=2024.3.1,>=2023.1.0->datasets==2.19.1->-r requirements.txt (line 3))\n",
+      "  Downloading fsspec-2024.3.1-py3-none-any.whl.metadata (6.8 kB)\n",
+      "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets==2.19.1->-r requirements.txt (line 3)) (3.10.5)\n",
+      "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets==2.19.1->-r requirements.txt (line 3)) (2.4.0)\n",
+      "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets==2.19.1->-r requirements.txt (line 3)) (1.3.1)\n",
+      "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets==2.19.1->-r requirements.txt (line 3)) (24.2.0)\n",
+      "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets==2.19.1->-r requirements.txt (line 3)) (1.4.1)\n",
+      "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets==2.19.1->-r requirements.txt (line 3)) (6.0.5)\n",
+      "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets==2.19.1->-r requirements.txt (line 3)) (1.9.4)\n",
+      "Requirement already satisfied: async-timeout<5.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets==2.19.1->-r requirements.txt (line 3)) (4.0.3)\n",
+      "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.23.0->transformers==4.41.1->-r requirements.txt (line 1)) (4.12.2)\n",
+      "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.41.1->-r requirements.txt (line 1)) (3.3.2)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.41.1->-r requirements.txt (line 1)) (3.7)\n",
+      "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.41.1->-r requirements.txt (line 1)) (2.0.7)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.41.1->-r requirements.txt (line 1)) (2024.8.30)\n",
+      "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate==0.30.1->-r requirements.txt (line 2)) (3.3)\n",
+      "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate==0.30.1->-r requirements.txt (line 2)) (3.1.4)\n",
+      "Requirement already satisfied: sympy==1.13.1 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate==0.30.1->-r requirements.txt (line 2)) (1.13.1)\n",
+      "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from sympy==1.13.1->torch>=1.10.0->accelerate==0.30.1->-r requirements.txt (line 2)) (1.3.0)\n",
+      "INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.\n",
+      "Collecting multiprocess (from datasets==2.19.1->-r requirements.txt (line 3))\n",
+      "  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)\n",
+      "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets==2.19.1->-r requirements.txt (line 3)) (2.9.0.post0)\n",
+      "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets==2.19.1->-r requirements.txt (line 3)) (2023.4)\n",
+      "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets==2.19.1->-r requirements.txt (line 3)) (2024.1)\n",
+      "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas->datasets==2.19.1->-r requirements.txt (line 3)) (1.16.0)\n",
+      "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=1.10.0->accelerate==0.30.1->-r requirements.txt (line 2)) (2.1.5)\n",
+      "Downloading transformers-4.41.1-py3-none-any.whl (9.1 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m9.1/9.1 MB\u001b[0m \u001b[31m175.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading accelerate-0.30.1-py3-none-any.whl (302 kB)\n",
+      "Downloading datasets-2.19.1-py3-none-any.whl (542 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m542.0/542.0 kB\u001b[0m \u001b[31m334.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m628.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)\n",
+      "Downloading fsspec-2024.3.1-py3-none-any.whl (171 kB)\n",
+      "Downloading huggingface_hub-0.26.2-py3-none-any.whl (447 kB)\n",
+      "Downloading tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.6/3.6 MB\u001b[0m \u001b[31m296.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading multiprocess-0.70.16-py310-none-any.whl (134 kB)\n",
+      "Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)\n",
+      "Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)\n",
+      "Installing collected packages: sentencepiece, xxhash, pyarrow-hotfix, fsspec, dill, multiprocess, huggingface-hub, tokenizers, accelerate, transformers, datasets\n",
+      "  Attempting uninstall: fsspec\n",
+      "    Found existing installation: fsspec 2024.6.1\n",
+      "    Uninstalling fsspec-2024.6.1:\n",
+      "      Successfully uninstalled fsspec-2024.6.1\n",
+      "  Attempting uninstall: dill\n",
+      "    Found existing installation: dill 0.3.9\n",
+      "    Uninstalling dill-0.3.9:\n",
+      "      Successfully uninstalled dill-0.3.9\n",
+      "Successfully installed accelerate-0.30.1 datasets-2.19.1 dill-0.3.8 fsspec-2024.3.1 huggingface-hub-0.26.2 multiprocess-0.70.16 pyarrow-hotfix-0.6 sentencepiece-0.2.0 tokenizers-0.19.1 transformers-4.41.1 xxhash-3.5.0\n",
+      "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable.It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.\u001b[0m\u001b[33m\n",
+      "\u001b[0m\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.3.1\u001b[0m\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython -m pip install --upgrade pip\u001b[0m\n",
+      "Note: you may need to restart the kernel to use updated packages.\n"
+     ]
+    }
+   ],
+   "source": [
+    "%pip install -r requirements.txt\n",
+    "\n",
+    "import torch\n",
+    "cudnn_version = torch.backends.cudnn.version()\n",
+    "assert cudnn_version >= 90100, \"cuDNN version >= 9.1.0 is needed to run this tutorial.\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e8dfabbf",
+   "metadata": {},
+   "source": [
+    "\n",
+    "|\n",
+    "## [Baseline] Running Hugging Face generation with Gemma model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "59560bff",
+   "metadata": {},
+   "source": [
+    "HuggingFace Transformers library offers generation API. \n",
+    "HuggingFace generation for the Gemma model will be used as a baseline."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "2803e0ec",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python3.10/dist-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.\n",
+      "`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.\n",
+      "Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use\n",
+      "`config.hidden_activation` if you want to override this behaviour.\n",
+      "See https://github.com/huggingface/transformers/pull/29402 for more details.\n",
+      "Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:04<00:00,  1.02s/it]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "============================== Generation example 1 ==============================\n",
+      "Prompt:\n",
+      "Here are the two facts about GPUs:\n",
+      "Generated text:\n",
+      "\n",
+      "\n",
+      "1. GPUs are very good at doing the same thing over and over again.\n",
+      "2. GPUs are very bad at doing different things at the same time.\n",
+      "\n",
+      "The first fact is why GPUs are so good at graphics. The second fact is\n",
+      "============================== Generation example 2 ==============================\n",
+      "Prompt:\n",
+      "Some facts about NVIDIA:\n",
+      "Generated text:\n",
+      "\n",
+      "\n",
+      "* NVIDIA is a global technology company that designs and develops high-performance computer graphics and computer processing units (CPUs) for the gaming and professional markets.\n",
+      "* The company was founded in 1993 and is headquartered in Santa Clara\n",
+      "============================== Generation example 3 ==============================\n",
+      "Prompt:\n",
+      "Here are the two facts about GPUs:\n",
+      "Generated text:\n",
+      "\n",
+      "\n",
+      "1. GPUs are very good at doing the same thing over and over again.\n",
+      "2. GPUs are very bad at doing different things at the same time.\n",
+      "\n",
+      "The first fact is why GPUs are so good at graphics. The second fact is\n",
+      "============================== Generation example 4 ==============================\n",
+      "Prompt:\n",
+      "Some facts about NVIDIA:\n",
+      "Generated text:\n",
+      "\n",
+      "\n",
+      "* NVIDIA is a global technology company that designs and develops high-performance computer graphics and computer processing units (CPUs) for the gaming and professional markets.\n",
+      "* The company was founded in 1993 and is headquartered in Santa Clara\n",
+      "============================== Generation example 5 ==============================\n",
+      "Prompt:\n",
+      "Here are the two facts about GPUs:\n",
+      "Generated text:\n",
+      "\n",
+      "\n",
+      "1. GPUs are very good at doing the same thing over and over again.\n",
+      "2. GPUs are very bad at doing different things at the same time.\n",
+      "\n",
+      "The first fact is why GPUs are so good at graphics. The second fact is\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Restart the notebook (to flush the GPU memory)\n",
+    "from utils import restart_jupyter_notebook\n",
+    "restart_jupyter_notebook()\n",
+    "\n",
+    "from utils import *\n",
+    "\n",
+    "# Default hyperparams, also defined in `utils.py` in class `Hyperparameters`\n",
+    "# !!! `model_name` attr must point to the location of the model weights !!!\n",
+    "# Weights can be downloaded from: https://huggingface.co/google/gemma-7b.\n",
+    "# Weights should be in the *.safetensors HF format, not in the original format.\n",
+    "hyperparams.model_name = \"/tmp/gemma-7b-hf\"  # <== Add model weight location here e.g. \"/path/to/downloaded/gemma/weights\"\n",
+    "\n",
+    "model = init_baseline_model(hyperparams)\n",
+    "\n",
+    "print_sample_of_generated_texts(model)\n",
+    "# benchmark_generation(model)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b3698dc6",
+   "metadata": {},
+   "source": [
+    "Let's put this time into the table for later comparison.\n",
+    "\n",
+    "| Models                                                      | Time (s) | Speedup |  \n",
+    "|-------------------------------------------------------------|---------------------------------------|--------------------------------------|\n",
+    "| HF (baseline)                                               | 87.68      | 1                         |"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8bb40f45",
+   "metadata": {},
+   "source": [
+    "## [Improvement 1] Using TransformerLayer from Transformer Engine instead of GemmaDecoderLayer."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "263b40f2",
+   "metadata": {},
+   "source": [
+    "As in the [Gemma](./tutorial_accelerate_hf_gemma_finetuning_with_te.ipynb) finetuning tutorial, a GemmaDecoderLayer is substituted by a tuned TransformerLayer from the Transformer Engine. Let's run it and compare the time with the baseline."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "9dceef93",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python3.10/dist-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in TEGemmaForCausalLM is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained(\"openai/whisper-tiny\", attn_implementation=\"flash_attention_2\", torch_dtype=torch.float16)`\n",
+      "Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in GemmaModel is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained(\"openai/whisper-tiny\", attn_implementation=\"flash_attention_2\", torch_dtype=torch.float16)`\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "> \u001b[0;32m/perfhome/mnt/wkstn/work/repos/TransformerEngine/transformer_engine/pytorch/attention.py\u001b[0m(8223)\u001b[0;36mforward\u001b[0;34m()\u001b[0m\n",
+      "\u001b[0;32m   8221 \u001b[0;31m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0m\u001b[0;32m   8222 \u001b[0;31m                \u001b[0;32mimport\u001b[0m \u001b[0mpdb\u001b[0m\u001b[0;34m;\u001b[0m \u001b[0mpdb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mset_trace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0m\u001b[0;32m-> 8223 \u001b[0;31m                key_layer, value_layer = inference_params.save_to_kv_cache(\n",
+      "\u001b[0m\u001b[0;32m   8224 \u001b[0;31m                    \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlayer_number\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey_layer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue_layer\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0m\u001b[0;32m   8225 \u001b[0;31m                )\n",
+      "\u001b[0m\n"
+     ]
+    },
+    {
+     "name": "stdin",
+     "output_type": "stream",
+     "text": [
+      "ipdb>  key_layer.shape\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "torch.Size([128, 64, 16, 256])\n"
+     ]
+    },
+    {
+     "name": "stdin",
+     "output_type": "stream",
+     "text": [
+      "ipdb>  value_layer.shape\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "torch.Size([128, 64, 16, 256])\n"
+     ]
+    },
+    {
+     "name": "stdin",
+     "output_type": "stream",
+     "text": [
+      "ipdb>  query_layer.shape\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "torch.Size([8192, 16, 256])\n"
+     ]
+    },
+    {
+     "name": "stdin",
+     "output_type": "stream",
+     "text": [
+      "ipdb>  c\n"
+     ]
+    },
+    {
+     "ename": "AssertionError",
+     "evalue": "Queries, keys and values must be 4D tensors when qkv_format = bshd!",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAssertionError\u001b[0m                            Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[2], line 11\u001b[0m\n\u001b[1;32m      7\u001b[0m hyperparams\u001b[38;5;241m.\u001b[39mmodel_name \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m/tmp/gemma-7b-hf\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;66;03m# <== Add model weight location here e.g. \"/path/to/downloaded/gemma/weights\"\u001b[39;00m\n\u001b[1;32m      9\u001b[0m model \u001b[38;5;241m=\u001b[39m init_te_gemma_model(hyperparams)\n\u001b[0;32m---> 11\u001b[0m \u001b[43mprint_sample_of_generated_texts\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     12\u001b[0m \u001b[38;5;66;03m# benchmark_generation(model)\u001b[39;00m\n",
+      "File \u001b[0;32m/perfhome/mnt/wkstn/work/repos/TransformerEngine/docs/examples/te_gemma/utils.py:280\u001b[0m, in \u001b[0;36mprint_sample_of_generated_texts\u001b[0;34m(model)\u001b[0m\n\u001b[1;32m    277\u001b[0m inputs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minput_ids\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m inputs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minput_ids\u001b[39m\u001b[38;5;124m\"\u001b[39m]\u001b[38;5;241m.\u001b[39mcuda()\n\u001b[1;32m    278\u001b[0m inputs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mattention_mask\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m inputs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mattention_mask\u001b[39m\u001b[38;5;124m\"\u001b[39m]\u001b[38;5;241m.\u001b[39mcuda()\n\u001b[0;32m--> 280\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgenerate\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmax_new_tokens\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m50\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m    281\u001b[0m generated_texts \u001b[38;5;241m=\u001b[39m tokenizer\u001b[38;5;241m.\u001b[39mbatch_decode(outputs, skip_special_tokens\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m    283\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mprint_output\u001b[39m(prompts, generated_texts, idx):\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py:116\u001b[0m, in \u001b[0;36mcontext_decorator.<locals>.decorate_context\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    113\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(func)\n\u001b[1;32m    114\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdecorate_context\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m    115\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m ctx_factory():\n\u001b[0;32m--> 116\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/perfhome/mnt/wkstn/work/repos/TransformerEngine/docs/examples/te_gemma/te_gemma.py:450\u001b[0m, in \u001b[0;36mTEGemmaForCausalLM.generate\u001b[0;34m(self, input_ids, pad_token_id, max_new_tokens, *args, **kwargs)\u001b[0m\n\u001b[1;32m    446\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39mqkv_format \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mthd\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m    447\u001b[0m     \u001b[38;5;66;03m# For thd layout padding is at the end, otherwise at the beginning.\u001b[39;00m\n\u001b[1;32m    448\u001b[0m     TEGemmaForCausalLM\u001b[38;5;241m.\u001b[39m_padding_to_end(input_ids, lengths)\n\u001b[0;32m--> 450\u001b[0m hidden_states, next_tokens \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_generate_context_phase\u001b[49m\u001b[43m(\u001b[49m\u001b[43minput_ids\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minference_params\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    452\u001b[0m \u001b[38;5;66;03m# Generation phase.\u001b[39;00m\n\u001b[1;32m    453\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39mqkv_format \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mthd\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n",
+      "File \u001b[0;32m/perfhome/mnt/wkstn/work/repos/TransformerEngine/docs/examples/te_gemma/te_gemma.py:381\u001b[0m, in \u001b[0;36mTEGemmaForCausalLM._generate_context_phase\u001b[0;34m(self, input_ids, inference_params)\u001b[0m\n\u001b[1;32m    378\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    379\u001b[0m     inference_params\u001b[38;5;241m.\u001b[39msetup_before_new_input(length\u001b[38;5;241m=\u001b[39minput_ids\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m1\u001b[39m])\n\u001b[0;32m--> 381\u001b[0m logits \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_model_context_phase\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    382\u001b[0m \u001b[43m    \u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    383\u001b[0m \u001b[43m    \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m(\u001b[49m\u001b[43minput_ids\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m==\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconfig\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mqkv_format\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m!=\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mthd\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    384\u001b[0m \u001b[43m    \u001b[49m\u001b[43mattn_mask_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mpadding_causal\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43minference_params\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mqkv_format\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m==\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mthd\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43marbitrary\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\n\u001b[1;32m    385\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    387\u001b[0m \u001b[38;5;66;03m# We choose logits coresponding with last token in each sequence,\u001b[39;00m\n\u001b[1;32m    388\u001b[0m \u001b[38;5;66;03m# which have various lengths - they are stored in (inference_params.incoming_seq_len - 1)\u001b[39;00m\n\u001b[1;32m    389\u001b[0m \u001b[38;5;66;03m# Tensor when qkv_format == \"thd\" and\u001b[39;00m\n\u001b[1;32m    390\u001b[0m \u001b[38;5;66;03m# they are the last token in the sequence when qkv_format != \"thd\".\u001b[39;00m\n\u001b[1;32m    391\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39mqkv_format \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mthd\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1736\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1734\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1735\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1736\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1747\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1742\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1743\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1744\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1745\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1746\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1747\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1749\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m   1750\u001b[0m called_always_called_hooks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n",
+      "File \u001b[0;32m/perfhome/mnt/wkstn/work/repos/TransformerEngine/docs/examples/te_gemma/te_gemma.py:183\u001b[0m, in \u001b[0;36mStaticGemmaModel.forward\u001b[0;34m(self, hidden_states, attention_mask, attn_mask_type)\u001b[0m\n\u001b[1;32m    180\u001b[0m     hidden_states\u001b[38;5;241m.\u001b[39mdata[:] \u001b[38;5;241m=\u001b[39m hidden_states\u001b[38;5;241m.\u001b[39mdata[:] \u001b[38;5;241m*\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnormalizer\n\u001b[1;32m    182\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m i, decoder_layer \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel\u001b[38;5;241m.\u001b[39mlayers):\n\u001b[0;32m--> 183\u001b[0m         hidden_states\u001b[38;5;241m.\u001b[39mdata[:] \u001b[38;5;241m=\u001b[39m \u001b[43mdecoder_layer\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    184\u001b[0m \u001b[43m            \u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    185\u001b[0m \u001b[43m            \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    186\u001b[0m \u001b[43m            \u001b[49m\u001b[43mself_attn_mask_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmask\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mattn_mask_type\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mis\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mattn_mask_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    187\u001b[0m \u001b[43m            \u001b[49m\u001b[43minference_params\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43minference_params\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    188\u001b[0m \u001b[43m        \u001b[49m\u001b[43m)\u001b[49m[\n\u001b[1;32m    189\u001b[0m             \u001b[38;5;241m0\u001b[39m\n\u001b[1;32m    190\u001b[0m         ]  \u001b[38;5;66;03m# static copy - for CUDA graphs\u001b[39;00m\n\u001b[1;32m    192\u001b[0m hidden_states\u001b[38;5;241m.\u001b[39mcopy_(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel\u001b[38;5;241m.\u001b[39mnorm(hidden_states))  \u001b[38;5;66;03m# static copy - for CUDA graphs\u001b[39;00m\n\u001b[1;32m    193\u001b[0m logits \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlm_head(hidden_states)\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1736\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1734\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1735\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1736\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1747\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1742\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1743\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1744\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1745\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1746\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1747\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1749\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m   1750\u001b[0m called_always_called_hooks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n",
+      "File \u001b[0;32m/perfhome/mnt/wkstn/work/repos/TransformerEngine/docs/examples/te_gemma/te_gemma.py:151\u001b[0m, in \u001b[0;36mTEGemmaDecoderLayer.forward\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m    148\u001b[0m     kwargs\u001b[38;5;241m.\u001b[39mpop(key, \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[1;32m    150\u001b[0m \u001b[38;5;66;03m# We need to return tuple to be compatible with HF.\u001b[39;00m\n\u001b[0;32m--> 151\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m (\u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mforward\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrotary_pos_emb\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mte_rope_emb\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m,)\n",
+      "File \u001b[0;32m/perfhome/mnt/wkstn/work/repos/TransformerEngine/transformer_engine/pytorch/transformer.py:690\u001b[0m, in \u001b[0;36mTransformerLayer.forward\u001b[0;34m(self, hidden_states, attention_mask, self_attn_mask_type, window_size, encoder_output, enc_dec_attn_mask, enc_dec_attn_mask_type, enc_dec_window_size, is_first_microbatch, checkpoint_core_attention, inference_params, rotary_pos_emb, core_attention_bias_type, core_attention_bias, alibi_slopes, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv, fast_zero_fill)\u001b[0m\n\u001b[1;32m    687\u001b[0m     hidden_states \u001b[38;5;241m=\u001b[39m cast_if_needed(hidden_states, torch\u001b[38;5;241m.\u001b[39mget_autocast_gpu_dtype())\n\u001b[1;32m    689\u001b[0m \u001b[38;5;66;03m# Self attention.\u001b[39;00m\n\u001b[0;32m--> 690\u001b[0m self_attention_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mself_attention\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    691\u001b[0m \u001b[43m    \u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    692\u001b[0m \u001b[43m    \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    693\u001b[0m \u001b[43m    \u001b[49m\u001b[43mattn_mask_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mself_attn_mask_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    694\u001b[0m \u001b[43m    \u001b[49m\u001b[43mwindow_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mwindow_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    695\u001b[0m \u001b[43m    \u001b[49m\u001b[43minference_params\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minference_params\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    696\u001b[0m \u001b[43m    \u001b[49m\u001b[43mis_first_microbatch\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_first_microbatch\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    697\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcheckpoint_core_attention\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcheckpoint_core_attention\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    698\u001b[0m \u001b[43m    \u001b[49m\u001b[43mrotary_pos_emb\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrotary_pos_emb\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    699\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcore_attention_bias_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcore_attention_bias_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    700\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcore_attention_bias\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcore_attention_bias\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    701\u001b[0m \u001b[43m    \u001b[49m\u001b[43malibi_slopes\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43malibi_slopes\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    702\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcu_seqlens_q\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcu_seqlens_q\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    703\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcu_seqlens_kv\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcu_seqlens_kv\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    704\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmax_seqlen_q\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_seqlen_q\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    705\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmax_seqlen_kv\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_seqlen_kv\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    706\u001b[0m \u001b[43m    \u001b[49m\u001b[43mfast_zero_fill\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfast_zero_fill\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    707\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    709\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mapply_residual_connection_post_layernorm \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moutput_layernorm:\n\u001b[1;32m    710\u001b[0m     attention_output, attention_bias, residual \u001b[38;5;241m=\u001b[39m self_attention_outputs\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1736\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1734\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1735\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1736\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1747\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1742\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1743\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1744\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1745\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1746\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1747\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1749\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m   1750\u001b[0m called_always_called_hooks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n",
+      "File \u001b[0;32m/perfhome/mnt/wkstn/work/repos/TransformerEngine/transformer_engine/pytorch/attention.py:9453\u001b[0m, in \u001b[0;36mMultiheadAttention.forward\u001b[0;34m(self, hidden_states, attention_mask, encoder_output, attn_mask_type, window_size, is_first_microbatch, checkpoint_core_attention, inference_params, rotary_pos_emb, core_attention_bias_type, core_attention_bias, alibi_slopes, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv, fast_zero_fill)\u001b[0m\n\u001b[1;32m   9447\u001b[0m query_layer \u001b[38;5;241m=\u001b[39m query_layer\u001b[38;5;241m.\u001b[39mview(\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m*\u001b[39mquery_layer\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m2\u001b[39m:])\u001b[38;5;241m.\u001b[39mcontiguous()\n\u001b[1;32m   9449\u001b[0m \u001b[38;5;66;03m# ===========================\u001b[39;00m\n\u001b[1;32m   9450\u001b[0m \u001b[38;5;66;03m# Core attention computation\u001b[39;00m\n\u001b[1;32m   9451\u001b[0m \u001b[38;5;66;03m# ===========================\u001b[39;00m\n\u001b[0;32m-> 9453\u001b[0m context_layer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcore_attention\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   9454\u001b[0m \u001b[43m    \u001b[49m\u001b[43mquery_layer\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   9455\u001b[0m \u001b[43m    \u001b[49m\u001b[43mkey_layer\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   9456\u001b[0m \u001b[43m    \u001b[49m\u001b[43mvalue_layer\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   9457\u001b[0m \u001b[43m    \u001b[49m\u001b[43mqkv_format\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mqkv_format\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   9458\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcu_seqlens_q\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcu_seqlens_q\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   9459\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcu_seqlens_kv\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcu_seqlens_kv\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   9460\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmax_seqlen_q\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_seqlen_q\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   9461\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmax_seqlen_kv\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_seqlen_kv\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   9462\u001b[0m \u001b[43m    \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   9463\u001b[0m \u001b[43m    \u001b[49m\u001b[43mattn_mask_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mattn_mask_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   9464\u001b[0m \u001b[43m    \u001b[49m\u001b[43mwindow_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mwindow_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   9465\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcheckpoint_core_attention\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcheckpoint_core_attention\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   9466\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcore_attention_bias_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcore_attention_bias_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   9467\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcore_attention_bias\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcore_attention_bias\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   9468\u001b[0m \u001b[43m    \u001b[49m\u001b[43malibi_slopes\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43malibi_slopes\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   9469\u001b[0m \u001b[43m    \u001b[49m\u001b[43mfast_zero_fill\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfast_zero_fill\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   9470\u001b[0m \u001b[43m    \u001b[49m\u001b[43minference_params\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minference_params\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   9471\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   9473\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mqkv_format \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mthd\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m   9474\u001b[0m     \u001b[38;5;66;03m# [b * sq, h] -> [qs, b, h]\u001b[39;00m\n\u001b[1;32m   9475\u001b[0m     context_layer  \u001b[38;5;241m=\u001b[39m context_layer\u001b[38;5;241m.\u001b[39mview(\n\u001b[1;32m   9476\u001b[0m         (inference_params\u001b[38;5;241m.\u001b[39mmax_batch_size, \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m, context_layer\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m1\u001b[39m])\n\u001b[1;32m   9477\u001b[0m     )\u001b[38;5;241m.\u001b[39mcontiguous()\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1736\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1734\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1735\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1736\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1747\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1742\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1743\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1744\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1745\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1746\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1747\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1749\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m   1750\u001b[0m called_always_called_hooks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n",
+      "File \u001b[0;32m/perfhome/mnt/wkstn/work/repos/TransformerEngine/transformer_engine/pytorch/attention.py:8301\u001b[0m, in \u001b[0;36mDotProductAttention.forward\u001b[0;34m(self, query_layer, key_layer, value_layer, attention_mask, qkv_format, cu_seqlens_q, cu_seqlens_kv, cu_seqlens_q_padded, cu_seqlens_kv_padded, max_seqlen_q, max_seqlen_kv, attn_mask_type, window_size, checkpoint_core_attention, core_attention_bias_type, core_attention_bias, alibi_slopes, fast_zero_fill, inference_params, is_first_microbatch)\u001b[0m\n\u001b[1;32m   8298\u001b[0m context_parallel \u001b[38;5;241m=\u001b[39m cp_size \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m   8300\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m qkv_format \u001b[38;5;129;01min\u001b[39;00m [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msbhd\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbshd\u001b[39m\u001b[38;5;124m\"\u001b[39m]:\n\u001b[0;32m-> 8301\u001b[0m     \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mall\u001b[39m(\n\u001b[1;32m   8302\u001b[0m         \u001b[38;5;28mlen\u001b[39m(x\u001b[38;5;241m.\u001b[39mshape) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m4\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m (query_layer, key_layer, value_layer)\n\u001b[1;32m   8303\u001b[0m     ), \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mQueries, keys and values must be 4D tensors when qkv_format = \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mqkv_format\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m!\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m   8304\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m qkv_format \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msbhd\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m   8305\u001b[0m         max_seqlen_q \u001b[38;5;241m=\u001b[39m query_layer\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m0\u001b[39m] \u001b[38;5;28;01mif\u001b[39;00m max_seqlen_q \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m max_seqlen_q\n",
+      "\u001b[0;31mAssertionError\u001b[0m: Queries, keys and values must be 4D tensors when qkv_format = bshd!"
+     ]
+    }
+   ],
+   "source": [
+    "# Restart the notebook (to flush the GPU memory)\n",
+    "from utils import restart_jupyter_notebook\n",
+    "restart_jupyter_notebook()\n",
+    "\n",
+    "from utils import *\n",
+    "\n",
+    "hyperparams.model_name = \"/tmp/gemma-7b-hf\" # <== Add model weight location here e.g. \"/path/to/downloaded/gemma/weights\"\n",
+    "\n",
+    "model = init_te_gemma_model(hyperparams)\n",
+    "\n",
+    "print_sample_of_generated_texts(model)\n",
+    "# benchmark_generation(model)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b5d40836",
+   "metadata": {
+    "jupyter": {
+     "source_hidden": true
+    }
+   },
+   "source": [
+    "The speedup of **62%** was obtained."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "006d18e8",
+   "metadata": {},
+   "source": [
+    "| Models                                                      | Time (s) | Speedup |  \n",
+    "|-------------------------------------------------------------|---------------------------------------|--------------------------------------|\n",
+    "| HF (baseline)                                               | 87.68      | 1                         |\n",
+    "| TE (subsitution of GemmaDecoderLayer with te.TransformerLayer)                                              | 54.11      | 1.62                         | "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2bbf3d47",
+   "metadata": {},
+   "source": [
+    "## [Improvement 2] Use of THD attention layout.\n",
+    "\n",
+    "Input sequences can have various lengths. Hugging Face generation – as can be seen in Animation 1 – pads the sequences and then uses attention mask. In the THD attention layout cumulative sequence lengths and offsets need to be provided, instead of attention mask. The THD attention layout is much more optimized than BSHD layout.\n",
+    "\n",
+    "The class `transformer_engine.pytorch.DotProductAttention` supports this format. One need to pass the following things as the arguments to the forward:\n",
+    "- `seq_offsets_q`, `seq_offsets_k`, `seq_offsets_v` – offsets of the beginnings of the next sequences,\n",
+    "- `cu_seqlens_q`, `cu_seqlens_kv` – cumulative sum of the lengths of the sequences of query and values,\n",
+    "- `max_seqlen_q` – maximum sequence length in query layer,\n",
+    "- `max_seqlen_kv` – maximum sequence length in key-value layer.\n",
+    "\n",
+    "<div class=\"alert alert-info\">\n",
+    "<b>Note</b>\n",
+    "\n",
+    "Currently, the THD attention for `TransformerLayer` is supported only for token generation.\n",
+    "</div>\n",
+    "\n",
+    "Let's look how using TransformerEngine with THD attention impacts the speed of token generation:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "4fc5e1cd",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python3.10/dist-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in TEGemmaForCausalLM is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained(\"openai/whisper-tiny\", attn_implementation=\"flash_attention_2\", torch_dtype=torch.float16)`\n",
+      "Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in GemmaModel is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained(\"openai/whisper-tiny\", attn_implementation=\"flash_attention_2\", torch_dtype=torch.float16)`\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "============================== Generation example 1 ==============================\n",
+      "Prompt:\n",
+      "Here are the two facts about GPUs:\n",
+      "Generated text:\n",
+      "\n",
+      "\n",
+      "1. They are very good at doing the same thing over and over again.\n",
+      "2. They are very bad at doing different things at the same time.\n",
+      "\n",
+      "This is why they are so good at rendering 3D graphics.\n",
+      "\n",
+      "The GPU\n",
+      "============================== Generation example 2 ==============================\n",
+      "Prompt:\n",
+      "Some facts about NVIDIA:\n",
+      "Generated text:\n",
+      "\n",
+      "\n",
+      "* NVIDIA is a global technology company that designs and develops graphics processing units (GPUs) for the gaming and professional markets.\n",
+      "* NVIDIA was founded in 1993 by Jensen Huang, Chris Malachowsky, and Curtis Priem.\n",
+      "============================== Generation example 3 ==============================\n",
+      "Prompt:\n",
+      "Here are the two facts about GPUs:\n",
+      "Generated text:\n",
+      "\n",
+      "\n",
+      "1. They are very good at doing the same thing over and over again.\n",
+      "2. They are very bad at doing different things at the same time.\n",
+      "\n",
+      "This is why they are so good at rendering 3D graphics.\n",
+      "\n",
+      "The GPU\n",
+      "============================== Generation example 4 ==============================\n",
+      "Prompt:\n",
+      "Some facts about NVIDIA:\n",
+      "Generated text:\n",
+      "\n",
+      "\n",
+      "* NVIDIA is a global technology company that designs and develops graphics processing units (GPUs) for the gaming and professional markets.\n",
+      "* NVIDIA was founded in 1993 by Jensen Huang, Chris Malachowsky, and Curtis Priem.\n",
+      "============================== Generation example 5 ==============================\n",
+      "Prompt:\n",
+      "Here are the two facts about GPUs:\n",
+      "Generated text:\n",
+      "\n",
+      "\n",
+      "1. They are very good at doing the same thing over and over again.\n",
+      "2. They are very bad at doing different things at the same time.\n",
+      "\n",
+      "This is why they are so good at rendering 3D graphics.\n",
+      "\n",
+      "The GPU\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Restart the notebook (to flush the GPU memory)\n",
+    "from utils import restart_jupyter_notebook\n",
+    "restart_jupyter_notebook()\n",
+    "\n",
+    "from utils import *\n",
+    "\n",
+    "hyperparams.model_name = \"/tmp/gemma-7b-hf/\" # <== Add model weight location here e.g. \"/path/to/downloaded/gemma/weights\"\n",
+    "hyperparams.qkv_format = \"thd\"\n",
+    "\n",
+    "model = init_te_gemma_model(hyperparams)\n",
+    "\n",
+    "print_sample_of_generated_texts(model)\n",
+    "# benchmark_generation(model)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8e397a65",
+   "metadata": {},
+   "source": [
+    "By using THD attention, the following speedup was obtained:\n",
+    "\n",
+    "| Models                                                      | Time (s) | Speedup |  \n",
+    "|-------------------------------------------------------------|---------------------------------------|--------------------------------------|\n",
+    "| HF (baseline)                                               | 87.68      | 1                         |\n",
+    "| TE (subsitution of GemmaDecoderLayer with te.TransformerLayer)                                              | 54.11      | 1.62                         | \n",
+    "| TE + THD attention                                               | 28.22      | 3.11                         |  "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "21a89d9c",
+   "metadata": {},
+   "source": [
+    "## [Improvement 3] Speeding up generation with CUDA Graphs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e2d53e7b",
+   "metadata": {},
+   "source": [
+    "TransformerEngine includes a function `transformer_engine.pytorch.make_graphed_callables`, which functions similarly to the corresponding feature in PyTorch. It is capable of recording any modules from the Transformer Engine. Below is a code excerpt from `te_gemma.py` from class `TEGemmaForCausalLMCudaGraphs`:\n",
+    "```\n",
+    "    def __init__(self, config : GemmaConfig):\n",
+    "            (...)\n",
+    "            \n",
+    "            # Here \"the trick\" happens. We override methods from TEGemmaForCausalLM\n",
+    "            # with their recorded version. After invocation of each of them,\n",
+    "            # captured graph will be replayed with minimal usage of CPU,\n",
+    "            # what will lead to huge speedup.\n",
+    "            (...)\n",
+    "            self._model_context_phase = \n",
+    "                self.record_graph(self._model_context_phase, self.hidden_states_buffer) # CUDA Graphs recording\n",
+    "\n",
+    "            (...)        \n",
+    "            self._model_generation_phase = \n",
+    "                self.record_graph(self._model_generation_phase, self.generation_buffer) # CUDA Graphs recording\n",
+    "\n",
+    "    @torch.no_grad()\n",
+    "    def record_graph(self, function, input_tensor):\n",
+    "        (...)\n",
+    "        # function is invoked on argument (self.hidden_states,) and all kernels are recorded.\n",
+    "        # record_graph() returns captured function, which can be run later with minimal use of th CPU.\n",
+    "        fp8_format = Format.HYBRID\n",
+    "        fp8_recipe = DelayedScaling(fp8_format=fp8_format, amax_history_len=32, amax_compute_algo=\"max\")\n",
+    "        with autocast(dtype=torch.bfloat16, cache_enabled=False):\n",
+    "            graphed_function = te.pytorch.make_graphed_callables(\n",
+    "                function, \n",
+    "                (input_tensor,), \n",
+    "                fp8_enabled=True, \n",
+    "                fp8_recipe=fp8_recipe, \n",
+    "                allow_unused_input=True,\n",
+    "                num_warmup_iters=3\n",
+    "            )\n",
+    "        return graphed_function\n",
+    "```\n",
+    "\n",
+    "It is strongly reccomended to review the entire code of the class `TEGemmaForCausalLMCudaGraphs`. Let's now proceed to evaluate the performance improvement offered by CUDA Graphs."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "31a3a8a3",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python3.10/dist-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in TEGemmaForCausalLMCudaGraphs is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained(\"openai/whisper-tiny\", attn_implementation=\"flash_attention_2\", torch_dtype=torch.float16)`\n",
+      "Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in GemmaModel is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained(\"openai/whisper-tiny\", attn_implementation=\"flash_attention_2\", torch_dtype=torch.float16)`\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "============================== Generation example 1 ==============================\n",
+      "Prompt:\n",
+      "Here are the two facts about GPUs:\n",
+      "Generated text:\n",
+      "\n",
+      "\n",
+      "1. They are very good at doing the same thing over and over again.\n",
+      "2. They are very bad at doing different things at the same time.\n",
+      "\n",
+      "This is why they are so good at rendering 3D graphics.\n",
+      "\n",
+      "The GPU\n",
+      "============================== Generation example 2 ==============================\n",
+      "Prompt:\n",
+      "Some facts about NVIDIA:\n",
+      "Generated text:\n",
+      "\n",
+      "\n",
+      "* NVIDIA is a global technology company that designs and develops graphics processing units (GPUs) for the gaming and professional markets.\n",
+      "* NVIDIA was founded in 1993 by Jensen Huang, Chris Malachowsky, and Curtis Priem.\n",
+      "============================== Generation example 3 ==============================\n",
+      "Prompt:\n",
+      "Here are the two facts about GPUs:\n",
+      "Generated text:\n",
+      "\n",
+      "\n",
+      "1. They are very good at doing the same thing over and over again.\n",
+      "2. They are very bad at doing different things at the same time.\n",
+      "\n",
+      "This is why they are so good at rendering 3D graphics.\n",
+      "\n",
+      "The GPU\n",
+      "============================== Generation example 4 ==============================\n",
+      "Prompt:\n",
+      "Some facts about NVIDIA:\n",
+      "Generated text:\n",
+      "\n",
+      "\n",
+      "* NVIDIA is a global technology company that designs and develops graphics processing units (GPUs) for the gaming and professional markets.\n",
+      "* NVIDIA was founded in 1993 by Jensen Huang, Chris Malachowsky, and Curtis Priem.\n",
+      "============================== Generation example 5 ==============================\n",
+      "Prompt:\n",
+      "Here are the two facts about GPUs:\n",
+      "Generated text:\n",
+      "\n",
+      "\n",
+      "1. They are very good at doing the same thing over and over again.\n",
+      "2. They are very bad at doing different things at the same time.\n",
+      "\n",
+      "This is why they are so good at rendering 3D graphics.\n",
+      "\n",
+      "The GPU\n"
+     ]
+    }
+   ],
+   "source": [
+    "#Restart the notebook (to flush the GPU memory)\n",
+    "from utils import restart_jupyter_notebook\n",
+    "restart_jupyter_notebook()\n",
+    "\n",
+    "\n",
+    "from utils import *\n",
+    "\n",
+    "hyperparams.model_name = \"/tmp/gemma-7b-hf/\" # <== Add model weight location here e.g. \"/path/to/downloaded/gemma/weights\"\n",
+    "hyperparams.qkv_format = \"thd\"\n",
+    "\n",
+    "hyperparams.generation_cuda_graphs = True\n",
+    "\n",
+    "# It is necessary to preallocate a static buffer.\n",
+    "# CUDA graphs require static input tensors for every kernel.\n",
+    "# This approach may result in a slight increase in memory consumption;\n",
+    "# however, the substantial speedup achieved makes it worthwhile.\n",
+    "hyperparams.cuda_graphs_static_batch_size = 64\n",
+    "hyperparams.cuda_graphs_static_max_seq_len = 1024\n",
+    "hyperparams.cuda_graphs_static_max_context_len = 128\n",
+    "model = init_te_gemma_model(hyperparams)\n",
+    "\n",
+    "print_sample_of_generated_texts(model)\n",
+    "# benchmark_generation(model)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "53bb430f",
+   "metadata": {},
+   "source": [
+    "The **5.23x** speedup was obtained.\n",
+    "\n",
+    "| Models                                                      | Time (s) | Speedup |  \n",
+    "|-------------------------------------------------------------|---------------------------------------|--------------------------------------|\n",
+    "| HF (baseline)                                               | 87.68      | 1                         |\n",
+    "| TE (subsitution of GemmaDecoderLayer with te.TransformerLayer)                                              | 54.11      | 1.62                         | \n",
+    "| TE + THD attention                                               | 28.22      | 3.11                         |  \n",
+    "| TE + THD attention + CUDA Graphs                                             | 16.75      | 5.23                         |  \n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0a11b75c",
+   "metadata": {},
+   "source": [
+    "Let's look at the screenshots from *NVIDIA Nsight System* profiler to see where this speedup comes from:\n",
+    "\n",
+    "<figure align=\"center\">\n",
+    "<img src=\"./media/graphs_1.png\" width=\"80%\">\n",
+    "<figcaption>\n",
+    "Figure 5: Without CUDA Graphs. One can see that GPU (blue) is idle for big portion of the time.\n",
+    "</figcaption>\n",
+    "</figure>\n",
+    "\n",
+    "<figure align=\"center\">\n",
+    "<img src=\"./media/graphs_2.png\" width=\"80%\">\n",
+    "<figcaption>\n",
+    "Figure 6: With CUDA Graphs. One can see that GPU (orange) is fully utilized.\n",
+    "</figcaption>\n",
+    "</figure>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e6b171a0",
+   "metadata": {},
+   "source": [
+    "## [Improvement 4] Running generation in FP8 of the model trained in higher precision "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1a80288b",
+   "metadata": {},
+   "source": [
+    "Implementing FP8 generation with the Gemma model is not straightforward, because this model was initially trained using BF16 precision, and the necessary FP8 scaling factors are missing. Running the model at this lower precision without proper scaling could lead to significant errors and incorrect results.\n",
+    "\n",
+    "It is highly recommended to familiarize oneself with the [tutorial](../../examples/fp8_primer.ipynb) on FP8 precision to understand the necessity of scaling.\n",
+    "\n",
+    "\n",
+    "<figure align=\"center\">\n",
+    "<img src=\"./media/calibration_1_half.svg\">\n",
+    "<figcaption>\n",
+    "    Figure 8: The FP8 scaling factors are incorrect and that leads to numerical errors. The weight calibration allows us to compute FP8 metadata during the forwards in higher precision.\n",
+    "</figcaption>\n",
+    "</figure>\n",
+    "\n",
+    "### Weight Calibration\n",
+    "\n",
+    "To address the issue outlined above, weight calibration will be used. This involves running several forward iterations at BF16 precision within the context `te.fp8_autocast(enabled=False, calibration=True)`. This setup allows the forward pass to operate at higher precision, while simultaneously collecting `amax_history` and other parameters related to the FP8 precision, which are essential for calculating the FP8 scaling well.\n",
+    "\n",
+    "The code below outlines the steps to initialize the BF16 model and conduct several forward iterations within the specified context. After these iterations, the model is saved, and these weights will be utilized in subsequent chapters."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "aecee0e1",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python3.10/dist-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in TEGemmaForCausalLM is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained(\"openai/whisper-tiny\", attn_implementation=\"flash_attention_2\", torch_dtype=torch.float16)`\n",
+      "Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in GemmaModel is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained(\"openai/whisper-tiny\", attn_implementation=\"flash_attention_2\", torch_dtype=torch.float16)`\n",
+      "Repo card metadata block was not found. Setting CardData to empty.\n",
+      "[WARNING  | huggingface_hub.repocard]: Repo card metadata block was not found. Setting CardData to empty.\n",
+      "Repo card metadata block was not found. Setting CardData to empty.\n",
+      "[WARNING  | huggingface_hub.repocard]: Repo card metadata block was not found. Setting CardData to empty.\n"
+     ]
+    }
+   ],
+   "source": [
+    "#Restart the notebook (to flush the GPU memory)\n",
+    "from utils import restart_jupyter_notebook\n",
+    "restart_jupyter_notebook()\n",
+    "\n",
+    "from utils import *\n",
+    "import transformer_engine.pytorch as te\n",
+    "\n",
+    "hyperparams.model_name = \"/tmp/gemma-7b-hf/\" # <== Add model weight location here e.g. \"/path/to/downloaded/gemma/weights\"\n",
+    "hyperparams.fuse_qkv_params = True # This is needed by the last improvement.\n",
+    "\n",
+    "model = init_te_gemma_model(hyperparams)\n",
+    "\n",
+    "# Calibration\n",
+    "with te.fp8_autocast(enabled=False, calibrating=True), \\\n",
+    "    torch.autocast(device_type='cuda', dtype=torch.bfloat16):\n",
+    "    model.train()\n",
+    "    run_forward_pass(model, hyperparams, num_iters=512)\n",
+    "\n",
+    "# Compute scale_fwd with enabled fp8 autocast\n",
+    "with te.fp8_autocast(enabled=True), \\\n",
+    "    torch.autocast(device_type='cuda', dtype=torch.bfloat16):\n",
+    "    run_forward_pass(model, hyperparams, 1)\n",
+    "\n",
+    "# Some parameters are in pointing to the same tensors, double save is avoided here.\n",
+    "dict_to_save = {k: v for k, v in model.state_dict().items() \\\n",
+    "                if (\"_context_phase\" not in k and \"_generation_phase\" not in k)}\n",
+    "torch.save(dict_to_save, 'calibrated_weights.pth') # <== Add path to save calibrated weights."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b6dcd135",
+   "metadata": {},
+   "source": [
+    "|\n",
+    "### Generation in FP8\n",
+    "\n",
+    "<figure align=\"center\">\n",
+    "<img src=\"./media/calibration_2_half.svg\">\n",
+    "<figcaption>\n",
+    "    Figure 8: After the weight calibration FP8 scaling factors are correct and prevent numerical errors.\n",
+    "</figcaption>\n",
+    "</figure>\n",
+    "\n",
+    "Now FP8 inference is ready to be run."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "38e005f5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!ls -alh /perfhome/repos/data/gemma-7b-hf/"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "a913f54d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python3.10/dist-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in TEGemmaForCausalLMCudaGraphs is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained(\"openai/whisper-tiny\", attn_implementation=\"flash_attention_2\", torch_dtype=torch.float16)`\n",
+      "Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in GemmaModel is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained(\"openai/whisper-tiny\", attn_implementation=\"flash_attention_2\", torch_dtype=torch.float16)`\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "============================== Generation example 1 ==============================\n",
+      "Prompt:\n",
+      "Here are the two facts about GPUs:\n",
+      "Generated text:\n",
+      "\n",
+      "\n",
+      "1. GPUs are very good at doing the same thing over and over again.\n",
+      "2. GPUs are very bad at doing different things at the same time.\n",
+      "\n",
+      "This is a very important distinction to make.\n",
+      "\n",
+      "The first fact is a good thing\n",
+      "============================== Generation example 2 ==============================\n",
+      "Prompt:\n",
+      "Some facts about NVIDIA:\n",
+      "Generated text:\n",
+      "\n",
+      "\n",
+      "* NVIDIA is a global technology company that designs and develops graphics processing units (GPUs) for the gaming and professional markets.\n",
+      "* NVIDIA was founded in 1993 and is headquartered in Santa Clara, California.\n",
+      "* NVIDIA's\n",
+      "============================== Generation example 3 ==============================\n",
+      "Prompt:\n",
+      "Here are the two facts about GPUs:\n",
+      "Generated text:\n",
+      "\n",
+      "\n",
+      "1. GPUs are very good at doing the same thing over and over again.\n",
+      "2. GPUs are very bad at doing different things at the same time.\n",
+      "\n",
+      "This is a very important distinction to make.\n",
+      "\n",
+      "The first fact is a good thing\n",
+      "============================== Generation example 4 ==============================\n",
+      "Prompt:\n",
+      "Some facts about NVIDIA:\n",
+      "Generated text:\n",
+      "\n",
+      "\n",
+      "* NVIDIA is a global technology company that designs and develops graphics processing units (GPUs) for the gaming and professional markets.\n",
+      "* NVIDIA was founded in 1993 and is headquartered in Santa Clara, California.\n",
+      "* NVIDIA's\n",
+      "============================== Generation example 5 ==============================\n",
+      "Prompt:\n",
+      "Here are the two facts about GPUs:\n",
+      "Generated text:\n",
+      "\n",
+      "\n",
+      "1. GPUs are very good at doing the same thing over and over again.\n",
+      "2. GPUs are very bad at doing different things at the same time.\n",
+      "\n",
+      "This is a very important distinction to make.\n",
+      "\n",
+      "The first fact is a good thing\n"
+     ]
+    }
+   ],
+   "source": [
+    "#Restart the notebook (to flush the GPU memory)\n",
+    "from utils import restart_jupyter_notebook\n",
+    "restart_jupyter_notebook()\n",
+    "\n",
+    "from utils import *\n",
+    "\n",
+    "hyperparams.model_name = \"/tmp/gemma-7b-hf/\"   # <== Add model weight location here e.g. \"/path/to/downloaded/gemma/weights\"\n",
+    "hyperparams.qkv_format = \"thd\"\n",
+    "hyperparams.fuse_qkv_params = True # This is needed by the last improvement.\n",
+    "\n",
+    "hyperparams.fp8 = True\n",
+    "# Calibrated fp8 weights are loaded directly from the file.\n",
+    "\n",
+    "hyperparams.fp8_model_weights_filename = \"calibrated_weights.pth\" # <== Add calibrated weights location here.\n",
+    "\n",
+    "hyperparams.generation_cuda_graphs = True\n",
+    "hyperparams.cuda_graphs_static_batch_size = 64\n",
+    "hyperparams.cuda_graphs_static_max_seq_len = 1024\n",
+    "hyperparams.cuda_graphs_static_max_context_len = 128\n",
+    "model = init_te_gemma_model(hyperparams)\n",
+    "\n",
+    "print_sample_of_generated_texts(model)\n",
+    "# benchmark_generation(model)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8cdbb56c",
+   "metadata": {},
+   "source": [
+    "One can observe that the outputs are coherent; however, the generation time has increased. Why is this the case?\n",
+    "\n",
+    "\n",
+    "<figure align=\"center\">\n",
+    "<img src=\"./media/fp8_model_init_1_half.svg\">\n",
+    "<figcaption>\n",
+    "    Figure 9: Running the model at higher precision involves only one GEMM operation. However, when the model operates in FP8, it requires not just the low-precision GEMM but also weight casting.\n",
+    "</figcaption>\n",
+    "</figure>\n",
+    "\n",
+    "Running the model in FP8 does not imply that all weights are stored in FP8. By default, they are stored in higher precision and are cast to FP8, using saved scaling factors, before operations such as GEMMs.\n",
+    "\n",
+    "This approach is beneficial during training: one can perform one cast for both backward and forward passes, leading to speedups. However, performing a single cast for each forward pass introduces too much overhead to achieve a speedup. This issue will be addressed in the next section of the tutorial."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8d3945e3",
+   "metadata": {},
+   "source": [
+    "### Use of only FP8 model weights"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2dd0cba9",
+   "metadata": {},
+   "source": [
+    "TransformerEngine stores parameters in higher precision and only casts them to FP8. It may be necessary to maintain accucacy during training. However, high precision is not needed when doing inference. \n",
+    "\n",
+    "Transformer Engine supports maintaining only FP8 weights with `fp8_model_init` decorator. Let's see an example\n",
+    "```\n",
+    "linear = te.Linear(1024, 1024) # this module is initialized with full precision weights\n",
+    "with te.fp8_model_init(enabled=True):\n",
+    "    linear_fp8 = te.Linear(1024, 1024) # this module is initialized only with fp8 weights\n",
+    "\n",
+    "assert type(linear.weight.data) is torch.Tensor\n",
+    "assert type(linear_fp8.weight.data) is te.float8_tensor.Float8Tensor\n",
+    "```\n",
+    "\n",
+    "<figure align=\"center\">\n",
+    "<img src=\"./media/fp8_model_init_2_half.svg\">\n",
+    "<figcaption>\n",
+    "    Figure 10: Using fp8_model_init stores the weights directly in FP8 format, which reduces both time and memory usage.\n",
+    "</figcaption>\n",
+    "</figure>\n",
+    "\n",
+    "Let's run the code with `fp8_model_init`:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "96264b9c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python3.10/dist-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in TEGemmaForCausalLMCudaGraphs is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained(\"openai/whisper-tiny\", attn_implementation=\"flash_attention_2\", torch_dtype=torch.float16)`\n",
+      "Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in GemmaModel is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained(\"openai/whisper-tiny\", attn_implementation=\"flash_attention_2\", torch_dtype=torch.float16)`\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "============================== Generation example 1 ==============================\n",
+      "Prompt:\n",
+      "Here are the two facts about GPUs:\n",
+      "Generated text:\n",
+      "\n",
+      "\n",
+      "1. GPUs are very good at doing the same thing over and over again.\n",
+      "2. GPUs are very bad at doing different things at the same time.\n",
+      "\n",
+      "This is a very important distinction to make.\n",
+      "\n",
+      "The first fact is a good thing\n",
+      "============================== Generation example 2 ==============================\n",
+      "Prompt:\n",
+      "Some facts about NVIDIA:\n",
+      "Generated text:\n",
+      "\n",
+      "\n",
+      "* NVIDIA is a global technology company that designs and develops graphics processing units (GPUs) for the gaming and professional markets.\n",
+      "* NVIDIA was founded in 1993 and is headquartered in Santa Clara, California.\n",
+      "* NVIDIA's\n",
+      "============================== Generation example 3 ==============================\n",
+      "Prompt:\n",
+      "Here are the two facts about GPUs:\n",
+      "Generated text:\n",
+      "\n",
+      "\n",
+      "1. GPUs are very good at doing the same thing over and over again.\n",
+      "2. GPUs are very bad at doing different things at the same time.\n",
+      "\n",
+      "This is a very important distinction to make.\n",
+      "\n",
+      "The first fact is a good thing\n",
+      "============================== Generation example 4 ==============================\n",
+      "Prompt:\n",
+      "Some facts about NVIDIA:\n",
+      "Generated text:\n",
+      "\n",
+      "\n",
+      "* NVIDIA is a global technology company that designs and develops graphics processing units (GPUs) for the gaming and professional markets.\n",
+      "* NVIDIA was founded in 1993 and is headquartered in Santa Clara, California.\n",
+      "* NVIDIA's\n",
+      "============================== Generation example 5 ==============================\n",
+      "Prompt:\n",
+      "Here are the two facts about GPUs:\n",
+      "Generated text:\n",
+      "\n",
+      "\n",
+      "1. GPUs are very good at doing the same thing over and over again.\n",
+      "2. GPUs are very bad at doing different things at the same time.\n",
+      "\n",
+      "This is a very important distinction to make.\n",
+      "\n",
+      "The first fact is a good thing\n"
+     ]
+    }
+   ],
+   "source": [
+    "#Restart the notebook (to flush the GPU memory)\n",
+    "from utils import restart_jupyter_notebook\n",
+    "restart_jupyter_notebook()\n",
+    "\n",
+    "# Import necessary packages and methods\n",
+    "from utils import *\n",
+    "\n",
+    "hyperparams.model_name = \"/tmp/gemma-7b-hf/\" # <== Add model weight location here e.g. \"/path/to/downloaded/gemma/weights\"\n",
+    "hyperparams.fuse_qkv_params = True # Needed for fp8_model_init().\n",
+    "hyperparams.qkv_format = \"thd\"\n",
+    "\n",
+    "hyperparams.fp8 = True\n",
+    "hyperparams.fp8_model_init = True # This will result in storing only fp8 weights.\n",
+    "hyperparams.fp8_model_weights_filename = \"calibrated_weights.pth\" # <== Add calibrated weights location here.\n",
+    "\n",
+    "hyperparams.generation_cuda_graphs = True\n",
+    "hyperparams.cuda_graphs_static_batch_size = 64\n",
+    "hyperparams.cuda_graphs_static_max_seq_len = 1024\n",
+    "hyperparams.cuda_graphs_static_max_context_len = 128\n",
+    "model = init_te_gemma_model(hyperparams)\n",
+    "\n",
+    "print_sample_of_generated_texts(model)\n",
+    "# benchmark_generation(model)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3e30ca5a",
+   "metadata": {},
+   "source": [
+    "| Models                                                      | Time (s) | Speedup |  \n",
+    "|-------------------------------------------------------------|---------------------------------------|--------------------------------------|\n",
+    "| HF (baseline)                                               | 87.68      | 1                         |\n",
+    "| TE (subsitution of GemmaDecoderLayer with te.TransformerLayer)                                              | 54.11      | 1.62                         | \n",
+    "| TE + THD attention                                               | 28.22      | 3.11                         |  \n",
+    "| TE + THD attention + CUDA Graphs                                             | 16.75      | 5.23                         |  \n",
+    "| TE + THD attention + FP8                                             | 12.13      | 7.23                         |  \n",
+    "\n",
+    "The final speedup is **7.23x**."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c6e87275",
+   "metadata": {},
+   "source": [
+    "## Conclusions"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7bb2452d",
+   "metadata": {},
+   "source": [
+    "\n",
+    "<figure align=\"center\">\n",
+    "<img src=\"./media/plot.svg\">\n",
+    "<figcaption>\n",
+    "    Figure 11: Times obtained with optimizations using TransformerEngine (seconds).\n",
+    "</figcaption>\n",
+    "</figure>\n",
+    "\n",
+    "In this tutorial, we've explored three features of the Transformer Engine:\n",
+    "1. Support for the THD attention layout,\n",
+    "2. Integration with CUDA Graphs,\n",
+    "3. FP8 weights calibration,\n",
+    "4. Models containing only FP8 version of their parameters.\n",
+    "\n",
+    "Each of these features can be applied in various contexts, such as fast token generation. It's important to note that the fastest possible inference speeds can be achieved using NVIDIA's inference-optimized [TensorRT](https://developer.nvidia.com/tensorrt) library."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/docs/examples/te_gemma/utils.py b/docs/examples/te_gemma/utils.py
new file mode 100755
index 0000000000..46577071c8
--- /dev/null
+++ b/docs/examples/te_gemma/utils.py
@@ -0,0 +1,366 @@
+# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+import time
+import sys
+import IPython
+import random
+import string
+
+from te_gemma_loading_weights import load_te_model
+from te_llama_loading_weights import load_te_model as load_te_model_llama
+import torch
+from torch.optim import AdamW
+from torch.utils.data import DataLoader
+
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    get_linear_schedule_with_warmup,
+    AutoConfig,
+)
+from transformers import DataCollatorForLanguageModeling
+from datasets import load_dataset
+from accelerate import Accelerator
+from accelerate.utils.dataclasses import FP8RecipeKwargs
+
+
+from te_gemma import TEGemmaForCausalLM, TEGemmaForCausalLMCudaGraphs
+from te_llama import TELlamaForCausalLM, TELlamaForCausalLMCudaGraphs
+
+class HyperParameters:
+    def __init__(self):
+        self.mixed_precision = "bf16"
+        self.model_name = None
+
+        self.fp8 = False
+
+        # Weights in fp8
+        self.fp8_model_weights_filename = None
+        self.fp8_model_init = False
+
+        # Cuda graphs
+        self.generation_cuda_graphs = False
+        self.cuda_graphs_static_batch_size = 16
+        self.cuda_graphs_static_max_seq_len = 256
+        self.cuda_graphs_static_max_context_len = 16
+
+        # Finetuning settings.
+        self.dataset_name = "timdettmers/openassistant-guanaco"
+        self.dataset_text_field = "text"
+        self.learning_rate = 1.41e-5
+        self.batch_size = 8
+        self.max_seq_length = 256
+        self.gradient_accumulation_steps = 1
+        self.num_warmup_steps = 5
+        self.num_training_steps = 10
+
+        # QKV format.
+        self.fuse_qkv_params = False
+        self.qkv_format = "bshd"
+
+
+hyperparams = HyperParameters()
+
+assert (
+    torch.backends.cudnn.version() >= 90100
+), "cuDNN version >= 9.1.0 is needed to run this tutorial."
+
+
+def get_dataloaders(accelerator: Accelerator, hyperparams):
+    dataset = load_dataset(hyperparams.dataset_name, split="train")
+    tokenizer = AutoTokenizer.from_pretrained(hyperparams.model_name)
+    if getattr(tokenizer, "pad_token", None) is None:
+        tokenizer.pad_token = tokenizer.eos_token
+
+    def tokenize(element):
+        outputs = tokenizer(
+            element["text"],
+            truncation=True,
+            padding=False,
+            max_length=hyperparams.max_seq_length,
+            return_overflowing_tokens=False,
+            return_length=False,
+        )
+        return {"input_ids": outputs["input_ids"], "attention_mask": outputs["attention_mask"]}
+
+    with accelerator.main_process_first():
+        dataset = dataset.map(tokenize, batched=True, remove_columns=dataset.column_names)
+
+    # Simply pad to the multiple of 16 for both FP8 and BF16 precision
+    pad_to_multiple_of = 16
+    data_collator = DataCollatorForLanguageModeling(
+        tokenizer=tokenizer,
+        mlm=False,
+        pad_to_multiple_of=pad_to_multiple_of,
+    )
+
+    dataloader_params = {
+        "batch_size": hyperparams.batch_size,
+        "collate_fn": data_collator,
+        "drop_last": True,
+    }
+    train_dataloader = DataLoader(dataset, **dataloader_params)
+    return train_dataloader
+
+
+def init_baseline_model(hyperparams):
+    # Init the model
+    config = AutoConfig.from_pretrained(hyperparams.model_name)
+    # make sure to use flash_attention to do iso comparison with TEGemmaModel
+    config._attn_implementation = "flash_attention_2"
+    model = AutoModelForCausalLM.from_pretrained(
+        hyperparams.model_name,
+        config=config,
+        torch_dtype=torch.bfloat16,
+    )
+    return model.cuda()
+
+
+def init_te_llama_model(hyperparams):
+    cls = TELlamaForCausalLMCudaGraphs if hyperparams.generation_cuda_graphs else TELlamaForCausalLM
+    config = AutoConfig.from_pretrained(hyperparams.model_name)
+    config._attn_implementation = "flash_attention_2"
+    # config.hidden_size = 1024
+    # config.head_dim = 128
+    print(config)
+    # Adding all params from the hyperparams to the config to make the code simpler.
+    for key, value in hyperparams.__dict__.items():
+        setattr(config, key, value)
+    model = load_te_model_llama(cls, config)
+    if hyperparams.generation_cuda_graphs:
+        model.record()
+    return model.cuda()
+
+def init_te_gemma_model(hyperparams):
+    cls = TEGemmaForCausalLMCudaGraphs if hyperparams.generation_cuda_graphs else TEGemmaForCausalLM
+    config = AutoConfig.from_pretrained(hyperparams.model_name)
+    config._attn_implementation = "flash_attention_2"
+    # config.hidden_size = 1024
+    # config.head_dim = 128
+    print(config)
+    # Adding all params from the hyperparams to the config to make the code simpler.
+    for key, value in hyperparams.__dict__.items():
+        setattr(config, key, value)
+    model = load_te_model(cls, config)
+    if hyperparams.generation_cuda_graphs:
+        model.record()
+    return model.cuda()
+
+
+def wrap_with_accelerator(model, hyperparams):
+    # Create FP8 kwarg handler if required
+    fp8_kwarg_handler = (
+        [FP8RecipeKwargs(backend="te")] if hyperparams.mixed_precision == "fp8" else None
+    )
+
+    # Init HF accelerator that's used for training
+    accelerator = Accelerator(
+        log_with="wandb",
+        gradient_accumulation_steps=hyperparams.gradient_accumulation_steps,
+        mixed_precision=hyperparams.mixed_precision,
+        kwargs_handlers=fp8_kwarg_handler,
+    )
+    # accelerator.print(f'State: {accelerator.state}')
+    train_dataloader = get_dataloaders(accelerator, hyperparams)
+
+    # Wrap model, optimizer/scheduler, dataloaders in accelerate
+    optimizer = AdamW(params=model.parameters(), lr=hyperparams.learning_rate, fused=True)
+    lr_scheduler = get_linear_schedule_with_warmup(
+        optimizer=optimizer,
+        num_warmup_steps=100,
+        num_training_steps=hyperparams.num_training_steps,
+    )
+    model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        model, optimizer, train_dataloader, lr_scheduler
+    )
+
+    return accelerator, model, optimizer, train_dataloader, lr_scheduler
+
+
+def finetune_model(model, hyperparams, accelerator, train_dataloader, optimizer, lr_scheduler):
+    model.train()
+    optimizer.zero_grad()
+    train_dataloader = enumerate(train_dataloader)
+
+    def run_iters(num_iters):
+        for _ in range(num_iters):
+            _, batch = next(train_dataloader)
+            with accelerator.accumulate(model):
+                outputs = model(**batch)
+                loss = outputs.loss
+                accelerator.backward(loss)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+    run_iters(hyperparams.num_warmup_steps)  # Warmup iters
+
+    # Get the timers ready
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+    torch.cuda.synchronize()
+
+    start.record()
+    run_iters(hyperparams.num_training_steps)  # Training iters
+    torch.cuda.synchronize()
+    end.record()
+    accelerator.end_training()
+
+    print(
+        f"""{hyperparams.num_training_steps} finetuning steps complete!\n
+          Average time taken per step:
+          {(start.elapsed_time(end)/hyperparams.num_training_steps):.0f}
+          milliseconds"""
+    )
+
+
+def restart_jupyter_notebook():
+    # Try restarting the Jupyter kernel
+    IPython.Application.instance().kernel.do_shutdown(True)
+
+    # Check whether the device memory has been flushed
+    if torch.cuda.memory_allocated() != 0:
+        import warnings
+
+        warnings.warn("The device memory hasn't been flushed, trying with a second method!")
+
+        # Try restarting the Jupyter kernel another way
+        # Restart the kernel
+        from IPython.core.display import HTML
+
+        HTML("<script>Jupyter.notebook.kernel.restart()</script>")
+
+        if torch.cuda.memory_allocated() != 0:
+            print(
+                "The device memory hasn't been flushed, try manually restarting the Jupyter kernel!"
+            )
+
+    # Suppress the warnings
+    if not sys.warnoptions:
+        import warnings
+
+        warnings.simplefilter("ignore")
+        torch.set_warn_always(False)
+
+
+@torch.no_grad()
+def run_forward_pass(model, hyperparams, num_iters):
+    """
+    It runs num_iters forward passes with sample data.
+    """
+    accelerator = Accelerator(
+        log_with="wandb",
+        gradient_accumulation_steps=hyperparams.gradient_accumulation_steps,
+        mixed_precision="no",
+    )
+    train_dataloader = get_dataloaders(accelerator, hyperparams)
+
+    # @sudhakars: what's the point of calling `model.train` inside `no_grad`
+    # context?
+    model.train()
+    train_dataloader = enumerate(train_dataloader)
+
+    for _ in range(num_iters):
+        _, batch = next(train_dataloader)
+        batch["input_ids"] = batch["input_ids"].cuda()
+        batch['attention_mask'] = batch["attention_mask"].cuda()
+        model(input_ids = batch["input_ids"], attention_mask = batch['attention_mask'])
+
+
+"""
+    Benchmarking and example generation functions.
+"""
+
+
+def print_sample_of_generated_texts(model):
+    tokenizer = AutoTokenizer.from_pretrained(hyperparams.model_name)
+    if getattr(tokenizer, "pad_token", None) is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    prompts = ["Here are the two facts about GPUs:", "Some facts about NVIDIA:"]
+    prompts *= 32
+    inputs = tokenizer(prompts, return_tensors="pt", padding=True)
+
+
+    max_length = inputs["input_ids"].size(1)
+    new_length = ((max_length + 63) // 64) * 128
+
+    # Add padding to the left
+    inputs["input_ids"] = torch.nn.functional.pad(
+        inputs["input_ids"], (new_length - max_length, 0), value=tokenizer.pad_token_id
+    )
+
+    # Add padding to the left (only intended for baseline generation with HF
+    # which expects padding to the left)
+    inputs["attention_mask"] = torch.nn.functional.pad(
+        inputs["attention_mask"], (new_length - max_length, 0), value=0
+    )
+
+    inputs["input_ids"] = inputs["input_ids"].cuda()
+    inputs["attention_mask"] = inputs["attention_mask"].cuda()
+
+    outputs = model.generate(**inputs, max_new_tokens=50)
+    generated_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
+
+    def print_output(prompts, generated_texts, idx):
+        print("=" * 30 + f" Generation example {idx+1} " + "=" * 30)
+        print("Prompt:")
+        print(generated_texts[idx][: len(prompts[idx])])
+        print("Generated text:")
+        print(generated_texts[idx][len(prompts[idx]) :])
+
+    for i in range(5):
+        print_output(prompts, generated_texts, i)
+
+
+def _generate_random_words(num_words, max_word_length):
+    words = []
+    for _ in range(num_words):
+        word_length = random.randint(1, max_word_length)
+        word = "".join(random.choices(string.ascii_lowercase, k=word_length))
+        words.append(word)
+    return words
+
+
+def benchmark_generation(model):
+    batch_size = 64
+    context_length = 128
+    max_new_tokens = 156 - 128
+    print("=" * 30 + " Benchmarking " + "=" * 30)
+    print(
+        f"Benchmarking for batch_size = {batch_size} and max total tokens ="
+        f" {context_length + max_new_tokens}"
+    )
+
+    input_str = _generate_random_words(batch_size, context_length)
+
+    tokenizer = AutoTokenizer.from_pretrained(hyperparams.model_name)
+    inputs = tokenizer(input_str, return_tensors="pt", padding=True)
+
+    max_length = inputs["input_ids"].size(1)
+
+    # Add padding to the left
+    inputs["input_ids"] = torch.nn.functional.pad(
+        inputs["input_ids"], (context_length - max_length, 0), value=tokenizer.pad_token_id
+    )
+
+    # Add padding to the left (only intended for baseline generation with HF
+    # which expects padding to the left)
+    inputs["attention_mask"] = torch.nn.functional.pad(
+        inputs["attention_mask"], (context_length - max_length, 0), value=0
+    )
+
+    inputs["input_ids"] = inputs["input_ids"].cuda()
+    inputs["attention_mask"] = inputs["attention_mask"].cuda()
+
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+    torch.cuda.synchronize()
+    start.record()
+
+    model.generate(inputs["input_ids"].cuda(), max_new_tokens=max_new_tokens)
+    torch.cuda.synchronize()
+    end.record()
+
+    print(f"Time: {start.elapsed_time(end)/1000:.2f} s.")
diff --git a/transformer_engine/pytorch/attention/inference.py b/transformer_engine/pytorch/attention/inference.py
index 8267bf63c7..8a33a7f047 100644
--- a/transformer_engine/pytorch/attention/inference.py
+++ b/transformer_engine/pytorch/attention/inference.py
@@ -214,6 +214,12 @@ def __init__(
             dtype=torch.int32,
             device=torch.cuda.current_device(),
         )
+        self.cu_pre_step_seqlens = torch.zeros(
+            self.max_batch_size,
+            dtype=torch.int32,
+            device=torch.cuda.current_device(),
+        )
+
 
     def reset(self):
         """Reset InferenceParams state"""
@@ -280,9 +286,12 @@ def pre_step(
 
     def get_seqlens_pre_step(self):
         """Get cached sequence lengths before the stepping"""
-        return torch.Tensor(list(self.sequences_pre_step.values())).to(
+        seqlens = torch.Tensor(list(self.sequences_pre_step.values())).to(
             dtype=torch.int32, device="cpu"
         )
+        # return seqlens.cuda()
+        self.cu_pre_step_seqlens[:len(seqlens)].copy_(seqlens, non_blocking=True)
+        return self.cu_pre_step_seqlens
 
     def convert_paged_to_nonpaged(self, layer_number: int):
         """
@@ -455,14 +464,14 @@ def pre_step(
         finished_seqs = self.sequences.keys() - unfinished_seqs
         unfinished_indices = [i for i, j in enumerate(self.sequences) if j in unfinished_seqs]
         finished_indices = [i for i, j in enumerate(self.sequences) if j in finished_seqs]
-        self.batch_indices.copy_(
+        self.batch_indices.data[:].copy_(
             torch.Tensor(
                 (
                     unfinished_indices
                     + finished_indices
                     + list(range(prev_batch_size, self.max_batch_size))
                 )
-            ).to(dtype=torch.int32, device="cpu")
+            )
         )
 
         # Advance unfinished sequences
diff --git a/transformer_engine/pytorch/attention/multi_head_attention.py b/transformer_engine/pytorch/attention/multi_head_attention.py
index a9a687ef15..ec51d68cdd 100644
--- a/transformer_engine/pytorch/attention/multi_head_attention.py
+++ b/transformer_engine/pytorch/attention/multi_head_attention.py
@@ -764,8 +764,8 @@ def forward(
                 # sequence_start = inference_params.seqlens[0]
                 sequence_end = sequence_start + sequence_length
 
-                q_pos_emb = q_pos_emb[sequence_start:sequence_end, ...]
-                k_pos_emb = k_pos_emb[sequence_start:sequence_end, ...]
+                # q_pos_emb = q_pos_emb[sequence_start:sequence_end, ...]
+                # k_pos_emb = k_pos_emb[sequence_start:sequence_end, ...]
 
             query_layer = apply_rotary_pos_emb(
                 query_layer,
@@ -775,6 +775,7 @@ def forward(
                 cu_seqlens=cu_seqlens_q,
                 cp_size=self.cp_size,
                 cp_rank=self.cp_rank,
+                start_positions=sequence_start,
             )
             key_layer = apply_rotary_pos_emb(
                 key_layer,
@@ -784,6 +785,7 @@ def forward(
                 cu_seqlens=cu_seqlens_kv,
                 cp_size=self.cp_size,
                 cp_rank=self.cp_rank,
+                start_positions=sequence_start,
             )
 
         # ===========================
diff --git a/transformer_engine/pytorch/csrc/extensions/apply_rope.cpp b/transformer_engine/pytorch/csrc/extensions/apply_rope.cpp
index b13a90f876..200f6817fc 100644
--- a/transformer_engine/pytorch/csrc/extensions/apply_rope.cpp
+++ b/transformer_engine/pytorch/csrc/extensions/apply_rope.cpp
@@ -27,9 +27,10 @@ at::Tensor fused_rope_forward(const at::Tensor &input, const at::Tensor &freqs,
   auto freqs_cu = makeTransformerEngineTensor(freqs);
   auto output_cu = makeTransformerEngineTensor(output);
 
-  auto start_positions_cu = transformer_engine::TensorWrapper();  // empty cu_seqlens tensor
+  auto start_positions_cu = transformer_engine::TensorWrapper();  // empty start_positions tensor
   if (start_positions) {
     start_positions_cu = makeTransformerEngineTensor(start_positions.value());
+    TORCH_CHECK(start_positions_cu.ndim() == 1, "expected 1D tensor");
   }
 
   if (qkv_format == NVTE_QKV_Format::NVTE_THD) {

From d56f4393f5f12995dbd38f1b611a955cb7e94c36 Mon Sep 17 00:00:00 2001
From: Sudhakar Singh <sudhakars@nvidia.com>
Date: Mon, 16 Jun 2025 11:47:52 -0700
Subject: [PATCH 2/7] remove extraneous code for easy debu

Signed-off-by: Sudhakar Singh <sudhakars@nvidia.com>
---
 docs/examples/te_gemma/run_generation.py |   2 +-
 docs/examples/te_gemma/te_gemma.py       | 310 +++------
 docs/examples/te_gemma/te_gemma_save.py  | 829 +++++++++++++++++++++++
 3 files changed, 932 insertions(+), 209 deletions(-)
 create mode 100755 docs/examples/te_gemma/te_gemma_save.py

diff --git a/docs/examples/te_gemma/run_generation.py b/docs/examples/te_gemma/run_generation.py
index eb781f11cf..6c45b9d670 100755
--- a/docs/examples/te_gemma/run_generation.py
+++ b/docs/examples/te_gemma/run_generation.py
@@ -19,4 +19,4 @@
 model = init_te_gemma_model(hyperparams)
 
 print_sample_of_generated_texts(model)
-benchmark_generation(model)
+# benchmark_generation(model)
diff --git a/docs/examples/te_gemma/te_gemma.py b/docs/examples/te_gemma/te_gemma.py
index f24b700979..bab980cc28 100755
--- a/docs/examples/te_gemma/te_gemma.py
+++ b/docs/examples/te_gemma/te_gemma.py
@@ -19,87 +19,6 @@
 
 import torch.nn.functional as F
 
-def setup_cache_params_from_infer_params(inference_params, lengths_tensor, max_input_length):
-    """
-    Converts the `input_ids` to variables like `cu_seqlens_q/kv`, etc. which
-    will be used later.
-
-    (Currently a hack, this should be reformatted to a better method)
-    """
-
-    assert lengths_tensor is not None and max_input_length is not None, \
-        "lengths_tensor and max_input_length should not be none for qkv_format = \"thd\""
-    torch.add(
-        inference_params.cached_sequence_lengths,
-        inference_params.input_sequence_lengths,
-        out=inference_params.cached_sequence_lengths)
-    # inference_params.input_sequence_lengths[:len(lengths_tensor)].copy_(lengths_tensor, non_blocking=True)
-    inference_params.input_sequence_lengths.copy_(lengths_tensor)
-
-    inference_params.max_incoming_seq_len = max_input_length
-
-    max_seqlen_q, max_seqlen_kv = inference_params.max_incoming_seq_len, inference_params.max_sequence_length
-
-    # # Allocation of buffers, it works correctly with CUDA Graphs.
-    _allocator = StaticBufferAllocator()
-    NR_BUFFERS = 4
-
-    cu_seqlens_q, cu_seqlens_kv, cu_seqlens_q_padded, cu_seqlens_kv_padded = [
-        _allocator(inference_params.max_batch_size + 1, dtype=torch.int32, device="cuda")
-        for _ in range(NR_BUFFERS)
-    ]
-
-    torch.cumsum(inference_params.input_sequence_lengths, dim=0, out=cu_seqlens_q[1:])
-    torch.cumsum(
-        inference_params.cached_sequence_lengths + inference_params.input_sequence_lengths,
-        dim=0, out=cu_seqlens_kv[1:])
-    # If layer has shape [b * s_layer, h, d]
-    # offsets are of the form [k * s_layer * h * d for k = 0, ..., batch_size]
-    cu_seqlens_q_padded.copy_(
-        torch.arange(0, inference_params.max_batch_size + 1, device="cuda") * max_seqlen_q)
-    cu_seqlens_kv_padded.copy_(
-        torch.arange(0, inference_params.max_batch_size + 1, device="cuda") * max_seqlen_kv)
-
-    # inference_params.step_dict = OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist()))
-    inference_params.pre_step(OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist())))
-
-    # print(inference_params.step_dict)
-
-    def get_cache_params_in_infer_params():
-        return max_seqlen_q, max_seqlen_kv, cu_seqlens_q, cu_seqlens_kv, cu_seqlens_q_padded, cu_seqlens_kv_padded
-
-    # For the time being, create an ad-hoc field in `inference_params` to get the variables.
-    # @sudhakars: to create a better way later.
-    inference_params.get_cache_params_from_infer_params = get_cache_params_in_infer_params
-
-# This class has been modified from
-# https://github.com/huggingface/transformers/blob/98adf24883b007c2a7fb17bab1c01b1614673433/src/transformers/models/gemma/modeling_gemma.py
-class GemmaRotaryEmbedding(torch.nn.Module):
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
-        super().__init__()
-
-        self.dim = dim
-        self.max_position_embeddings = max_position_embeddings
-        self.base = base
-        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float() / self.dim))
-        self.register_buffer("inv_freq", tensor=inv_freq, persistent=False)
-
-    @torch.no_grad()
-    def forward(self, x, position_ids, seq_len=None):
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        self.inv_freq.to(x.device)
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
-        position_ids_expanded = position_ids[:, None, :].float()
-        # Force float32 since bfloat16 loses precision on long contexts
-        # See https://github.com/huggingface/transformers/pull/29285
-        device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
-            emb = torch.cat((freqs, freqs), dim=-1)
-        return emb.unsqueeze(2) # should return in [b, s, 1, d] format
-
-
 class StaticBufferAllocator(torch.nn.Module):
     """
         This class is used when we use te.make_graphed_callable().
@@ -152,6 +71,10 @@ def __init__(self, config: GemmaConfig, layer_idx: int, *args, **kwargs):
             zero_centered_gamma=True,
         )
 
+        self.te_rope_emb = RotaryPositionEmbedding(self.gemma_config.head_dim)(
+            max_seq_len=self.gemma_config.max_position_embeddings
+        ).cuda()
+
     def alloc(self, size, dtype, device):
         """
             Allocated the buffer and works correctly with CUDA Graphs.
@@ -160,67 +83,26 @@ def alloc(self, size, dtype, device):
 
     def forward(self, *args, **kwargs):  # We need to additionally pass positional encoding.
 
-        # if "self_attn_mask_type" in kwargs:
-        #     attn_mask_type = kwargs['self_attn_mask_type']
-        # else:
-        #     attn_mask_type = "whatever_default_is"
-
-        # if attn_mask_type == "arbitrary":
-        #     # @sudhakars: following logic doesn't work for `thd`
-        #     attn_mask = kwargs['attention_mask']
-        #     attention_mask_inv = ~attn_mask
-        #     generation_case = torch.tensor(torch.tensor(attn_mask.shape).shape).item() > 2
-
-        #     if generation_case:
-        #         # @sudhakars: for some reason, `attention_mask` for generation is of the
-        #         # form [b, 1, 1, s].
-        #         attention_mask_inv = attention_mask_inv.squeeze(1).squeeze(1)
-        #         assert torch.tensor(torch.tensor(attention_mask_inv.shape).shape).item() == 2
-
-        #     # Create `position_ids` on the fly using `attention_mask` since HF
-        #     # does the same in generation logic.
-        #     position_ids = attention_mask_inv.long().cumsum(-1) - 1
-        #     position_ids.masked_fill_(attention_mask_inv == 0, 1)
-
-        #     if "position_ids" in kwargs and kwargs['position_ids'] is not None:
-        #         assert torch.all(torch.eq(position_ids, kwargs["position_ids"])), "position ids don't match match exactly!"
-
-        #     # convert [b, s] to [b, 1, s, s] since `arbitrary` is only set for
-        #     # context phase and context phase gets [b, s] sized attn mask
-        #     seq_len = 1 if torch.tensor(torch.tensor(attn_mask.shape).shape).item() > 2 else attention_mask_inv.shape[1]
-        #     arbitrary_attn_mask = torch.zeros(attention_mask_inv.shape[0], 1, seq_len, attention_mask_inv.shape[1]).bool()
-        #     for sample_idx in range(attn_mask.shape[0]):
-        #         pad_len = attn_mask[sample_idx].sum().int().item()
-        #         # set the columns to padded
-        #         arbitrary_attn_mask[sample_idx, :, :, :pad_len] = True
-        #         # set the rows to padded
-        #         if not generation_case:
-        #             arbitrary_attn_mask[sample_idx, :, :pad_len, :] = True
-        #             arbitrary_attn_mask[sample_idx] = torch.tril(arbitrary_attn_mask[sample_idx].logical_not()).logical_not()
-
-        #     # Update the attention mask to arbitrary
-        #     kwargs['attention_mask'] = arbitrary_attn_mask.cuda()
-
-        #     # @sudhakars: `max_position_embeddings` is not even used inside GemmaRotaryEmbedding
-        #     # @sudhakars: change the hardcoded `dim` to something like config.head_dim
-        #     te_rope_emb = GemmaRotaryEmbedding(dim=256, max_position_embeddings=self.gemma_config.max_position_embeddings).cuda()
-        #     te_rope_emb = te_rope_emb(args[0], position_ids.cuda())
-        # else:
+
         # When the `attention_mask` is not `arbitrary`, then for the purpose
         # of this tutorial, we're using `padding_causal` (for context) and
         # `padding` (for generation)
         # @sudhakars: find a better way to provide the `tensor_format`
-        te_rope_emb = RotaryPositionEmbedding(self.gemma_config.head_dim)(
-            max_seq_len=self.gemma_config.max_position_embeddings
-        ).cuda()
+
 
         inference_params = kwargs["inference_params"]
         # @sudhakars: big assumption that the input is "sbhd"
         # batch_size = args[0].shape[0]
-        if inference_params.qkv_format_legacy == "thd":
-            (
-                max_seqlen_q, max_seqlen_kv, cu_seqlens_q, cu_seqlens_kv, cu_seqlens_q_padded, cu_seqlens_kv_padded
-            ) = inference_params.get_cache_params_from_infer_params()
+
+        # if inference_params.qkv_format_legacy == "thd":
+        #     cache_params = kwargs["cache_params"]
+        #     max_seqlen_q = cache_params.max_seqlen_q
+        #     max_seqlen_kv = cache_params.max_seqlen_kv
+        #     cu_seqlens_q = cache_params.cu_seqlens_q
+        #     cu_seqlens_kv = cache_params.cu_seqlens_kv
+        #     cu_seqlens_q_padded = cache_params.cu_seqlens_q_padded
+        #     cu_seqlens_kv_padded = cache_params.cu_seqlens_kv_padded
+            # print(f"input_sequence_lengths (in forward): \n{inference_params.input_sequence_lengths}")
 
         # this args cannot be passed to TransformerLayer
         keys_to_remove = [
@@ -233,16 +115,15 @@ def forward(self, *args, **kwargs):  # We need to additionally pass positional e
         for key in keys_to_remove:
             kwargs.pop(key, None)
 
-        # import pdb; pdb.set_trace()
         # We need to return tuple to be compatible with HF.
         return (
             super().forward(
                 *args,
-                rotary_pos_emb=te_rope_emb,
-                cu_seqlens_q=cu_seqlens_q,
-                cu_seqlens_kv=cu_seqlens_kv,
-                max_seqlen_q=max_seqlen_q,
-                max_seqlen_kv=max_seqlen_kv,
+                rotary_pos_emb=self.te_rope_emb,
+                # cu_seqlens_q=cu_seqlens_q,
+                # cu_seqlens_kv=cu_seqlens_kv,
+                # max_seqlen_q=max_seqlen_q,
+                # max_seqlen_kv=max_seqlen_kv,
                 **kwargs
             ),
         )
@@ -271,11 +152,13 @@ def set_inference_params(self, inference_params):
 
     # @sudhakars: is `arbitrary` fine being the default here?
     def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor = None, attn_mask_type: str = "arbitrary"):
+        print(f"StaticGemmaModel forward start")
         with torch.no_grad():
             # static operation - for CUDA graphs
             hidden_states.data[:] = hidden_states.data[:] * self.normalizer
 
             for i, decoder_layer in enumerate(self.model.layers):
+                # print(f"layer {i}")
                 hidden_states.data[:] = decoder_layer(
                     hidden_states,
                     attention_mask=attention_mask,
@@ -288,7 +171,7 @@ def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor = No
         hidden_states.copy_(self.model.norm(hidden_states))  # static copy - for CUDA graphs
         logits = self.lm_head(hidden_states)
         logits = logits.float()
-        return logits
+        return logits, hidden_states
 
 
 class GemmaGenerator(torch.nn.Module):
@@ -311,7 +194,7 @@ def set_inference_params(self, inference_params):
 
     # @sudhakars: is `arbitrary` a good default value here?
     def forward(self, hidden_states: torch.Tensor, mask: torch.Tensor = None, attn_mask_type: str = "arbitrary"):
-        logits = self.gemma_layers(hidden_states, attention_mask=mask, attn_mask_type = attn_mask_type)
+        logits, _ = self.gemma_layers(hidden_states, attention_mask=mask, attn_mask_type = attn_mask_type)
 
         assert logits.shape[0] == hidden_states.shape[0]  # b
         assert logits.shape[1] == hidden_states.shape[1]  # seq_len
@@ -325,27 +208,6 @@ def forward(self, hidden_states: torch.Tensor, mask: torch.Tensor = None, attn_m
         return next_tokens
 
 
-class PartialForwardWrapper(torch.nn.Module):
-    """
-    This class wraps a `torch.nn.Module` while partially modifying its `forward`
-
-    CUDAGraphs' `make_graphed_callables` method takes in a module but if only
-    `functools.partial` is used to wrap the module, it changes the modules'
-    type and that interferes with the `make_graphed_callables` intrinsics.
-    """
-    def __init__(self, module, **kwargs):
-        super().__init__()
-        self.module = module
-        self.partial_forward = partial(self.module.forward, **kwargs)
-
-    def __call__(self, *args, **kwargs):
-        return self.partial_forward(*args, **kwargs)
-
-    # @sudhakars: should we use better abstraction?
-    def set_inference_params(self, *args, **kwargs):
-        return self.module.set_inference_params(*args, **kwargs)
-
-
 @contextmanager
 def replace_decoder(te_decoder_cls):
     """
@@ -442,13 +304,19 @@ def _create_inference_params(self, *args, **kwargs):
             *args, **kwargs
         )
 
-        max_batch_size = kwargs["max_batch_size"]
+        # max_batch_size = kwargs["max_batch_size"]
 
         # Initialize some legacy params
-        infer_params.cached_sequence_lengths = torch.zeros(
-            (max_batch_size,), device="cuda", dtype=torch.int32)
-        infer_params.input_sequence_lengths = torch.zeros(
-            (max_batch_size,), device="cuda", dtype=torch.int32)
+        # _allocator = StaticBufferAllocator()
+        # infer_params.cached_sequence_lengths = _allocator((max_batch_size,), dtype=torch.int32, device="cuda")
+        # infer_params.input_sequence_lengths = _allocator((max_batch_size,), dtype=torch.int32, device="cuda")
+
+        # These are updated in setup_cache_params_from_infer_params and they should be static for
+        # the duration of the context as well as the generation phase.
+        # infer_params.cu_seqlens_q, infer_params.cu_seqlens_kv, infer_params.cu_seqlens_q_padded, infer_params.cu_seqlens_kv_padded = [
+        #     _allocator(max_batch_size + 1, dtype=torch.int32, device="cuda")
+        #     for _ in range(4)
+        # ]
 
         return infer_params
 
@@ -478,39 +346,42 @@ def _get_generation_buffer(self, hidden_states_buffer, data_to_copy=None):
     def _generate_context_phase(self, input_ids: torch.Tensor, inference_params: InferenceParams):
         # import pdb; pdb.set_trace()
         hidden_states = self._create_hidden_states_buffer(input_ids)
-        hidden_states.data[:] = self.model.embed_tokens(input_ids)
+        hidden_states.copy_(self.model.embed_tokens(input_ids))
 
         # We need to update offsets before every forward pass to make cache work properly.
         lengths = input_ids.ne(0).sum(dim=1)
+
         # import pdb; pdb.set_trace()
         if self.config.qkv_format == "thd":
             # inference_params.setup_before_new_input(
             #     lengths_tensor=lengths, max_input_length=input_ids.shape[1]
             # )
             lengths = input_ids.ne(0).sum(dim=1)
-            max_input_length = input_ids.shape[1]
-            setup_cache_params_from_infer_params(inference_params, lengths, max_input_length)
+            inference_params.pre_step(OrderedDict(zip(list(range(len(lengths))), lengths.tolist())))
         else:
             inference_params.setup_before_new_input(length=input_ids.shape[1])
 
 
-        logits = self._model_context_phase(
+        logits, hs_buffer = self._model_context_phase(
             hidden_states,
             attention_mask=((input_ids == 0) if self.config.qkv_format != "thd" else None),
-            attn_mask_type="padding_causal" if self.config.qkv_format == "thd" else "arbitrary"
+            attn_mask_type="padding_causal" if self.config.qkv_format == "thd" else "arbitrary",
         )
 
         # We choose logits coresponding with last token in each sequence,
         # which have various lengths - they are stored in (inference_params.incoming_seq_len - 1)
         # Tensor when qkv_format == "thd" and
-        # they are the last token in the sequence when qkv_format != "thd".
-        if self.config.qkv_format == "thd":
-            logits = logits[
+            # they are the last token in the sequence when qkv_format != "thd".
+            # import pdb; pdb.set_trace()
+        import pdb; pdb.set_trace()
 
-                torch.arange(logits.size(0)), inference_params.input_sequence_lengths - 1, :
-            ]
-        else:
-            logits = logits[:, -1, :]
+        # if self.config.qkv_format == "thd":
+        #     logits = logits[
+
+        #         torch.arange(logits.size(0)), lengths - 1, :
+        #     ]
+        # else:
+        logits = logits[:, -1, :]
 
         next_tokens = torch.argmax(logits, dim=1)
 
@@ -572,7 +443,7 @@ def generate(
             inference_params = self._create_inference_params(
                 max_batch_size=batch_size,
                 # num_layers=self.config.num_hidden_layers,
-                max_sequence_length=self._next_64_multiply(max_input_sequence_len + max_new_tokens),
+                max_sequence_length=128,
                 num_heads_kv=self.config.num_key_value_heads,
                 # num_heads_q=self.config.num_attention_heads,
                 head_dim_v=self.config.head_dim,
@@ -584,29 +455,34 @@ def generate(
                 # is_cuda_graph=False
             )
 
-            def init_cache_params_in_infer_params(inference_params):
-                inference_params.cached_sequence_lengths = torch.zeros(
-                (batch_size,), device="cuda", dtype=torch.int32)
-                inference_params.input_sequence_lengths = torch.zeros(
-                (batch_size,), device="cuda", dtype=torch.int32)
+            # def init_cache_params_in_infer_params(inference_params):
+            #     _allocator = StaticBufferAllocator()
+            #     inference_params.cached_sequence_lengths = _allocator(
+            #         (batch_size,), dtype=torch.int32, device="cuda")
+            #     inference_params.input_sequence_lengths = _allocator(
+            #         (batch_size,), dtype=torch.int32, device="cuda")
 
-            init_cache_params_in_infer_params(inference_params)
-            inference_params.qkv_format_legacy = self.config.qkv_format
+            # init_cache_params_in_infer_params(inference_params)
+
+
+            # inference_params.qkv_format_legacy = self.config.qkv_format
 
             self._model_context_phase.set_inference_params(inference_params)
             self._model_generation_phase.set_inference_params(inference_params)
 
+            print(f"context phase start")
+            # import pdb; pdb.set_trace()
             hidden_states, next_tokens = self._generate_context_phase(input_ids, inference_params)
 
+            print(f"context phase done")
             # Generation phase.
             if self.config.qkv_format == "thd":
                 # inference_params.setup_before_new_input(
                 #     lengths_tensor=torch.ones((next_tokens.shape[0],), device="cuda"),
                 #     max_input_length=1,
                 # )
-                setup_cache_params_from_infer_params(inference_params,
-                                                     lengths_tensor=torch.ones((next_tokens.shape[0],), dtype=int),
-                                                     max_input_length=1)
+                lengths_tensor=torch.ones((next_tokens.shape[0],), dtype=int)
+                inference_params.pre_step(OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist())))
             else:
                 inference_params.setup_before_new_input(length=1)
 
@@ -637,9 +513,8 @@ def init_cache_params_in_infer_params(inference_params):
                     #     lengths_tensor=torch.ones((next_tokens.shape[0],), device="cuda"),
                     #     max_input_length=1,
                     # )
-                    setup_cache_params_from_infer_params(inference_params,
-                                                        lengths_tensor=torch.ones((next_tokens.shape[0],), dtype=int),
-                                                        max_input_length=1)
+                    lengths_tensor=torch.ones((next_tokens.shape[0],), dtype=int)
+                    inference_params.pre_step(OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist())))
                 else:
                     inference_params.setup_before_new_input(length=1)
                 # next_tokens is static output tensor, so we need to clone it
@@ -706,12 +581,16 @@ def __init__(self, config: GemmaConfig):
         )
 
         ## Taken from TEGemmaForCausalLM above
-        max_batch_size = self.config.cuda_graphs_static_batch_size
-        # Initialize some legacy params
-        self.inference_params.cached_sequence_lengths = torch.zeros(
-            (max_batch_size,), device="cuda", dtype=torch.int32)
-        self.inference_params.input_sequence_lengths = torch.zeros(
-            (max_batch_size,), device="cuda", dtype=torch.int32)
+        # max_batch_size = self.config.cuda_graphs_static_batch_size
+        # # Initialize some legacy params
+        # _allocator = StaticBufferAllocator()
+        # self.inference_params.cached_sequence_lengths = _allocator((max_batch_size,), dtype=torch.int32, device="cuda")
+        # self.inference_params.input_sequence_lengths = _allocator((max_batch_size,), dtype=torch.int32, device="cuda")
+
+        # self.inference_params.cu_seqlens_q, self.inference_params.cu_seqlens_kv, self.inference_params.cu_seqlens_q_padded, self.inference_params.cu_seqlens_kv_padded = [
+        #     _allocator(max_batch_size + 1, dtype=torch.int32, device="cuda")
+        #     for _ in range(4)
+        # ]
 
         # def init_cache_params_in_infer_params(inference_params):
         #         inference_params.cached_sequence_lengths = torch.zeros(
@@ -720,7 +599,7 @@ def __init__(self, config: GemmaConfig):
         #         (batch_size,), device="cuda", dtype=torch.int32)
         # init_cache_params_in_infer_params(inference_params)
 
-        self.inference_params.qkv_format_legacy = self.config.qkv_format
+        # self.inference_params.qkv_format_legacy = self.config.qkv_format
 
         self._model_generation_phase.set_inference_params(self.inference_params)
         self._model_context_phase.set_inference_params(self.inference_params)
@@ -737,30 +616,45 @@ def record(self):
             self.config.cuda_graphs_static_batch_size,
             self.config.cuda_graphs_static_max_context_len,
         )
-        self.inference_params.reset()
+        # self.inference_params.reset()
         # self.inference_params.setup_before_new_input(
         #     lengths_tensor=torch.tensor(input_shape[0] * [input_shape[1]], device="cuda"),
         #     max_input_length=input_shape[1],
         # )
-        lengths = torch.tensor(input_shape[0] * [input_shape[1]], device="cuda")
+
+        # [1] Should be same as lengths_tensor from TEGemmaForCausalLM
+        lengths = torch.tensor(input_shape[0] * [input_shape[1]], device="cuda", dtype=torch.int32)
         max_input_length = input_shape[1]
-        setup_cache_params_from_infer_params(self.inference_params, lengths, max_input_length)
 
+        self.inference_params.pre_step(OrderedDict(zip(list(range(len(lengths))), lengths.tolist())))
+
+        print(f"context phase recording start")
+        # self._model_context_phase.model.layers = torch.nn.ModuleList([
+        #     self.record_graph(
+        #             layer,
+        #             self.hidden_states_buffer,
+        #             self_attn_mask_type="padding_causal",
+        #             inference_params=self.inference_params
+        #         )
+        #     for layer in self._model_context_phase.model.layers
+        # ])
         self._model_context_phase = self.record_graph(
             self._model_context_phase,
             self.hidden_states_buffer,
             attn_mask_type="padding_causal"
         )  # CUDA Graphs recording
 
+        print(f"context phase recording done")
         input_shape = (self.config.cuda_graphs_static_batch_size, 1)
-        self.inference_params.reset()
+        # self.inference_params.reset()
         # self.inference_params.setup_before_new_input(
         #     lengths_tensor=torch.tensor(input_shape[0] * [input_shape[1]], device="cuda"),
         #     max_input_length=input_shape[1],
         # )
-        lengths = torch.tensor(input_shape[0] * [input_shape[1]], device="cuda")
+        lengths = torch.tensor(input_shape[0] * [1], device="cuda", dtype=torch.int32)
         max_input_length = input_shape[1]
-        setup_cache_params_from_infer_params(self.inference_params, lengths, max_input_length)
+
+        self.inference_params.pre_step(OrderedDict(zip(list(range(len(lengths))), lengths.tolist())))
 
         self._model_generation_phase = self.record_graph(
             self._model_generation_phase,
@@ -802,7 +696,7 @@ def record_graph(self, function, input_tensor, **sample_kwargs):
                 fp8_enabled=self.config.fp8,
                 fp8_recipe=fp8_recipe,
                 allow_unused_input=True,
-                num_warmup_iters=3,
+                num_warmup_iters=5,
                 sample_kwargs=sample_kwargs,
             )
         return graphed_function
diff --git a/docs/examples/te_gemma/te_gemma_save.py b/docs/examples/te_gemma/te_gemma_save.py
new file mode 100755
index 0000000000..a46f6a9b94
--- /dev/null
+++ b/docs/examples/te_gemma/te_gemma_save.py
@@ -0,0 +1,829 @@
+# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+from contextlib import contextmanager
+
+from typing import Optional
+from functools import partial
+from collections import OrderedDict
+
+import torch
+import transformer_engine as te
+from transformer_engine.pytorch.attention import InferenceParams, RotaryPositionEmbedding
+from transformer_engine.common.recipe import Format, DelayedScaling
+from torch.cuda.amp import autocast
+
+import transformers
+from transformers.models.gemma.modeling_gemma import GemmaForCausalLM, GemmaConfig, GemmaModel
+
+import torch.nn.functional as F
+
+class CacheParams:
+    def __init__(self, max_seqlen_q, max_seqlen_kv, cu_seqlens_q, cu_seqlens_kv, cu_seqlens_q_padded, cu_seqlens_kv_padded):
+        self.max_seqlen_q = max_seqlen_q
+        self.max_seqlen_kv = max_seqlen_kv
+        self.cu_seqlens_q = cu_seqlens_q
+        self.cu_seqlens_kv = cu_seqlens_kv
+        self.cu_seqlens_q_padded = cu_seqlens_q_padded
+        self.cu_seqlens_kv_padded = cu_seqlens_kv_padded
+
+
+def setup_cache_params_from_infer_params(inference_params, lengths_tensor, max_input_length):
+    """
+    Converts the `input_ids` to variables like `cu_seqlens_q/kv`, etc. which
+    will be used later.
+
+    (Currently a hack, this should be reformatted to a better method)
+    """
+
+    assert lengths_tensor is not None and max_input_length is not None, \
+        "lengths_tensor and max_input_length should not be none for qkv_format = \"thd\""
+
+    inference_params.max_incoming_seq_len = max_input_length
+
+    lengths_tensor = lengths_tensor.to(inference_params.cu_seqlens_q.device)
+
+    # inference_params.step_dict = OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist()))
+    inference_params.pre_step(OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist())))
+
+    # print(inference_params.step_dict)
+
+    # def get_cache_params_in_infer_params():
+    # return CacheParams(max_seqlen_q, max_seqlen_kv, inference_params.cu_seqlens_q, inference_params.cu_seqlens_kv, inference_params.cu_seqlens_q_padded, inference_params.cu_seqlens_kv_padded)
+
+    # For the time being, create an ad-hoc field in `inference_params` to get the variables.
+    # @sudhakars: to create a better way later.
+    # inference_params.get_cache_params_from_infer_params = get_cache_params_in_infer_params
+
+# This class has been modified from
+# https://github.com/huggingface/transformers/blob/98adf24883b007c2a7fb17bab1c01b1614673433/src/transformers/models/gemma/modeling_gemma.py
+class GemmaRotaryEmbedding(torch.nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float() / self.dim))
+        self.register_buffer("inv_freq", tensor=inv_freq, persistent=False)
+
+    @torch.no_grad()
+    def forward(self, x, position_ids, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        self.inv_freq.to(x.device)
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
+        # Force float32 since bfloat16 loses precision on long contexts
+        # See https://github.com/huggingface/transformers/pull/29285
+        device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+        return emb.unsqueeze(2) # should return in [b, s, 1, d] format
+
+
+class StaticBufferAllocator(torch.nn.Module):
+    """
+        This class is used when we use te.make_graphed_callable().
+        CUDA Graphs require all tensors to be static. Neverthless,
+        torch API make_graphed_callable() takes care of output of torch modules,
+        and makes them static. Thus by wrapping allocation of memory into
+        torch.nn.Module, we can greatly simplify our code.
+    """
+
+    # pylint: disable=no-self-use
+    def forward(self, size, dtype, device):
+        """
+            Return buffer of given size, dtype and device.
+        """
+        return torch.zeros(size, dtype=dtype, device=device)
+
+class TEGemmaDecoderLayer(te.pytorch.TransformerLayer):
+    """
+    Wrapper class over TE's `TransformerLayer`. This makes the wrapper very
+    similar to HF's `GemmaDecoderLayer` and easier to replace it in the code.
+
+    Args:
+        config: GemmaConfig
+        args: positional args (for compatibility with `GemmaDecoderLayer`)
+        kwargs: keyword args (for compatibility with `GemmaDecoderLayer`)
+    """
+
+    def __init__(self, config: GemmaConfig, layer_idx: int, *args, **kwargs):
+
+        self.gemma_config = config
+
+        super().__init__(
+            hidden_size=config.hidden_size,
+            ffn_hidden_size=config.intermediate_size,
+            num_attention_heads=config.num_attention_heads,
+            bias=False,
+            layernorm_epsilon=config.rms_norm_eps,
+            hidden_dropout=0,
+            attention_dropout=0,
+            fuse_qkv_params=config.fuse_qkv_params,
+            normalization="RMSNorm",
+            activation="geglu",
+            # attn_input_format=config.qkv_format,
+            attn_input_format="bshd",
+            num_gqa_groups=config.num_key_value_heads,
+            kv_channels=self.gemma_config.head_dim,
+            layer_number=(
+                layer_idx + 1
+            ),  # Layer numbers in TE starts from 1, not 0 like in the HF.
+            zero_centered_gamma=True,
+        )
+
+    def alloc(self, size, dtype, device):
+        """
+            Allocated the buffer and works correctly with CUDA Graphs.
+        """
+        return self._allocator(size, dtype, device)
+
+    def forward(self, *args, **kwargs):  # We need to additionally pass positional encoding.
+
+        # if "self_attn_mask_type" in kwargs:
+        #     attn_mask_type = kwargs['self_attn_mask_type']
+        # else:
+        #     attn_mask_type = "whatever_default_is"
+
+        # if attn_mask_type == "arbitrary":
+        #     # @sudhakars: following logic doesn't work for `thd`
+        #     attn_mask = kwargs['attention_mask']
+        #     attention_mask_inv = ~attn_mask
+        #     generation_case = torch.tensor(torch.tensor(attn_mask.shape).shape).item() > 2
+
+        #     if generation_case:
+        #         # @sudhakars: for some reason, `attention_mask` for generation is of the
+        #         # form [b, 1, 1, s].
+        #         attention_mask_inv = attention_mask_inv.squeeze(1).squeeze(1)
+        #         assert torch.tensor(torch.tensor(attention_mask_inv.shape).shape).item() == 2
+
+        #     # Create `position_ids` on the fly using `attention_mask` since HF
+        #     # does the same in generation logic.
+        #     position_ids = attention_mask_inv.long().cumsum(-1) - 1
+        #     position_ids.masked_fill_(attention_mask_inv == 0, 1)
+
+        #     if "position_ids" in kwargs and kwargs['position_ids'] is not None:
+        #         assert torch.all(torch.eq(position_ids, kwargs["position_ids"])), "position ids don't match match exactly!"
+
+        #     # convert [b, s] to [b, 1, s, s] since `arbitrary` is only set for
+        #     # context phase and context phase gets [b, s] sized attn mask
+        #     seq_len = 1 if torch.tensor(torch.tensor(attn_mask.shape).shape).item() > 2 else attention_mask_inv.shape[1]
+        #     arbitrary_attn_mask = torch.zeros(attention_mask_inv.shape[0], 1, seq_len, attention_mask_inv.shape[1]).bool()
+        #     for sample_idx in range(attn_mask.shape[0]):
+        #         pad_len = attn_mask[sample_idx].sum().int().item()
+        #         # set the columns to padded
+        #         arbitrary_attn_mask[sample_idx, :, :, :pad_len] = True
+        #         # set the rows to padded
+        #         if not generation_case:
+        #             arbitrary_attn_mask[sample_idx, :, :pad_len, :] = True
+        #             arbitrary_attn_mask[sample_idx] = torch.tril(arbitrary_attn_mask[sample_idx].logical_not()).logical_not()
+
+        #     # Update the attention mask to arbitrary
+        #     kwargs['attention_mask'] = arbitrary_attn_mask.cuda()
+
+        #     # @sudhakars: `max_position_embeddings` is not even used inside GemmaRotaryEmbedding
+        #     # @sudhakars: change the hardcoded `dim` to something like config.head_dim
+        #     te_rope_emb = GemmaRotaryEmbedding(dim=256, max_position_embeddings=self.gemma_config.max_position_embeddings).cuda()
+        #     te_rope_emb = te_rope_emb(args[0], position_ids.cuda())
+        # else:
+        # When the `attention_mask` is not `arbitrary`, then for the purpose
+        # of this tutorial, we're using `padding_causal` (for context) and
+        # `padding` (for generation)
+        # @sudhakars: find a better way to provide the `tensor_format`
+        te_rope_emb = RotaryPositionEmbedding(self.gemma_config.head_dim)(
+            max_seq_len=self.gemma_config.max_position_embeddings
+        ).cuda()
+
+        inference_params = kwargs["inference_params"]
+        # @sudhakars: big assumption that the input is "sbhd"
+        # batch_size = args[0].shape[0]
+
+        # if inference_params.qkv_format_legacy == "thd":
+        #     cache_params = kwargs["cache_params"]
+        #     max_seqlen_q = cache_params.max_seqlen_q
+        #     max_seqlen_kv = cache_params.max_seqlen_kv
+        #     cu_seqlens_q = cache_params.cu_seqlens_q
+        #     cu_seqlens_kv = cache_params.cu_seqlens_kv
+        #     cu_seqlens_q_padded = cache_params.cu_seqlens_q_padded
+        #     cu_seqlens_kv_padded = cache_params.cu_seqlens_kv_padded
+            # print(f"input_sequence_lengths (in forward): \n{inference_params.input_sequence_lengths}")
+
+        # this args cannot be passed to TransformerLayer
+        keys_to_remove = [
+            "position_ids",
+            "past_key_value",
+            "output_attentions",
+            "use_cache",
+            "cache_position",
+        ]
+        for key in keys_to_remove:
+            kwargs.pop(key, None)
+
+        # We need to return tuple to be compatible with HF.
+        return (
+            super().forward(
+                *args,
+                rotary_pos_emb=te_rope_emb,
+                # cu_seqlens_q=cu_seqlens_q,
+                # cu_seqlens_kv=cu_seqlens_kv,
+                # max_seqlen_q=max_seqlen_q,
+                # max_seqlen_kv=max_seqlen_kv,
+                **kwargs
+            ),
+        )
+
+class StaticGemmaModel(torch.nn.Module):
+    """
+    StaticGemma is based of HF GemmaModel class.
+    It is adjusted to work properly with CUDA Graphs.
+    """
+
+    def __init__(
+        self,
+        model: GemmaModel,
+        dtype: torch.dtype,
+        mask: torch.Tensor,
+        lm_head: torch.nn.Module,
+    ):
+        super().__init__()
+        self.model = model
+        self.normalizer = torch.tensor(self.model.config.hidden_size**0.5, dtype=dtype)
+        self.mask = mask
+        self.lm_head = lm_head
+
+    def set_inference_params(self, inference_params):
+        self.inference_params = inference_params
+
+    # @sudhakars: is `arbitrary` fine being the default here?
+    def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor = None, attn_mask_type: str = "arbitrary"):
+        print(f"StaticGemmaModel forward start")
+        with torch.no_grad():
+            # static operation - for CUDA graphs
+            hidden_states.data[:] = hidden_states.data[:] * self.normalizer
+
+            for i, decoder_layer in enumerate(self.model.layers):
+                # print(f"layer {i}")
+                hidden_states.data[:] = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    self_attn_mask_type=self.mask if attn_mask_type is None else attn_mask_type,
+                    inference_params=self.inference_params,
+                )[
+                    0
+                ]  # static copy - for CUDA graphs
+
+        hidden_states.copy_(self.model.norm(hidden_states))  # static copy - for CUDA graphs
+        logits = self.lm_head(hidden_states)
+        logits = logits.float()
+        return logits, hidden_states
+
+
+class GemmaGenerator(torch.nn.Module):
+    """
+    GemmaGenerator gets one layer of embeddins,
+    makes forward pass and returns next tokens.
+    """
+
+    def __init__(
+        self, model: GemmaModel, lm_head: torch.nn.Module, dtype: torch.dtype, qkv_format: str
+    ):
+        super().__init__()
+        self.model = model
+        self.gemma_layers = StaticGemmaModel(model, dtype, "arbitrary", lm_head)
+        self.qkv_format = qkv_format
+
+    def set_inference_params(self, inference_params):
+        self.inference_params = inference_params
+        self.gemma_layers.set_inference_params(inference_params)
+
+    # @sudhakars: is `arbitrary` a good default value here?
+    def forward(self, hidden_states: torch.Tensor, mask: torch.Tensor = None, attn_mask_type: str = "arbitrary"):
+        logits, _ = self.gemma_layers(hidden_states, attention_mask=mask, attn_mask_type = attn_mask_type)
+
+        assert logits.shape[0] == hidden_states.shape[0]  # b
+        assert logits.shape[1] == hidden_states.shape[1]  # seq_len
+        # logits.shape[2] = number of tokens
+        logits = logits[:, -1, :]
+        next_tokens = torch.argmax(logits, dim=1)
+
+        # static copy for CUDA graphs
+        hidden_states.copy_(self.model.embed_tokens(next_tokens).unsqueeze(1))
+
+        return next_tokens
+
+
+class PartialForwardWrapper(torch.nn.Module):
+    """
+    This class wraps a `torch.nn.Module` while partially modifying its `forward`
+
+    CUDAGraphs' `make_graphed_callables` method takes in a module but if only
+    `functools.partial` is used to wrap the module, it changes the modules'
+    type and that interferes with the `make_graphed_callables` intrinsics.
+    """
+    def __init__(self, module, **kwargs):
+        super().__init__()
+        self.module = module
+        self.partial_forward = partial(self.module.forward, **kwargs)
+
+    def __call__(self, *args, **kwargs):
+        return self.partial_forward(*args, **kwargs)
+
+    # @sudhakars: should we use better abstraction?
+    def set_inference_params(self, *args, **kwargs):
+        return self.module.set_inference_params(*args, **kwargs)
+
+
+@contextmanager
+def replace_decoder(te_decoder_cls):
+    """
+    Replace `GemmaDecoderLayer` with custom `TEGemmaDecoderLayer`.
+    """
+    original_gemma_decoder_cls = transformers.models.gemma.modeling_gemma.GemmaDecoderLayer
+    transformers.models.gemma.modeling_gemma.GemmaDecoderLayer = te_decoder_cls
+    try:
+        yield
+    finally:
+        transformers.models.gemma.modeling_gemma.GemmaDecoderLayer = original_gemma_decoder_cls
+
+
+class TEGemmaForCausalLM(GemmaForCausalLM):
+    """
+    Causal LM created with `GemmaModel`. The underlying `GemmaDecoderLayer`
+    class is monkey-patched with `TEGemmaDecoderLayer` class before
+    initializing the causal LM with `GemmaForCausalLM`.
+
+    Args:
+        config: GemmaConfig
+    """
+
+    def __init__(self, config: GemmaConfig):
+        with replace_decoder(te_decoder_cls=TEGemmaDecoderLayer):
+            super().__init__(config)
+        self.config = config
+        self.to(torch.bfloat16).cuda()
+        self.hidden_size = config.hidden_size
+        self._model_generation_phase = GemmaGenerator(
+            lm_head=self.lm_head,
+            model=self.model,
+            dtype=torch.bfloat16,
+            qkv_format=config.qkv_format,
+        )
+        self._model_context_phase = StaticGemmaModel(
+            self.model, torch.bfloat16, "arbitrary", self.lm_head
+        )
+
+        if self.config.fp8:
+            self.fp8_recipe = DelayedScaling(
+                fp8_format=Format.HYBRID, amax_history_len=16, amax_compute_algo="max"
+            )
+
+    @staticmethod
+    def _padding_to_end(inputs, lengths, max_seq_len=None):
+        """
+        Gets the tensor with sequence padded from the beginning and
+        return tensor padded from its end.
+
+        Parameters
+        ----------
+        inputs : Tensor, tensor with shape [b, s] containing token numbers.
+                 It's padded from the beggining.
+        lengths: Tensor, tensor with shape [s] with lengths of the sequences.
+
+        """
+        max_seq_len = torch.max(lengths) if max_seq_len is None else max_seq_len
+        batch_size, max_seq_len = inputs.shape
+        new_input_ids = inputs.clone()
+        for i in range(batch_size):
+            new_input_ids[i, : lengths[i]] = inputs[i, (max_seq_len - lengths[i]) : max_seq_len]
+            new_input_ids[i, lengths[i] :] = inputs[i, 0 : (max_seq_len - lengths[i])]
+
+        # Disable the input preparation that involves extra padding
+        # inputs.copy_(new_input_ids)
+
+        # Trim the inputs to no extra padding i.e. fix the max seq len to
+        # the longest sequence in the batch
+        actual_max_seq_len = max_seq_len
+        inputs.data = new_input_ids[:, :actual_max_seq_len]
+        print(f"actual_max_seq_len: {actual_max_seq_len}")
+
+        # For Paged Attention, make the valid sequences, multiple of 64
+        # inputs.data = new_input_ids[:, :4].repeat(1, 16)
+
+
+    def _next_64_multiply(self, x):
+        return ((x + 63) // 64) * 64
+
+    # This function is overriden in TeGEmmaForCausalLMCudaGraphs.
+    def _create_hidden_states_buffer(self, input_ids: torch.Tensor):
+        tensor = torch.empty(
+            (input_ids.shape[0], input_ids.shape[1], self.hidden_size),
+            device="cuda",
+            dtype=torch.float32,
+        )
+        # import pdb; pdb.set_trace()
+        return tensor
+
+    # This function is overriden in TeGEmmaForCausalLMCudaGraphs.
+    def _create_inference_params(self, *args, **kwargs):
+        infer_params = InferenceParams(
+            *args, **kwargs
+        )
+
+        # max_batch_size = kwargs["max_batch_size"]
+
+        # Initialize some legacy params
+        # _allocator = StaticBufferAllocator()
+        # infer_params.cached_sequence_lengths = _allocator((max_batch_size,), dtype=torch.int32, device="cuda")
+        # infer_params.input_sequence_lengths = _allocator((max_batch_size,), dtype=torch.int32, device="cuda")
+
+        # These are updated in setup_cache_params_from_infer_params and they should be static for
+        # the duration of the context as well as the generation phase.
+        # infer_params.cu_seqlens_q, infer_params.cu_seqlens_kv, infer_params.cu_seqlens_q_padded, infer_params.cu_seqlens_kv_padded = [
+        #     _allocator(max_batch_size + 1, dtype=torch.int32, device="cuda")
+        #     for _ in range(4)
+        # ]
+
+        return infer_params
+
+    # This function is overriden in TeGEmmaForCausalLMCudaGraphs.
+    def _get_max_input_seq_len(self, input_ids):
+        return input_ids.shape[1] \
+                if not hasattr(self.config, "cuda_graphs_static_max_context_len") \
+                    else self.config.cuda_graphs_static_max_context_len
+
+    # The buffer for generation is some part (beginning) of hidden states buffer.
+    # This function returns pointer to it and also copies there data if provided.
+    def _get_generation_buffer(self, hidden_states_buffer, data_to_copy=None):
+        # hidden_states_buffer has shape [b, s, hd]
+        # generation_buffer will have shape [b, 1, hd]
+        # Notice that "generation_buffer = hidden_states_buffer[:, 0, :].unsqueeze(1)"
+        # will return uncontiguous buffer, which we want to avoid.
+        output = hidden_states_buffer.view(-1)[
+            : hidden_states_buffer.shape[0] * hidden_states_buffer.shape[2]
+        ]
+        if data_to_copy is not None:
+            output.copy_(data_to_copy.reshape(-1))
+        generation_buffer = output.view(
+            (hidden_states_buffer.shape[0], 1, hidden_states_buffer.shape[2])
+        )
+        return generation_buffer
+
+    def _generate_context_phase(self, input_ids: torch.Tensor, inference_params: InferenceParams):
+        # import pdb; pdb.set_trace()
+        hidden_states = self._create_hidden_states_buffer(input_ids)
+        hidden_states.data[:] = self.model.embed_tokens(input_ids)
+
+        # We need to update offsets before every forward pass to make cache work properly.
+        lengths = input_ids.ne(0).sum(dim=1)
+
+        # import pdb; pdb.set_trace()
+        if self.config.qkv_format == "thd":
+            # inference_params.setup_before_new_input(
+            #     lengths_tensor=lengths, max_input_length=input_ids.shape[1]
+            # )
+            lengths = input_ids.ne(0).sum(dim=1)
+            inference_params.pre_step(OrderedDict(zip(list(range(len(lengths))), lengths.tolist())))
+        else:
+            inference_params.setup_before_new_input(length=input_ids.shape[1])
+
+
+        logits, hs_buffer = self._model_context_phase(
+            hidden_states,
+            attention_mask=((input_ids == 0) if self.config.qkv_format != "thd" else None),
+            attn_mask_type="padding_causal" if self.config.qkv_format == "thd" else "arbitrary",
+        )
+
+        # We choose logits coresponding with last token in each sequence,
+        # which have various lengths - they are stored in (inference_params.incoming_seq_len - 1)
+        # Tensor when qkv_format == "thd" and
+        # they are the last token in the sequence when qkv_format != "thd".
+        # import pdb; pdb.set_trace()
+        if self.config.qkv_format == "thd":
+            logits = logits[
+
+                torch.arange(logits.size(0)), lengths - 1, :
+            ]
+        else:
+            logits = logits[:, -1, :]
+
+        next_tokens = torch.argmax(logits, dim=1)
+
+        # self.hidden_states have shape [b, s, hd].
+        # We return hidden state for the last token - output has shape [b, 1, hd]
+        hidden_states = self._get_generation_buffer(
+            hidden_states, self.model.embed_tokens(next_tokens)
+        )
+        return hidden_states, next_tokens
+
+    def _make_mask_one_token_longer(self, mask):
+        return torch.cat(
+            [mask, torch.zeros(mask.size(0), 1, 1, 1, dtype=torch.bool, device=mask.device)], dim=-1
+        )
+
+    @torch.no_grad()
+    def generate(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        pad_token_id: int = 0,
+        max_new_tokens: int = 0,
+        *args,
+        **kwargs
+    ):
+        self.eval()
+
+        # We need both autocasts: FP8 for operations that can run in lower precision
+        # and BF16 for those that cannot.
+        with autocast(dtype=torch.bfloat16, cache_enabled=False), te.pytorch.fp8_autocast(
+            enabled=self.config.fp8, fp8_recipe=self.fp8_recipe if self.config.fp8 else None
+        ):
+
+            lengths = torch.sum(input_ids.ne(pad_token_id), dim=-1).squeeze()  # [s]
+
+            batch_size, max_input_sequence_len = input_ids.shape[0], self._get_max_input_seq_len(
+                input_ids
+            )
+
+            # This is not needed since the padding to the left is already done in utils.py
+            # # Pad input_ids with zeros on the left to match max_input_sequence_len
+            # # This adds padding tokens (0) to the left side of each sequence in the batch
+            # # Shape goes from [batch_size, seq_len] to [batch_size, max_input_sequence_len]
+            # input_ids = F.pad(
+            #                 input_ids, (max_input_sequence_len - input_ids.shape[1], 0), "constant", 0
+            #             )
+
+            if self.config.qkv_format == "thd":
+                # For thd layout padding is at the end, otherwise at the beginning.
+                TEGemmaForCausalLM._padding_to_end(input_ids,
+                    lengths,
+                    max_seq_len=self.config.cuda_graphs_static_max_context_len \
+                        if self.config.generation_cuda_graphs else None
+                )
+
+            # import pdb; pdb.set_trace()
+
+            # InferenceParams is a cache, where keys and values of previous tokens are stored.
+            # Moreover it stores length of both already generated and input sequences.
+            inference_params = self._create_inference_params(
+                max_batch_size=batch_size,
+                # num_layers=self.config.num_hidden_layers,
+                max_sequence_length=self._next_64_multiply(max_input_sequence_len + max_new_tokens),
+                num_heads_kv=self.config.num_key_value_heads,
+                # num_heads_q=self.config.num_attention_heads,
+                head_dim_v=self.config.head_dim,
+                head_dim_k=self.config.head_dim,
+                dtype=torch.bfloat16,
+                is_paged=self.config.is_paged,
+                page_size=64,
+                total_num_pages=64, # 64 * 64 (max_sequence_length) / 64 (page_size)
+                # is_cuda_graph=False
+            )
+
+            # def init_cache_params_in_infer_params(inference_params):
+            #     _allocator = StaticBufferAllocator()
+            #     inference_params.cached_sequence_lengths = _allocator(
+            #         (batch_size,), dtype=torch.int32, device="cuda")
+            #     inference_params.input_sequence_lengths = _allocator(
+            #         (batch_size,), dtype=torch.int32, device="cuda")
+
+            # init_cache_params_in_infer_params(inference_params)
+
+
+            # inference_params.qkv_format_legacy = self.config.qkv_format
+
+            self._model_context_phase.set_inference_params(inference_params)
+            self._model_generation_phase.set_inference_params(inference_params)
+
+            print(f"context phase start")
+            # import pdb; pdb.set_trace()
+            hidden_states, next_tokens = self._generate_context_phase(input_ids, inference_params)
+
+            print(f"context phase done")
+            # Generation phase.
+            if self.config.qkv_format == "thd":
+                # inference_params.setup_before_new_input(
+                #     lengths_tensor=torch.ones((next_tokens.shape[0],), device="cuda"),
+                #     max_input_length=1,
+                # )
+                lengths_tensor=torch.ones((next_tokens.shape[0],), dtype=int)
+                inference_params.pre_step(OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist())))
+            else:
+                inference_params.setup_before_new_input(length=1)
+
+            output_tokens = [next_tokens]
+
+            mask = None
+            if self.config.qkv_format != "thd":
+                mask = (input_ids == 0).unsqueeze(1).unsqueeze(1)
+
+            for _ in range(max_new_tokens):
+                if self.config.qkv_format != "thd":
+                    # It will not work with cuda graphs, but it is not used for thd qkv_format.
+                    # Attention mask in bshd needs attn_mask increased by 1 to
+                    # include the next token to be generated
+                    mask = self._make_mask_one_token_longer(mask)
+
+                # setup_cache_params_from_infer_params(inference_params, input_ids)
+                # @sudhakars: could create position_ids from mask here
+                next_tokens = self._model_generation_phase(hidden_states, mask, attn_mask_type="padding" if self.config.qkv_format=="thd" else "arbitrary")
+
+                # self.inference_params contains for example kv_cache.
+                # This needs to be called before every pass,
+                # to update the information of sequence lengths.
+                # Here we increase sequence offsets by one,
+                # because we generated one token for every sequence.
+                if self.config.qkv_format == "thd":
+                    # self.inference_params.setup_before_new_input(
+                    #     lengths_tensor=torch.ones((next_tokens.shape[0],), device="cuda"),
+                    #     max_input_length=1,
+                    # )
+                    lengths_tensor=torch.ones((next_tokens.shape[0],), dtype=int)
+                    inference_params.pre_step(OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist())))
+                else:
+                    inference_params.setup_before_new_input(length=1)
+                # next_tokens is static output tensor, so we need to clone it
+                # - it gets changed every iteration.
+                output_tokens.append(next_tokens.clone())
+
+            result = torch.cat((input_ids, torch.stack(output_tokens).permute([1, 0])), dim=1)
+            return result
+
+    def forward(self, *args, **kwargs):
+        self._model_context_phase.set_inference_params(None)
+        hidden_states = self.model.embed_tokens(kwargs["input_ids"])
+        logits = self._model_context_phase(
+            hidden_states,
+            attention_mask=((kwargs["input_ids"] == 0) if self.config.qkv_format != "thd" else None),
+            attn_mask_type="arbitrary"
+        )
+        return logits
+
+class TEGemmaForCausalLMCudaGraphs(TEGemmaForCausalLM):
+    """
+    TEGemmaForCausalLMCudaGraphs is the version of the class TEGemmaForCausalLM
+    using CUDA Graphs to speed it up. We need to make one trade-off.
+    Namely, batch_size, max_seq_len and max_context_seq_len need to be static.
+    It is necessary to run generation with the same value of
+    these variables that we recorded graph on.
+    """
+
+    def __init__(self, config: GemmaConfig):
+        super().__init__(config)
+        assert (
+            config.qkv_format == "thd"
+        ), "Generation with CUDA Graphs are implemented only for thd format."
+
+        # Preparation of the static buffers.
+        self.config = config
+        self.hidden_states_buffer = torch.empty(
+            (
+                self.config.cuda_graphs_static_batch_size,
+                self.config.cuda_graphs_static_max_context_len,
+                self.config.hidden_size,
+            )
+        ).cuda()
+        # This is in fact part of the buffer for hidden_states.
+        self.generation_buffer = self._get_generation_buffer(self.hidden_states_buffer)
+        # self.inference_params = InferenceParams(
+        #     max_batch_size=config.cuda_graphs_static_batch_size,
+        #     max_sequence_length=config.cuda_graphs_static_max_seq_len,
+        #     qkv_format="thd",
+        # )
+        self.inference_params = InferenceParams(
+            max_batch_size=self.config.cuda_graphs_static_batch_size,
+            # num_layers=self.config.num_hidden_layers,
+            max_sequence_length=self.config.cuda_graphs_static_max_seq_len,
+            num_heads_kv=self.config.num_key_value_heads,
+            # num_heads_q=self.config.num_attention_heads,
+            head_dim_v=self.config.head_dim,
+            head_dim_k=self.config.head_dim,
+            dtype=torch.bfloat16,
+            is_paged=self.config.is_paged,
+            page_size=64,
+            total_num_pages=64, # 64 * 64 (max_sequence_length) / 64 (page_size)
+            # is_cuda_graph=False
+        )
+
+        ## Taken from TEGemmaForCausalLM above
+        # max_batch_size = self.config.cuda_graphs_static_batch_size
+        # # Initialize some legacy params
+        # _allocator = StaticBufferAllocator()
+        # self.inference_params.cached_sequence_lengths = _allocator((max_batch_size,), dtype=torch.int32, device="cuda")
+        # self.inference_params.input_sequence_lengths = _allocator((max_batch_size,), dtype=torch.int32, device="cuda")
+
+        # self.inference_params.cu_seqlens_q, self.inference_params.cu_seqlens_kv, self.inference_params.cu_seqlens_q_padded, self.inference_params.cu_seqlens_kv_padded = [
+        #     _allocator(max_batch_size + 1, dtype=torch.int32, device="cuda")
+        #     for _ in range(4)
+        # ]
+
+        # def init_cache_params_in_infer_params(inference_params):
+        #         inference_params.cached_sequence_lengths = torch.zeros(
+        #         (batch_size,), device="cuda", dtype=torch.int32)
+        #         inference_params.input_sequence_lengths = torch.zeros(
+        #         (batch_size,), device="cuda", dtype=torch.int32)
+        # init_cache_params_in_infer_params(inference_params)
+
+        # self.inference_params.qkv_format_legacy = self.config.qkv_format
+
+        self._model_generation_phase.set_inference_params(self.inference_params)
+        self._model_context_phase.set_inference_params(self.inference_params)
+
+    def record(self):
+        # We want to record model in training=False, because it will be used in generation.
+        self.eval()
+
+        # Here "the trick" happens. We override methods from TEGemmaForCausalLM
+        # with their recorded version. After invocation of each of them,
+        # captured graph will be replayed with minimal usage of CPU,
+        # what will lead to huge speedup.
+        input_shape = (
+            self.config.cuda_graphs_static_batch_size,
+            self.config.cuda_graphs_static_max_context_len,
+        )
+        # self.inference_params.reset()
+        # self.inference_params.setup_before_new_input(
+        #     lengths_tensor=torch.tensor(input_shape[0] * [input_shape[1]], device="cuda"),
+        #     max_input_length=input_shape[1],
+        # )
+
+        # [1] Should be same as lengths_tensor from TEGemmaForCausalLM
+        lengths = torch.tensor(input_shape[0] * [input_shape[1]], device="cuda", dtype=torch.int32)
+        max_input_length = input_shape[1]
+
+        self.inference_params.pre_step(OrderedDict(zip(list(range(len(lengths))), lengths.tolist())))
+
+        print(f"context phase recording start")
+        # self._model_context_phase.model.layers = torch.nn.ModuleList([
+        #     self.record_graph(
+        #             layer,
+        #             self.hidden_states_buffer,
+        #             self_attn_mask_type="padding_causal",
+        #             inference_params=self.inference_params
+        #         )
+        #     for layer in self._model_context_phase.model.layers
+        # ])
+        self._model_context_phase = self.record_graph(
+            self._model_context_phase,
+            self.hidden_states_buffer,
+            attn_mask_type="padding_causal"
+        )  # CUDA Graphs recording
+
+        print(f"context phase recording done")
+        input_shape = (self.config.cuda_graphs_static_batch_size, 1)
+        # self.inference_params.reset()
+        # self.inference_params.setup_before_new_input(
+        #     lengths_tensor=torch.tensor(input_shape[0] * [input_shape[1]], device="cuda"),
+        #     max_input_length=input_shape[1],
+        # )
+        lengths = torch.tensor(input_shape[0] * [1], device="cuda", dtype=torch.int32)
+        max_input_length = input_shape[1]
+
+        self.inference_params.pre_step(OrderedDict(zip(list(range(len(lengths))), lengths.tolist())))
+
+        self._model_generation_phase = self.record_graph(
+            self._model_generation_phase,
+            self.generation_buffer,
+            attn_mask_type="padding"
+        )  # CUDA Graphs recording
+
+    """
+        Functions _create_hidden_states_buffer and _create_inference_params
+        from base class are overriden to make hidden_states and inference_params static
+        - not changing their position in memory between every invocation.
+    """
+
+    def _create_hidden_states_buffer(self, *args, **kwargs):
+        return self.hidden_states_buffer
+
+    def _create_inference_params(self, *args, **kwargs):
+        self.inference_params.reset()
+        return self.inference_params
+
+    def _get_max_input_seq_len(self, _):
+        return self.config.cuda_graphs_static_max_context_len
+
+    @torch.no_grad()
+    def record_graph(self, function, input_tensor, **sample_kwargs):
+        # function is invoked on argument (self.hidden_states,) and all kernels are recorded.
+        # record_graph() returns captured function, which can be run later with lower of th CPU.
+        fp8_format = Format.HYBRID
+        fp8_recipe = DelayedScaling(
+            fp8_format=fp8_format, amax_history_len=1024, amax_compute_algo="max"
+        )
+
+        # We need both autocasts: FP8 for operations that can run in lower precision
+        # and BF16 for those that cannot.
+        with autocast(dtype=torch.bfloat16, cache_enabled=False):
+            graphed_function = te.pytorch.make_graphed_callables(
+                function,
+                (input_tensor,),
+                fp8_enabled=self.config.fp8,
+                fp8_recipe=fp8_recipe,
+                allow_unused_input=True,
+                num_warmup_iters=5,
+                sample_kwargs=sample_kwargs,
+            )
+        return graphed_function

From 6cd3c1a6fe44a0327ba286dbf15e1ceb34eab3de Mon Sep 17 00:00:00 2001
From: Sudhakar Singh <sudhakars@nvidia.com>
Date: Tue, 17 Jun 2025 15:27:06 -0700
Subject: [PATCH 3/7] make cuda graphs work with non-paged and paged attention

Signed-off-by: Sudhakar Singh <sudhakars@nvidia.com>
---
 docs/examples/te_gemma/te_gemma.py            | 218 ++++--------------
 .../pytorch/attention/inference.py            |  20 +-
 2 files changed, 54 insertions(+), 184 deletions(-)

diff --git a/docs/examples/te_gemma/te_gemma.py b/docs/examples/te_gemma/te_gemma.py
index bab980cc28..cd59a081e8 100755
--- a/docs/examples/te_gemma/te_gemma.py
+++ b/docs/examples/te_gemma/te_gemma.py
@@ -19,22 +19,6 @@
 
 import torch.nn.functional as F
 
-class StaticBufferAllocator(torch.nn.Module):
-    """
-        This class is used when we use te.make_graphed_callable().
-        CUDA Graphs require all tensors to be static. Neverthless,
-        torch API make_graphed_callable() takes care of output of torch modules,
-        and makes them static. Thus by wrapping allocation of memory into
-        torch.nn.Module, we can greatly simplify our code.
-    """
-
-    # pylint: disable=no-self-use
-    def forward(self, size, dtype, device):
-        """
-            Return buffer of given size, dtype and device.
-        """
-        return torch.zeros(size, dtype=dtype, device=device)
-
 class TEGemmaDecoderLayer(te.pytorch.TransformerLayer):
     """
     Wrapper class over TE's `TransformerLayer`. This makes the wrapper very
@@ -71,39 +55,8 @@ def __init__(self, config: GemmaConfig, layer_idx: int, *args, **kwargs):
             zero_centered_gamma=True,
         )
 
-        self.te_rope_emb = RotaryPositionEmbedding(self.gemma_config.head_dim)(
-            max_seq_len=self.gemma_config.max_position_embeddings
-        ).cuda()
-
-    def alloc(self, size, dtype, device):
-        """
-            Allocated the buffer and works correctly with CUDA Graphs.
-        """
-        return self._allocator(size, dtype, device)
-
     def forward(self, *args, **kwargs):  # We need to additionally pass positional encoding.
 
-
-        # When the `attention_mask` is not `arbitrary`, then for the purpose
-        # of this tutorial, we're using `padding_causal` (for context) and
-        # `padding` (for generation)
-        # @sudhakars: find a better way to provide the `tensor_format`
-
-
-        inference_params = kwargs["inference_params"]
-        # @sudhakars: big assumption that the input is "sbhd"
-        # batch_size = args[0].shape[0]
-
-        # if inference_params.qkv_format_legacy == "thd":
-        #     cache_params = kwargs["cache_params"]
-        #     max_seqlen_q = cache_params.max_seqlen_q
-        #     max_seqlen_kv = cache_params.max_seqlen_kv
-        #     cu_seqlens_q = cache_params.cu_seqlens_q
-        #     cu_seqlens_kv = cache_params.cu_seqlens_kv
-        #     cu_seqlens_q_padded = cache_params.cu_seqlens_q_padded
-        #     cu_seqlens_kv_padded = cache_params.cu_seqlens_kv_padded
-            # print(f"input_sequence_lengths (in forward): \n{inference_params.input_sequence_lengths}")
-
         # this args cannot be passed to TransformerLayer
         keys_to_remove = [
             "position_ids",
@@ -115,15 +68,12 @@ def forward(self, *args, **kwargs):  # We need to additionally pass positional e
         for key in keys_to_remove:
             kwargs.pop(key, None)
 
+        rope_emb = kwargs.pop("rope_emb", None)
         # We need to return tuple to be compatible with HF.
         return (
             super().forward(
                 *args,
-                rotary_pos_emb=self.te_rope_emb,
-                # cu_seqlens_q=cu_seqlens_q,
-                # cu_seqlens_kv=cu_seqlens_kv,
-                # max_seqlen_q=max_seqlen_q,
-                # max_seqlen_kv=max_seqlen_kv,
+                rotary_pos_emb=rope_emb,
                 **kwargs
             ),
         )
@@ -151,8 +101,8 @@ def set_inference_params(self, inference_params):
         self.inference_params = inference_params
 
     # @sudhakars: is `arbitrary` fine being the default here?
-    def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor = None, attn_mask_type: str = "arbitrary"):
-        print(f"StaticGemmaModel forward start")
+    def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor = None, attn_mask_type: str = "arbitrary", rope_emb: torch.Tensor = None):
+        # print(f"StaticGemmaModel forward start")
         with torch.no_grad():
             # static operation - for CUDA graphs
             hidden_states.data[:] = hidden_states.data[:] * self.normalizer
@@ -164,6 +114,7 @@ def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor = No
                     attention_mask=attention_mask,
                     self_attn_mask_type=self.mask if attn_mask_type is None else attn_mask_type,
                     inference_params=self.inference_params,
+                    rope_emb=rope_emb
                 )[
                     0
                 ]  # static copy - for CUDA graphs
@@ -193,8 +144,8 @@ def set_inference_params(self, inference_params):
         self.gemma_layers.set_inference_params(inference_params)
 
     # @sudhakars: is `arbitrary` a good default value here?
-    def forward(self, hidden_states: torch.Tensor, mask: torch.Tensor = None, attn_mask_type: str = "arbitrary"):
-        logits, _ = self.gemma_layers(hidden_states, attention_mask=mask, attn_mask_type = attn_mask_type)
+    def forward(self, hidden_states: torch.Tensor, mask: torch.Tensor = None, attn_mask_type: str = "arbitrary", rope_emb: torch.Tensor = None):
+        logits, _ = self.gemma_layers(hidden_states, attention_mask=mask, attn_mask_type = attn_mask_type, rope_emb=rope_emb)
 
         assert logits.shape[0] == hidden_states.shape[0]  # b
         assert logits.shape[1] == hidden_states.shape[1]  # seq_len
@@ -252,6 +203,10 @@ def __init__(self, config: GemmaConfig):
                 fp8_format=Format.HYBRID, amax_history_len=16, amax_compute_algo="max"
             )
 
+        self.te_rope_emb = RotaryPositionEmbedding(self.config.head_dim)(
+            max_seq_len=self.config.max_position_embeddings
+        ).cuda()
+
     @staticmethod
     def _padding_to_end(inputs, lengths, max_seq_len=None):
         """
@@ -279,11 +234,13 @@ def _padding_to_end(inputs, lengths, max_seq_len=None):
         # the longest sequence in the batch
         actual_max_seq_len = max_seq_len
         inputs.data = new_input_ids[:, :actual_max_seq_len]
-        print(f"actual_max_seq_len: {actual_max_seq_len}")
+        # print(f"actual_max_seq_len: {actual_max_seq_len}")
 
         # For Paged Attention, make the valid sequences, multiple of 64
         # inputs.data = new_input_ids[:, :4].repeat(1, 16)
-
+        # import pdb; pdb.set_trace()
+        # print(f"inputs.data.shape: {inputs.data.shape}")
+        # exit()
 
     def _next_64_multiply(self, x):
         return ((x + 63) // 64) * 64
@@ -303,21 +260,6 @@ def _create_inference_params(self, *args, **kwargs):
         infer_params = InferenceParams(
             *args, **kwargs
         )
-
-        # max_batch_size = kwargs["max_batch_size"]
-
-        # Initialize some legacy params
-        # _allocator = StaticBufferAllocator()
-        # infer_params.cached_sequence_lengths = _allocator((max_batch_size,), dtype=torch.int32, device="cuda")
-        # infer_params.input_sequence_lengths = _allocator((max_batch_size,), dtype=torch.int32, device="cuda")
-
-        # These are updated in setup_cache_params_from_infer_params and they should be static for
-        # the duration of the context as well as the generation phase.
-        # infer_params.cu_seqlens_q, infer_params.cu_seqlens_kv, infer_params.cu_seqlens_q_padded, infer_params.cu_seqlens_kv_padded = [
-        #     _allocator(max_batch_size + 1, dtype=torch.int32, device="cuda")
-        #     for _ in range(4)
-        # ]
-
         return infer_params
 
     # This function is overriden in TeGEmmaForCausalLMCudaGraphs.
@@ -366,22 +308,16 @@ def _generate_context_phase(self, input_ids: torch.Tensor, inference_params: Inf
             hidden_states,
             attention_mask=((input_ids == 0) if self.config.qkv_format != "thd" else None),
             attn_mask_type="padding_causal" if self.config.qkv_format == "thd" else "arbitrary",
+            rope_emb=self.te_rope_emb
         )
 
-        # We choose logits coresponding with last token in each sequence,
-        # which have various lengths - they are stored in (inference_params.incoming_seq_len - 1)
-        # Tensor when qkv_format == "thd" and
-            # they are the last token in the sequence when qkv_format != "thd".
-            # import pdb; pdb.set_trace()
-        import pdb; pdb.set_trace()
-
-        # if self.config.qkv_format == "thd":
-        #     logits = logits[
+        if self.config.qkv_format == "thd":
+            logits = logits[
 
-        #         torch.arange(logits.size(0)), lengths - 1, :
-        #     ]
-        # else:
-        logits = logits[:, -1, :]
+                torch.arange(logits.size(0)), lengths - 1, :
+            ]
+        else:
+            logits = logits[:, -1, :]
 
         next_tokens = torch.argmax(logits, dim=1)
 
@@ -416,17 +352,8 @@ def generate(
 
             lengths = torch.sum(input_ids.ne(pad_token_id), dim=-1).squeeze()  # [s]
 
-            batch_size, max_input_sequence_len = input_ids.shape[0], self._get_max_input_seq_len(
-                input_ids
-            )
-
-            # This is not needed since the padding to the left is already done in utils.py
-            # # Pad input_ids with zeros on the left to match max_input_sequence_len
-            # # This adds padding tokens (0) to the left side of each sequence in the batch
-            # # Shape goes from [batch_size, seq_len] to [batch_size, max_input_sequence_len]
-            # input_ids = F.pad(
-            #                 input_ids, (max_input_sequence_len - input_ids.shape[1], 0), "constant", 0
-            #             )
+            # print(f"max_input_sequence_len: {max_input_sequence_len}")
+            # exit()
 
             if self.config.qkv_format == "thd":
                 # For thd layout padding is at the end, otherwise at the beginning.
@@ -436,7 +363,9 @@ def generate(
                         if self.config.generation_cuda_graphs else None
                 )
 
-            # import pdb; pdb.set_trace()
+            batch_size, max_input_sequence_len = input_ids.shape[0], self._get_max_input_seq_len(
+                input_ids
+            )
 
             # InferenceParams is a cache, where keys and values of previous tokens are stored.
             # Moreover it stores length of both already generated and input sequences.
@@ -451,36 +380,19 @@ def generate(
                 dtype=torch.bfloat16,
                 is_paged=self.config.is_paged,
                 page_size=64,
-                total_num_pages=64, # 64 * 64 (max_sequence_length) / 64 (page_size)
-                # is_cuda_graph=False
+                total_num_pages=64 * 128 // 64, # 64 * 64 (max_sequence_length) / 64 (page_size)
             )
 
-            # def init_cache_params_in_infer_params(inference_params):
-            #     _allocator = StaticBufferAllocator()
-            #     inference_params.cached_sequence_lengths = _allocator(
-            #         (batch_size,), dtype=torch.int32, device="cuda")
-            #     inference_params.input_sequence_lengths = _allocator(
-            #         (batch_size,), dtype=torch.int32, device="cuda")
-
-            # init_cache_params_in_infer_params(inference_params)
-
-
-            # inference_params.qkv_format_legacy = self.config.qkv_format
-
             self._model_context_phase.set_inference_params(inference_params)
             self._model_generation_phase.set_inference_params(inference_params)
 
-            print(f"context phase start")
+            # print(f"context phase start")
             # import pdb; pdb.set_trace()
             hidden_states, next_tokens = self._generate_context_phase(input_ids, inference_params)
 
-            print(f"context phase done")
+            # print(f"context phase done")
             # Generation phase.
             if self.config.qkv_format == "thd":
-                # inference_params.setup_before_new_input(
-                #     lengths_tensor=torch.ones((next_tokens.shape[0],), device="cuda"),
-                #     max_input_length=1,
-                # )
                 lengths_tensor=torch.ones((next_tokens.shape[0],), dtype=int)
                 inference_params.pre_step(OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist())))
             else:
@@ -499,9 +411,7 @@ def generate(
                     # include the next token to be generated
                     mask = self._make_mask_one_token_longer(mask)
 
-                # setup_cache_params_from_infer_params(inference_params, input_ids)
-                # @sudhakars: could create position_ids from mask here
-                next_tokens = self._model_generation_phase(hidden_states, mask, attn_mask_type="padding" if self.config.qkv_format=="thd" else "arbitrary")
+                next_tokens = self._model_generation_phase(hidden_states, mask=mask, attn_mask_type="padding" if self.config.qkv_format=="thd" else "arbitrary", rope_emb=self.te_rope_emb)
 
                 # self.inference_params contains for example kv_cache.
                 # This needs to be called before every pass,
@@ -509,10 +419,6 @@ def generate(
                 # Here we increase sequence offsets by one,
                 # because we generated one token for every sequence.
                 if self.config.qkv_format == "thd":
-                    # self.inference_params.setup_before_new_input(
-                    #     lengths_tensor=torch.ones((next_tokens.shape[0],), device="cuda"),
-                    #     max_input_length=1,
-                    # )
                     lengths_tensor=torch.ones((next_tokens.shape[0],), dtype=int)
                     inference_params.pre_step(OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist())))
                 else:
@@ -558,13 +464,9 @@ def __init__(self, config: GemmaConfig):
                 self.config.hidden_size,
             )
         ).cuda()
+
         # This is in fact part of the buffer for hidden_states.
         self.generation_buffer = self._get_generation_buffer(self.hidden_states_buffer)
-        # self.inference_params = InferenceParams(
-        #     max_batch_size=config.cuda_graphs_static_batch_size,
-        #     max_sequence_length=config.cuda_graphs_static_max_seq_len,
-        #     qkv_format="thd",
-        # )
         self.inference_params = InferenceParams(
             max_batch_size=self.config.cuda_graphs_static_batch_size,
             # num_layers=self.config.num_hidden_layers,
@@ -576,31 +478,9 @@ def __init__(self, config: GemmaConfig):
             dtype=torch.bfloat16,
             is_paged=self.config.is_paged,
             page_size=64,
-            total_num_pages=64, # 64 * 64 (max_sequence_length) / 64 (page_size)
-            # is_cuda_graph=False
+            total_num_pages=64 * self.config.cuda_graphs_static_max_seq_len // 64, # 64 * 64 (max_sequence_length) / 64 (page_size)
         )
 
-        ## Taken from TEGemmaForCausalLM above
-        # max_batch_size = self.config.cuda_graphs_static_batch_size
-        # # Initialize some legacy params
-        # _allocator = StaticBufferAllocator()
-        # self.inference_params.cached_sequence_lengths = _allocator((max_batch_size,), dtype=torch.int32, device="cuda")
-        # self.inference_params.input_sequence_lengths = _allocator((max_batch_size,), dtype=torch.int32, device="cuda")
-
-        # self.inference_params.cu_seqlens_q, self.inference_params.cu_seqlens_kv, self.inference_params.cu_seqlens_q_padded, self.inference_params.cu_seqlens_kv_padded = [
-        #     _allocator(max_batch_size + 1, dtype=torch.int32, device="cuda")
-        #     for _ in range(4)
-        # ]
-
-        # def init_cache_params_in_infer_params(inference_params):
-        #         inference_params.cached_sequence_lengths = torch.zeros(
-        #         (batch_size,), device="cuda", dtype=torch.int32)
-        #         inference_params.input_sequence_lengths = torch.zeros(
-        #         (batch_size,), device="cuda", dtype=torch.int32)
-        # init_cache_params_in_infer_params(inference_params)
-
-        # self.inference_params.qkv_format_legacy = self.config.qkv_format
-
         self._model_generation_phase.set_inference_params(self.inference_params)
         self._model_context_phase.set_inference_params(self.inference_params)
 
@@ -616,11 +496,6 @@ def record(self):
             self.config.cuda_graphs_static_batch_size,
             self.config.cuda_graphs_static_max_context_len,
         )
-        # self.inference_params.reset()
-        # self.inference_params.setup_before_new_input(
-        #     lengths_tensor=torch.tensor(input_shape[0] * [input_shape[1]], device="cuda"),
-        #     max_input_length=input_shape[1],
-        # )
 
         # [1] Should be same as lengths_tensor from TEGemmaForCausalLM
         lengths = torch.tensor(input_shape[0] * [input_shape[1]], device="cuda", dtype=torch.int32)
@@ -628,38 +503,27 @@ def record(self):
 
         self.inference_params.pre_step(OrderedDict(zip(list(range(len(lengths))), lengths.tolist())))
 
-        print(f"context phase recording start")
-        # self._model_context_phase.model.layers = torch.nn.ModuleList([
-        #     self.record_graph(
-        #             layer,
-        #             self.hidden_states_buffer,
-        #             self_attn_mask_type="padding_causal",
-        #             inference_params=self.inference_params
-        #         )
-        #     for layer in self._model_context_phase.model.layers
-        # ])
+        # print(f"context phase recording start")
+
         self._model_context_phase = self.record_graph(
             self._model_context_phase,
             self.hidden_states_buffer,
-            attn_mask_type="padding_causal"
+            attn_mask_type="padding_causal",
+            rope_emb=self.te_rope_emb
         )  # CUDA Graphs recording
 
-        print(f"context phase recording done")
+        # print(f"context phase recording done")
         input_shape = (self.config.cuda_graphs_static_batch_size, 1)
-        # self.inference_params.reset()
-        # self.inference_params.setup_before_new_input(
-        #     lengths_tensor=torch.tensor(input_shape[0] * [input_shape[1]], device="cuda"),
-        #     max_input_length=input_shape[1],
-        # )
+
         lengths = torch.tensor(input_shape[0] * [1], device="cuda", dtype=torch.int32)
-        max_input_length = input_shape[1]
 
         self.inference_params.pre_step(OrderedDict(zip(list(range(len(lengths))), lengths.tolist())))
 
         self._model_generation_phase = self.record_graph(
             self._model_generation_phase,
             self.generation_buffer,
-            attn_mask_type="padding"
+            attn_mask_type="padding",
+            rope_emb=self.te_rope_emb
         )  # CUDA Graphs recording
 
     """
diff --git a/transformer_engine/pytorch/attention/inference.py b/transformer_engine/pytorch/attention/inference.py
index 8a33a7f047..bcd4d7de30 100644
--- a/transformer_engine/pytorch/attention/inference.py
+++ b/transformer_engine/pytorch/attention/inference.py
@@ -214,7 +214,7 @@ def __init__(
             dtype=torch.int32,
             device=torch.cuda.current_device(),
         )
-        self.cu_pre_step_seqlens = torch.zeros(
+        self.pre_step_seqlens = torch.zeros(
             self.max_batch_size,
             dtype=torch.int32,
             device=torch.cuda.current_device(),
@@ -272,6 +272,11 @@ def pre_step(
         for k, v in self.sequences.items():
             self.sequences_pre_step[k] = v - step_dict[k]
 
+        pre_step_seqlens = torch.Tensor(list(self.sequences_pre_step.values())).to(
+            dtype=torch.int32, device="cpu"
+        )
+        self.pre_step_seqlens[:len(pre_step_seqlens)].copy_(pre_step_seqlens, non_blocking=True)
+
         seqlens_q = list(step_dict.values())
         cu_seqlens_q = [0] + [sum(seqlens_q[:i]) for i in range(1, self.batch_size + 1)]
         cu_seqlens_q = cu_seqlens_q + [cu_seqlens_q[-1]] * (self.max_batch_size - self.batch_size)
@@ -286,12 +291,13 @@ def pre_step(
 
     def get_seqlens_pre_step(self):
         """Get cached sequence lengths before the stepping"""
-        seqlens = torch.Tensor(list(self.sequences_pre_step.values())).to(
-            dtype=torch.int32, device="cpu"
-        )
-        # return seqlens.cuda()
-        self.cu_pre_step_seqlens[:len(seqlens)].copy_(seqlens, non_blocking=True)
-        return self.cu_pre_step_seqlens
+        # seqlens = torch.Tensor(list(self.sequences_pre_step.values())).to(
+        #     dtype=torch.int32, device="cpu"
+        # )
+        # # return seqlens.cuda()
+        # self.cu_pre_step_seqlens[:len(seqlens)].copy_(seqlens, non_blocking=True)
+        # return self.cu_pre_step_seqlens
+        return self.pre_step_seqlens
 
     def convert_paged_to_nonpaged(self, layer_number: int):
         """

From 2d12b722ad27185e23015bf7f7b61c3867625b96 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 17 Jun 2025 22:27:53 +0000
Subject: [PATCH 4/7] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 docs/examples/te_gemma/check_cuda_graphs.py   |  27 ++-
 docs/examples/te_gemma/check_gemm.py          |  15 +-
 docs/examples/te_gemma/run_generation.py      |   4 +-
 .../examples/te_gemma/run_generation_llama.py |   4 +-
 docs/examples/te_gemma/te_gemma.py            | 106 +++++----
 docs/examples/te_gemma/te_gemma_save.py       | 151 ++++++++-----
 docs/examples/te_gemma/te_llama.py            | 203 ++++++++++++------
 docs/examples/te_gemma/utils.py               |   7 +-
 .../pytorch/attention/inference.py            |   3 +-
 9 files changed, 337 insertions(+), 183 deletions(-)

diff --git a/docs/examples/te_gemma/check_cuda_graphs.py b/docs/examples/te_gemma/check_cuda_graphs.py
index fa198db5ef..aee35f6911 100644
--- a/docs/examples/te_gemma/check_cuda_graphs.py
+++ b/docs/examples/te_gemma/check_cuda_graphs.py
@@ -1,6 +1,7 @@
 import torch
 from transformer_engine.pytorch import Linear, LayerNorm
 
+
 # 1. Define model with static buffers
 class TE_Model(torch.nn.Module):
     def __init__(self, max_seq_len=4096):
@@ -10,51 +11,57 @@ def __init__(self, max_seq_len=4096):
         self.attn_proj = Linear(1024, 1024)
 
         # Pre-allocate static buffers
-        self.register_buffer('kv_cache', torch.zeros(max_seq_len, 1024, device='cuda'))
-        self.register_buffer('attn_mask', torch.tril(torch.ones(max_seq_len, max_seq_len, device='cuda')))
+        self.register_buffer("kv_cache", torch.zeros(max_seq_len, 1024, device="cuda"))
+        self.register_buffer(
+            "attn_mask", torch.tril(torch.ones(max_seq_len, max_seq_len, device="cuda"))
+        )
 
     def forward(self, hidden_states, seq_start: int):
         # Dynamic slicing of static buffers
         seq_len = hidden_states.size(1)
-        current_mask = self.attn_mask[seq_start:seq_start+seq_len, :seq_len]
+        current_mask = self.attn_mask[seq_start : seq_start + seq_len, :seq_len]
 
         x = self.ln(hidden_states)
         x = self.attn_proj(x)
         # Update KV cache (in-place)
-        self.kv_cache[seq_start:seq_start+seq_len].copy_(x)
+        self.kv_cache[seq_start : seq_start + seq_len].copy_(x)
         return x
 
+
 # 2. Create graphable callables
 model = TE_Model().cuda()
-static_input = torch.randn(8, 256, 1024, device='cuda')  # (batch, seq, hidden)
-seq_start = torch.tensor(0, device='cuda')
+static_input = torch.randn(8, 256, 1024, device="cuda")  # (batch, seq, hidden)
+seq_start = torch.tensor(0, device="cuda")
 
 # Wrap with CUDA Graphs
 graph_model = torch.cuda.make_graphed_callables(
     [model],  # Module list
     sample_args=[(static_input, seq_start)],  # Must match actual input structure
     # memory_pool=torch.cuda.graphs.graph_pool_handle(),
-    allow_unused_input=False
+    allow_unused_input=False,
 )
 
+
 # 3. Warmup and execution
 def run_inference(x, seq_start):
     # Inputs must match sample_args' device/type/shape
-    x = x.to('cuda', non_blocking=True).requires_grad_(False)
-    seq_start = seq_start.to('cuda', non_blocking=True)
+    x = x.to("cuda", non_blocking=True).requires_grad_(False)
+    seq_start = seq_start.to("cuda", non_blocking=True)
 
     with torch.cuda.amp.autocast():
         return graph_model(x, seq_start)
 
+
 # Warm-up (essential for TE's kernel auto-tuner)
 for _ in range(3):
     _ = run_inference(static_input, seq_start)
 torch.cuda.synchronize()
 
+
 # 4. Usage with dynamic sequence lengths
 def process_batch(inputs, start_pos):
     # inputs: (batch, seq) on CPU
-    inputs_gpu = inputs.to('cuda', non_blocking=True)
+    inputs_gpu = inputs.to("cuda", non_blocking=True)
 
     # Output shares memory with pre-allocated buffers
     return run_inference(inputs_gpu, start_pos)
diff --git a/docs/examples/te_gemma/check_gemm.py b/docs/examples/te_gemma/check_gemm.py
index dbcc0f53af..1ed6edd23a 100755
--- a/docs/examples/te_gemma/check_gemm.py
+++ b/docs/examples/te_gemma/check_gemm.py
@@ -8,11 +8,13 @@
 from transformer_engine.pytorch.module.base import get_workspace
 import transformer_engine.pytorch.cpp_extensions as cpp_tex
 
+
 @functools.lru_cache(maxsize=None)
 def _empty_tensor() -> torch.Tensor:
     """Get tensor with no entries and no data"""
     return torch.Tensor()
 
+
 def gemm(
     A: torch.Tensor,
     B: torch.Tensor,
@@ -100,20 +102,23 @@ def gemm(
         assert ub is not None, "ub object is None!"
     _ = fn(*args)
 
-    import pdb; pdb.set_trace()
+    import pdb
+
+    pdb.set_trace()
     return out, grad_bias, gelu_input
 
+
 if __name__ == "__main__":
     fc2_weight = torch.load("fc2_weight.pth").cuda()
-    
+
     base_repo = "/perfhome/mnt/wkstn/work/repos/te_gemma_gen_support/TransformerEngine/docs/examples/te_gemma/"
     base_repo = ""
     gelu_out = torch.load(base_repo + "gelu_out.pth").cuda()
-    
+
     activation_dtype = torch.bfloat16
     fc2_bias = _empty_tensor()
     use_fc2_bias = False
-    
+
     dim_size = list(gelu_out.size())
     dim_size[1] = fc2_weight.size(0)
     fc2_out = torch.empty(dim_size, dtype=activation_dtype, device=gelu_out.device)
@@ -129,4 +134,4 @@ def gemm(
         ub_algo=None,
         ub=None,
         extra_output_tensor=None,
-    )
\ No newline at end of file
+    )
diff --git a/docs/examples/te_gemma/run_generation.py b/docs/examples/te_gemma/run_generation.py
index 6c45b9d670..bfe610d361 100755
--- a/docs/examples/te_gemma/run_generation.py
+++ b/docs/examples/te_gemma/run_generation.py
@@ -1,6 +1,8 @@
 from utils import *
 
-hyperparams.model_name = "/perfhome/repos/ckpt/models/gemma-7b-hf/" # "/tmp/gemma-7b-hf/" # <== Add model weight location here e.g. "/path/to/downloaded/gemma/weights"
+hyperparams.model_name = (  # "/tmp/gemma-7b-hf/" # <== Add model weight location here e.g. "/path/to/downloaded/gemma/weights"
+    "/perfhome/repos/ckpt/models/gemma-7b-hf/"
+)
 hyperparams.qkv_format = "thd"
 
 # hyperparams.generation_cuda_graphs = True # 709.8s
diff --git a/docs/examples/te_gemma/run_generation_llama.py b/docs/examples/te_gemma/run_generation_llama.py
index 2f90995bd1..1c3e6626ca 100755
--- a/docs/examples/te_gemma/run_generation_llama.py
+++ b/docs/examples/te_gemma/run_generation_llama.py
@@ -1,6 +1,8 @@
 from utils import *
 
-hyperparams.model_name = "/perfhome/repos/ckpt/models/llama2-7b-hf/" # "/tmp/gemma-7b-hf/" # <== Add model weight location here e.g. "/path/to/downloaded/gemma/weights"
+hyperparams.model_name = (  # "/tmp/gemma-7b-hf/" # <== Add model weight location here e.g. "/path/to/downloaded/gemma/weights"
+    "/perfhome/repos/ckpt/models/llama2-7b-hf/"
+)
 hyperparams.qkv_format = "thd"
 
 # model = init_te_llama_model(hyperparams)
diff --git a/docs/examples/te_gemma/te_gemma.py b/docs/examples/te_gemma/te_gemma.py
index cd59a081e8..706ea16bc4 100755
--- a/docs/examples/te_gemma/te_gemma.py
+++ b/docs/examples/te_gemma/te_gemma.py
@@ -19,6 +19,7 @@
 
 import torch.nn.functional as F
 
+
 class TEGemmaDecoderLayer(te.pytorch.TransformerLayer):
     """
     Wrapper class over TE's `TransformerLayer`. This makes the wrapper very
@@ -70,13 +71,8 @@ def forward(self, *args, **kwargs):  # We need to additionally pass positional e
 
         rope_emb = kwargs.pop("rope_emb", None)
         # We need to return tuple to be compatible with HF.
-        return (
-            super().forward(
-                *args,
-                rotary_pos_emb=rope_emb,
-                **kwargs
-            ),
-        )
+        return (super().forward(*args, rotary_pos_emb=rope_emb, **kwargs),)
+
 
 class StaticGemmaModel(torch.nn.Module):
     """
@@ -101,7 +97,13 @@ def set_inference_params(self, inference_params):
         self.inference_params = inference_params
 
     # @sudhakars: is `arbitrary` fine being the default here?
-    def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor = None, attn_mask_type: str = "arbitrary", rope_emb: torch.Tensor = None):
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor = None,
+        attn_mask_type: str = "arbitrary",
+        rope_emb: torch.Tensor = None,
+    ):
         # print(f"StaticGemmaModel forward start")
         with torch.no_grad():
             # static operation - for CUDA graphs
@@ -114,7 +116,7 @@ def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor = No
                     attention_mask=attention_mask,
                     self_attn_mask_type=self.mask if attn_mask_type is None else attn_mask_type,
                     inference_params=self.inference_params,
-                    rope_emb=rope_emb
+                    rope_emb=rope_emb,
                 )[
                     0
                 ]  # static copy - for CUDA graphs
@@ -144,8 +146,16 @@ def set_inference_params(self, inference_params):
         self.gemma_layers.set_inference_params(inference_params)
 
     # @sudhakars: is `arbitrary` a good default value here?
-    def forward(self, hidden_states: torch.Tensor, mask: torch.Tensor = None, attn_mask_type: str = "arbitrary", rope_emb: torch.Tensor = None):
-        logits, _ = self.gemma_layers(hidden_states, attention_mask=mask, attn_mask_type = attn_mask_type, rope_emb=rope_emb)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        mask: torch.Tensor = None,
+        attn_mask_type: str = "arbitrary",
+        rope_emb: torch.Tensor = None,
+    ):
+        logits, _ = self.gemma_layers(
+            hidden_states, attention_mask=mask, attn_mask_type=attn_mask_type, rope_emb=rope_emb
+        )
 
         assert logits.shape[0] == hidden_states.shape[0]  # b
         assert logits.shape[1] == hidden_states.shape[1]  # seq_len
@@ -257,16 +267,16 @@ def _create_hidden_states_buffer(self, input_ids: torch.Tensor):
 
     # This function is overriden in TeGEmmaForCausalLMCudaGraphs.
     def _create_inference_params(self, *args, **kwargs):
-        infer_params = InferenceParams(
-            *args, **kwargs
-        )
+        infer_params = InferenceParams(*args, **kwargs)
         return infer_params
 
     # This function is overriden in TeGEmmaForCausalLMCudaGraphs.
     def _get_max_input_seq_len(self, input_ids):
-        return input_ids.shape[1] \
-                if not hasattr(self.config, "cuda_graphs_static_max_context_len") \
-                    else self.config.cuda_graphs_static_max_context_len
+        return (
+            input_ids.shape[1]
+            if not hasattr(self.config, "cuda_graphs_static_max_context_len")
+            else self.config.cuda_graphs_static_max_context_len
+        )
 
     # The buffer for generation is some part (beginning) of hidden states buffer.
     # This function returns pointer to it and also copies there data if provided.
@@ -303,19 +313,15 @@ def _generate_context_phase(self, input_ids: torch.Tensor, inference_params: Inf
         else:
             inference_params.setup_before_new_input(length=input_ids.shape[1])
 
-
         logits, hs_buffer = self._model_context_phase(
             hidden_states,
             attention_mask=((input_ids == 0) if self.config.qkv_format != "thd" else None),
             attn_mask_type="padding_causal" if self.config.qkv_format == "thd" else "arbitrary",
-            rope_emb=self.te_rope_emb
+            rope_emb=self.te_rope_emb,
         )
 
         if self.config.qkv_format == "thd":
-            logits = logits[
-
-                torch.arange(logits.size(0)), lengths - 1, :
-            ]
+            logits = logits[torch.arange(logits.size(0)), lengths - 1, :]
         else:
             logits = logits[:, -1, :]
 
@@ -357,10 +363,14 @@ def generate(
 
             if self.config.qkv_format == "thd":
                 # For thd layout padding is at the end, otherwise at the beginning.
-                TEGemmaForCausalLM._padding_to_end(input_ids,
+                TEGemmaForCausalLM._padding_to_end(
+                    input_ids,
                     lengths,
-                    max_seq_len=self.config.cuda_graphs_static_max_context_len \
-                        if self.config.generation_cuda_graphs else None
+                    max_seq_len=(
+                        self.config.cuda_graphs_static_max_context_len
+                        if self.config.generation_cuda_graphs
+                        else None
+                    ),
                 )
 
             batch_size, max_input_sequence_len = input_ids.shape[0], self._get_max_input_seq_len(
@@ -380,7 +390,7 @@ def generate(
                 dtype=torch.bfloat16,
                 is_paged=self.config.is_paged,
                 page_size=64,
-                total_num_pages=64 * 128 // 64, # 64 * 64 (max_sequence_length) / 64 (page_size)
+                total_num_pages=64 * 128 // 64,  # 64 * 64 (max_sequence_length) / 64 (page_size)
             )
 
             self._model_context_phase.set_inference_params(inference_params)
@@ -393,8 +403,10 @@ def generate(
             # print(f"context phase done")
             # Generation phase.
             if self.config.qkv_format == "thd":
-                lengths_tensor=torch.ones((next_tokens.shape[0],), dtype=int)
-                inference_params.pre_step(OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist())))
+                lengths_tensor = torch.ones((next_tokens.shape[0],), dtype=int)
+                inference_params.pre_step(
+                    OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist()))
+                )
             else:
                 inference_params.setup_before_new_input(length=1)
 
@@ -411,7 +423,12 @@ def generate(
                     # include the next token to be generated
                     mask = self._make_mask_one_token_longer(mask)
 
-                next_tokens = self._model_generation_phase(hidden_states, mask=mask, attn_mask_type="padding" if self.config.qkv_format=="thd" else "arbitrary", rope_emb=self.te_rope_emb)
+                next_tokens = self._model_generation_phase(
+                    hidden_states,
+                    mask=mask,
+                    attn_mask_type="padding" if self.config.qkv_format == "thd" else "arbitrary",
+                    rope_emb=self.te_rope_emb,
+                )
 
                 # self.inference_params contains for example kv_cache.
                 # This needs to be called before every pass,
@@ -419,8 +436,10 @@ def generate(
                 # Here we increase sequence offsets by one,
                 # because we generated one token for every sequence.
                 if self.config.qkv_format == "thd":
-                    lengths_tensor=torch.ones((next_tokens.shape[0],), dtype=int)
-                    inference_params.pre_step(OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist())))
+                    lengths_tensor = torch.ones((next_tokens.shape[0],), dtype=int)
+                    inference_params.pre_step(
+                        OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist()))
+                    )
                 else:
                     inference_params.setup_before_new_input(length=1)
                 # next_tokens is static output tensor, so we need to clone it
@@ -435,11 +454,14 @@ def forward(self, *args, **kwargs):
         hidden_states = self.model.embed_tokens(kwargs["input_ids"])
         logits = self._model_context_phase(
             hidden_states,
-            attention_mask=((kwargs["input_ids"] == 0) if self.config.qkv_format != "thd" else None),
-            attn_mask_type="arbitrary"
+            attention_mask=(
+                (kwargs["input_ids"] == 0) if self.config.qkv_format != "thd" else None
+            ),
+            attn_mask_type="arbitrary",
         )
         return logits
 
+
 class TEGemmaForCausalLMCudaGraphs(TEGemmaForCausalLM):
     """
     TEGemmaForCausalLMCudaGraphs is the version of the class TEGemmaForCausalLM
@@ -478,7 +500,9 @@ def __init__(self, config: GemmaConfig):
             dtype=torch.bfloat16,
             is_paged=self.config.is_paged,
             page_size=64,
-            total_num_pages=64 * self.config.cuda_graphs_static_max_seq_len // 64, # 64 * 64 (max_sequence_length) / 64 (page_size)
+            total_num_pages=64
+            * self.config.cuda_graphs_static_max_seq_len
+            // 64,  # 64 * 64 (max_sequence_length) / 64 (page_size)
         )
 
         self._model_generation_phase.set_inference_params(self.inference_params)
@@ -501,7 +525,9 @@ def record(self):
         lengths = torch.tensor(input_shape[0] * [input_shape[1]], device="cuda", dtype=torch.int32)
         max_input_length = input_shape[1]
 
-        self.inference_params.pre_step(OrderedDict(zip(list(range(len(lengths))), lengths.tolist())))
+        self.inference_params.pre_step(
+            OrderedDict(zip(list(range(len(lengths))), lengths.tolist()))
+        )
 
         # print(f"context phase recording start")
 
@@ -509,7 +535,7 @@ def record(self):
             self._model_context_phase,
             self.hidden_states_buffer,
             attn_mask_type="padding_causal",
-            rope_emb=self.te_rope_emb
+            rope_emb=self.te_rope_emb,
         )  # CUDA Graphs recording
 
         # print(f"context phase recording done")
@@ -517,13 +543,15 @@ def record(self):
 
         lengths = torch.tensor(input_shape[0] * [1], device="cuda", dtype=torch.int32)
 
-        self.inference_params.pre_step(OrderedDict(zip(list(range(len(lengths))), lengths.tolist())))
+        self.inference_params.pre_step(
+            OrderedDict(zip(list(range(len(lengths))), lengths.tolist()))
+        )
 
         self._model_generation_phase = self.record_graph(
             self._model_generation_phase,
             self.generation_buffer,
             attn_mask_type="padding",
-            rope_emb=self.te_rope_emb
+            rope_emb=self.te_rope_emb,
         )  # CUDA Graphs recording
 
     """
diff --git a/docs/examples/te_gemma/te_gemma_save.py b/docs/examples/te_gemma/te_gemma_save.py
index a46f6a9b94..c83378840c 100755
--- a/docs/examples/te_gemma/te_gemma_save.py
+++ b/docs/examples/te_gemma/te_gemma_save.py
@@ -19,8 +19,17 @@
 
 import torch.nn.functional as F
 
+
 class CacheParams:
-    def __init__(self, max_seqlen_q, max_seqlen_kv, cu_seqlens_q, cu_seqlens_kv, cu_seqlens_q_padded, cu_seqlens_kv_padded):
+    def __init__(
+        self,
+        max_seqlen_q,
+        max_seqlen_kv,
+        cu_seqlens_q,
+        cu_seqlens_kv,
+        cu_seqlens_q_padded,
+        cu_seqlens_kv_padded,
+    ):
         self.max_seqlen_q = max_seqlen_q
         self.max_seqlen_kv = max_seqlen_kv
         self.cu_seqlens_q = cu_seqlens_q
@@ -37,15 +46,18 @@ def setup_cache_params_from_infer_params(inference_params, lengths_tensor, max_i
     (Currently a hack, this should be reformatted to a better method)
     """
 
-    assert lengths_tensor is not None and max_input_length is not None, \
-        "lengths_tensor and max_input_length should not be none for qkv_format = \"thd\""
+    assert (
+        lengths_tensor is not None and max_input_length is not None
+    ), 'lengths_tensor and max_input_length should not be none for qkv_format = "thd"'
 
     inference_params.max_incoming_seq_len = max_input_length
 
     lengths_tensor = lengths_tensor.to(inference_params.cu_seqlens_q.device)
 
     # inference_params.step_dict = OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist()))
-    inference_params.pre_step(OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist())))
+    inference_params.pre_step(
+        OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist()))
+    )
 
     # print(inference_params.step_dict)
 
@@ -56,6 +68,7 @@ def setup_cache_params_from_infer_params(inference_params, lengths_tensor, max_i
     # @sudhakars: to create a better way later.
     # inference_params.get_cache_params_from_infer_params = get_cache_params_in_infer_params
 
+
 # This class has been modified from
 # https://github.com/huggingface/transformers/blob/98adf24883b007c2a7fb17bab1c01b1614673433/src/transformers/models/gemma/modeling_gemma.py
 class GemmaRotaryEmbedding(torch.nn.Module):
@@ -65,41 +78,48 @@ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         self.dim = dim
         self.max_position_embeddings = max_position_embeddings
         self.base = base
-        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float() / self.dim))
+        inv_freq = 1.0 / (
+            self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float() / self.dim)
+        )
         self.register_buffer("inv_freq", tensor=inv_freq, persistent=False)
 
     @torch.no_grad()
     def forward(self, x, position_ids, seq_len=None):
         # x: [bs, num_attention_heads, seq_len, head_size]
         self.inv_freq.to(x.device)
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        inv_freq_expanded = (
+            self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        )
         position_ids_expanded = position_ids[:, None, :].float()
         # Force float32 since bfloat16 loses precision on long contexts
         # See https://github.com/huggingface/transformers/pull/29285
         device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        device_type = (
+            device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        )
         with torch.autocast(device_type=device_type, enabled=False):
             freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
-        return emb.unsqueeze(2) # should return in [b, s, 1, d] format
+        return emb.unsqueeze(2)  # should return in [b, s, 1, d] format
 
 
 class StaticBufferAllocator(torch.nn.Module):
     """
-        This class is used when we use te.make_graphed_callable().
-        CUDA Graphs require all tensors to be static. Neverthless,
-        torch API make_graphed_callable() takes care of output of torch modules,
-        and makes them static. Thus by wrapping allocation of memory into
-        torch.nn.Module, we can greatly simplify our code.
+    This class is used when we use te.make_graphed_callable().
+    CUDA Graphs require all tensors to be static. Neverthless,
+    torch API make_graphed_callable() takes care of output of torch modules,
+    and makes them static. Thus by wrapping allocation of memory into
+    torch.nn.Module, we can greatly simplify our code.
     """
 
     # pylint: disable=no-self-use
     def forward(self, size, dtype, device):
         """
-            Return buffer of given size, dtype and device.
+        Return buffer of given size, dtype and device.
         """
         return torch.zeros(size, dtype=dtype, device=device)
 
+
 class TEGemmaDecoderLayer(te.pytorch.TransformerLayer):
     """
     Wrapper class over TE's `TransformerLayer`. This makes the wrapper very
@@ -138,7 +158,7 @@ def __init__(self, config: GemmaConfig, layer_idx: int, *args, **kwargs):
 
     def alloc(self, size, dtype, device):
         """
-            Allocated the buffer and works correctly with CUDA Graphs.
+        Allocated the buffer and works correctly with CUDA Graphs.
         """
         return self._allocator(size, dtype, device)
 
@@ -210,7 +230,7 @@ def forward(self, *args, **kwargs):  # We need to additionally pass positional e
         #     cu_seqlens_kv = cache_params.cu_seqlens_kv
         #     cu_seqlens_q_padded = cache_params.cu_seqlens_q_padded
         #     cu_seqlens_kv_padded = cache_params.cu_seqlens_kv_padded
-            # print(f"input_sequence_lengths (in forward): \n{inference_params.input_sequence_lengths}")
+        # print(f"input_sequence_lengths (in forward): \n{inference_params.input_sequence_lengths}")
 
         # this args cannot be passed to TransformerLayer
         keys_to_remove = [
@@ -232,10 +252,11 @@ def forward(self, *args, **kwargs):  # We need to additionally pass positional e
                 # cu_seqlens_kv=cu_seqlens_kv,
                 # max_seqlen_q=max_seqlen_q,
                 # max_seqlen_kv=max_seqlen_kv,
-                **kwargs
+                **kwargs,
             ),
         )
 
+
 class StaticGemmaModel(torch.nn.Module):
     """
     StaticGemma is based of HF GemmaModel class.
@@ -259,7 +280,12 @@ def set_inference_params(self, inference_params):
         self.inference_params = inference_params
 
     # @sudhakars: is `arbitrary` fine being the default here?
-    def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor = None, attn_mask_type: str = "arbitrary"):
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor = None,
+        attn_mask_type: str = "arbitrary",
+    ):
         print(f"StaticGemmaModel forward start")
         with torch.no_grad():
             # static operation - for CUDA graphs
@@ -301,8 +327,15 @@ def set_inference_params(self, inference_params):
         self.gemma_layers.set_inference_params(inference_params)
 
     # @sudhakars: is `arbitrary` a good default value here?
-    def forward(self, hidden_states: torch.Tensor, mask: torch.Tensor = None, attn_mask_type: str = "arbitrary"):
-        logits, _ = self.gemma_layers(hidden_states, attention_mask=mask, attn_mask_type = attn_mask_type)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        mask: torch.Tensor = None,
+        attn_mask_type: str = "arbitrary",
+    ):
+        logits, _ = self.gemma_layers(
+            hidden_states, attention_mask=mask, attn_mask_type=attn_mask_type
+        )
 
         assert logits.shape[0] == hidden_states.shape[0]  # b
         assert logits.shape[1] == hidden_states.shape[1]  # seq_len
@@ -324,6 +357,7 @@ class PartialForwardWrapper(torch.nn.Module):
     `functools.partial` is used to wrap the module, it changes the modules'
     type and that interferes with the `make_graphed_callables` intrinsics.
     """
+
     def __init__(self, module, **kwargs):
         super().__init__()
         self.module = module
@@ -413,7 +447,6 @@ def _padding_to_end(inputs, lengths, max_seq_len=None):
         # For Paged Attention, make the valid sequences, multiple of 64
         # inputs.data = new_input_ids[:, :4].repeat(1, 16)
 
-
     def _next_64_multiply(self, x):
         return ((x + 63) // 64) * 64
 
@@ -429,9 +462,7 @@ def _create_hidden_states_buffer(self, input_ids: torch.Tensor):
 
     # This function is overriden in TeGEmmaForCausalLMCudaGraphs.
     def _create_inference_params(self, *args, **kwargs):
-        infer_params = InferenceParams(
-            *args, **kwargs
-        )
+        infer_params = InferenceParams(*args, **kwargs)
 
         # max_batch_size = kwargs["max_batch_size"]
 
@@ -451,9 +482,11 @@ def _create_inference_params(self, *args, **kwargs):
 
     # This function is overriden in TeGEmmaForCausalLMCudaGraphs.
     def _get_max_input_seq_len(self, input_ids):
-        return input_ids.shape[1] \
-                if not hasattr(self.config, "cuda_graphs_static_max_context_len") \
-                    else self.config.cuda_graphs_static_max_context_len
+        return (
+            input_ids.shape[1]
+            if not hasattr(self.config, "cuda_graphs_static_max_context_len")
+            else self.config.cuda_graphs_static_max_context_len
+        )
 
     # The buffer for generation is some part (beginning) of hidden states buffer.
     # This function returns pointer to it and also copies there data if provided.
@@ -490,7 +523,6 @@ def _generate_context_phase(self, input_ids: torch.Tensor, inference_params: Inf
         else:
             inference_params.setup_before_new_input(length=input_ids.shape[1])
 
-
         logits, hs_buffer = self._model_context_phase(
             hidden_states,
             attention_mask=((input_ids == 0) if self.config.qkv_format != "thd" else None),
@@ -503,10 +535,7 @@ def _generate_context_phase(self, input_ids: torch.Tensor, inference_params: Inf
         # they are the last token in the sequence when qkv_format != "thd".
         # import pdb; pdb.set_trace()
         if self.config.qkv_format == "thd":
-            logits = logits[
-
-                torch.arange(logits.size(0)), lengths - 1, :
-            ]
+            logits = logits[torch.arange(logits.size(0)), lengths - 1, :]
         else:
             logits = logits[:, -1, :]
 
@@ -531,7 +560,7 @@ def generate(
         pad_token_id: int = 0,
         max_new_tokens: int = 0,
         *args,
-        **kwargs
+        **kwargs,
     ):
         self.eval()
 
@@ -557,10 +586,14 @@ def generate(
 
             if self.config.qkv_format == "thd":
                 # For thd layout padding is at the end, otherwise at the beginning.
-                TEGemmaForCausalLM._padding_to_end(input_ids,
+                TEGemmaForCausalLM._padding_to_end(
+                    input_ids,
                     lengths,
-                    max_seq_len=self.config.cuda_graphs_static_max_context_len \
-                        if self.config.generation_cuda_graphs else None
+                    max_seq_len=(
+                        self.config.cuda_graphs_static_max_context_len
+                        if self.config.generation_cuda_graphs
+                        else None
+                    ),
                 )
 
             # import pdb; pdb.set_trace()
@@ -578,7 +611,7 @@ def generate(
                 dtype=torch.bfloat16,
                 is_paged=self.config.is_paged,
                 page_size=64,
-                total_num_pages=64, # 64 * 64 (max_sequence_length) / 64 (page_size)
+                total_num_pages=64,  # 64 * 64 (max_sequence_length) / 64 (page_size)
                 # is_cuda_graph=False
             )
 
@@ -591,7 +624,6 @@ def generate(
 
             # init_cache_params_in_infer_params(inference_params)
 
-
             # inference_params.qkv_format_legacy = self.config.qkv_format
 
             self._model_context_phase.set_inference_params(inference_params)
@@ -608,8 +640,10 @@ def generate(
                 #     lengths_tensor=torch.ones((next_tokens.shape[0],), device="cuda"),
                 #     max_input_length=1,
                 # )
-                lengths_tensor=torch.ones((next_tokens.shape[0],), dtype=int)
-                inference_params.pre_step(OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist())))
+                lengths_tensor = torch.ones((next_tokens.shape[0],), dtype=int)
+                inference_params.pre_step(
+                    OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist()))
+                )
             else:
                 inference_params.setup_before_new_input(length=1)
 
@@ -628,7 +662,11 @@ def generate(
 
                 # setup_cache_params_from_infer_params(inference_params, input_ids)
                 # @sudhakars: could create position_ids from mask here
-                next_tokens = self._model_generation_phase(hidden_states, mask, attn_mask_type="padding" if self.config.qkv_format=="thd" else "arbitrary")
+                next_tokens = self._model_generation_phase(
+                    hidden_states,
+                    mask,
+                    attn_mask_type="padding" if self.config.qkv_format == "thd" else "arbitrary",
+                )
 
                 # self.inference_params contains for example kv_cache.
                 # This needs to be called before every pass,
@@ -640,8 +678,10 @@ def generate(
                     #     lengths_tensor=torch.ones((next_tokens.shape[0],), device="cuda"),
                     #     max_input_length=1,
                     # )
-                    lengths_tensor=torch.ones((next_tokens.shape[0],), dtype=int)
-                    inference_params.pre_step(OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist())))
+                    lengths_tensor = torch.ones((next_tokens.shape[0],), dtype=int)
+                    inference_params.pre_step(
+                        OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist()))
+                    )
                 else:
                     inference_params.setup_before_new_input(length=1)
                 # next_tokens is static output tensor, so we need to clone it
@@ -656,11 +696,14 @@ def forward(self, *args, **kwargs):
         hidden_states = self.model.embed_tokens(kwargs["input_ids"])
         logits = self._model_context_phase(
             hidden_states,
-            attention_mask=((kwargs["input_ids"] == 0) if self.config.qkv_format != "thd" else None),
-            attn_mask_type="arbitrary"
+            attention_mask=(
+                (kwargs["input_ids"] == 0) if self.config.qkv_format != "thd" else None
+            ),
+            attn_mask_type="arbitrary",
         )
         return logits
 
+
 class TEGemmaForCausalLMCudaGraphs(TEGemmaForCausalLM):
     """
     TEGemmaForCausalLMCudaGraphs is the version of the class TEGemmaForCausalLM
@@ -703,7 +746,7 @@ def __init__(self, config: GemmaConfig):
             dtype=torch.bfloat16,
             is_paged=self.config.is_paged,
             page_size=64,
-            total_num_pages=64, # 64 * 64 (max_sequence_length) / 64 (page_size)
+            total_num_pages=64,  # 64 * 64 (max_sequence_length) / 64 (page_size)
             # is_cuda_graph=False
         )
 
@@ -753,7 +796,9 @@ def record(self):
         lengths = torch.tensor(input_shape[0] * [input_shape[1]], device="cuda", dtype=torch.int32)
         max_input_length = input_shape[1]
 
-        self.inference_params.pre_step(OrderedDict(zip(list(range(len(lengths))), lengths.tolist())))
+        self.inference_params.pre_step(
+            OrderedDict(zip(list(range(len(lengths))), lengths.tolist()))
+        )
 
         print(f"context phase recording start")
         # self._model_context_phase.model.layers = torch.nn.ModuleList([
@@ -766,9 +811,7 @@ def record(self):
         #     for layer in self._model_context_phase.model.layers
         # ])
         self._model_context_phase = self.record_graph(
-            self._model_context_phase,
-            self.hidden_states_buffer,
-            attn_mask_type="padding_causal"
+            self._model_context_phase, self.hidden_states_buffer, attn_mask_type="padding_causal"
         )  # CUDA Graphs recording
 
         print(f"context phase recording done")
@@ -781,12 +824,12 @@ def record(self):
         lengths = torch.tensor(input_shape[0] * [1], device="cuda", dtype=torch.int32)
         max_input_length = input_shape[1]
 
-        self.inference_params.pre_step(OrderedDict(zip(list(range(len(lengths))), lengths.tolist())))
+        self.inference_params.pre_step(
+            OrderedDict(zip(list(range(len(lengths))), lengths.tolist()))
+        )
 
         self._model_generation_phase = self.record_graph(
-            self._model_generation_phase,
-            self.generation_buffer,
-            attn_mask_type="padding"
+            self._model_generation_phase, self.generation_buffer, attn_mask_type="padding"
         )  # CUDA Graphs recording
 
     """
diff --git a/docs/examples/te_gemma/te_llama.py b/docs/examples/te_gemma/te_llama.py
index 426b79cbf1..637f4f574c 100755
--- a/docs/examples/te_gemma/te_llama.py
+++ b/docs/examples/te_gemma/te_llama.py
@@ -19,6 +19,7 @@
 
 import torch.nn.functional as F
 
+
 def setup_cache_params_from_infer_params(inference_params, lengths_tensor, max_input_length):
     """
     Converts the `input_ids` to variables like `cu_seqlens_q/kv`, etc. which
@@ -27,16 +28,21 @@ def setup_cache_params_from_infer_params(inference_params, lengths_tensor, max_i
     (Currently a hack, this should be reformatted to a better method)
     """
 
-    assert lengths_tensor is not None and max_input_length is not None, \
-        "lengths_tensor and max_input_length should not be none for qkv_format = \"thd\""
+    assert (
+        lengths_tensor is not None and max_input_length is not None
+    ), 'lengths_tensor and max_input_length should not be none for qkv_format = "thd"'
     torch.add(
         inference_params.cached_sequence_lengths,
         inference_params.input_sequence_lengths,
-        out=inference_params.cached_sequence_lengths)
+        out=inference_params.cached_sequence_lengths,
+    )
     inference_params.input_sequence_lengths.copy_(lengths_tensor)
     inference_params.max_incoming_seq_len = max_input_length
 
-    max_seqlen_q, max_seqlen_kv = inference_params.max_incoming_seq_len, inference_params.max_sequence_length
+    max_seqlen_q, max_seqlen_kv = (
+        inference_params.max_incoming_seq_len,
+        inference_params.max_sequence_length,
+    )
 
     # # Allocation of buffers, it works correctly with CUDA Graphs.
     _allocator = StaticBufferAllocator()
@@ -50,26 +56,40 @@ def setup_cache_params_from_infer_params(inference_params, lengths_tensor, max_i
     torch.cumsum(inference_params.input_sequence_lengths, dim=0, out=cu_seqlens_q[1:])
     torch.cumsum(
         inference_params.cached_sequence_lengths + inference_params.input_sequence_lengths,
-        dim=0, out=cu_seqlens_kv[1:])
+        dim=0,
+        out=cu_seqlens_kv[1:],
+    )
     # If layer has shape [b * s_layer, h, d]
     # offsets are of the form [k * s_layer * h * d for k = 0, ..., batch_size]
     cu_seqlens_q_padded.copy_(
-        torch.arange(0, inference_params.max_batch_size + 1, device="cuda") * max_seqlen_q)
+        torch.arange(0, inference_params.max_batch_size + 1, device="cuda") * max_seqlen_q
+    )
     cu_seqlens_kv_padded.copy_(
-        torch.arange(0, inference_params.max_batch_size + 1, device="cuda") * max_seqlen_kv)
+        torch.arange(0, inference_params.max_batch_size + 1, device="cuda") * max_seqlen_kv
+    )
 
     # inference_params.step_dict = OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist()))
-    inference_params.pre_step(OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist())))
+    inference_params.pre_step(
+        OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist()))
+    )
 
     # print(inference_params.step_dict)
 
     def get_cache_params_in_infer_params():
-        return max_seqlen_q, max_seqlen_kv, cu_seqlens_q, cu_seqlens_kv, cu_seqlens_q_padded, cu_seqlens_kv_padded
+        return (
+            max_seqlen_q,
+            max_seqlen_kv,
+            cu_seqlens_q,
+            cu_seqlens_kv,
+            cu_seqlens_q_padded,
+            cu_seqlens_kv_padded,
+        )
 
     # For the time being, create an ad-hoc field in `inference_params` to get the variables.
     # @sudhakars: to create a better way later.
     inference_params.get_cache_params_from_infer_params = get_cache_params_in_infer_params
 
+
 # This class has been modified from
 # https://github.com/huggingface/transformers/blob/98adf24883b007c2a7fb17bab1c01b1614673433/src/transformers/models/gemma/modeling_gemma.py
 class LlamaRotaryEmbedding(torch.nn.Module):
@@ -79,41 +99,48 @@ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         self.dim = dim
         self.max_position_embeddings = max_position_embeddings
         self.base = base
-        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float() / self.dim))
+        inv_freq = 1.0 / (
+            self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float() / self.dim)
+        )
         self.register_buffer("inv_freq", tensor=inv_freq, persistent=False)
 
     @torch.no_grad()
     def forward(self, x, position_ids, seq_len=None):
         # x: [bs, num_attention_heads, seq_len, head_size]
         self.inv_freq.to(x.device)
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        inv_freq_expanded = (
+            self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        )
         position_ids_expanded = position_ids[:, None, :].float()
         # Force float32 since bfloat16 loses precision on long contexts
         # See https://github.com/huggingface/transformers/pull/29285
         device_type = x.device.type
-        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        device_type = (
+            device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        )
         with torch.autocast(device_type=device_type, enabled=False):
             freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
             emb = torch.cat((freqs, freqs), dim=-1)
-        return emb.unsqueeze(2) # should return in [b, s, 1, d] format
+        return emb.unsqueeze(2)  # should return in [b, s, 1, d] format
 
 
 class StaticBufferAllocator(torch.nn.Module):
     """
-        This class is used when we use te.make_graphed_callable().
-        CUDA Graphs require all tensors to be static. Neverthlessly,
-        torch API make_graphed_callable() takes care of output of torch modules,
-        and makes them static. Thus by wrapping allocation of memory into
-        torch.nn.Module, we can greatly simplify our code.
+    This class is used when we use te.make_graphed_callable().
+    CUDA Graphs require all tensors to be static. Neverthlessly,
+    torch API make_graphed_callable() takes care of output of torch modules,
+    and makes them static. Thus by wrapping allocation of memory into
+    torch.nn.Module, we can greatly simplify our code.
     """
 
     # pylint: disable=no-self-use
     def forward(self, size, dtype, device):
         """
-            Return buffer of given size, dtype and device.
+        Return buffer of given size, dtype and device.
         """
         return torch.zeros(size, dtype=dtype, device=device)
 
+
 class TELlamaDecoderLayer(te.pytorch.TransformerLayer):
     """
     Wrapper class over TE's `TransformerLayer`. This makes the wrapper very
@@ -134,39 +161,39 @@ def __init__(self, config: LlamaConfig, layer_idx: int, *args, **kwargs):
             hidden_size=config.hidden_size,
             ffn_hidden_size=config.intermediate_size,
             num_attention_heads=config.num_attention_heads,
-            bias=False, # LLaMA specific
+            bias=False,  # LLaMA specific
             layernorm_epsilon=config.rms_norm_eps,
             hidden_dropout=0,
             attention_dropout=0,
             fuse_qkv_params=config.fuse_qkv_params,
             normalization="RMSNorm",
-            activation="swiglu", # LLaMA specific
+            activation="swiglu",  # LLaMA specific
             # attn_input_format=config.qkv_format,
             attn_input_format="bshd",
             num_gqa_groups=config.num_key_value_heads,
-            kv_channels=self.head_dim, # LLaMA specific
+            kv_channels=self.head_dim,  # LLaMA specific
             layer_number=(
                 layer_idx + 1
             ),  # Layer numbers in TE starts from 1, not 0 like in the HF.
-            zero_centered_gamma=True, # LLaMA specific
+            zero_centered_gamma=True,  # LLaMA specific
         )
 
     def alloc(self, size, dtype, device):
         """
-            Allocated the buffer and works correctly with CUDA Graphs.
+        Allocated the buffer and works correctly with CUDA Graphs.
         """
         return self._allocator(size, dtype, device)
 
     def forward(self, *args, **kwargs):  # We need to additionally pass positional encoding.
 
         if "self_attn_mask_type" in kwargs:
-            attn_mask_type = kwargs['self_attn_mask_type']
+            attn_mask_type = kwargs["self_attn_mask_type"]
         else:
             attn_mask_type = "whatever_default_is"
 
         if attn_mask_type == "arbitrary":
             # @sudhakars: following logic doesn't work for `thd`
-            attn_mask = kwargs['attention_mask']
+            attn_mask = kwargs["attention_mask"]
             attention_mask_inv = ~attn_mask
             generation_case = torch.tensor(torch.tensor(attn_mask.shape).shape).item() > 2
 
@@ -181,13 +208,21 @@ def forward(self, *args, **kwargs):  # We need to additionally pass positional e
             position_ids = attention_mask_inv.long().cumsum(-1) - 1
             position_ids.masked_fill_(attention_mask_inv == 0, 1)
 
-            if "position_ids" in kwargs and kwargs['position_ids'] is not None:
-                assert torch.all(torch.eq(position_ids, kwargs["position_ids"])), "position ids don't match match exactly!"
+            if "position_ids" in kwargs and kwargs["position_ids"] is not None:
+                assert torch.all(
+                    torch.eq(position_ids, kwargs["position_ids"])
+                ), "position ids don't match match exactly!"
 
             # convert [b, s] to [b, 1, s, s] since `arbitrary` is only set for
             # context phase and context phase gets [b, s] sized attn mask
-            seq_len = 1 if torch.tensor(torch.tensor(attn_mask.shape).shape).item() > 2 else attention_mask_inv.shape[1]
-            arbitrary_attn_mask = torch.zeros(attention_mask_inv.shape[0], 1, seq_len, attention_mask_inv.shape[1]).bool()
+            seq_len = (
+                1
+                if torch.tensor(torch.tensor(attn_mask.shape).shape).item() > 2
+                else attention_mask_inv.shape[1]
+            )
+            arbitrary_attn_mask = torch.zeros(
+                attention_mask_inv.shape[0], 1, seq_len, attention_mask_inv.shape[1]
+            ).bool()
             for sample_idx in range(attn_mask.shape[0]):
                 pad_len = attn_mask[sample_idx].sum().int().item()
                 # set the columns to padded
@@ -195,21 +230,25 @@ def forward(self, *args, **kwargs):  # We need to additionally pass positional e
                 # set the rows to padded
                 if not generation_case:
                     arbitrary_attn_mask[sample_idx, :, :pad_len, :] = True
-                    arbitrary_attn_mask[sample_idx] = torch.tril(arbitrary_attn_mask[sample_idx].logical_not()).logical_not()
+                    arbitrary_attn_mask[sample_idx] = torch.tril(
+                        arbitrary_attn_mask[sample_idx].logical_not()
+                    ).logical_not()
 
             # Update the attention mask to arbitrary
-            kwargs['attention_mask'] = arbitrary_attn_mask.cuda()
+            kwargs["attention_mask"] = arbitrary_attn_mask.cuda()
 
             # @sudhakars: `max_position_embeddings` is not even used inside GemmaRotaryEmbedding
             # @sudhakars: change the hardcoded `dim` to something like config.head_dim
-            te_rope_emb = LlamaRotaryEmbedding(dim=self.head_dim, max_position_embeddings=self.llama_config.max_position_embeddings).cuda()
+            te_rope_emb = LlamaRotaryEmbedding(
+                dim=self.head_dim, max_position_embeddings=self.llama_config.max_position_embeddings
+            ).cuda()
             te_rope_emb = te_rope_emb(args[0], position_ids.cuda())
         else:
             # When the `attention_mask` is not `arbitrary`, then for the purpose
             # of this tutorial, we're using `padding_causal` (for context) and
             # `padding` (for generation)
             # @sudhakars: find a better way to provide the `tensor_format`
-            te_rope_emb = RotaryPositionEmbedding(self.head_dim)( # Use self.head_dim
+            te_rope_emb = RotaryPositionEmbedding(self.head_dim)(  # Use self.head_dim
                 max_seq_len=self.llama_config.max_position_embeddings
             ).cuda()
 
@@ -218,7 +257,12 @@ def forward(self, *args, **kwargs):  # We need to additionally pass positional e
         # batch_size = args[0].shape[0]
         if inference_params.qkv_format_legacy == "thd":
             (
-                max_seqlen_q, max_seqlen_kv, cu_seqlens_q, cu_seqlens_kv, cu_seqlens_q_padded, cu_seqlens_kv_padded
+                max_seqlen_q,
+                max_seqlen_kv,
+                cu_seqlens_q,
+                cu_seqlens_kv,
+                cu_seqlens_q_padded,
+                cu_seqlens_kv_padded,
             ) = inference_params.get_cache_params_from_infer_params()
 
         # this args cannot be passed to TransformerLayer
@@ -246,6 +290,7 @@ def forward(self, *args, **kwargs):  # We need to additionally pass positional e
             ),
         )
 
+
 class StaticLlamaModel(torch.nn.Module):
     """
     StaticLlama is based of HF LlamaModel class.
@@ -261,7 +306,7 @@ def __init__(
     ):
         super().__init__()
         self.model = model
-        self.llama_config = model.config # Store LlamaConfig
+        self.llama_config = model.config  # Store LlamaConfig
         self.normalizer = torch.tensor(self.llama_config.hidden_size**0.5, dtype=dtype)
         self.mask = mask
         self.lm_head = lm_head
@@ -270,7 +315,12 @@ def set_inference_params(self, inference_params):
         self.inference_params = inference_params
 
     # @sudhakars: is `arbitrary` fine being the default here?
-    def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor = None, attn_mask_type: str = "arbitrary"):
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor = None,
+        attn_mask_type: str = "arbitrary",
+    ):
         # import pdb; pdb.set_trace()
         if hidden_states.shape[1] > 1:
             torch.save(hidden_states, "input_ctxt.pth")
@@ -314,8 +364,10 @@ def set_inference_params(self, inference_params):
         self.llama_layers.set_inference_params(inference_params)
 
     # @sudhakars: is `arbitrary` a good default value here?
-    def forward(self, hidden_states: torch.Tensor, mask: torch.Tensor = None, mask_type: str = "arbitrary"):
-        logits = self.llama_layers(hidden_states, attention_mask=mask, attn_mask_type = mask_type)
+    def forward(
+        self, hidden_states: torch.Tensor, mask: torch.Tensor = None, mask_type: str = "arbitrary"
+    ):
+        logits = self.llama_layers(hidden_states, attention_mask=mask, attn_mask_type=mask_type)
 
         assert logits.shape[0] == hidden_states.shape[0]  # b
         assert logits.shape[1] == hidden_states.shape[1]  # seq_len
@@ -336,9 +388,11 @@ def forward(self, hidden_states: torch.Tensor, mask: torch.Tensor = None, mask_t
             #     lengths_tensor=torch.ones((next_tokens.shape[0],), device="cuda"),
             #     max_input_length=1,
             # )
-            setup_cache_params_from_infer_params(self.inference_params,
-                                                 lengths_tensor=torch.ones((next_tokens.shape[0],), dtype=int),
-                                                 max_input_length=1)
+            setup_cache_params_from_infer_params(
+                self.inference_params,
+                lengths_tensor=torch.ones((next_tokens.shape[0],), dtype=int),
+                max_input_length=1,
+            )
         else:
             self.inference_params.setup_before_new_input(length=1)
 
@@ -353,6 +407,7 @@ class PartialForwardWrapper(torch.nn.Module):
     `functools.partial` is used to wrap the module, it changes the modules'
     type and that interferes with the `make_graphed_callables` intrinsics.
     """
+
     def __init__(self, module, **kwargs):
         super().__init__()
         self.module = module
@@ -441,7 +496,6 @@ def _padding_to_end(inputs, lengths):
         # For Paged Attention, make the valid sequences, multiple of 64
         # inputs.data = new_input_ids[:, :4].repeat(1, 16)
 
-
     def _next_64_multiply(self, x):
         return ((x + 63) // 64) * 64
 
@@ -455,17 +509,17 @@ def _create_hidden_states_buffer(self, input_ids: torch.Tensor):
 
     # This function is overriden in TeGEmmaForCausalLMCudaGraphs.
     def _create_inference_params(self, *args, **kwargs):
-        infer_params = InferenceParams(
-            *args, **kwargs
-        )
+        infer_params = InferenceParams(*args, **kwargs)
 
         max_batch_size = kwargs["max_batch_size"]
 
         # Initialize some legacy params
         infer_params.cached_sequence_lengths = torch.zeros(
-            (max_batch_size,), device="cuda", dtype=torch.int32)
+            (max_batch_size,), device="cuda", dtype=torch.int32
+        )
         infer_params.input_sequence_lengths = torch.zeros(
-            (max_batch_size,), device="cuda", dtype=torch.int32)
+            (max_batch_size,), device="cuda", dtype=torch.int32
+        )
 
         return infer_params
 
@@ -507,11 +561,10 @@ def _generate_context_phase(self, input_ids: torch.Tensor, inference_params: Inf
         else:
             inference_params.setup_before_new_input(length=input_ids.shape[1])
 
-
         logits = self._model_context_phase(
             hidden_states,
             attention_mask=((input_ids == 0) if self.config.qkv_format != "thd" else None),
-            attn_mask_type="padding_causal" if self.config.qkv_format == "thd" else "arbitrary"
+            attn_mask_type="padding_causal" if self.config.qkv_format == "thd" else "arbitrary",
         )
 
         # We choose logits coresponding with last token in each sequence,
@@ -520,7 +573,6 @@ def _generate_context_phase(self, input_ids: torch.Tensor, inference_params: Inf
         # they are the last token in the sequence when qkv_format != "thd".
         if self.config.qkv_format == "thd":
             logits = logits[
-
                 torch.arange(logits.size(0)), inference_params.input_sequence_lengths - 1, :
             ]
         else:
@@ -585,15 +637,17 @@ def generate(
                 dtype=torch.bfloat16,
                 is_paged=True,
                 page_size=64,
-                total_num_pages=64 *3, # 64 * 64 (max_sequence_length) / 64 (page_size)
+                total_num_pages=64 * 3,  # 64 * 64 (max_sequence_length) / 64 (page_size)
                 # is_cuda_graph=False
             )
 
             def init_cache_params_in_infer_params(inference_params):
                 inference_params.cached_sequence_lengths = torch.zeros(
-                (batch_size,), device="cuda", dtype=torch.int32)
+                    (batch_size,), device="cuda", dtype=torch.int32
+                )
                 inference_params.input_sequence_lengths = torch.zeros(
-                (batch_size,), device="cuda", dtype=torch.int32)
+                    (batch_size,), device="cuda", dtype=torch.int32
+                )
 
             init_cache_params_in_infer_params(inference_params)
             inference_params.qkv_format_legacy = self.config.qkv_format
@@ -609,9 +663,11 @@ def init_cache_params_in_infer_params(inference_params):
                 #     lengths_tensor=torch.ones((next_tokens.shape[0],), device="cuda"),
                 #     max_input_length=1,
                 # )
-                setup_cache_params_from_infer_params(inference_params,
-                                                     lengths_tensor=torch.ones((next_tokens.shape[0],), dtype=int),
-                                                     max_input_length=1)
+                setup_cache_params_from_infer_params(
+                    inference_params,
+                    lengths_tensor=torch.ones((next_tokens.shape[0],), dtype=int),
+                    max_input_length=1,
+                )
             else:
                 inference_params.setup_before_new_input(length=1)
 
@@ -630,7 +686,11 @@ def init_cache_params_in_infer_params(inference_params):
 
                 # setup_cache_params_from_infer_params(inference_params, input_ids)
                 # @sudhakars: could create position_ids from mask here
-                next_tokens = self._model_generation_phase(hidden_states, mask, mask_type="padding" if self.config.qkv_format=="thd" else "arbitrary")
+                next_tokens = self._model_generation_phase(
+                    hidden_states,
+                    mask,
+                    mask_type="padding" if self.config.qkv_format == "thd" else "arbitrary",
+                )
                 # next_tokens is static output tensor, so we need to clone it
                 # - it gets changed every iteration.
                 output_tokens.append(next_tokens.clone())
@@ -643,11 +703,14 @@ def forward(self, *args, **kwargs):
         hidden_states = self.model.embed_tokens(kwargs["input_ids"])
         logits = self._model_context_phase(
             hidden_states,
-            attention_mask=((kwargs["input_ids"] == 0) if self.config.qkv_format != "thd" else None),
-            attn_mask_type="arbitrary"
+            attention_mask=(
+                (kwargs["input_ids"] == 0) if self.config.qkv_format != "thd" else None
+            ),
+            attn_mask_type="arbitrary",
         )
         return logits
 
+
 class TELlamaForCausalLMCudaGraphs(TELlamaForCausalLM):
     """
     TELlamaForCausalLMCudaGraphs is the version of the class TELlamaForCausalLM
@@ -701,10 +764,13 @@ def record(self):
             max_input_length=input_shape[1],
         )
         self._model_context_phase = self.record_graph(
-            PartialForwardWrapper(self._model_context_phase, attn_mask_type="padding_causal"
-                    if self.inference_params.qkv_format == "thd"
-                    else "arbitrary"),
-            self.hidden_states_buffer
+            PartialForwardWrapper(
+                self._model_context_phase,
+                attn_mask_type=(
+                    "padding_causal" if self.inference_params.qkv_format == "thd" else "arbitrary"
+                ),
+            ),
+            self.hidden_states_buffer,
         )  # CUDA Graphs recording
 
         input_shape = (self.config.cuda_graphs_static_batch_size, 1)
@@ -714,10 +780,11 @@ def record(self):
             max_input_length=input_shape[1],
         )
         self._model_generation_phase = self.record_graph(
-            PartialForwardWrapper(self._model_generation_phase, mask_type="padding"
-                    if self.inference_params.qkv_format=="thd"
-                    else "arbitrary"),
-            self.generation_buffer
+            PartialForwardWrapper(
+                self._model_generation_phase,
+                mask_type="padding" if self.inference_params.qkv_format == "thd" else "arbitrary",
+            ),
+            self.generation_buffer,
         )  # CUDA Graphs recording
 
     """
diff --git a/docs/examples/te_gemma/utils.py b/docs/examples/te_gemma/utils.py
index 46577071c8..27e07ee15a 100755
--- a/docs/examples/te_gemma/utils.py
+++ b/docs/examples/te_gemma/utils.py
@@ -29,6 +29,7 @@
 from te_gemma import TEGemmaForCausalLM, TEGemmaForCausalLMCudaGraphs
 from te_llama import TELlamaForCausalLM, TELlamaForCausalLMCudaGraphs
 
+
 class HyperParameters:
     def __init__(self):
         self.mixed_precision = "bf16"
@@ -133,6 +134,7 @@ def init_te_llama_model(hyperparams):
         model.record()
     return model.cuda()
 
+
 def init_te_gemma_model(hyperparams):
     cls = TEGemmaForCausalLMCudaGraphs if hyperparams.generation_cuda_graphs else TEGemmaForCausalLM
     config = AutoConfig.from_pretrained(hyperparams.model_name)
@@ -265,8 +267,8 @@ def run_forward_pass(model, hyperparams, num_iters):
     for _ in range(num_iters):
         _, batch = next(train_dataloader)
         batch["input_ids"] = batch["input_ids"].cuda()
-        batch['attention_mask'] = batch["attention_mask"].cuda()
-        model(input_ids = batch["input_ids"], attention_mask = batch['attention_mask'])
+        batch["attention_mask"] = batch["attention_mask"].cuda()
+        model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"])
 
 
 """
@@ -282,7 +284,6 @@ def print_sample_of_generated_texts(model):
     prompts *= 32
     inputs = tokenizer(prompts, return_tensors="pt", padding=True)
 
-
     max_length = inputs["input_ids"].size(1)
     new_length = ((max_length + 63) // 64) * 128
 
diff --git a/transformer_engine/pytorch/attention/inference.py b/transformer_engine/pytorch/attention/inference.py
index bcd4d7de30..62a724ef79 100644
--- a/transformer_engine/pytorch/attention/inference.py
+++ b/transformer_engine/pytorch/attention/inference.py
@@ -220,7 +220,6 @@ def __init__(
             device=torch.cuda.current_device(),
         )
 
-
     def reset(self):
         """Reset InferenceParams state"""
         self.sequences = OrderedDict()
@@ -275,7 +274,7 @@ def pre_step(
         pre_step_seqlens = torch.Tensor(list(self.sequences_pre_step.values())).to(
             dtype=torch.int32, device="cpu"
         )
-        self.pre_step_seqlens[:len(pre_step_seqlens)].copy_(pre_step_seqlens, non_blocking=True)
+        self.pre_step_seqlens[: len(pre_step_seqlens)].copy_(pre_step_seqlens, non_blocking=True)
 
         seqlens_q = list(step_dict.values())
         cu_seqlens_q = [0] + [sum(seqlens_q[:i]) for i in range(1, self.batch_size + 1)]

From 97b756c6e5b05e6f4f8207fdbeef8226b3fb3113 Mon Sep 17 00:00:00 2001
From: Sudhakar Singh <sudhakars@nvidia.com>
Date: Tue, 24 Jun 2025 16:41:23 -0700
Subject: [PATCH 5/7] perf imp for kv cache ops

Signed-off-by: Sudhakar Singh <sudhakars@nvidia.com>
---
 transformer_engine/common/fused_attn/kv_cache.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/transformer_engine/common/fused_attn/kv_cache.cu b/transformer_engine/common/fused_attn/kv_cache.cu
index af69faaabe..ea468e435b 100644
--- a/transformer_engine/common/fused_attn/kv_cache.cu
+++ b/transformer_engine/common/fused_attn/kv_cache.cu
@@ -116,14 +116,14 @@ void copy_to_kv_cache_launcher(Tensor new_k, Tensor new_v, Tensor k_cache, Tenso
                                bool is_non_paged, cudaStream_t stream) {
   if (new_k.has_data() && new_v.has_data() && k_cache.has_data() && v_cache.has_data()) {
     if (is_non_paged) {
-      reindex_kv_cache_kernel<<<16, 256, 0, stream>>>(
+      reindex_kv_cache_kernel<<<128, 1024, 0, stream>>>(
           reinterpret_cast<dtype *>(k_cache.data.dptr),
           reinterpret_cast<dtype *>(v_cache.data.dptr),
           reinterpret_cast<int *>(page_table.data.dptr),
           reinterpret_cast<int *>(cu_new_lens.data.dptr),
           reinterpret_cast<int *>(cu_cached_lens.data.dptr), h_kv, d_k, d_v, b, max_seq_len);
     }
-    copy_to_kv_cache_kernel<<<16, 256, 0, stream>>>(
+    copy_to_kv_cache_kernel<<<128, 1024, 0, stream>>>(
         reinterpret_cast<dtype *>(new_k.data.dptr), reinterpret_cast<dtype *>(new_v.data.dptr),
         reinterpret_cast<dtype *>(k_cache.data.dptr), reinterpret_cast<dtype *>(v_cache.data.dptr),
         reinterpret_cast<int *>(page_table.data.dptr),

From 5011eb33eec6fd0742a5e63fb656d7b6e067ad41 Mon Sep 17 00:00:00 2001
From: Sudhakar Singh <sudhakars@nvidia.com>
Date: Tue, 24 Jun 2025 16:43:26 -0700
Subject: [PATCH 6/7] add code for calibration

Signed-off-by: Sudhakar Singh <sudhakars@nvidia.com>
---
 docs/examples/te_gemma/run_generation.py | 57 ++++++++++++++++++------
 1 file changed, 43 insertions(+), 14 deletions(-)

diff --git a/docs/examples/te_gemma/run_generation.py b/docs/examples/te_gemma/run_generation.py
index 6c45b9d670..e208598dfa 100755
--- a/docs/examples/te_gemma/run_generation.py
+++ b/docs/examples/te_gemma/run_generation.py
@@ -1,22 +1,51 @@
 from utils import *
+import transformer_engine.pytorch as te
 
 hyperparams.model_name = "/perfhome/repos/ckpt/models/gemma-7b-hf/" # "/tmp/gemma-7b-hf/" # <== Add model weight location here e.g. "/path/to/downloaded/gemma/weights"
 hyperparams.qkv_format = "thd"
 
-# hyperparams.generation_cuda_graphs = True # 709.8s
-hyperparams.generation_cuda_graphs = True
+run_generation = True
+run_calibration = False
 
-if hyperparams.generation_cuda_graphs:
-    # It is necessary to preallocate a static buffer.
-    # CUDA graphs require static input tensors for every kernel.
-    # This approach may result in a slight increase in memory consumption;
-    # however, the substantial speedup achieved makes it worthwhile.
-    hyperparams.cuda_graphs_static_batch_size = 64
-    hyperparams.cuda_graphs_static_max_seq_len = 1024
-    hyperparams.cuda_graphs_static_max_context_len = 128
+if run_calibration:
+    hyperparams.fuse_qkv_params = True # This is needed by the last improvement.
 
-hyperparams.is_paged = False
-model = init_te_gemma_model(hyperparams)
+    model = init_te_gemma_model(hyperparams)
+
+    # Calibration
+    with te.fp8_autocast(enabled=False, calibrating=True), \
+        torch.autocast(device_type='cuda', dtype=torch.bfloat16):
+        model.train()
+        run_forward_pass(model, hyperparams, num_iters=512)
+
+    # Compute scale_fwd with enabled fp8 autocast
+    with te.fp8_autocast(enabled=True), \
+        torch.autocast(device_type='cuda', dtype=torch.bfloat16):
+        run_forward_pass(model, hyperparams, 1)
+
+    # Some parameters are in pointing to the same tensors, double save is avoided here.
+    dict_to_save = {k: v for k, v in model.state_dict().items() \
+                    if ("_context_phase" not in k and "_generation_phase" not in k)}
+    torch.save(dict_to_save, 'calibrated_weights.pth') # <== Add path to save calibrated weights.
+
+
+if run_generation:
+
+    # hyperparams.generation_cuda_graphs = False # 4.15s
+    hyperparams.generation_cuda_graphs = True # 4.38s
+
+    if hyperparams.generation_cuda_graphs:
+        # It is necessary to preallocate a static buffer.
+        # CUDA graphs require static input tensors for every kernel.
+        # This approach may result in a slight increase in memory consumption;
+        # however, the substantial speedup achieved makes it worthwhile.
+        hyperparams.cuda_graphs_static_batch_size = 64
+        hyperparams.cuda_graphs_static_max_seq_len = 128
+        hyperparams.cuda_graphs_static_max_context_len = 128
+
+    hyperparams.is_paged = False
+    model = init_te_gemma_model(hyperparams)
+
+    print_sample_of_generated_texts(model)
+    benchmark_generation(model)
 
-print_sample_of_generated_texts(model)
-# benchmark_generation(model)

From 0f7ea225be20982cfc28723c9027745465390d0c Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 24 Jun 2025 23:52:23 +0000
Subject: [PATCH 7/7] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 docs/examples/te_gemma/run_generation.py | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/docs/examples/te_gemma/run_generation.py b/docs/examples/te_gemma/run_generation.py
index 87e5e40f01..910fa325d0 100755
--- a/docs/examples/te_gemma/run_generation.py
+++ b/docs/examples/te_gemma/run_generation.py
@@ -10,31 +10,34 @@
 run_calibration = False
 
 if run_calibration:
-    hyperparams.fuse_qkv_params = True # This is needed by the last improvement.
+    hyperparams.fuse_qkv_params = True  # This is needed by the last improvement.
 
     model = init_te_gemma_model(hyperparams)
 
     # Calibration
-    with te.fp8_autocast(enabled=False, calibrating=True), \
-        torch.autocast(device_type='cuda', dtype=torch.bfloat16):
+    with te.fp8_autocast(enabled=False, calibrating=True), torch.autocast(
+        device_type="cuda", dtype=torch.bfloat16
+    ):
         model.train()
         run_forward_pass(model, hyperparams, num_iters=512)
 
     # Compute scale_fwd with enabled fp8 autocast
-    with te.fp8_autocast(enabled=True), \
-        torch.autocast(device_type='cuda', dtype=torch.bfloat16):
+    with te.fp8_autocast(enabled=True), torch.autocast(device_type="cuda", dtype=torch.bfloat16):
         run_forward_pass(model, hyperparams, 1)
 
     # Some parameters are in pointing to the same tensors, double save is avoided here.
-    dict_to_save = {k: v for k, v in model.state_dict().items() \
-                    if ("_context_phase" not in k and "_generation_phase" not in k)}
-    torch.save(dict_to_save, 'calibrated_weights.pth') # <== Add path to save calibrated weights.
+    dict_to_save = {
+        k: v
+        for k, v in model.state_dict().items()
+        if ("_context_phase" not in k and "_generation_phase" not in k)
+    }
+    torch.save(dict_to_save, "calibrated_weights.pth")  # <== Add path to save calibrated weights.
 
 
 if run_generation:
 
     # hyperparams.generation_cuda_graphs = False # 4.15s
-    hyperparams.generation_cuda_graphs = True # 4.38s
+    hyperparams.generation_cuda_graphs = True  # 4.38s
 
     if hyperparams.generation_cuda_graphs:
         # It is necessary to preallocate a static buffer.
@@ -50,4 +53,3 @@
 
     print_sample_of_generated_texts(model)
     benchmark_generation(model)
-