diff --git a/docs/examples/te_gemma/check_cuda_graphs.py b/docs/examples/te_gemma/check_cuda_graphs.py
new file mode 100644
index 0000000000..aee35f6911
--- /dev/null
+++ b/docs/examples/te_gemma/check_cuda_graphs.py
@@ -0,0 +1,67 @@
+import torch
+from transformer_engine.pytorch import Linear, LayerNorm
+
+
+# 1. Define model with static buffers
+class TE_Model(torch.nn.Module):
+    def __init__(self, max_seq_len=4096):
+        super().__init__()
+        self.max_seq_len = max_seq_len
+        self.ln = LayerNorm(1024)
+        self.attn_proj = Linear(1024, 1024)
+
+        # Pre-allocate static buffers
+        self.register_buffer("kv_cache", torch.zeros(max_seq_len, 1024, device="cuda"))
+        self.register_buffer(
+            "attn_mask", torch.tril(torch.ones(max_seq_len, max_seq_len, device="cuda"))
+        )
+
+    def forward(self, hidden_states, seq_start: int):
+        # Dynamic slicing of static buffers
+        seq_len = hidden_states.size(1)
+        current_mask = self.attn_mask[seq_start : seq_start + seq_len, :seq_len]
+
+        x = self.ln(hidden_states)
+        x = self.attn_proj(x)
+        # Update KV cache (in-place)
+        self.kv_cache[seq_start : seq_start + seq_len].copy_(x)
+        return x
+
+
+# 2. Create graphable callables
+model = TE_Model().cuda()
+static_input = torch.randn(8, 256, 1024, device="cuda")  # (batch, seq, hidden)
+seq_start = torch.tensor(0, device="cuda")
+
+# Wrap with CUDA Graphs
+graph_model = torch.cuda.make_graphed_callables(
+    [model],  # Module list
+    sample_args=[(static_input, seq_start)],  # Must match actual input structure
+    # memory_pool=torch.cuda.graphs.graph_pool_handle(),
+    allow_unused_input=False,
+)
+
+
+# 3. Warmup and execution
+def run_inference(x, seq_start):
+    # Inputs must match sample_args' device/type/shape
+    x = x.to("cuda", non_blocking=True).requires_grad_(False)
+    seq_start = seq_start.to("cuda", non_blocking=True)
+
+    with torch.cuda.amp.autocast():
+        return graph_model(x, seq_start)
+
+
+# Warm-up (essential for TE's kernel auto-tuner)
+for _ in range(3):
+    _ = run_inference(static_input, seq_start)
+torch.cuda.synchronize()
+
+
+# 4. Usage with dynamic sequence lengths
+def process_batch(inputs, start_pos):
+    # inputs: (batch, seq) on CPU
+    inputs_gpu = inputs.to("cuda", non_blocking=True)
+
+    # Output shares memory with pre-allocated buffers
+    return run_inference(inputs_gpu, start_pos)
diff --git a/docs/examples/te_gemma/check_gemm.py b/docs/examples/te_gemma/check_gemm.py
new file mode 100755
index 0000000000..1ed6edd23a
--- /dev/null
+++ b/docs/examples/te_gemma/check_gemm.py
@@ -0,0 +1,137 @@
+import functools
+from typing import Optional, Tuple, Union, List
+import torch
+import transformer_engine as te
+import transformer_engine_torch as tex
+from transformer_engine.pytorch.constants import TE_DType
+from transformer_engine.pytorch.utils import assert_dim_for_fp8_exec
+from transformer_engine.pytorch.module.base import get_workspace
+import transformer_engine.pytorch.cpp_extensions as cpp_tex
+
+
+@functools.lru_cache(maxsize=None)
+def _empty_tensor() -> torch.Tensor:
+    """Get tensor with no entries and no data"""
+    return torch.Tensor()
+
+
+def gemm(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    dtype: torch.dtype,
+    workspace: torch.Tensor,
+    gelu: bool = False,
+    gelu_input: Optional[torch.Tensor] = None,
+    grad: bool = False,
+    accumulate: bool = False,
+    layout: str = "TN",
+    out: Optional[torch.Tensor] = None,
+    bias: Optional[torch.Tensor] = None,
+    use_bias: bool = False,
+    ub_algo: tex.CommOverlapAlgo = None,
+    ub: Union[tex.CommOverlap, tex.CommOverlapP2P] = None,
+    extra_output_tensor: torch.Tensor = None,
+) -> Tuple[Union[torch.Tensor, None], ...]:
+    """Non FP8 GEMM."""
+
+    assert layout in ("TN", "NN", "NT"), f"GEMM layout {layout} not supported."
+    transa = layout[0] == "T"
+    transb = layout[1] == "T"
+    empty_tensor = _empty_tensor()
+    fp8_index = -1  # dummy index
+
+    if out is None:
+        out = torch.empty(
+            B.shape[1] if transb else B.shape[0],
+            A.shape[0] if transa else A.shape[1],
+            dtype=dtype,
+            device="cuda",
+        )
+    else:
+        if not out.is_contiguous():
+            raise ValueError("Output tensor is not contiguous.")
+
+    if gelu and not grad:
+        gelu_input = torch.empty_like(out, dtype=dtype)
+    elif not gelu:
+        gelu_input = empty_tensor
+
+    if grad and use_bias:
+        grad_bias = torch.empty(B.shape[1], dtype=out.dtype, device="cuda")
+    else:
+        grad_bias = empty_tensor
+
+    bias = bias if use_bias else empty_tensor
+
+    assert (
+        A.dtype == dtype and B.dtype == dtype
+    ), f"Expected dtype={dtype}, but found A.dtype={A.dtype} and B.dtype={B.dtype}"
+    input_dtype = TE_DType[dtype]
+    output_dtype = TE_DType[out.dtype]
+    if use_bias:
+        bias_dtype = TE_DType[grad_bias.dtype] if grad else TE_DType[bias.dtype]
+    else:
+        bias_dtype = output_dtype
+
+    args = (
+        A,
+        empty_tensor,
+        fp8_index,
+        input_dtype,
+        transa,
+        B,
+        empty_tensor,
+        fp8_index,
+        input_dtype,
+        transb,
+        out,
+        empty_tensor,  # out_scale
+        output_dtype,
+        empty_tensor,  # out_amax
+        grad_bias if grad else bias,
+        bias_dtype,
+        gelu_input,
+        grad,
+        workspace,
+        workspace.shape[0],
+        accumulate,
+        False,  # use_split_accumulator
+    )
+    fn = torch.ops.tex_ts.te_gemm_ts
+    if ub_algo is not None:
+        assert ub is not None, "ub object is None!"
+    _ = fn(*args)
+
+    import pdb
+
+    pdb.set_trace()
+    return out, grad_bias, gelu_input
+
+
+if __name__ == "__main__":
+    fc2_weight = torch.load("fc2_weight.pth").cuda()
+
+    base_repo = "/perfhome/mnt/wkstn/work/repos/te_gemma_gen_support/TransformerEngine/docs/examples/te_gemma/"
+    base_repo = ""
+    gelu_out = torch.load(base_repo + "gelu_out.pth").cuda()
+
+    activation_dtype = torch.bfloat16
+    fc2_bias = _empty_tensor()
+    use_fc2_bias = False
+
+    dim_size = list(gelu_out.size())
+    dim_size[1] = fc2_weight.size(0)
+    fc2_out = torch.empty(dim_size, dtype=activation_dtype, device=gelu_out.device)
+
+    _ = cpp_tex.gemm(
+        fc2_weight,
+        gelu_out,
+        activation_dtype,
+        get_workspace(),
+        bias=fc2_bias,
+        use_bias=use_fc2_bias,
+        out=fc2_out,
+        ub_algo=None,
+        ub=None,
+        extra_output_tensor=None,
+    )
diff --git a/docs/examples/te_gemma/check_rope.ipynb b/docs/examples/te_gemma/check_rope.ipynb
new file mode 100755
index 0000000000..26d5c9058f
--- /dev/null
+++ b/docs/examples/te_gemma/check_rope.ipynb
@@ -0,0 +1,716 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "72f61b51-b6fc-4463-9783-d42a25ca3a2f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "before tex import\n"
+     ]
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "import torch.nn.functional as F\n",
+    "import math\n",
+    "print(\"before tex import\")\n",
+    "import transformer_engine as te\n",
+    "import transformer_engine_torch as tex"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "1f81be75-bf64-43b2-852a-7c482a1c3418",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformer_engine.pytorch.attention import apply_rotary_pos_emb"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "8853f973-d834-41a9-929d-8687b947134f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def compare_rope_outputs(t, freqs_s11d, freqs_sb1d):\n",
+    "    output1 = tex.fused_rope_forward(t, freqs_s11d, torch.Tensor(), False)\n",
+    "    output2 = tex.fused_rope_forward(t, freqs_sb1d, torch.Tensor(), False)\n",
+    "    print(output1, output2, sep=\"\\n\")\n",
+    "    assert torch.allclose(output1, output2)\n",
+    "    return output1, output2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "6b7bada1-6748-46f1-93a4-c2ac1a617063",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "torch.manual_seed(0)\n",
+    "b = 2\n",
+    "s = 3\n",
+    "h = 2\n",
+    "d = 4"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "54a8f6d6-28f8-4a9a-8ba0-0fdefff138e7",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "torch.Size([3, 1, 1, 4]) torch.Size([3, 2, 1, 4])\n"
+     ]
+    }
+   ],
+   "source": [
+    "freqs_s11d = torch.ones(s, 1, 1, d).cuda() * math.pi/4\n",
+    "freqs_sb1d = freqs_s11d.broadcast_to(s, b, 1, d).clone()\n",
+    "t = torch.ones(s, b, h, d).cuda()\n",
+    "\n",
+    "print(freqs_s11d.shape, freqs_sb1d.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "5070307a-3104-401b-b84c-00f3bbf02ccc",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([[[[0.7854, 0.7854, 0.7854, 0.7854]]],\n",
+       "\n",
+       "\n",
+       "        [[[0.7854, 0.7854, 0.7854, 0.7854]]],\n",
+       "\n",
+       "\n",
+       "        [[[0.7854, 0.7854, 0.7854, 0.7854]]]], device='cuda:0')"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "freqs_s11d"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "81e52785-e6ad-4180-9567-564af692375c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(4, 4, 4, 1)"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "freqs_s11d.stride()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "0da9bc09-7e1e-4056-85eb-64b6122c7440",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "4, 0\n",
+      "4, 4, 4, 1, \n",
+      "nvt_fused_rope_fwd: 4, 0fused_rope_fwd: 4, 0fused_rope_fwd_launcher: 4, 0thread_id: 0, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n"
+     ]
+    }
+   ],
+   "source": [
+    "output = tex.fused_rope_forward(t, freqs_s11d, torch.Tensor(), False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "1b78017d-09b3-4b5f-93a8-75f6ba6f131c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "output_unfused=apply_rotary_pos_emb(\n",
+    "    t,\n",
+    "    freqs_s11d,\n",
+    "    tensor_format=\"sbhd\",\n",
+    "    fused=False,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "6f5d9350-deb1-48ef-a0a2-e18e01ed336f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([[[[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+       "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00]],\n",
+       "\n",
+       "         [[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+       "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00]]],\n",
+       "\n",
+       "\n",
+       "        [[[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+       "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00]],\n",
+       "\n",
+       "         [[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+       "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00]]],\n",
+       "\n",
+       "\n",
+       "        [[[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+       "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00]],\n",
+       "\n",
+       "         [[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+       "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00]]]],\n",
+       "       device='cuda:0')"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "output_unfused"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "b01e29b8-dfdf-41ac-81a5-d8edf6a8c168",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "4, 0\n",
+      "4, 4, 4, 1, \n",
+      "nvt_fused_rope_fwd: 4, 0fused_rope_fwd: 4, 0fused_rope_fwd_launcher: 4, 08, 4\n",
+      "8, 4, 4, 1, \n",
+      "nvt_fused_rope_fwd: 8, 4fused_rope_fwd: 8, 4fused_rope_fwd_launcher: 8, 4thread_id: 0, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 0, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 1, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 2, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 0, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 2, b_id: 0, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 1, b_id: 1, freq_stride_s: 4, freq_stride_b: 0, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 2, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 2, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 2, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 2, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 2, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 2, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 2, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 2, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 0, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 0, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 0, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 0, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 0, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 0, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 0, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 0, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 2, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 2, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 2, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 2, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 2, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 2, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 2, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 2, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 0, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 0, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 0, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 0, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 0, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 0, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 0, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 0, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 0, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 0, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 0, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 0, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 0, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 0, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 0, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 0, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 0, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 0, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 0, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 0, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 0, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 0, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 0, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 0, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 2, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 2, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 2, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 2, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 2, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 2, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 2, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 2, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 1, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 1, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 1, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 1, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 1, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 1, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 1, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 1, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 2, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 2, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 2, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 2, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 2, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 2, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 2, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 2, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 1, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 1, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 1, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 1, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 1, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 1, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 1, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 1, b_id: 1, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 1, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 1, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 1, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 1, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 1, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 1, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 1, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 1, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 1, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 1, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 1, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 1, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 0, s_id: 1, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 1, s_id: 1, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 2, s_id: 1, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "thread_id: 3, s_id: 1, b_id: 0, freq_stride_s: 8, freq_stride_b: 4, freq: 0.785398, v_sin: 0.707107, v_cos: 0.707107\n",
+      "tensor([[[[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+      "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00]],\n",
+      "\n",
+      "         [[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+      "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00]]],\n",
+      "\n",
+      "\n",
+      "        [[[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+      "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00]],\n",
+      "\n",
+      "         [[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+      "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00]]],\n",
+      "\n",
+      "\n",
+      "        [[[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+      "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00]],\n",
+      "\n",
+      "         [[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+      "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00]]]],\n",
+      "       device='cuda:0')\n",
+      "tensor([[[[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+      "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00]],\n",
+      "\n",
+      "         [[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+      "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00]]],\n",
+      "\n",
+      "\n",
+      "        [[[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+      "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00]],\n",
+      "\n",
+      "         [[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+      "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00]]],\n",
+      "\n",
+      "\n",
+      "        [[[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+      "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00]],\n",
+      "\n",
+      "         [[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+      "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00]]]],\n",
+      "       device='cuda:0')\n"
+     ]
+    }
+   ],
+   "source": [
+    "output1, output2 = compare_rope_outputs(t, freqs_s11d, freqs_sb1d)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "b168b178-1f63-4ccc-b084-2ac2c1ec016b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "torch.Size([6, 1, 1, 4]) torch.Size([6, 2, 1, 4])\n"
+     ]
+    }
+   ],
+   "source": [
+    "freqs_s11d = torch.randn(s, 1, 1, d).cuda()\n",
+    "freqs_sb1d = freqs_s11d.broadcast_to(s, b, 1, d).clone()\n",
+    "\n",
+    "print(freqs_s11d.shape, freqs_sb1d.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "33ec2e07-6e54-49f7-92f7-2f217a766456",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor([[[[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+      "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00]],\n",
+      "\n",
+      "         [[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+      "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00]]],\n",
+      "\n",
+      "\n",
+      "        [[[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+      "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00]],\n",
+      "\n",
+      "         [[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+      "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00]]],\n",
+      "\n",
+      "\n",
+      "        [[[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+      "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00]],\n",
+      "\n",
+      "         [[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+      "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00]]],\n",
+      "\n",
+      "\n",
+      "        [[[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+      "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00]],\n",
+      "\n",
+      "         [[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.0000e+00],\n",
+      "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.0000e+00]]]],\n",
+      "       device='cuda:0')\n",
+      "tensor([[[[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+      "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00]],\n",
+      "\n",
+      "         [[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+      "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00]]],\n",
+      "\n",
+      "\n",
+      "        [[[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+      "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00]],\n",
+      "\n",
+      "         [[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+      "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00]]],\n",
+      "\n",
+      "\n",
+      "        [[[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+      "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00]],\n",
+      "\n",
+      "         [[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+      "          [-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00]]],\n",
+      "\n",
+      "\n",
+      "        [[[-5.9605e-08, -5.9605e-08,  1.4142e+00,  1.4142e+00],\n",
+      "          [ 7.0711e-01,  7.0711e-01,  7.0711e-01,  7.0711e-01]],\n",
+      "\n",
+      "         [[ 7.0711e-01,  7.0711e-01,  7.0711e-01,  7.0711e-01],\n",
+      "          [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00]]]],\n",
+      "       device='cuda:0')\n"
+     ]
+    },
+    {
+     "ename": "AssertionError",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAssertionError\u001b[0m                            Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[9], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m output1, output2 \u001b[38;5;241m=\u001b[39m \u001b[43mcompare_rope_outputs\u001b[49m\u001b[43m(\u001b[49m\u001b[43mt\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfreqs_s11d\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfreqs_sb1d\u001b[49m\u001b[43m)\u001b[49m\n",
+      "Cell \u001b[0;32mIn[8], line 5\u001b[0m, in \u001b[0;36mcompare_rope_outputs\u001b[0;34m(t, freqs_s11d, freqs_sb1d)\u001b[0m\n\u001b[1;32m      3\u001b[0m output2 \u001b[38;5;241m=\u001b[39m tex\u001b[38;5;241m.\u001b[39mfused_rope_forward(t, freqs_sb1d, torch\u001b[38;5;241m.\u001b[39mTensor(), \u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[1;32m      4\u001b[0m \u001b[38;5;28mprint\u001b[39m(output1, output2, sep\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m----> 5\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m torch\u001b[38;5;241m.\u001b[39mallclose(output1, output2)\n\u001b[1;32m      6\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m output1, output2\n",
+      "\u001b[0;31mAssertionError\u001b[0m: "
+     ]
+    }
+   ],
+   "source": [
+    "output1, output2 = compare_rope_outputs(t, freqs_s11d, freqs_sb1d)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3b58b818-7b31-4ecd-80bd-b5ba049b3c2e",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "before tex import\n"
+     ]
+    }
+   ],
+   "source": [
+    "freqs_s11d = torch.randn(s, 1, 1, d).cuda()\n",
+    "print(freqs_s11d)\n",
+    "freqs_sb1d = freqs_s11d.broadcast_to(s, b, 1, d).clone()\n",
+    "print(freqs_sb1d)\n",
+    "assert torch.all(torch.eq(freqs_sb1d[:, 0, ...], freqs_sb1d[:, 1, ...]))\n",
+    "\n",
+    "comp"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c04940b8-3056-466b-90f6-07a02ac47ace",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/docs/examples/te_gemma/media/calibration.svg b/docs/examples/te_gemma/media/calibration.svg
new file mode 100755
index 0000000000..b1e1b5ae4b
--- /dev/null
+++ b/docs/examples/te_gemma/media/calibration.svg
@@ -0,0 +1 @@
+<svg width="1280" height="720" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" overflow="hidden"><g><rect x="0" y="0" width="1280" height="720" fill="#FFFFFF"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="24" transform="translate(39.6169 204)">FP8 with initial scaling factors</text><rect x="25" y="326" width="129" height="164" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#E8E8E8"/><rect x="40" y="351" width="103" height="71" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(76.8203 374)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(63.067 390)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(70.067 406)">weight</text><rect x="40" y="433" width="103" height="48" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#F7CBCB"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(74.3203 445)">Initial</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(55.7337 461)">FP8 scaling</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(68.6536 477)">factors</text><rect x="183" y="363" width="82" height="45" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#FBE3D6"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(212.27 382)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(201.77 398)">Weight</text><rect x="288" y="307" width="82" height="44" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#FBE3D6"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(316.622 325)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(312.202 341)">Input</text><rect x="277" y="224" width="103" height="71" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(314.289 247)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(300.535 263)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(312.455 279)">input</text><rect x="288" y="363" width="82" height="45" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#C1E5F5"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(316.619 382)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(307.952 398)">GEMM</text><path d="M0.015735-0.999876 34.0184-0.464776 33.987 1.53498-0.015735 0.999876ZM32.7325-3.48538 40.6686 0.64 32.6066 4.51362Z" transform="matrix(1 0 0 -1 143 386.64)"/><path d="M265 385 280.791 385 280.791 387 265 387ZM279.458 382 287.458 386 279.458 390Z"/><path d="M330 351 330 356.349 328 356.349 328 351ZM333 355.016 329 363.016 325 355.016Z"/><path d="M330 295 330 300.349 328 300.349 328 295ZM333 299.016 329 307.016 325 299.016Z"/><path d="M246.452 367 250.312 372.911 248.72 373.608 253.582 378.673 251.989 379.519 258 388 247.807 381.501 249.751 380.598 243.417 376.435 245.687 375.149 239 370.782Z" fill="#FF0000" fill-rule="evenodd"/><path d="M351.844 310 355.907 315.63 354.231 316.294 359.349 321.118 357.673 321.923 364 330 353.27 323.81 355.317 322.951 348.65 318.986 351.039 317.761 344 313.602Z" fill="#FF0000" fill-rule="evenodd"/><path d="M353.452 367 357.312 372.63 355.72 373.294 360.582 378.118 358.99 378.923 365 387 354.807 380.81 356.751 379.951 350.418 375.986 352.687 374.761 346 370.602Z" fill="#FF0000" fill-rule="evenodd"/><path d="M0.0369111-0.999319 6.03282-0.777852 5.959 1.22079-0.0369111 0.999319ZM8.10061-0.673792 14.0656-0.0265058 13.8498 1.96182 7.88485 1.31454ZM16.1141 0.247765 22.0214 1.2984 21.6712 3.2675 15.7639 2.21686ZM24.0392 1.73603 29.8647 3.17232 29.386 5.11418 23.5604 3.67789ZM31.84 3.77104 37.3437 5.51189 37.6161 5.61786 36.8909 7.48175 36.6487 7.38751 36.7097 7.40901 31.2368 5.67792ZM39.48 6.34307 44.1494 8.15988 45.1058 8.60687 44.259 10.4188 43.3326 9.98577 43.3934 10.0118 38.7547 8.20696ZM46.9177 9.45367 50.5967 11.1731 52.3529 12.1468 51.3831 13.896 49.6571 12.939 49.7186 12.9703 46.0709 11.2656ZM54.1021 13.1167 56.6269 14.5166 59.2925 16.2633 58.1963 17.9361 55.5616 16.2097 55.6248 16.2478 53.1322 14.8658ZM60.9653 17.3595 62.181 18.1561 65.8342 20.9962 64.6067 22.5752 60.9853 19.7599 61.051 19.8068 59.8691 19.0323ZM67.4635 22.304 71.6245 26.1896 71.8939 26.499 70.3854 27.8122 70.15 27.5418 70.2217 27.6161 66.0985 23.7658ZM73.2071 28.0075 75.3941 30.5196 76.9604 32.8279 75.3055 33.9509 73.7729 31.6924 73.8461 31.7875 71.6986 29.3207ZM78.0834 34.4828 78.4463 35.0176 80.7274 39.6775 80.8507 40.1892 78.9063 40.6576 78.8087 40.2527 78.8828 40.4582 76.681 35.9603 76.7517 36.0821 76.4285 35.6058ZM81.3191 42.1336 81.384 42.4029 79.4396 42.8713 79.3747 42.602ZM84.1057 40.6042 81.6141 49.1944 76.2369 42.047Z" transform="matrix(1 0 0 -1 143 457.194)"/><path d="M0.0162082-0.999869 6.01542-0.902619 5.983 1.09712-0.0162082 0.999869ZM8.01516-0.870203 14.0144-0.772954 13.9819 1.22678 7.98274 1.12953ZM16.0141-0.740537 17.4432-0.717371 22.0401-0.498449 21.9449 1.49929 17.3637 1.28111 17.3951 1.28211 15.9817 1.2592ZM24.0378-0.403309 30.031-0.117889 29.9359 1.87985 23.9427 1.59443ZM32.0288-0.0227485 34.7491 0.106806 38.046 0.36414 37.8903 2.35808 34.6086 2.10192 34.6389 2.10382 31.9336 1.97499ZM40.0399 0.519777 46.0217 0.986686 45.8661 2.98062 39.8843 2.51371ZM48.0156 1.14232 51.7818 1.43629 54.021 1.67858 53.8059 3.66697 51.5816 3.42629 51.6113 3.42907 47.86 3.13626ZM56.0094 1.89373 61.9746 2.53919 61.7594 4.52758 55.7942 3.88213ZM63.963 2.75434 68.4058 3.23507 69.9524 3.44976 69.6774 5.43077 68.1457 5.21815 68.1756 5.22184 63.7478 4.74273ZM71.9334 3.72476 77.8764 4.54976 77.6014 6.53076 71.6584 5.70577ZM79.8574 4.82475 84.4855 5.46721 85.8248 5.69592 85.4882 7.66738 84.1642 7.4413 84.195 7.44607 79.5824 6.80576ZM87.7963 6.03256 93.7107 7.0425 93.374 9.01396 87.4596 8.00403ZM95.6821 7.37914 99.8855 8.09689 101.618 8.45234 101.217 10.4115 99.4998 10.0594 99.5324 10.0656 95.3455 9.35061ZM103.578 8.85419 109.455 10.0598 109.053 12.019 103.176 10.8134ZM111.414 10.4616 114.471 11.0885 117.305 11.7781 116.832 13.7214 114.015 13.0361 114.051 13.044 111.013 12.4208ZM119.248 12.251 125.078 13.6696 124.605 15.6129 118.775 14.1942ZM127.021 14.1425 128.106 14.4065 132.839 15.7676 132.286 17.6897 127.573 16.3343 127.613 16.3449 126.548 16.0858ZM134.761 16.3204 140.527 17.9787 139.974 19.9008 134.208 18.2425ZM142.468 18.6339 148.147 20.5714 147.501 22.4642 141.822 20.5268ZM150.04 21.2172 151.991 21.883 155.691 23.3992 154.932 25.2498 151.26 23.745 151.317 23.7661 149.394 23.1101ZM157.541 24.1576 161.965 25.9705 163.116 26.5198 162.255 28.3249 161.129 27.7877 161.181 27.8105 156.783 26.0083ZM164.921 27.3811 166.417 28.0949 170.288 30.1608 169.346 31.9252 165.496 29.8699 165.536 29.8903 164.06 29.1861ZM172.051 31.2128 174.133 32.4706 177.115 34.5431 175.973 36.1855 173.017 34.1311 173.071 34.1659 171.017 32.9246ZM178.738 35.8385 180.177 37.0144 182.575 39.3924 183.152 40.3283 181.449 41.3778 180.935 40.5432 181.082 40.7285 178.803 38.4683 178.875 38.5326 177.473 37.3872ZM184.201 42.0309 184.451 42.436 182.748 43.4854 182.499 43.0803ZM186.867 40.296 185.963 49.1944 179.387 43.132Z" transform="matrix(1 0 0 -1 143 457.194)"/><path d="M0.0156214-0.999878 6.01489-0.90615 5.98365 1.09361-0.0156214 0.999878ZM8.01465-0.874907 14.0139-0.781179 13.9827 1.21858 7.9834 1.12485ZM16.0137-0.749936 22.0129-0.656208 21.9817 1.34355 15.9824 1.24982ZM24.0425-0.610275 30.0362-0.33517 29.9445 1.66273 23.9508 1.38762ZM32.0341-0.243468 38.0278 0.0316382 37.9361 2.02953 31.9424 1.75443ZM40.0257 0.12334 46.0194 0.398446 45.9277 2.39634 39.934 2.12124ZM48.0445 0.523202 54.0276 0.973298 53.8776 2.96766 47.8945 2.51757ZM56.022 1.12333 62.0051 1.57343 61.855 3.56779 55.872 3.1177ZM63.9994 1.72346 69.9786 2.17326 70.0112 2.17666 69.8037 4.16587 69.7855 4.16396 69.8142 4.16654 63.8494 3.71782ZM72.0004 2.3841 77.968 3.00644 77.7606 4.99565 71.793 4.37332ZM79.9573 3.21389 85.9249 3.83622 85.7175 5.82544 79.7498 5.2031ZM87.9141 4.04367 92.4444 4.51611 93.9056 4.71161 93.6404 6.69394 92.1936 6.50038 92.2225 6.50382 87.7067 6.03288ZM95.8879 4.97682 101.835 5.77248 101.57 7.75481 95.6227 6.95916ZM103.817 6.03769 109.764 6.83334 109.499 8.81568 103.552 8.02003ZM111.747 7.09856 114.174 7.42333 117.708 8.00485 117.383 9.97831 113.864 9.39923 113.894 9.40367 111.481 9.0809ZM119.681 8.32963 125.601 9.30397 125.277 11.2774 119.356 10.3031ZM127.575 9.62875 133.495 10.6031 133.17 12.5765 127.25 11.6022ZM135.497 10.9496 141.383 12.1131 140.996 14.0752 135.109 12.9116ZM143.345 12.501 149.232 13.6646 148.844 15.6266 142.958 14.463ZM151.194 14.0524 154.692 14.744 157.096 15.3078 156.639 17.255 154.252 16.6952 154.287 16.7026 150.806 16.0145ZM159.043 15.7645 164.885 17.1344 164.428 19.0816 158.586 17.7116ZM166.832 17.5911 172.673 18.9611 172.217 20.9083 166.375 19.5383ZM174.638 19.4697 180.436 21.011 179.923 22.9439 174.124 21.4025ZM182.386 21.5434 188.15 23.2095 187.595 25.1309 181.831 23.4647ZM190.094 23.7721 195.817 25.5738 195.217 27.4815 189.494 25.6798ZM197.725 26.1743 197.937 26.241 203.426 28.1274 202.776 30.0188 197.299 28.1367 197.324 28.1448 197.125 28.082ZM205.317 28.7775 205.372 28.7962 210.96 30.9008 210.255 32.7724 204.681 30.673 204.708 30.6829 204.667 30.6689ZM212.856 31.6341 218.398 33.9327 217.632 35.7801 212.09 33.4815ZM220.257 34.7672 224.842 36.8757 225.729 37.3319 224.814 39.1104 223.947 38.6643 223.987 38.6836 219.421 36.5843ZM227.508 38.2468 230.314 39.6905 232.817 41.1479 231.81 42.8763 229.33 41.432 229.376 41.457 226.593 40.0253ZM234.545 42.1544 235.24 42.5592 239.599 45.4789 239.669 45.5339 238.432 47.1052 238.392 47.074 238.454 47.1192 234.153 44.2384 234.207 44.2717 233.539 43.8827ZM241.24 46.7714 243.37 48.4484 245.817 50.7879 244.435 52.2335 242.022 49.9271 242.095 49.9899 240.003 48.3426ZM247.249 52.3466 249.052 54.5432 250.739 57.3923 249.018 58.4114 247.37 55.6281 247.457 55.7529 245.703 53.6153ZM251.537 59.4264 252.056 60.8708 252.444 64.0697 252.369 65.7033 250.371 65.6112 250.443 64.0611 250.449 64.2276 250.085 61.2235 250.136 61.4415 249.655 60.1032ZM252.277 67.7012 252.001 73.6948 250.003 73.6028 250.279 67.6091ZM251.909 75.6927 251.888 76.1544 251.138 81.7297 249.156 81.4632 249.9 75.9313 249.892 76.0185 249.911 75.6006ZM250.871 83.7119 250.319 87.8223 249.896 89.7144 247.944 89.2783 248.357 87.4283 248.342 87.5131 248.889 83.4453ZM249.46 91.6663 248.152 97.5219 246.2 97.0858 247.508 91.2302ZM247.645 99.5265 246.377 103.802 245.848 105.302 243.962 104.638 244.482 103.161 244.466 103.209 245.728 98.9575ZM245.184 107.189 244.709 108.539 242.933 112.808 241.086 112.04 242.852 107.796 242.832 107.849 243.298 106.524ZM242.025 114.662 240.941 116.859 239.091 119.979 237.371 118.959 239.201 115.872 239.165 115.939 240.232 113.777ZM237.891 121.694 236.653 123.392 233.529 126.024 232.24 124.494 235.272 121.94 235.109 122.116 236.275 120.516ZM235.853 128.254 226.908 128.214 232.307 121.083Z" transform="matrix(1 0 0 -1 143 457.214)"/><path d="M821 170 821 513.312" stroke="#000000" stroke-width="2" stroke-miterlimit="8" fill="none" fill-rule="evenodd"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="24" transform="translate(531.587 204)">Weight calibration</text><rect x="461" y="326" width="129" height="164" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#E8E8E8"/><rect x="476" y="351" width="103" height="71" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(513.235 374)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(499.482 390)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(506.482 406)">weight</text><rect x="476" y="433" width="103" height="48" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(493.898 453)">FP8 scaling</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(505.065 469)">factors</text><rect x="679" y="224" width="103" height="71" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(716.025 247)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(702.272 263)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(714.192 279)">input</text><rect x="679" y="351" width="103" height="70" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#C1E5F5"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(716.026 374)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(702.272 390)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(709.692 406)">GEMM</text><path d="M0.00641402-0.999979 93.1192-0.402739 93.1064 1.59722-0.00641402 0.999979ZM91.8051-3.41123 99.7793 0.64 91.7538 4.5886Z" transform="matrix(1 0 0 -1 579 386.64)"/><path d="M732 295 732 344.395 730 344.395 730 295ZM735 343.061 731 351.061 727 343.061Z"/><path d="M731.277 421.127 731.042 422.957 730.317 424.869 729.159 426.721 728.823 427.11 727.311 425.801 727.597 425.471 727.505 425.596 728.569 423.893 728.482 424.068 729.131 422.358 729.074 422.586 729.293 420.873ZM727.412 428.692 725.669 430.271 723.36 431.994 722.525 432.487 721.508 430.766 722.297 430.3 722.207 430.359 724.435 428.696 724.362 428.757 726.069 427.21ZM720.803 433.505 717.748 435.309 715.429 436.378 714.592 434.561 716.865 433.514 716.775 433.561 719.786 431.783ZM713.612 437.214 710.817 438.501 708.011 439.551 707.31 437.678 710.082 436.641 710.014 436.669 712.775 435.397ZM706.138 440.253 702.686 441.544 700.423 442.25 699.828 440.34 702.065 439.643 702.012 439.662 705.437 438.38ZM698.513 442.845 693.46 444.42 692.733 444.611 692.224 442.676 692.93 442.491 692.886 442.504 697.918 440.936ZM690.798 445.119 684.995 446.643 684.487 444.709 690.29 443.185ZM683.022 447.153 677.165 448.454 676.731 446.502 682.588 445.2ZM675.212 448.888 672.147 449.57 669.303 450.103 668.935 448.137 671.762 447.607 671.729 447.614 674.778 446.936ZM667.337 450.471 661.44 451.576 661.072 449.61 666.969 448.505ZM659.44 451.924 653.511 452.849 653.203 450.873 659.131 449.948ZM651.535 453.157 647.746 453.748 645.57 454.024 645.318 452.04 647.48 451.766 647.452 451.77 651.227 451.181ZM643.586 454.275 637.633 455.03 637.382 453.046 643.334 452.291ZM635.649 455.282 634.662 455.407 629.654 455.902 629.457 453.912 634.451 453.418 634.424 453.421 635.398 453.298ZM627.664 456.099 621.693 456.689 621.496 454.698 627.467 454.108ZM619.672 456.848 613.687 457.275 613.545 455.28 619.53 454.853ZM611.692 457.417 607.275 457.732 605.677 457.801 605.59 455.803 607.175 455.734 607.147 455.736 611.55 455.422ZM603.679 457.888 597.685 458.149 597.598 456.151 603.592 455.89ZM595.687 458.236 593.193 458.344 589.661 458.397 589.631 456.397 593.15 456.345 593.121 456.346 595.6 456.238ZM587.661 458.426 585.681 458.456 585.651 456.456 587.631 456.427ZM587.058 461.436 579 457.555 586.94 453.436Z"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="24" transform="translate(857.055 204)">FP8 with calibrated scaling factors</text><rect x="868" y="326" width="129" height="164" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#E8E8E8"/><rect x="883" y="351" width="103" height="71" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(919.685 374)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(905.932 390)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(912.932 406)">weight</text><rect x="883" y="433" width="103" height="48" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#92D050"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(902.185 445)">Calibrated</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(898.599 461)">FP8 scaling</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(911.519 477)">factors</text><rect x="1026" y="363" width="82" height="45" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#FBE3D6"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(1055.14 382)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(1044.64 398)">Weight</text><rect x="1131" y="307" width="82" height="44" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#FBE3D6"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(1159.49 325)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(1155.07 341)">Input</text><rect x="1120" y="224" width="103" height="71" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(1157.15 247)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(1143.4 263)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(1155.32 279)">input</text><rect x="1131" y="363" width="82" height="45" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#C1E5F5"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(1159.48 382)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(1150.82 398)">GEMM</text><path d="M0.015735-0.999876 34.0184-0.464776 33.987 1.53498-0.015735 0.999876ZM32.7325-3.48538 40.6686 0.64 32.6066 4.51362Z" transform="matrix(1 0 0 -1 986 386.64)"/><path d="M1108 385 1123.79 385 1123.79 387 1108 387ZM1122.46 382 1130.46 386 1122.46 390Z"/><path d="M1173 351 1173 356.349 1171 356.349 1171 351ZM1176 355.016 1172 363.016 1168 355.016Z"/><path d="M1173 295 1173 300.349 1171 300.349 1171 295ZM1176 299.016 1172 307.016 1168 299.016Z"/><path d="M0.0369111-0.999319 6.03282-0.777852 5.959 1.22079-0.0369111 0.999319ZM8.10061-0.673792 14.0656-0.0265058 13.8498 1.96182 7.88485 1.31454ZM16.1141 0.247765 22.0214 1.2984 21.6712 3.2675 15.7639 2.21686ZM24.0392 1.73603 29.8647 3.17232 29.386 5.11418 23.5604 3.67789ZM31.84 3.77104 37.3437 5.51189 37.6161 5.61786 36.8909 7.48175 36.6487 7.38751 36.7097 7.40901 31.2368 5.67792ZM39.48 6.34307 44.1494 8.15988 45.1058 8.60687 44.259 10.4188 43.3326 9.98577 43.3934 10.0118 38.7547 8.20696ZM46.9177 9.45367 50.5967 11.1731 52.3529 12.1468 51.3831 13.896 49.6571 12.939 49.7186 12.9703 46.0709 11.2656ZM54.1021 13.1167 56.6269 14.5166 59.2925 16.2633 58.1963 17.9361 55.5616 16.2097 55.6248 16.2478 53.1322 14.8658ZM60.9653 17.3595 62.181 18.1561 65.8342 20.9962 64.6067 22.5752 60.9853 19.7599 61.051 19.8068 59.8691 19.0323ZM67.4635 22.304 71.6245 26.1896 71.8939 26.499 70.3854 27.8122 70.15 27.5418 70.2217 27.6161 66.0985 23.7658ZM73.2071 28.0075 75.3941 30.5196 76.9604 32.8279 75.3055 33.9509 73.7729 31.6924 73.8461 31.7875 71.6986 29.3207ZM78.0834 34.4828 78.4463 35.0176 80.7274 39.6775 80.8507 40.1892 78.9063 40.6576 78.8087 40.2527 78.8828 40.4582 76.681 35.9603 76.7517 36.0821 76.4285 35.6058ZM81.3191 42.1336 81.384 42.4029 79.4396 42.8713 79.3747 42.602ZM84.1057 40.6042 81.6141 49.1944 76.2369 42.047Z" transform="matrix(1 0 0 -1 986 457.194)"/><path d="M0.0162082-0.999869 6.01542-0.902619 5.983 1.09712-0.0162082 0.999869ZM8.01516-0.870203 14.0144-0.772954 13.9819 1.22678 7.98274 1.12953ZM16.0141-0.740537 17.4432-0.717371 22.0401-0.498449 21.9449 1.49929 17.3637 1.28111 17.3951 1.28211 15.9817 1.2592ZM24.0378-0.403309 30.031-0.117889 29.9359 1.87985 23.9427 1.59443ZM32.0288-0.0227485 34.7491 0.106806 38.046 0.36414 37.8903 2.35808 34.6086 2.10192 34.6389 2.10382 31.9336 1.97499ZM40.0399 0.519777 46.0217 0.986686 45.8661 2.98062 39.8843 2.51371ZM48.0156 1.14232 51.7818 1.43629 54.021 1.67858 53.8059 3.66697 51.5816 3.42629 51.6113 3.42907 47.86 3.13626ZM56.0094 1.89373 61.9746 2.53919 61.7594 4.52758 55.7942 3.88213ZM63.963 2.75434 68.4058 3.23507 69.9524 3.44976 69.6774 5.43077 68.1457 5.21815 68.1756 5.22184 63.7478 4.74273ZM71.9334 3.72476 77.8764 4.54976 77.6014 6.53076 71.6584 5.70577ZM79.8574 4.82475 84.4855 5.46721 85.8248 5.69592 85.4882 7.66738 84.1642 7.4413 84.195 7.44607 79.5824 6.80576ZM87.7963 6.03256 93.7107 7.0425 93.374 9.01396 87.4596 8.00403ZM95.6821 7.37914 99.8855 8.09689 101.618 8.45234 101.217 10.4115 99.4998 10.0594 99.5324 10.0656 95.3455 9.35061ZM103.578 8.85419 109.455 10.0598 109.053 12.019 103.176 10.8134ZM111.414 10.4616 114.471 11.0885 117.305 11.7781 116.832 13.7214 114.015 13.0361 114.051 13.044 111.013 12.4208ZM119.248 12.251 125.078 13.6696 124.605 15.6129 118.775 14.1942ZM127.021 14.1425 128.106 14.4065 132.839 15.7676 132.286 17.6897 127.573 16.3343 127.613 16.3449 126.548 16.0858ZM134.761 16.3204 140.527 17.9787 139.974 19.9008 134.208 18.2425ZM142.468 18.6339 148.147 20.5714 147.501 22.4642 141.822 20.5268ZM150.04 21.2172 151.991 21.883 155.691 23.3992 154.932 25.2498 151.26 23.745 151.317 23.7661 149.394 23.1101ZM157.541 24.1576 161.965 25.9705 163.116 26.5198 162.255 28.3249 161.129 27.7877 161.181 27.8105 156.783 26.0083ZM164.921 27.3811 166.417 28.0949 170.288 30.1608 169.346 31.9252 165.496 29.8699 165.536 29.8903 164.06 29.1861ZM172.051 31.2128 174.133 32.4706 177.115 34.5431 175.973 36.1855 173.017 34.1311 173.071 34.1659 171.017 32.9246ZM178.738 35.8385 180.177 37.0144 182.575 39.3924 183.152 40.3283 181.449 41.3778 180.935 40.5432 181.082 40.7285 178.803 38.4683 178.875 38.5326 177.473 37.3872ZM184.201 42.0309 184.451 42.436 182.748 43.4854 182.499 43.0803ZM186.867 40.296 185.963 49.1944 179.387 43.132Z" transform="matrix(1 0 0 -1 986 457.194)"/><path d="M0.0156214-0.999878 6.01489-0.90615 5.98365 1.09361-0.0156214 0.999878ZM8.01465-0.874907 14.0139-0.781179 13.9827 1.21858 7.9834 1.12485ZM16.0137-0.749936 22.0129-0.656208 21.9817 1.34355 15.9824 1.24982ZM24.0425-0.610275 30.0362-0.33517 29.9445 1.66273 23.9508 1.38762ZM32.0341-0.243468 38.0278 0.0316382 37.9361 2.02953 31.9424 1.75443ZM40.0257 0.12334 46.0194 0.398446 45.9277 2.39634 39.934 2.12124ZM48.0445 0.523202 54.0276 0.973298 53.8776 2.96766 47.8945 2.51757ZM56.022 1.12333 62.0051 1.57343 61.855 3.56779 55.872 3.1177ZM63.9994 1.72346 69.9786 2.17326 70.0112 2.17666 69.8037 4.16587 69.7855 4.16396 69.8142 4.16654 63.8494 3.71782ZM72.0004 2.3841 77.968 3.00644 77.7606 4.99565 71.793 4.37332ZM79.9573 3.21389 85.9249 3.83622 85.7175 5.82544 79.7498 5.2031ZM87.9141 4.04367 92.4444 4.51611 93.9056 4.71161 93.6404 6.69394 92.1936 6.50038 92.2225 6.50382 87.7067 6.03288ZM95.8879 4.97682 101.835 5.77248 101.57 7.75481 95.6227 6.95916ZM103.817 6.03769 109.764 6.83334 109.499 8.81568 103.552 8.02003ZM111.747 7.09856 114.174 7.42333 117.708 8.00485 117.383 9.97831 113.864 9.39923 113.894 9.40367 111.481 9.0809ZM119.681 8.32963 125.601 9.30397 125.277 11.2774 119.356 10.3031ZM127.575 9.62875 133.495 10.6031 133.17 12.5765 127.25 11.6022ZM135.497 10.9496 141.383 12.1131 140.996 14.0752 135.109 12.9116ZM143.345 12.501 149.232 13.6646 148.844 15.6266 142.958 14.463ZM151.194 14.0524 154.692 14.744 157.096 15.3078 156.639 17.255 154.252 16.6952 154.287 16.7026 150.806 16.0145ZM159.043 15.7645 164.885 17.1344 164.428 19.0816 158.586 17.7116ZM166.832 17.5911 172.673 18.9611 172.217 20.9083 166.375 19.5383ZM174.638 19.4697 180.436 21.011 179.923 22.9439 174.124 21.4025ZM182.386 21.5434 188.15 23.2095 187.595 25.1309 181.831 23.4647ZM190.094 23.7721 195.817 25.5738 195.217 27.4815 189.494 25.6798ZM197.725 26.1743 197.937 26.241 203.426 28.1274 202.776 30.0188 197.299 28.1367 197.324 28.1448 197.125 28.082ZM205.317 28.7775 205.372 28.7962 210.96 30.9008 210.255 32.7724 204.681 30.673 204.708 30.6829 204.667 30.6689ZM212.856 31.6341 218.398 33.9327 217.632 35.7801 212.09 33.4815ZM220.257 34.7672 224.842 36.8757 225.729 37.3319 224.814 39.1104 223.947 38.6643 223.987 38.6836 219.421 36.5843ZM227.508 38.2468 230.314 39.6905 232.817 41.1479 231.81 42.8763 229.33 41.432 229.376 41.457 226.593 40.0253ZM234.545 42.1544 235.24 42.5592 239.599 45.4789 239.669 45.5339 238.432 47.1052 238.392 47.074 238.454 47.1192 234.153 44.2384 234.207 44.2717 233.539 43.8827ZM241.24 46.7714 243.37 48.4484 245.817 50.7879 244.435 52.2335 242.022 49.9271 242.095 49.9899 240.003 48.3426ZM247.249 52.3466 249.052 54.5432 250.739 57.3923 249.018 58.4114 247.37 55.6281 247.457 55.7529 245.703 53.6153ZM251.537 59.4264 252.056 60.8708 252.444 64.0697 252.369 65.7033 250.371 65.6112 250.443 64.0611 250.449 64.2276 250.085 61.2235 250.136 61.4415 249.655 60.1032ZM252.277 67.7012 252.001 73.6948 250.003 73.6028 250.279 67.6091ZM251.909 75.6927 251.888 76.1544 251.138 81.7297 249.156 81.4632 249.9 75.9313 249.892 76.0185 249.911 75.6006ZM250.871 83.7119 250.319 87.8223 249.896 89.7144 247.944 89.2783 248.357 87.4283 248.342 87.5131 248.889 83.4453ZM249.46 91.6663 248.152 97.5219 246.2 97.0858 247.508 91.2302ZM247.645 99.5265 246.377 103.802 245.848 105.302 243.962 104.638 244.482 103.161 244.466 103.209 245.728 98.9575ZM245.184 107.189 244.709 108.539 242.933 112.808 241.086 112.04 242.852 107.796 242.832 107.849 243.298 106.524ZM242.025 114.662 240.941 116.859 239.091 119.979 237.371 118.959 239.201 115.872 239.165 115.939 240.232 113.777ZM237.891 121.694 236.653 123.392 233.529 126.024 232.24 124.494 235.272 121.94 235.109 122.116 236.275 120.516ZM235.853 128.254 226.908 128.214 232.307 121.083Z" transform="matrix(1 0 0 -1 986 457.214)"/><path d="M422 170 422 513.312" stroke="#000000" stroke-width="2" stroke-miterlimit="8" fill="none" fill-rule="evenodd"/></g></svg>
\ No newline at end of file
diff --git a/docs/examples/te_gemma/media/calibration_1_half.svg b/docs/examples/te_gemma/media/calibration_1_half.svg
new file mode 100755
index 0000000000..af2641387f
--- /dev/null
+++ b/docs/examples/te_gemma/media/calibration_1_half.svg
@@ -0,0 +1 @@
+<svg width="960" height="480" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" overflow="hidden"><g><rect x="0" y="0" width="960" height="480" fill="#FFFFFF"/><rect x="81" y="206" width="129" height="164" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#E8E8E8"/><rect x="96" y="231" width="103" height="71" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(133.202 254)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(119.448 270)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(126.448 286)">weight</text><rect x="96" y="313" width="103" height="48" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#F7CBCB"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(130.702 325)">Initial</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(112.115 341)">FP8 scaling</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(125.035 357)">factors</text><rect x="240" y="243" width="82" height="45" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#FBE3D6"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(268.651 262)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(258.151 278)">Weight</text><rect x="344" y="187" width="82" height="44" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#FBE3D6"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(373.003 205)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(368.583 221)">Input</text><rect x="334" y="104" width="103" height="71" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(370.67 127)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(356.917 143)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(368.837 159)">input</text><rect x="344" y="243" width="82" height="45" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#C1E5F5"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(373 262)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(364.333 278)">GEMM</text><path d="M0.015735-0.999876 34.0184-0.464776 33.987 1.53498-0.015735 0.999876ZM32.7325-3.48538 40.6686 0.64 32.6066 4.51362Z" transform="matrix(1 0 0 -1 199 266.64)"/><path d="M322 265 337.791 265 337.791 267 322 267ZM336.458 262 344.458 266 336.458 270Z"/><path d="M386 231 386 236.349 384 236.349 384 231ZM389 235.016 385 243.016 381 235.016Z"/><path d="M386 175 386 180.349 384 180.349 384 175ZM389 179.016 385 187.016 381 179.016Z"/><path d="M302.844 247 306.907 252.911 305.231 253.608 310.349 258.673 308.673 259.519 315 268 304.27 261.501 306.317 260.598 299.65 256.435 302.039 255.149 295 250.782Z" fill="#FF0000" fill-rule="evenodd"/><path d="M408.452 190 412.312 195.63 410.72 196.294 415.582 201.118 413.99 201.923 420 210 409.807 203.81 411.751 202.951 405.418 198.986 407.687 197.761 401 193.602Z" fill="#FF0000" fill-rule="evenodd"/><path d="M409.452 247 413.312 252.63 411.72 253.294 416.582 258.118 414.99 258.923 421 267 410.807 260.81 412.751 259.951 406.418 255.986 408.687 254.761 402 250.602Z" fill="#FF0000" fill-rule="evenodd"/><path d="M0.0369111-0.999319 6.03282-0.777852 5.959 1.22079-0.0369111 0.999319ZM8.10061-0.673792 14.0656-0.0265058 13.8498 1.96182 7.88485 1.31454ZM16.1141 0.247765 22.0214 1.2984 21.6712 3.2675 15.7639 2.21686ZM24.0392 1.73603 29.8647 3.17232 29.386 5.11418 23.5604 3.67789ZM31.84 3.77104 37.3437 5.51189 37.6161 5.61786 36.8909 7.48175 36.6487 7.38751 36.7097 7.40901 31.2368 5.67792ZM39.48 6.34307 44.1494 8.15988 45.1058 8.60687 44.259 10.4188 43.3326 9.98577 43.3934 10.0118 38.7547 8.20696ZM46.9177 9.45367 50.5967 11.1731 52.3529 12.1468 51.3831 13.896 49.6571 12.939 49.7186 12.9703 46.0709 11.2656ZM54.1021 13.1167 56.6269 14.5166 59.2925 16.2633 58.1963 17.9361 55.5616 16.2097 55.6248 16.2478 53.1322 14.8658ZM60.9653 17.3595 62.181 18.1561 65.8342 20.9962 64.6067 22.5752 60.9853 19.7599 61.051 19.8068 59.8691 19.0323ZM67.4635 22.304 71.6245 26.1896 71.8939 26.499 70.3854 27.8122 70.15 27.5418 70.2217 27.6161 66.0985 23.7658ZM73.2071 28.0075 75.3941 30.5196 76.9604 32.8279 75.3055 33.9509 73.7729 31.6924 73.8461 31.7875 71.6986 29.3207ZM78.0834 34.4828 78.4463 35.0176 80.7274 39.6775 80.8507 40.1892 78.9063 40.6576 78.8087 40.2527 78.8828 40.4582 76.681 35.9603 76.7517 36.0821 76.4285 35.6058ZM81.3191 42.1336 81.384 42.4029 79.4396 42.8713 79.3747 42.602ZM84.1057 40.6042 81.6141 49.1944 76.2369 42.047Z" transform="matrix(1 0 0 -1 199 337.194)"/><path d="M0.0162082-0.999869 6.01542-0.902619 5.983 1.09712-0.0162082 0.999869ZM8.01516-0.870203 14.0144-0.772953 13.982 1.22678 7.98274 1.12953ZM16.0141-0.740537 17.4432-0.717371 22.0401-0.498449 21.945 1.49929 17.3637 1.28111 17.3951 1.28211 15.9817 1.2592ZM24.0378-0.403308 30.031-0.117888 29.9359 1.87985 23.9427 1.59443ZM32.0288-0.0227477 34.7491 0.106806 38.046 0.364142 37.8903 2.35808 34.6086 2.10192 34.6388 2.10382 31.9336 1.97499ZM40.0399 0.519778 46.0217 0.986688 45.8661 2.98062 39.8843 2.51371ZM48.0156 1.14232 51.7818 1.43629 54.021 1.67858 53.8059 3.66698 51.5815 3.42629 51.6113 3.42907 47.86 3.13626ZM56.0094 1.89373 61.9746 2.53919 61.7594 4.52759 55.7943 3.88213ZM63.963 2.75434 68.4058 3.23507 69.9524 3.44977 69.6774 5.43077 68.1457 5.21815 68.1756 5.22184 63.7478 4.74274ZM71.9334 3.72477 77.8764 4.54976 77.6014 6.53077 71.6584 5.70577ZM79.8574 4.82476 84.4854 5.46721 85.8248 5.69593 85.4882 7.66739 84.1641 7.4413 84.195 7.44607 79.5824 6.80576ZM87.7963 6.03257 93.7107 7.0425 93.374 9.01397 87.4596 8.00403ZM95.6821 7.37915 99.8854 8.09689 101.618 8.45235 101.217 10.4116 99.4998 10.0594 99.5324 10.0656 95.3455 9.35061ZM103.578 8.8542 109.455 10.0598 109.053 12.019 103.176 10.8134ZM111.414 10.4616 114.471 11.0885 117.305 11.7781 116.832 13.7214 114.015 13.0361 114.051 13.044 111.013 12.4208ZM119.248 12.251 125.078 13.6696 124.605 15.6129 118.775 14.1943ZM127.021 14.1425 128.106 14.4065 132.839 15.7676 132.286 17.6897 127.573 16.3343 127.613 16.3449 126.548 16.0858ZM134.761 16.3204 140.527 17.9787 139.974 19.9008 134.208 18.2425ZM142.468 18.6339 148.147 20.5714 147.501 22.4643 141.822 20.5268ZM150.04 21.2172 151.991 21.883 155.691 23.3992 154.932 25.2498 151.26 23.745 151.317 23.7661 149.394 23.1101ZM157.541 24.1577 161.965 25.9705 163.116 26.5198 162.255 28.3249 161.129 27.7877 161.18 27.8105 156.783 26.0083ZM164.921 27.3811 166.417 28.0949 170.288 30.1608 169.346 31.9252 165.496 29.8699 165.536 29.8903 164.06 29.1862ZM172.051 31.2128 174.133 32.4706 177.115 34.5432 175.973 36.1855 173.017 34.1311 173.071 34.1659 171.017 32.9246ZM178.738 35.8386 180.177 37.0144 182.575 39.3924 183.152 40.3284 181.449 41.3778 180.935 40.5432 181.082 40.7285 178.803 38.4683 178.875 38.5326 177.473 37.3872ZM184.201 42.0309 184.451 42.436 182.748 43.4854 182.499 43.0804ZM186.867 40.296 185.963 49.1944 179.387 43.1319Z" transform="matrix(1 0 0 -1 199 337.194)"/><path d="M0.0156214-0.999878 6.01489-0.90615 5.98365 1.09361-0.0156214 0.999878ZM8.01465-0.874907 14.0139-0.781179 13.9827 1.21858 7.9834 1.12485ZM16.0137-0.749936 22.0129-0.656208 21.9817 1.34355 15.9824 1.24982ZM24.0425-0.610275 30.0362-0.33517 29.9445 1.66273 23.9508 1.38762ZM32.0341-0.243468 38.0278 0.0316382 37.9361 2.02953 31.9424 1.75443ZM40.0257 0.12334 46.0194 0.398446 45.9277 2.39634 39.934 2.12124ZM48.0445 0.523202 54.0276 0.973298 53.8776 2.96766 47.8945 2.51757ZM56.022 1.12333 62.0051 1.57343 61.855 3.56779 55.872 3.1177ZM63.9994 1.72346 69.9786 2.17326 70.0112 2.17666 69.8037 4.16587 69.7855 4.16396 69.8142 4.16654 63.8494 3.71782ZM72.0004 2.3841 77.968 3.00644 77.7606 4.99565 71.793 4.37332ZM79.9573 3.21389 85.9249 3.83622 85.7175 5.82544 79.7498 5.2031ZM87.9141 4.04367 92.4444 4.51611 93.9056 4.71161 93.6404 6.69394 92.1936 6.50038 92.2225 6.50382 87.7067 6.03288ZM95.8879 4.97682 101.835 5.77248 101.57 7.75481 95.6227 6.95916ZM103.817 6.03769 109.764 6.83334 109.499 8.81568 103.552 8.02003ZM111.747 7.09856 114.174 7.42333 117.708 8.00485 117.383 9.97831 113.864 9.39923 113.894 9.40367 111.481 9.0809ZM119.681 8.32963 125.601 9.30397 125.277 11.2774 119.356 10.3031ZM127.575 9.62875 133.495 10.6031 133.17 12.5765 127.25 11.6022ZM135.497 10.9496 141.383 12.1131 140.996 14.0752 135.109 12.9116ZM143.345 12.501 149.232 13.6646 148.844 15.6266 142.958 14.463ZM151.194 14.0524 154.692 14.744 157.096 15.3078 156.639 17.255 154.252 16.6952 154.287 16.7026 150.806 16.0145ZM159.043 15.7645 164.885 17.1344 164.428 19.0816 158.586 17.7116ZM166.832 17.5911 172.673 18.9611 172.217 20.9083 166.375 19.5383ZM174.638 19.4697 180.436 21.011 179.923 22.9439 174.124 21.4025ZM182.386 21.5434 188.15 23.2095 187.595 25.1309 181.831 23.4647ZM190.094 23.7721 195.817 25.5738 195.217 27.4815 189.494 25.6798ZM197.725 26.1743 197.937 26.241 203.426 28.1274 202.776 30.0188 197.299 28.1367 197.324 28.1448 197.125 28.082ZM205.317 28.7775 205.372 28.7962 210.96 30.9008 210.255 32.7724 204.681 30.673 204.708 30.6829 204.667 30.6689ZM212.856 31.6341 218.398 33.9327 217.632 35.7801 212.09 33.4815ZM220.257 34.7672 224.842 36.8757 225.729 37.3319 224.814 39.1104 223.947 38.6643 223.987 38.6836 219.421 36.5843ZM227.508 38.2468 230.314 39.6905 232.817 41.1479 231.81 42.8763 229.33 41.432 229.376 41.457 226.593 40.0253ZM234.545 42.1544 235.24 42.5592 239.599 45.4789 239.669 45.5339 238.432 47.1052 238.392 47.074 238.454 47.1192 234.153 44.2384 234.207 44.2717 233.539 43.8827ZM241.24 46.7714 243.37 48.4484 245.817 50.7879 244.435 52.2335 242.022 49.9271 242.095 49.9899 240.003 48.3426ZM247.249 52.3466 249.052 54.5432 250.739 57.3923 249.018 58.4114 247.37 55.6281 247.457 55.7529 245.703 53.6153ZM251.537 59.4264 252.056 60.8708 252.444 64.0697 252.369 65.7033 250.371 65.6112 250.443 64.0611 250.449 64.2276 250.085 61.2235 250.136 61.4415 249.655 60.1032ZM252.277 67.7012 252.001 73.6948 250.003 73.6028 250.279 67.6091ZM251.909 75.6927 251.888 76.1544 251.138 81.7297 249.156 81.4632 249.9 75.9313 249.892 76.0185 249.911 75.6006ZM250.871 83.7119 250.319 87.8223 249.896 89.7144 247.944 89.2783 248.357 87.4283 248.342 87.5131 248.889 83.4453ZM249.46 91.6663 248.152 97.5219 246.2 97.0858 247.508 91.2302ZM247.645 99.5265 246.377 103.802 245.848 105.302 243.962 104.638 244.482 103.161 244.466 103.209 245.728 98.9575ZM245.184 107.189 244.709 108.539 242.933 112.808 241.086 112.04 242.852 107.796 242.832 107.849 243.298 106.524ZM242.025 114.662 240.941 116.859 239.091 119.979 237.371 118.959 239.201 115.872 239.165 115.939 240.232 113.777ZM237.891 121.694 236.653 123.392 233.529 126.024 232.24 124.494 235.272 121.94 235.109 122.116 236.275 120.516ZM235.853 128.254 226.908 128.214 232.307 121.083Z" transform="matrix(1 0 0 -1 199 337.214)"/><rect x="518" y="206" width="129" height="164" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#E8E8E8"/><rect x="533" y="231" width="103" height="71" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(569.617 254)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(555.863 270)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(562.863 286)">weight</text><rect x="533" y="313" width="103" height="48" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(550.28 333)">FP8 scaling</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(561.447 349)">factors</text><rect x="735" y="104" width="103" height="71" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(772.407 127)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(758.653 143)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(770.573 159)">input</text><rect x="735" y="231" width="103" height="70" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#C1E5F5"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(772.407 254)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(758.653 270)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(766.073 286)">GEMM</text><path d="M0.00641402-0.999979 93.1192-0.402739 93.1064 1.59722-0.00641402 0.999979ZM91.8051-3.41123 99.7793 0.64 91.7538 4.5886Z" transform="matrix(1 0 0 -1 636 266.64)"/><path d="M788 175 788 224.395 786 224.395 786 175ZM791 223.061 787 231.061 783 223.061Z"/><path d="M788.277 301.127 788.042 302.957 787.317 304.869 786.16 306.721 785.823 307.11 784.311 305.801 784.597 305.471 784.505 305.595 785.569 303.893 785.482 304.068 786.131 302.358 786.074 302.586 786.293 300.873ZM784.412 308.692 782.669 310.271 780.361 311.994 779.525 312.487 778.508 310.765 779.297 310.3 779.207 310.359 781.435 308.696 781.362 308.757 783.069 307.21ZM777.803 313.505 774.748 315.309 772.429 316.378 771.592 314.561 773.866 313.514 773.775 313.561 776.786 311.783ZM770.612 317.214 767.817 318.501 765.011 319.551 764.31 317.678 767.082 316.641 767.014 316.669 769.776 315.397ZM763.138 320.253 759.686 321.544 757.423 322.25 756.828 320.34 759.065 319.643 759.012 319.662 762.437 318.38ZM755.513 322.845 750.46 324.42 749.733 324.611 749.224 322.676 749.93 322.491 749.886 322.504 754.918 320.936ZM747.798 325.119 741.995 326.643 741.487 324.709 747.29 323.185ZM740.022 327.153 734.165 328.454 733.731 326.502 739.588 325.2ZM732.212 328.888 729.147 329.57 726.303 330.103 725.935 328.137 728.762 327.607 728.729 327.614 731.778 326.936ZM724.337 330.471 718.44 331.576 718.072 329.61 723.969 328.505ZM716.44 331.924 710.511 332.849 710.203 330.873 716.132 329.948ZM708.535 333.157 704.746 333.748 702.57 334.024 702.318 332.04 704.481 331.766 704.452 331.77 708.227 331.181ZM700.586 334.275 694.634 335.03 694.382 333.046 700.334 332.291ZM692.649 335.282 691.662 335.407 686.654 335.902 686.457 333.912 691.452 333.418 691.424 333.421 692.398 333.298ZM684.664 336.099 678.693 336.689 678.496 334.698 684.467 334.108ZM676.672 336.848 670.687 337.275 670.545 335.28 676.53 334.853ZM668.692 337.417 664.275 337.732 662.677 337.801 662.59 335.803 664.175 335.734 664.147 335.736 668.55 335.422ZM660.679 337.888 654.685 338.149 654.598 336.151 660.592 335.89ZM652.687 338.236 650.194 338.345 646.661 338.397 646.631 336.397 650.15 336.345 650.121 336.346 652.6 336.238ZM644.661 338.426 642.681 338.456 642.651 336.456 644.631 336.427ZM644.058 341.436 636 337.555 643.94 333.437Z"/><path d="M479 50 479 393.312" stroke="#000000" stroke-width="2" stroke-miterlimit="8" fill="none" fill-rule="evenodd"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="22" transform="translate(105.552 72)">FP8 with initial scaling factors</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="22" transform="translate(591.229 72)">Weight calibration</text></g></svg>
\ No newline at end of file
diff --git a/docs/examples/te_gemma/media/calibration_2_half.svg b/docs/examples/te_gemma/media/calibration_2_half.svg
new file mode 100755
index 0000000000..2d56f7d434
--- /dev/null
+++ b/docs/examples/te_gemma/media/calibration_2_half.svg
@@ -0,0 +1 @@
+<svg width="960" height="480" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" overflow="hidden"><g><rect x="0" y="0" width="960" height="480" fill="#FFFFFF"/><path d="M446 56 446 399.312" stroke="#000000" stroke-width="2" stroke-miterlimit="8" fill="none" fill-rule="evenodd"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="22" transform="translate(166.41 88)">Weight calibration</text><rect x="87" y="211" width="129" height="164" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#E8E8E8"/><rect x="102" y="236" width="103" height="71" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(138.558 260)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(124.805 276)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(131.805 292)">weight</text><rect x="102" y="319" width="103" height="48" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(119.222 339)">FP8 scaling</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(130.388 355)">factors</text><rect x="304" y="109" width="103" height="71" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(341.349 132)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(327.595 148)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(339.515 164)">input</text><rect x="304" y="236" width="103" height="70" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#C1E5F5"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(341.348 259)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(327.595 275)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(335.015 291)">GEMM</text><path d="M0.00641402-0.999979 93.1192-0.402739 93.1064 1.59722-0.00641402 0.999979ZM91.8051-3.41123 99.7793 0.64 91.7538 4.5886Z" transform="matrix(1 0 0 -1 205 271.64)"/><path d="M357 180 357 229.395 355 229.395 355 180ZM360 228.061 356 236.061 352 228.061Z"/><path d="M357.277 306.127 357.042 307.957 356.317 309.869 355.16 311.721 354.823 312.11 353.311 310.801 353.597 310.471 353.505 310.595 354.569 308.893 354.482 309.068 355.131 307.358 355.074 307.586 355.293 305.873ZM353.412 313.692 351.669 315.271 349.361 316.994 348.525 317.487 347.508 315.765 348.297 315.3 348.207 315.359 350.435 313.696 350.362 313.757 352.069 312.21ZM346.803 318.505 343.748 320.309 341.429 321.378 340.592 319.561 342.866 318.514 342.775 318.561 345.786 316.783ZM339.612 322.214 336.817 323.501 334.011 324.551 333.31 322.678 336.082 321.641 336.014 321.669 338.775 320.397ZM332.138 325.253 328.686 326.544 326.423 327.25 325.828 325.34 328.065 324.643 328.012 324.662 331.437 323.38ZM324.513 327.845 319.46 329.42 318.733 329.611 318.224 327.676 318.93 327.491 318.886 327.504 323.918 325.936ZM316.798 330.119 310.995 331.643 310.487 329.709 316.29 328.185ZM309.022 332.153 303.165 333.454 302.731 331.502 308.588 330.2ZM301.212 333.888 298.147 334.57 295.303 335.103 294.935 333.137 297.762 332.607 297.729 332.614 300.778 331.936ZM293.337 335.471 287.44 336.576 287.072 334.61 292.969 333.505ZM285.44 336.924 279.511 337.849 279.203 335.873 285.132 334.948ZM277.535 338.157 273.746 338.748 271.57 339.024 271.318 337.04 273.481 336.766 273.452 336.77 277.227 336.181ZM269.586 339.275 263.633 340.03 263.382 338.046 269.334 337.291ZM261.649 340.282 260.662 340.407 255.654 340.902 255.457 338.912 260.452 338.418 260.424 338.421 261.398 338.298ZM253.664 341.099 247.693 341.689 247.496 339.698 253.467 339.108ZM245.672 341.848 239.687 342.275 239.545 340.28 245.53 339.853ZM237.692 342.417 233.275 342.732 231.677 342.801 231.59 340.803 233.175 340.734 233.147 340.736 237.55 340.422ZM229.679 342.888 223.685 343.149 223.598 341.151 229.592 340.89ZM221.687 343.236 219.194 343.345 215.661 343.397 215.631 341.397 219.15 341.345 219.121 341.346 221.6 341.238ZM213.661 343.426 211.681 343.456 211.651 341.456 213.631 341.427ZM213.058 346.436 205 342.555 212.94 338.437Z"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="22" transform="translate(500.235 88)">FP8 with calibrated scaling factors</text><rect x="493" y="211" width="129" height="164" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#E8E8E8"/><rect x="508" y="236" width="103" height="71" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(545.009 260)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(531.255 276)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(538.255 292)">weight</text><rect x="508" y="319" width="103" height="48" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#92D050"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(527.509 331)">Calibrated</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(523.922 347)">FP8 scaling</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(536.842 363)">factors</text><rect x="652" y="249" width="82" height="44" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#FBE3D6"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(680.458 267)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(669.958 283)">Weight</text><rect x="756" y="192" width="82" height="45" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#FBE3D6"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(784.81 210)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(780.39 226)">Input</text><rect x="745" y="109" width="103" height="71" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(782.477 132)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(768.723 148)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(780.643 164)">input</text><rect x="756" y="249" width="82" height="44" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#C1E5F5"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(784.807 267)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(776.14 283)">GEMM</text><path d="M0.015735-0.999876 34.0184-0.464776 33.987 1.53498-0.015735 0.999876ZM32.7325-3.48538 40.6686 0.64 32.6066 4.51362Z" transform="matrix(1 0 0 -1 611 271.64)"/><path d="M734 270 749.791 270 749.791 272 734 272ZM748.458 267 756.458 271 748.458 275Z"/><path d="M798 237 798 242.349 796 242.349 796 237ZM801 241.016 797 249.016 793 241.016Z"/><path d="M798 180 798 185.349 796 185.349 796 180ZM801 184.016 797 192.016 793 184.016Z"/><path d="M0.0369111-0.999319 6.03282-0.777852 5.959 1.22079-0.0369111 0.999319ZM8.10061-0.673792 14.0656-0.0265058 13.8498 1.96182 7.88485 1.31454ZM16.1141 0.247765 22.0214 1.2984 21.6712 3.2675 15.7639 2.21686ZM24.0392 1.73603 29.8647 3.17232 29.386 5.11418 23.5604 3.67789ZM31.84 3.77104 37.3437 5.51189 37.6161 5.61786 36.8909 7.48175 36.6487 7.38751 36.7097 7.40901 31.2368 5.67792ZM39.48 6.34307 44.1494 8.15988 45.1058 8.60687 44.259 10.4188 43.3326 9.98577 43.3934 10.0118 38.7547 8.20696ZM46.9177 9.45367 50.5967 11.1731 52.3529 12.1468 51.3831 13.896 49.6571 12.939 49.7186 12.9703 46.0709 11.2656ZM54.1021 13.1167 56.6269 14.5166 59.2925 16.2633 58.1963 17.9361 55.5616 16.2097 55.6248 16.2478 53.1322 14.8658ZM60.9653 17.3595 62.181 18.1561 65.8342 20.9962 64.6067 22.5752 60.9853 19.7599 61.051 19.8068 59.8691 19.0323ZM67.4635 22.304 71.6245 26.1896 71.8939 26.499 70.3854 27.8122 70.15 27.5418 70.2217 27.6161 66.0985 23.7658ZM73.2071 28.0075 75.3941 30.5196 76.9604 32.8279 75.3055 33.9509 73.7729 31.6924 73.8461 31.7875 71.6986 29.3207ZM78.0834 34.4828 78.4463 35.0176 80.7274 39.6775 80.8507 40.1892 78.9063 40.6576 78.8087 40.2527 78.8828 40.4582 76.681 35.9603 76.7517 36.0821 76.4285 35.6058ZM81.3191 42.1336 81.384 42.4029 79.4396 42.8713 79.3747 42.602ZM84.1057 40.6042 81.6141 49.1944 76.2369 42.047Z" transform="matrix(1 0 0 -1 611 342.194)"/><path d="M0.0162082-0.999869 6.01542-0.902619 5.983 1.09712-0.0162082 0.999869ZM8.01516-0.870203 14.0144-0.772953 13.982 1.22678 7.98274 1.12953ZM16.0141-0.740537 17.4432-0.717371 22.0401-0.498449 21.945 1.49929 17.3637 1.28111 17.3951 1.28211 15.9817 1.2592ZM24.0378-0.403308 30.031-0.117888 29.9359 1.87985 23.9427 1.59443ZM32.0288-0.0227477 34.7491 0.106806 38.046 0.364142 37.8903 2.35808 34.6086 2.10192 34.6388 2.10382 31.9336 1.97499ZM40.0399 0.519778 46.0217 0.986688 45.8661 2.98062 39.8843 2.51371ZM48.0156 1.14232 51.7818 1.43629 54.021 1.67858 53.8059 3.66698 51.5815 3.42629 51.6113 3.42907 47.86 3.13626ZM56.0094 1.89373 61.9746 2.53919 61.7594 4.52759 55.7943 3.88213ZM63.963 2.75434 68.4058 3.23507 69.9524 3.44977 69.6774 5.43077 68.1457 5.21815 68.1756 5.22184 63.7478 4.74274ZM71.9334 3.72477 77.8764 4.54976 77.6014 6.53077 71.6584 5.70577ZM79.8574 4.82476 84.4854 5.46721 85.8248 5.69593 85.4882 7.66739 84.1641 7.4413 84.195 7.44607 79.5824 6.80576ZM87.7963 6.03257 93.7107 7.0425 93.374 9.01397 87.4596 8.00403ZM95.6821 7.37915 99.8854 8.09689 101.618 8.45235 101.217 10.4116 99.4998 10.0594 99.5324 10.0656 95.3455 9.35061ZM103.578 8.8542 109.455 10.0598 109.053 12.019 103.176 10.8134ZM111.414 10.4616 114.471 11.0885 117.305 11.7781 116.832 13.7214 114.015 13.0361 114.051 13.044 111.013 12.4208ZM119.248 12.251 125.078 13.6696 124.605 15.6129 118.775 14.1943ZM127.021 14.1425 128.106 14.4065 132.839 15.7676 132.286 17.6897 127.573 16.3343 127.613 16.3449 126.548 16.0858ZM134.761 16.3204 140.527 17.9787 139.974 19.9008 134.208 18.2425ZM142.468 18.6339 148.147 20.5714 147.501 22.4643 141.822 20.5268ZM150.04 21.2172 151.991 21.883 155.691 23.3992 154.932 25.2498 151.26 23.745 151.317 23.7661 149.394 23.1101ZM157.541 24.1577 161.965 25.9705 163.116 26.5198 162.255 28.3249 161.129 27.7877 161.18 27.8105 156.783 26.0083ZM164.921 27.3811 166.417 28.0949 170.288 30.1608 169.346 31.9252 165.496 29.8699 165.536 29.8903 164.06 29.1862ZM172.051 31.2128 174.133 32.4706 177.115 34.5432 175.973 36.1855 173.017 34.1311 173.071 34.1659 171.017 32.9246ZM178.738 35.8386 180.177 37.0144 182.575 39.3924 183.152 40.3284 181.449 41.3778 180.935 40.5432 181.082 40.7285 178.803 38.4683 178.875 38.5326 177.473 37.3872ZM184.201 42.0309 184.451 42.436 182.748 43.4854 182.499 43.0804ZM186.867 40.296 185.963 49.1944 179.387 43.1319Z" transform="matrix(1 0 0 -1 611 342.194)"/><path d="M0.0156214-0.999878 6.01489-0.90615 5.98365 1.09361-0.0156214 0.999878ZM8.01465-0.874907 14.0139-0.781179 13.9827 1.21858 7.9834 1.12485ZM16.0137-0.749936 22.0129-0.656208 21.9817 1.34355 15.9824 1.24982ZM24.0425-0.610275 30.0362-0.33517 29.9445 1.66273 23.9508 1.38762ZM32.0341-0.243468 38.0278 0.0316382 37.9361 2.02953 31.9424 1.75443ZM40.0257 0.12334 46.0194 0.398446 45.9277 2.39634 39.934 2.12124ZM48.0445 0.523202 54.0276 0.973298 53.8776 2.96766 47.8945 2.51757ZM56.022 1.12333 62.0051 1.57343 61.855 3.56779 55.872 3.1177ZM63.9994 1.72346 69.9786 2.17326 70.0112 2.17666 69.8037 4.16587 69.7855 4.16396 69.8142 4.16654 63.8494 3.71782ZM72.0004 2.3841 77.968 3.00644 77.7606 4.99565 71.793 4.37332ZM79.9573 3.21389 85.9249 3.83622 85.7175 5.82544 79.7498 5.2031ZM87.9141 4.04367 92.4444 4.51611 93.9056 4.71161 93.6404 6.69394 92.1936 6.50038 92.2225 6.50382 87.7067 6.03288ZM95.8879 4.97682 101.835 5.77248 101.57 7.75481 95.6227 6.95916ZM103.817 6.03769 109.764 6.83334 109.499 8.81568 103.552 8.02003ZM111.747 7.09856 114.174 7.42333 117.708 8.00485 117.383 9.97831 113.864 9.39923 113.894 9.40367 111.481 9.0809ZM119.681 8.32963 125.601 9.30397 125.277 11.2774 119.356 10.3031ZM127.575 9.62875 133.495 10.6031 133.17 12.5765 127.25 11.6022ZM135.497 10.9496 141.383 12.1131 140.996 14.0752 135.109 12.9116ZM143.345 12.501 149.232 13.6646 148.844 15.6266 142.958 14.463ZM151.194 14.0524 154.692 14.744 157.096 15.3078 156.639 17.255 154.252 16.6952 154.287 16.7026 150.806 16.0145ZM159.043 15.7645 164.885 17.1344 164.428 19.0816 158.586 17.7116ZM166.832 17.5911 172.673 18.9611 172.217 20.9083 166.375 19.5383ZM174.638 19.4697 180.436 21.011 179.923 22.9439 174.124 21.4025ZM182.386 21.5434 188.15 23.2095 187.595 25.1309 181.831 23.4647ZM190.094 23.7721 195.817 25.5738 195.217 27.4815 189.494 25.6798ZM197.725 26.1743 197.937 26.241 203.426 28.1274 202.776 30.0188 197.299 28.1367 197.324 28.1448 197.125 28.082ZM205.317 28.7775 205.372 28.7962 210.96 30.9008 210.255 32.7724 204.681 30.673 204.708 30.6829 204.667 30.6689ZM212.856 31.6341 218.398 33.9327 217.632 35.7801 212.09 33.4815ZM220.257 34.7672 224.842 36.8757 225.729 37.3319 224.814 39.1104 223.947 38.6643 223.987 38.6836 219.421 36.5843ZM227.508 38.2468 230.314 39.6905 232.817 41.1479 231.81 42.8763 229.33 41.432 229.376 41.457 226.593 40.0253ZM234.545 42.1544 235.24 42.5592 239.599 45.4789 239.669 45.5339 238.432 47.1052 238.392 47.074 238.454 47.1192 234.153 44.2384 234.207 44.2717 233.539 43.8827ZM241.24 46.7714 243.37 48.4484 245.817 50.7879 244.435 52.2335 242.022 49.9271 242.095 49.9899 240.003 48.3426ZM247.249 52.3466 249.052 54.5432 250.739 57.3923 249.018 58.4114 247.37 55.6281 247.457 55.7529 245.703 53.6153ZM251.537 59.4264 252.056 60.8708 252.444 64.0697 252.369 65.7033 250.371 65.6112 250.443 64.0611 250.449 64.2276 250.085 61.2235 250.136 61.4415 249.655 60.1032ZM252.277 67.7012 252.001 73.6948 250.003 73.6028 250.279 67.6091ZM251.909 75.6927 251.888 76.1544 251.138 81.7297 249.156 81.4632 249.9 75.9313 249.892 76.0185 249.911 75.6006ZM250.871 83.7119 250.319 87.8223 249.896 89.7144 247.944 89.2783 248.357 87.4283 248.342 87.5131 248.889 83.4453ZM249.46 91.6663 248.152 97.5219 246.2 97.0858 247.508 91.2302ZM247.645 99.5265 246.377 103.802 245.848 105.302 243.962 104.638 244.482 103.161 244.466 103.209 245.728 98.9575ZM245.184 107.189 244.709 108.539 242.933 112.808 241.086 112.04 242.852 107.796 242.832 107.849 243.298 106.524ZM242.025 114.662 240.941 116.859 239.091 119.979 237.371 118.959 239.201 115.872 239.165 115.939 240.232 113.777ZM237.891 121.694 236.653 123.392 233.529 126.024 232.24 124.494 235.272 121.94 235.109 122.116 236.275 120.516ZM235.853 128.254 226.908 128.214 232.307 121.083Z" transform="matrix(1 0 0 -1 611 342.214)"/></g></svg>
\ No newline at end of file
diff --git a/docs/examples/te_gemma/media/fp8_model_init.svg b/docs/examples/te_gemma/media/fp8_model_init.svg
new file mode 100755
index 0000000000..c7fce2120d
--- /dev/null
+++ b/docs/examples/te_gemma/media/fp8_model_init.svg
@@ -0,0 +1 @@
+<svg width="1280" height="720" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" overflow="hidden"><g><rect x="0" y="0" width="1280" height="720" fill="#FFFFFF"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="24" transform="translate(151.097 204)">FP32/BF16</text><path d="M821 170 821 513.312" stroke="#000000" stroke-width="2" stroke-miterlimit="8" fill="none" fill-rule="evenodd"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="24" transform="translate(615.044 204)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="24" transform="translate(908.732 204)">FP8 with fp8_model_init()</text><rect x="868" y="326" width="129" height="164" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#E8E8E8"/><rect x="883" y="363" width="101" height="45" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(920.957 382)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(911.87 398)">weight</text><rect x="1079" y="363" width="82" height="45" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#C1E5F5"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(1108.05 382)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(1099.38 398)">GEMM</text><path d="M984 385 1073.04 385 1073.04 387 984 387ZM1071.71 382 1079.71 386 1071.71 390Z"/><path d="M1120 280.99 1120.73 356.404 1118.73 356.423 1118 281.01ZM1123.71 355.042 1119.79 363.08 1115.71 355.119Z"/><path d="M422 170 422 513.312" stroke="#000000" stroke-width="2" stroke-miterlimit="8" fill="none" fill-rule="evenodd"/><rect x="54" y="326" width="129" height="164" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#E8E8E8"/><rect x="68" y="351" width="103" height="71" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(105.39 374)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(91.6367 390)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(98.6367 406)">weight</text><rect x="271" y="224" width="103" height="71" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(308.18 247)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(294.427 263)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(306.347 279)">input</text><rect x="271" y="351" width="103" height="70" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#C1E5F5"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(308.18 374)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(294.427 390)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(301.847 406)">GEMM</text><path d="M0.00641402-0.999979 93.1192-0.402739 93.1064 1.59722-0.00641402 0.999979ZM91.8051-3.41123 99.7793 0.64 91.7538 4.5886Z" transform="matrix(1 0 0 -1 171 386.64)"/><path d="M324 295 324 344.395 322 344.395 322 295ZM327 343.061 323 351.061 319 343.061Z"/><rect x="447" y="326" width="129" height="164" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#E8E8E8"/><rect x="462" y="351" width="103" height="71" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(498.862 375)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(485.109 391)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(492.109 407)">weight</text><rect x="606" y="364" width="81" height="44" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#FBE3D6"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(634.312 382)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(623.812 398)">Weight</text><rect x="703" y="234" width="96" height="45" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(738.66 252)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(734.494 268)">input</text><rect x="710" y="364" width="82" height="44" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#C1E5F5"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(738.66 382)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(729.994 398)">GEMM</text><path d="M0.015735-0.999876 34.0184-0.464776 33.987 1.53498-0.015735 0.999876ZM32.7325-3.48538 40.6686 0.64 32.6066 4.51362Z" transform="matrix(1 0 0 -1 565 386.64)"/><path d="M687 385 702.791 385 702.791 387 687 387ZM701.458 382 709.458 386 701.458 390Z"/><path d="M752 279 752 357.156 750 357.156 750 279ZM755 355.822 751 363.822 747 355.822Z"/><rect x="1071" y="237" width="97" height="44" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(1107.26 255)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(1103.09 271)">input</text></g></svg>
\ No newline at end of file
diff --git a/docs/examples/te_gemma/media/fp8_model_init_1_half.svg b/docs/examples/te_gemma/media/fp8_model_init_1_half.svg
new file mode 100755
index 0000000000..3b217a3eb2
--- /dev/null
+++ b/docs/examples/te_gemma/media/fp8_model_init_1_half.svg
@@ -0,0 +1 @@
+<svg width="960" height="480" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" overflow="hidden"><g><rect x="0" y="0" width="960" height="480" fill="#FFFFFF"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="22" transform="translate(195.4 93)">FP32/BF16</text><path d="M461 61 461 404.312" stroke="#000000" stroke-width="2" stroke-miterlimit="8" fill="none" fill-rule="evenodd"/><rect x="92" y="217" width="129" height="164" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#E8E8E8"/><rect x="107" y="242" width="103" height="71" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(144.193 265)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(130.44 281)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(137.44 297)">weight</text><rect x="310" y="114" width="103" height="72" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(346.984 138)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(333.231 154)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(345.151 170)">input</text><rect x="310" y="242" width="103" height="70" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#C1E5F5"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(346.984 265)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(333.23 281)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(340.65 297)">GEMM</text><path d="M0.00641402-0.999979 93.1192-0.402739 93.1064 1.59722-0.00641402 0.999979ZM91.8051-3.41123 99.7793 0.64 91.7538 4.5886Z" transform="matrix(1 0 0 -1 210 277.64)"/><path d="M362 186 362 235.395 360 235.395 360 186ZM365 234.061 361 242.061 357 234.061Z"/><rect x="486" y="217" width="129" height="164" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#E8E8E8"/><rect x="501" y="242" width="103" height="71" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(537.665 266)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(523.912 282)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(530.912 298)">weight</text><rect x="644" y="255" width="82" height="44" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#FBE3D6"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(673.115 273)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(662.615 289)">Weight</text><rect x="741" y="125" width="97" height="45" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(777.464 143)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(773.297 159)">input</text><rect x="749" y="255" width="82" height="44" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#C1E5F5"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(777.464 273)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(768.797 289)">GEMM</text><path d="M0.015735-0.999876 34.0184-0.464776 33.987 1.53498-0.015735 0.999876ZM32.7325-3.48538 40.6686 0.64 32.6066 4.51362Z" transform="matrix(1 0 0 -1 604 277.64)"/><path d="M726 276 741.791 276 741.791 278 726 278ZM740.458 273 748.458 277 740.458 281Z"/><path d="M791 170 791 248.156 789 248.156 789 170ZM794 246.822 790 254.822 786 246.822Z"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="22" transform="translate(645.181 91)">FP8</text></g></svg>
\ No newline at end of file
diff --git a/docs/examples/te_gemma/media/fp8_model_init_2_half.svg b/docs/examples/te_gemma/media/fp8_model_init_2_half.svg
new file mode 100755
index 0000000000..46587664fe
--- /dev/null
+++ b/docs/examples/te_gemma/media/fp8_model_init_2_half.svg
@@ -0,0 +1 @@
+<svg width="960" height="480" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" overflow="hidden"><g><rect x="0" y="0" width="960" height="480" fill="#FFFFFF"/><path d="M471 66 471 409.312" stroke="#000000" stroke-width="2" stroke-miterlimit="8" fill="none" fill-rule="evenodd"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="22" transform="translate(267.606 98)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="22" transform="translate(572.588 98)">FP8 with fp8_model_init()</text><rect x="519" y="222" width="129" height="164" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#E8E8E8"/><rect x="533" y="259" width="101" height="45" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(571.603 277)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(562.516 293)">weight</text><rect x="730" y="259" width="82" height="45" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#C1E5F5"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(758.696 277)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(750.029 293)">GEMM</text><path d="M634 280 723.041 280 723.041 282 634 282ZM721.708 277 729.708 281 721.708 285Z"/><path d="M771 176.99 771.726 252.404 769.726 252.423 769 177.01ZM774.713 251.042 770.79 259.08 766.713 251.119Z"/><rect x="98" y="222" width="129" height="164" stroke="#042433" stroke-width="2" stroke-miterlimit="8" fill="#E8E8E8"/><rect x="112" y="246" width="104" height="72" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(149.508 270)">High</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(135.755 286)">precision</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(142.755 302)">weight</text><rect x="256" y="259" width="82" height="45" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#FBE3D6"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(284.957 277)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(274.457 293)">Weight</text><rect x="353" y="130" width="97" height="44" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(389.306 148)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(385.139 164)">input</text><rect x="361" y="259" width="81" height="45" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#C1E5F5"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(389.306 277)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(380.639 293)">GEMM</text><path d="M0.015735-0.999876 34.0184-0.464776 33.987 1.53498-0.015735 0.999876ZM32.7325-3.48538 40.6686 0.64 32.6066 4.51362Z" transform="matrix(1 0 0 -1 216 281.64)"/><path d="M338 280 353.791 280 353.791 282 338 282ZM352.458 277 360.458 281 352.458 285Z"/><path d="M402 174 402 252.156 400 252.156 400 174ZM405 250.822 401 258.822 397 250.822Z"/><rect x="722" y="132" width="96" height="45" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(757.906 151)">FP8</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(753.739 167)">input</text></g></svg>
\ No newline at end of file
diff --git a/docs/examples/te_gemma/media/generation_animation.gif b/docs/examples/te_gemma/media/generation_animation.gif
new file mode 100755
index 0000000000..25150cb9b6
Binary files /dev/null and b/docs/examples/te_gemma/media/generation_animation.gif differ
diff --git a/docs/examples/te_gemma/media/graphs.svg b/docs/examples/te_gemma/media/graphs.svg
new file mode 100755
index 0000000000..f734637e6d
--- /dev/null
+++ b/docs/examples/te_gemma/media/graphs.svg
@@ -0,0 +1 @@
+<svg width="1280" height="720" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" overflow="hidden"><g><rect x="0" y="0" width="1280" height="720" fill="#FFFFFF"/><path d="M645 209 645 446.818" stroke="#000000" stroke-width="2" stroke-miterlimit="8" fill="none" fill-rule="evenodd"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="24" transform="translate(201.111 246)">Without CUDA Graphs</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="24" transform="translate(855.749 246)">With CUDA Graphs</text><rect x="64" y="319" width="91" height="49" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#F2F2F2"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="16" transform="translate(75.6135 349)">Launch 1</text><rect x="155" y="371" width="90" height="49" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="16" transform="translate(169.288 401)">Kernel 1</text><rect x="245" y="319" width="91" height="49" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#F2F2F2"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="16" transform="translate(256.462 349)">Launch 2</text><rect x="336" y="371" width="90" height="49" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="16" transform="translate(350.136 401)">Kernel 2</text><rect x="426" y="319" width="91" height="49" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#F2F2F2"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="16" transform="translate(437.31 349)">Launch 3</text><rect x="517" y="371" width="90" height="49" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="16" transform="translate(530.984 401)">Kernel 3</text><path d="M47 368 621.291 368 621.291 372 47 372ZM619.291 364 631.291 370 619.291 376Z"/><rect x="680" y="319" width="145" height="49" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#F2F2F2"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="16" transform="translate(694.058 349)">Launch Graph 1</text><rect x="830" y="370" width="91" height="49" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="16" transform="translate(844.463 400)">Kernel 1</text><rect x="924" y="370" width="90" height="49" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="16" transform="translate(938.451 400)">Kernel 2</text><rect x="1018" y="370" width="90" height="49" stroke="#000000" stroke-width="2" stroke-linejoin="round" stroke-miterlimit="10" fill="#D9F2D0"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="16" transform="translate(1032.44 400)">Kernel 3</text><path d="M663 368 1237.29 368 1237.29 372 663 372ZM1235.29 364 1247.29 370 1235.29 376Z"/></g></svg>
\ No newline at end of file
diff --git a/docs/examples/te_gemma/media/graphs_1.png b/docs/examples/te_gemma/media/graphs_1.png
new file mode 100755
index 0000000000..f42b50fe0d
Binary files /dev/null and b/docs/examples/te_gemma/media/graphs_1.png differ
diff --git a/docs/examples/te_gemma/media/graphs_2.png b/docs/examples/te_gemma/media/graphs_2.png
new file mode 100755
index 0000000000..35c34ede55
Binary files /dev/null and b/docs/examples/te_gemma/media/graphs_2.png differ
diff --git a/docs/examples/te_gemma/media/plot.svg b/docs/examples/te_gemma/media/plot.svg
new file mode 100755
index 0000000000..481f156df6
--- /dev/null
+++ b/docs/examples/te_gemma/media/plot.svg
@@ -0,0 +1 @@
+<svg width="1280" height="720" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" overflow="hidden"><g><rect x="0" y="0" width="1280" height="720" fill="#FFFFFF"/><g><path d="M203.5 526.5 1109.5 526.5M203.5 479.5 1109.5 479.5M203.5 431.5 1109.5 431.5M203.5 384.5 1109.5 384.5M203.5 337.5 1109.5 337.5M203.5 289.5 1109.5 289.5M203.5 242.5 1109.5 242.5M203.5 195.5 1109.5 195.5M203.5 147.5 1109.5 147.5M203.5 100.5 1109.5 100.5" stroke="#D9D9D9" stroke-linejoin="round" stroke-miterlimit="10" fill="none"/></g><g><path d="M265 159 322 159 322 574 265 574ZM447 318 503 318 503 574 447 574ZM628 440 685 440 685 574 628 574ZM809 495 866 495 866 574 809 574ZM990 517 1047 517 1047 574 990 574Z" fill="#76B900"/></g><g><path d="M203.5 574.5 1109.5 574.5" stroke="#D9D9D9" stroke-linejoin="round" stroke-miterlimit="10" fill="none" fill-rule="evenodd"/></g><g><text fill="#404040" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(275.188 149)">87.68 s</text></g><g><text fill="#404040" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(456.403 308)">54.11 s</text></g><g><text fill="#404040" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(637.619 431)">28.22 s</text></g><g><text fill="#404040" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(818.835 485)">16.75 s</text></g><g><text fill="#404040" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(1000.05 507)">12.13 s</text></g><g><text fill="#595959" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(177.491 577)">0 s</text></g><g><text fill="#595959" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(171.077 530)">10 s</text></g><g><text fill="#595959" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(171.077 482)">20 s</text></g><g><text fill="#595959" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(171.077 435)">30 s</text></g><g><text fill="#595959" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(171.077 388)">40 s</text></g><g><text fill="#595959" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(171.077 340)">50 s</text></g><g><text fill="#595959" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(171.077 293)">60 s</text></g><g><text fill="#595959" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(171.077 246)">70 s</text></g><g><text fill="#595959" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(171.077 198)">80 s</text></g><g><text fill="#595959" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(171.077 151)">90 s</text></g><g><text fill="#595959" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(164.664 103)">100 s</text></g><g><text fill="#595959" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(259.651 593)">HF (baseline)</text></g><g><text fill="#595959" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(430.297 593)">TE (subsitution of</text></g><g><text fill="#595959" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(405.697 607)">GemmaDecoderLayer with</text></g><g><text fill="#595959" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(420.753 622)">te.TransformerLayer)</text></g><g><text fill="#595959" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(608.469 593)">TE + THD attention</text></g><g><text fill="#595959" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(747.892 593)">TE + THD attention + CUDA Graphs</text></g><g><text fill="#595959" font-family="Aptos,Aptos_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(955.438 593)">TE + THD attention + FP8</text></g></g></svg>
\ No newline at end of file
diff --git a/docs/examples/te_gemma/media/thd_bshd.svg b/docs/examples/te_gemma/media/thd_bshd.svg
new file mode 100755
index 0000000000..47eed69565
--- /dev/null
+++ b/docs/examples/te_gemma/media/thd_bshd.svg
@@ -0,0 +1 @@
+<svg width="3840" height="2160" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" overflow="hidden"><g><rect x="0" y="0" width="3840" height="2160" fill="#FFFFFF"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="96" transform="translate(664.716 313)">BSHD Layout</text><path d="M1920 459 1920 1991.8" stroke="#000000" stroke-width="8" stroke-linejoin="round" stroke-miterlimit="10" fill="none" fill-rule="evenodd"/><rect x="128.5" y="1356.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="365.5" y="1356.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="246.5" y="1356.5" width="80" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="484.5" y="1356.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="128.5" y="1472.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="365.5" y="1472.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="246.5" y="1472.5" width="80" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="484.5" y="1472.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="128.5" y="1587.5" width="79" height="81" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="365.5" y="1587.5" width="79" height="81" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="246.5" y="1587.5" width="80" height="81" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="484.5" y="1587.5" width="79" height="81" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="128.5" y="1701.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="365.5" y="1701.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="246.5" y="1701.5" width="80" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="484.5" y="1701.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="96" transform="translate(294.581 1311)">Q</text><rect x="742.5" y="1356.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="979.5" y="1356.5" width="78.9999" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="860.5" y="1356.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="1098.5" y="1356.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="742.5" y="1472.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="979.5" y="1472.5" width="78.9999" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="860.5" y="1472.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="1098.5" y="1472.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="742.5" y="1587.5" width="79" height="81" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="979.5" y="1587.5" width="78.9999" height="81" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="860.5" y="1587.5" width="79" height="81" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="1098.5" y="1587.5" width="79" height="81" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="742.5" y="1701.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="979.5" y="1701.5" width="78.9999" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="860.5" y="1701.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="1098.5" y="1701.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="96" transform="translate(912.066 1311)">K</text><rect x="1306.5" y="1356.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="1544.5" y="1356.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="1425.5" y="1356.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="1662.5" y="1356.5" width="80" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="1306.5" y="1472.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="1544.5" y="1472.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="1425.5" y="1472.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="1662.5" y="1472.5" width="80" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="1306.5" y="1587.5" width="79" height="81" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="1544.5" y="1587.5" width="79" height="81" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="1425.5" y="1587.5" width="79" height="81" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="1662.5" y="1587.5" width="80" height="81" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="1306.5" y="1701.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="1544.5" y="1701.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="1425.5" y="1701.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="1662.5" y="1701.5" width="80" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="96" transform="translate(1477.22 1311)">V</text><rect x="2148.5" y="1292.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="2326.5" y="1293.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="2237.5" y="1293.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="2414.5" y="1293.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="96" transform="translate(2028.54 1358)">Q</text><rect x="2501.5" y="1292.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="2678.5" y="1292.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="2589.5" y="1292.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="2767.5" y="1292.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="2853.5" y="1293.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="3031.5" y="1294.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="2942.5" y="1294.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="3119.5" y="1294.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="3206.5" y="1292.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="3383.5" y="1293.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="3294.5" y="1293.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="3472.5" y="1293.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="2148.5" y="1418.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="2326.5" y="1419.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="2237.5" y="1419.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="2414.5" y="1419.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="96" transform="translate(2032.21 1484)">K</text><rect x="2501.5" y="1418.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="2678.5" y="1418.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="2589.5" y="1418.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="2767.5" y="1418.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="2853.5" y="1419.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="3031.5" y="1420.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="2942.5" y="1420.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="3119.5" y="1420.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="3206.5" y="1418.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="3383.5" y="1419.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="3294.5" y="1419.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="3472.5" y="1419.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="2148.5" y="1544.5" width="52" height="64" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="2326.5" y="1545.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="2237.5" y="1545.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="2414.5" y="1545.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="96" transform="translate(2032.71 1610)">V</text><rect x="2501.5" y="1544.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="2678.5" y="1544.5" width="52" height="64" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="2589.5" y="1544.5" width="52" height="64" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="2767.5" y="1544.5" width="52" height="64" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="2853.5" y="1545.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="3031.5" y="1546.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="2942.5" y="1546.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="3119.5" y="1546.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="3206.5" y="1544.5" width="52" height="64" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFE893"/><rect x="3383.5" y="1545.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="3294.5" y="1545.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="3472.5" y="1545.5" width="52" height="63" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="53" transform="translate(2063.07 648)">Cumulative sequence lengths:</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="53" transform="translate(2841.94 648)">3, 3 + 1, 3 + 1 + 3, 3 + 1 + 3 + 1</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="53" transform="translate(2442.44 771)">Sequence offsets:</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="53" transform="translate(2916.28 771)">0, 4, 8, 12</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="80" transform="translate(318.26 1982)">[</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="80" transform="translate(348.26 1982)">b</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="80" transform="translate(397.093 1982)">atch_size,</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="80" transform="translate(781.793 1982)">s</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="80" transform="translate(823.293 1982)">eq_len,</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="80" transform="translate(1105.46 1982)">h</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="80" transform="translate(1153.79 1982)">ead_nr,</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="80" transform="translate(1444.04 1982)">d</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="80" transform="translate(1492.87 1982)">im]</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="80" transform="translate(2232.79 1982)">[</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="80" transform="translate(2262.79 1982)">t</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="80" transform="translate(2298.63 1982)">otal_nr_token</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="80" transform="translate(2816.33 1982)">s</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="80" transform="translate(2856.99 1982)">,</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="80" transform="translate(2895.66 1982)">h</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="80" transform="translate(2943.99 1982)">ead_nr</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="80" transform="translate(3195.57 1982)">,</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="80" transform="translate(3234.24 1982)">d</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="80" transform="translate(3283.07 1982)">im]</text><path d="M2142 1283C2142 1262.57 2142 1246 2142 1246L2268.96 1246C2268.96 1246 2268.96 1229.43 2268.96 1209 2268.96 1229.43 2268.96 1246 2268.96 1246L2383 1246C2383 1246 2383 1262.57 2383 1283" stroke="#000000" stroke-width="4" stroke-linejoin="round" stroke-miterlimit="10" fill="none" fill-rule="evenodd"/><path d="M2470 1617C2470 1637.43 2470 1654 2470 1654L2439.9 1654C2439.9 1654 2439.9 1670.57 2439.9 1691 2439.9 1670.57 2439.9 1654 2439.9 1654L2408 1654C2408 1654 2408 1637.43 2408 1617" stroke="#000000" stroke-width="4" stroke-linejoin="round" stroke-miterlimit="10" fill="none" fill-rule="evenodd"/><path d="M2848 1283C2848 1262.57 2848 1246 2848 1246L2974.96 1246C2974.96 1246 2974.96 1229.43 2974.96 1209 2974.96 1229.43 2974.96 1246 2974.96 1246L3089 1246C3089 1246 3089 1262.57 3089 1283" stroke="#000000" stroke-width="4" stroke-linejoin="round" stroke-miterlimit="10" fill="none" fill-rule="evenodd"/><path d="M3201 1283C3201 1262.57 3201 1246 3201 1246L3233.66 1246C3233.66 1246 3233.66 1229.43 3233.66 1209 3233.66 1229.43 3233.66 1246 3233.66 1246L3263 1246C3263 1246 3263 1262.57 3263 1283" stroke="#000000" stroke-width="4" stroke-linejoin="round" stroke-miterlimit="10" fill="none" fill-rule="evenodd"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="53" transform="translate(2199.97 1195)">Seq. 1</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="53" transform="translate(2457.48 1194)">Seq. 2</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="53" transform="translate(3162.02 1196)">Seq. 4</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="53" transform="translate(2904.51 1199)">Seq. 3</text><path d="M575 1778C575 1789.32 572.436 1798.5 569.274 1798.5L335.242 1798.5C332.079 1798.5 329.516 1807.68 329.516 1819 329.516 1807.68 326.952 1798.5 323.79 1798.5L114.726 1798.5C111.564 1798.5 109 1789.32 109 1778" stroke="#000000" stroke-width="4" stroke-linejoin="round" stroke-miterlimit="10" fill="none" fill-rule="evenodd"/><path d="M1749 1333C1760.32 1333 1769.5 1335.56 1769.5 1338.73L1769.5 1572.76C1769.5 1575.92 1778.68 1578.48 1790 1578.48 1778.68 1578.48 1769.5 1581.05 1769.5 1584.21L1769.5 1793.27C1769.5 1796.44 1760.32 1799 1749 1799" stroke="#000000" stroke-width="4" stroke-linejoin="round" stroke-miterlimit="10" fill="none" fill-rule="evenodd"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="53" transform="translate(317.331 1867)">s</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="53" transform="translate(1801.9 1594)">b</text><path d="M3531 1756C3531 1767.6 3528.37 1777 3525.13 1777L2796.2 1777C2792.96 1777 2790.33 1786.4 2790.33 1798 2790.33 1786.4 2787.71 1777 2784.47 1777L2130.87 1777C2127.63 1777 2125 1767.6 2125 1756" stroke="#000000" stroke-width="4" stroke-linejoin="round" stroke-miterlimit="10" fill="none" fill-rule="evenodd"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="53" transform="translate(2786.19 1862)">t</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="700" font-size="96" transform="translate(2550.83 313)">THD Layout</text><path d="M2497 1278C2497 1257.57 2497 1241 2497 1241L2529.66 1241C2529.66 1241 2529.66 1224.43 2529.66 1204 2529.66 1224.43 2529.66 1241 2529.66 1241L2559 1241C2559 1241 2559 1257.57 2559 1278" stroke="#000000" stroke-width="4" stroke-linejoin="round" stroke-miterlimit="10" fill="none" fill-rule="evenodd"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="53" transform="translate(2375.05 1738)">Pad. 1</text><path d="M2825 1617C2825 1636.33 2825 1652 2825 1652L2708.01 1652C2708.01 1652 2708.01 1667.67 2708.01 1687 2708.01 1667.67 2708.01 1652 2708.01 1652L2584 1652C2584 1652 2584 1636.33 2584 1617" stroke="#000000" stroke-width="4" stroke-linejoin="round" stroke-miterlimit="10" fill="none" fill-rule="evenodd"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="53" transform="translate(2638.24 1734)">Pad. 2</text><path d="M3531 1622C3531 1641.05 3531 1656.5 3531 1656.5L3414.01 1656.5C3414.01 1656.5 3414.01 1671.95 3414.01 1691 3414.01 1671.95 3414.01 1656.5 3414.01 1656.5L3290 1656.5C3290 1656.5 3290 1641.05 3290 1622" stroke="#000000" stroke-width="4" stroke-linejoin="round" stroke-miterlimit="10" fill="none" fill-rule="evenodd"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="53" transform="translate(3331.41 1741)">Pad. 4</text><path d="M3182 1618C3182 1636.5 3182 1651.5 3182 1651.5L3146.56 1651.5C3146.56 1651.5 3146.56 1666.5 3146.56 1685 3146.56 1666.5 3146.56 1651.5 3146.56 1651.5L3109 1651.5C3109 1651.5 3109 1636.5 3109 1618" stroke="#000000" stroke-width="4" stroke-linejoin="round" stroke-miterlimit="10" fill="none" fill-rule="evenodd"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="53" transform="translate(3071.62 1731)">Pad. 3</text><rect x="1382.5" y="464.5" width="428" height="354" stroke="#000000" stroke-width="2.66667" stroke-linecap="square" stroke-linejoin="round" stroke-miterlimit="10" stroke-dasharray="8 2.66667" fill="#FFFFFF"/><rect x="509.5" y="417.5" width="736" height="614" stroke="#000000" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><text font-family="Trebuchet MS,Trebuchet MS_MSFontService,sans-serif" font-weight="400" font-size="64" transform="translate(658.871 507)">Attention mask</text><rect x="657.5" y="550.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="895.5" y="550.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10"/><rect x="776.5" y="550.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10"/><rect x="1014.5" y="550.5" width="78.9999" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10"/><rect x="657.5" y="666.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="895.5" y="666.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="776.5" y="666.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="1014.5" y="666.5" width="78.9999" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10"/><rect x="657.5" y="781.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10"/><rect x="895.5" y="781.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10"/><rect x="776.5" y="781.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10"/><rect x="1014.5" y="781.5" width="78.9999" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10"/><rect x="657.5" y="895.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="895.5" y="895.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="776.5" y="895.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><rect x="1014.5" y="895.5" width="78.9999" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10"/><rect x="1425.5" y="542.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10"/><rect x="1425.5" y="666.5" width="79" height="82" stroke="#003526" stroke-width="2.66667" stroke-linejoin="round" stroke-miterlimit="10" fill="#FFFFFF"/><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="53" transform="translate(1555.66 601)">token</text><text font-family="NVIDIA Sans,NVIDIA Sans_MSFontService,sans-serif" font-weight="400" font-size="53" transform="translate(1546.62 725)">padding</text></g></svg>
\ No newline at end of file
diff --git a/docs/examples/te_gemma/requirements.txt b/docs/examples/te_gemma/requirements.txt
new file mode 100755
index 0000000000..c90fb6dad0
--- /dev/null
+++ b/docs/examples/te_gemma/requirements.txt
@@ -0,0 +1,4 @@
+transformers==4.41.1
+accelerate==0.30.1
+datasets==2.19.1
+sentencepiece==0.2.0
\ No newline at end of file
diff --git a/docs/examples/te_gemma/run_gemma_2b.py b/docs/examples/te_gemma/run_gemma_2b.py
new file mode 100644
index 0000000000..db2fb087c9
--- /dev/null
+++ b/docs/examples/te_gemma/run_gemma_2b.py
@@ -0,0 +1,15 @@
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from huggingface_hub import login
+
+access_token = ""
+login(access_token)
+
+model_name = "google/gemma-3-4b-it"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(model_name)
+print(model.config)
+input_text = "Write me a poem about Machine Learning."
+input_ids = tokenizer(input_text, return_tensors="pt")
+
+outputs = model.generate(**input_ids)
+print(tokenizer.decode(outputs[0]))
diff --git a/docs/examples/te_gemma/run_generation.py b/docs/examples/te_gemma/run_generation.py
new file mode 100755
index 0000000000..910fa325d0
--- /dev/null
+++ b/docs/examples/te_gemma/run_generation.py
@@ -0,0 +1,55 @@
+from utils import *
+import transformer_engine.pytorch as te
+
+hyperparams.model_name = (  # "/tmp/gemma-7b-hf/" # <== Add model weight location here e.g. "/path/to/downloaded/gemma/weights"
+    "/perfhome/repos/ckpt/models/gemma-7b-hf/"
+)
+hyperparams.qkv_format = "thd"
+
+run_generation = True
+run_calibration = False
+
+if run_calibration:
+    hyperparams.fuse_qkv_params = True  # This is needed by the last improvement.
+
+    model = init_te_gemma_model(hyperparams)
+
+    # Calibration
+    with te.fp8_autocast(enabled=False, calibrating=True), torch.autocast(
+        device_type="cuda", dtype=torch.bfloat16
+    ):
+        model.train()
+        run_forward_pass(model, hyperparams, num_iters=512)
+
+    # Compute scale_fwd with enabled fp8 autocast
+    with te.fp8_autocast(enabled=True), torch.autocast(device_type="cuda", dtype=torch.bfloat16):
+        run_forward_pass(model, hyperparams, 1)
+
+    # Some parameters are in pointing to the same tensors, double save is avoided here.
+    dict_to_save = {
+        k: v
+        for k, v in model.state_dict().items()
+        if ("_context_phase" not in k and "_generation_phase" not in k)
+    }
+    torch.save(dict_to_save, "calibrated_weights.pth")  # <== Add path to save calibrated weights.
+
+
+if run_generation:
+
+    # hyperparams.generation_cuda_graphs = False # 4.15s
+    hyperparams.generation_cuda_graphs = True  # 4.38s
+
+    if hyperparams.generation_cuda_graphs:
+        # It is necessary to preallocate a static buffer.
+        # CUDA graphs require static input tensors for every kernel.
+        # This approach may result in a slight increase in memory consumption;
+        # however, the substantial speedup achieved makes it worthwhile.
+        hyperparams.cuda_graphs_static_batch_size = 64
+        hyperparams.cuda_graphs_static_max_seq_len = 128
+        hyperparams.cuda_graphs_static_max_context_len = 128
+
+    hyperparams.is_paged = False
+    model = init_te_gemma_model(hyperparams)
+
+    print_sample_of_generated_texts(model)
+    benchmark_generation(model)
diff --git a/docs/examples/te_gemma/run_generation_llama.py b/docs/examples/te_gemma/run_generation_llama.py
new file mode 100755
index 0000000000..1c3e6626ca
--- /dev/null
+++ b/docs/examples/te_gemma/run_generation_llama.py
@@ -0,0 +1,12 @@
+from utils import *
+
+hyperparams.model_name = (  # "/tmp/gemma-7b-hf/" # <== Add model weight location here e.g. "/path/to/downloaded/gemma/weights"
+    "/perfhome/repos/ckpt/models/llama2-7b-hf/"
+)
+hyperparams.qkv_format = "thd"
+
+# model = init_te_llama_model(hyperparams)
+model = init_baseline_model(hyperparams)
+
+print_sample_of_generated_texts(model)
+# benchmark_generation(model)
diff --git a/docs/examples/te_gemma/te_gemma.py b/docs/examples/te_gemma/te_gemma.py
new file mode 100755
index 0000000000..706ea16bc4
--- /dev/null
+++ b/docs/examples/te_gemma/te_gemma.py
@@ -0,0 +1,594 @@
+# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+from contextlib import contextmanager
+
+from typing import Optional
+from functools import partial
+from collections import OrderedDict
+
+import torch
+import transformer_engine as te
+from transformer_engine.pytorch.attention import InferenceParams, RotaryPositionEmbedding
+from transformer_engine.common.recipe import Format, DelayedScaling
+from torch.cuda.amp import autocast
+
+import transformers
+from transformers.models.gemma.modeling_gemma import GemmaForCausalLM, GemmaConfig, GemmaModel
+
+import torch.nn.functional as F
+
+
+class TEGemmaDecoderLayer(te.pytorch.TransformerLayer):
+    """
+    Wrapper class over TE's `TransformerLayer`. This makes the wrapper very
+    similar to HF's `GemmaDecoderLayer` and easier to replace it in the code.
+
+    Args:
+        config: GemmaConfig
+        args: positional args (for compatibility with `GemmaDecoderLayer`)
+        kwargs: keyword args (for compatibility with `GemmaDecoderLayer`)
+    """
+
+    def __init__(self, config: GemmaConfig, layer_idx: int, *args, **kwargs):
+
+        self.gemma_config = config
+
+        super().__init__(
+            hidden_size=config.hidden_size,
+            ffn_hidden_size=config.intermediate_size,
+            num_attention_heads=config.num_attention_heads,
+            bias=False,
+            layernorm_epsilon=config.rms_norm_eps,
+            hidden_dropout=0,
+            attention_dropout=0,
+            fuse_qkv_params=config.fuse_qkv_params,
+            normalization="RMSNorm",
+            activation="geglu",
+            # attn_input_format=config.qkv_format,
+            attn_input_format="bshd",
+            num_gqa_groups=config.num_key_value_heads,
+            kv_channels=self.gemma_config.head_dim,
+            layer_number=(
+                layer_idx + 1
+            ),  # Layer numbers in TE starts from 1, not 0 like in the HF.
+            zero_centered_gamma=True,
+        )
+
+    def forward(self, *args, **kwargs):  # We need to additionally pass positional encoding.
+
+        # this args cannot be passed to TransformerLayer
+        keys_to_remove = [
+            "position_ids",
+            "past_key_value",
+            "output_attentions",
+            "use_cache",
+            "cache_position",
+        ]
+        for key in keys_to_remove:
+            kwargs.pop(key, None)
+
+        rope_emb = kwargs.pop("rope_emb", None)
+        # We need to return tuple to be compatible with HF.
+        return (super().forward(*args, rotary_pos_emb=rope_emb, **kwargs),)
+
+
+class StaticGemmaModel(torch.nn.Module):
+    """
+    StaticGemma is based of HF GemmaModel class.
+    It is adjusted to work properly with CUDA Graphs.
+    """
+
+    def __init__(
+        self,
+        model: GemmaModel,
+        dtype: torch.dtype,
+        mask: torch.Tensor,
+        lm_head: torch.nn.Module,
+    ):
+        super().__init__()
+        self.model = model
+        self.normalizer = torch.tensor(self.model.config.hidden_size**0.5, dtype=dtype)
+        self.mask = mask
+        self.lm_head = lm_head
+
+    def set_inference_params(self, inference_params):
+        self.inference_params = inference_params
+
+    # @sudhakars: is `arbitrary` fine being the default here?
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor = None,
+        attn_mask_type: str = "arbitrary",
+        rope_emb: torch.Tensor = None,
+    ):
+        # print(f"StaticGemmaModel forward start")
+        with torch.no_grad():
+            # static operation - for CUDA graphs
+            hidden_states.data[:] = hidden_states.data[:] * self.normalizer
+
+            for i, decoder_layer in enumerate(self.model.layers):
+                # print(f"layer {i}")
+                hidden_states.data[:] = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    self_attn_mask_type=self.mask if attn_mask_type is None else attn_mask_type,
+                    inference_params=self.inference_params,
+                    rope_emb=rope_emb,
+                )[
+                    0
+                ]  # static copy - for CUDA graphs
+
+        hidden_states.copy_(self.model.norm(hidden_states))  # static copy - for CUDA graphs
+        logits = self.lm_head(hidden_states)
+        logits = logits.float()
+        return logits, hidden_states
+
+
+class GemmaGenerator(torch.nn.Module):
+    """
+    GemmaGenerator gets one layer of embeddins,
+    makes forward pass and returns next tokens.
+    """
+
+    def __init__(
+        self, model: GemmaModel, lm_head: torch.nn.Module, dtype: torch.dtype, qkv_format: str
+    ):
+        super().__init__()
+        self.model = model
+        self.gemma_layers = StaticGemmaModel(model, dtype, "arbitrary", lm_head)
+        self.qkv_format = qkv_format
+
+    def set_inference_params(self, inference_params):
+        self.inference_params = inference_params
+        self.gemma_layers.set_inference_params(inference_params)
+
+    # @sudhakars: is `arbitrary` a good default value here?
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        mask: torch.Tensor = None,
+        attn_mask_type: str = "arbitrary",
+        rope_emb: torch.Tensor = None,
+    ):
+        logits, _ = self.gemma_layers(
+            hidden_states, attention_mask=mask, attn_mask_type=attn_mask_type, rope_emb=rope_emb
+        )
+
+        assert logits.shape[0] == hidden_states.shape[0]  # b
+        assert logits.shape[1] == hidden_states.shape[1]  # seq_len
+        # logits.shape[2] = number of tokens
+        logits = logits[:, -1, :]
+        next_tokens = torch.argmax(logits, dim=1)
+
+        # static copy for CUDA graphs
+        hidden_states.copy_(self.model.embed_tokens(next_tokens).unsqueeze(1))
+
+        return next_tokens
+
+
+@contextmanager
+def replace_decoder(te_decoder_cls):
+    """
+    Replace `GemmaDecoderLayer` with custom `TEGemmaDecoderLayer`.
+    """
+    original_gemma_decoder_cls = transformers.models.gemma.modeling_gemma.GemmaDecoderLayer
+    transformers.models.gemma.modeling_gemma.GemmaDecoderLayer = te_decoder_cls
+    try:
+        yield
+    finally:
+        transformers.models.gemma.modeling_gemma.GemmaDecoderLayer = original_gemma_decoder_cls
+
+
+class TEGemmaForCausalLM(GemmaForCausalLM):
+    """
+    Causal LM created with `GemmaModel`. The underlying `GemmaDecoderLayer`
+    class is monkey-patched with `TEGemmaDecoderLayer` class before
+    initializing the causal LM with `GemmaForCausalLM`.
+
+    Args:
+        config: GemmaConfig
+    """
+
+    def __init__(self, config: GemmaConfig):
+        with replace_decoder(te_decoder_cls=TEGemmaDecoderLayer):
+            super().__init__(config)
+        self.config = config
+        self.to(torch.bfloat16).cuda()
+        self.hidden_size = config.hidden_size
+        self._model_generation_phase = GemmaGenerator(
+            lm_head=self.lm_head,
+            model=self.model,
+            dtype=torch.bfloat16,
+            qkv_format=config.qkv_format,
+        )
+        self._model_context_phase = StaticGemmaModel(
+            self.model, torch.bfloat16, "arbitrary", self.lm_head
+        )
+
+        if self.config.fp8:
+            self.fp8_recipe = DelayedScaling(
+                fp8_format=Format.HYBRID, amax_history_len=16, amax_compute_algo="max"
+            )
+
+        self.te_rope_emb = RotaryPositionEmbedding(self.config.head_dim)(
+            max_seq_len=self.config.max_position_embeddings
+        ).cuda()
+
+    @staticmethod
+    def _padding_to_end(inputs, lengths, max_seq_len=None):
+        """
+        Gets the tensor with sequence padded from the beginning and
+        return tensor padded from its end.
+
+        Parameters
+        ----------
+        inputs : Tensor, tensor with shape [b, s] containing token numbers.
+                 It's padded from the beggining.
+        lengths: Tensor, tensor with shape [s] with lengths of the sequences.
+
+        """
+        max_seq_len = torch.max(lengths) if max_seq_len is None else max_seq_len
+        batch_size, max_seq_len = inputs.shape
+        new_input_ids = inputs.clone()
+        for i in range(batch_size):
+            new_input_ids[i, : lengths[i]] = inputs[i, (max_seq_len - lengths[i]) : max_seq_len]
+            new_input_ids[i, lengths[i] :] = inputs[i, 0 : (max_seq_len - lengths[i])]
+
+        # Disable the input preparation that involves extra padding
+        # inputs.copy_(new_input_ids)
+
+        # Trim the inputs to no extra padding i.e. fix the max seq len to
+        # the longest sequence in the batch
+        actual_max_seq_len = max_seq_len
+        inputs.data = new_input_ids[:, :actual_max_seq_len]
+        # print(f"actual_max_seq_len: {actual_max_seq_len}")
+
+        # For Paged Attention, make the valid sequences, multiple of 64
+        # inputs.data = new_input_ids[:, :4].repeat(1, 16)
+        # import pdb; pdb.set_trace()
+        # print(f"inputs.data.shape: {inputs.data.shape}")
+        # exit()
+
+    def _next_64_multiply(self, x):
+        return ((x + 63) // 64) * 64
+
+    # This function is overriden in TeGEmmaForCausalLMCudaGraphs.
+    def _create_hidden_states_buffer(self, input_ids: torch.Tensor):
+        tensor = torch.empty(
+            (input_ids.shape[0], input_ids.shape[1], self.hidden_size),
+            device="cuda",
+            dtype=torch.float32,
+        )
+        # import pdb; pdb.set_trace()
+        return tensor
+
+    # This function is overriden in TeGEmmaForCausalLMCudaGraphs.
+    def _create_inference_params(self, *args, **kwargs):
+        infer_params = InferenceParams(*args, **kwargs)
+        return infer_params
+
+    # This function is overriden in TeGEmmaForCausalLMCudaGraphs.
+    def _get_max_input_seq_len(self, input_ids):
+        return (
+            input_ids.shape[1]
+            if not hasattr(self.config, "cuda_graphs_static_max_context_len")
+            else self.config.cuda_graphs_static_max_context_len
+        )
+
+    # The buffer for generation is some part (beginning) of hidden states buffer.
+    # This function returns pointer to it and also copies there data if provided.
+    def _get_generation_buffer(self, hidden_states_buffer, data_to_copy=None):
+        # hidden_states_buffer has shape [b, s, hd]
+        # generation_buffer will have shape [b, 1, hd]
+        # Notice that "generation_buffer = hidden_states_buffer[:, 0, :].unsqueeze(1)"
+        # will return uncontiguous buffer, which we want to avoid.
+        output = hidden_states_buffer.view(-1)[
+            : hidden_states_buffer.shape[0] * hidden_states_buffer.shape[2]
+        ]
+        if data_to_copy is not None:
+            output.copy_(data_to_copy.reshape(-1))
+        generation_buffer = output.view(
+            (hidden_states_buffer.shape[0], 1, hidden_states_buffer.shape[2])
+        )
+        return generation_buffer
+
+    def _generate_context_phase(self, input_ids: torch.Tensor, inference_params: InferenceParams):
+        # import pdb; pdb.set_trace()
+        hidden_states = self._create_hidden_states_buffer(input_ids)
+        hidden_states.copy_(self.model.embed_tokens(input_ids))
+
+        # We need to update offsets before every forward pass to make cache work properly.
+        lengths = input_ids.ne(0).sum(dim=1)
+
+        # import pdb; pdb.set_trace()
+        if self.config.qkv_format == "thd":
+            # inference_params.setup_before_new_input(
+            #     lengths_tensor=lengths, max_input_length=input_ids.shape[1]
+            # )
+            lengths = input_ids.ne(0).sum(dim=1)
+            inference_params.pre_step(OrderedDict(zip(list(range(len(lengths))), lengths.tolist())))
+        else:
+            inference_params.setup_before_new_input(length=input_ids.shape[1])
+
+        logits, hs_buffer = self._model_context_phase(
+            hidden_states,
+            attention_mask=((input_ids == 0) if self.config.qkv_format != "thd" else None),
+            attn_mask_type="padding_causal" if self.config.qkv_format == "thd" else "arbitrary",
+            rope_emb=self.te_rope_emb,
+        )
+
+        if self.config.qkv_format == "thd":
+            logits = logits[torch.arange(logits.size(0)), lengths - 1, :]
+        else:
+            logits = logits[:, -1, :]
+
+        next_tokens = torch.argmax(logits, dim=1)
+
+        # self.hidden_states have shape [b, s, hd].
+        # We return hidden state for the last token - output has shape [b, 1, hd]
+        hidden_states = self._get_generation_buffer(
+            hidden_states, self.model.embed_tokens(next_tokens)
+        )
+        return hidden_states, next_tokens
+
+    def _make_mask_one_token_longer(self, mask):
+        return torch.cat(
+            [mask, torch.zeros(mask.size(0), 1, 1, 1, dtype=torch.bool, device=mask.device)], dim=-1
+        )
+
+    @torch.no_grad()
+    def generate(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        pad_token_id: int = 0,
+        max_new_tokens: int = 0,
+        *args,
+        **kwargs
+    ):
+        self.eval()
+
+        # We need both autocasts: FP8 for operations that can run in lower precision
+        # and BF16 for those that cannot.
+        with autocast(dtype=torch.bfloat16, cache_enabled=False), te.pytorch.fp8_autocast(
+            enabled=self.config.fp8, fp8_recipe=self.fp8_recipe if self.config.fp8 else None
+        ):
+
+            lengths = torch.sum(input_ids.ne(pad_token_id), dim=-1).squeeze()  # [s]
+
+            # print(f"max_input_sequence_len: {max_input_sequence_len}")
+            # exit()
+
+            if self.config.qkv_format == "thd":
+                # For thd layout padding is at the end, otherwise at the beginning.
+                TEGemmaForCausalLM._padding_to_end(
+                    input_ids,
+                    lengths,
+                    max_seq_len=(
+                        self.config.cuda_graphs_static_max_context_len
+                        if self.config.generation_cuda_graphs
+                        else None
+                    ),
+                )
+
+            batch_size, max_input_sequence_len = input_ids.shape[0], self._get_max_input_seq_len(
+                input_ids
+            )
+
+            # InferenceParams is a cache, where keys and values of previous tokens are stored.
+            # Moreover it stores length of both already generated and input sequences.
+            inference_params = self._create_inference_params(
+                max_batch_size=batch_size,
+                # num_layers=self.config.num_hidden_layers,
+                max_sequence_length=128,
+                num_heads_kv=self.config.num_key_value_heads,
+                # num_heads_q=self.config.num_attention_heads,
+                head_dim_v=self.config.head_dim,
+                head_dim_k=self.config.head_dim,
+                dtype=torch.bfloat16,
+                is_paged=self.config.is_paged,
+                page_size=64,
+                total_num_pages=64 * 128 // 64,  # 64 * 64 (max_sequence_length) / 64 (page_size)
+            )
+
+            self._model_context_phase.set_inference_params(inference_params)
+            self._model_generation_phase.set_inference_params(inference_params)
+
+            # print(f"context phase start")
+            # import pdb; pdb.set_trace()
+            hidden_states, next_tokens = self._generate_context_phase(input_ids, inference_params)
+
+            # print(f"context phase done")
+            # Generation phase.
+            if self.config.qkv_format == "thd":
+                lengths_tensor = torch.ones((next_tokens.shape[0],), dtype=int)
+                inference_params.pre_step(
+                    OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist()))
+                )
+            else:
+                inference_params.setup_before_new_input(length=1)
+
+            output_tokens = [next_tokens]
+
+            mask = None
+            if self.config.qkv_format != "thd":
+                mask = (input_ids == 0).unsqueeze(1).unsqueeze(1)
+
+            for _ in range(max_new_tokens):
+                if self.config.qkv_format != "thd":
+                    # It will not work with cuda graphs, but it is not used for thd qkv_format.
+                    # Attention mask in bshd needs attn_mask increased by 1 to
+                    # include the next token to be generated
+                    mask = self._make_mask_one_token_longer(mask)
+
+                next_tokens = self._model_generation_phase(
+                    hidden_states,
+                    mask=mask,
+                    attn_mask_type="padding" if self.config.qkv_format == "thd" else "arbitrary",
+                    rope_emb=self.te_rope_emb,
+                )
+
+                # self.inference_params contains for example kv_cache.
+                # This needs to be called before every pass,
+                # to update the information of sequence lengths.
+                # Here we increase sequence offsets by one,
+                # because we generated one token for every sequence.
+                if self.config.qkv_format == "thd":
+                    lengths_tensor = torch.ones((next_tokens.shape[0],), dtype=int)
+                    inference_params.pre_step(
+                        OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist()))
+                    )
+                else:
+                    inference_params.setup_before_new_input(length=1)
+                # next_tokens is static output tensor, so we need to clone it
+                # - it gets changed every iteration.
+                output_tokens.append(next_tokens.clone())
+
+            result = torch.cat((input_ids, torch.stack(output_tokens).permute([1, 0])), dim=1)
+            return result
+
+    def forward(self, *args, **kwargs):
+        self._model_context_phase.set_inference_params(None)
+        hidden_states = self.model.embed_tokens(kwargs["input_ids"])
+        logits = self._model_context_phase(
+            hidden_states,
+            attention_mask=(
+                (kwargs["input_ids"] == 0) if self.config.qkv_format != "thd" else None
+            ),
+            attn_mask_type="arbitrary",
+        )
+        return logits
+
+
+class TEGemmaForCausalLMCudaGraphs(TEGemmaForCausalLM):
+    """
+    TEGemmaForCausalLMCudaGraphs is the version of the class TEGemmaForCausalLM
+    using CUDA Graphs to speed it up. We need to make one trade-off.
+    Namely, batch_size, max_seq_len and max_context_seq_len need to be static.
+    It is necessary to run generation with the same value of
+    these variables that we recorded graph on.
+    """
+
+    def __init__(self, config: GemmaConfig):
+        super().__init__(config)
+        assert (
+            config.qkv_format == "thd"
+        ), "Generation with CUDA Graphs are implemented only for thd format."
+
+        # Preparation of the static buffers.
+        self.config = config
+        self.hidden_states_buffer = torch.empty(
+            (
+                self.config.cuda_graphs_static_batch_size,
+                self.config.cuda_graphs_static_max_context_len,
+                self.config.hidden_size,
+            )
+        ).cuda()
+
+        # This is in fact part of the buffer for hidden_states.
+        self.generation_buffer = self._get_generation_buffer(self.hidden_states_buffer)
+        self.inference_params = InferenceParams(
+            max_batch_size=self.config.cuda_graphs_static_batch_size,
+            # num_layers=self.config.num_hidden_layers,
+            max_sequence_length=self.config.cuda_graphs_static_max_seq_len,
+            num_heads_kv=self.config.num_key_value_heads,
+            # num_heads_q=self.config.num_attention_heads,
+            head_dim_v=self.config.head_dim,
+            head_dim_k=self.config.head_dim,
+            dtype=torch.bfloat16,
+            is_paged=self.config.is_paged,
+            page_size=64,
+            total_num_pages=64
+            * self.config.cuda_graphs_static_max_seq_len
+            // 64,  # 64 * 64 (max_sequence_length) / 64 (page_size)
+        )
+
+        self._model_generation_phase.set_inference_params(self.inference_params)
+        self._model_context_phase.set_inference_params(self.inference_params)
+
+    def record(self):
+        # We want to record model in training=False, because it will be used in generation.
+        self.eval()
+
+        # Here "the trick" happens. We override methods from TEGemmaForCausalLM
+        # with their recorded version. After invocation of each of them,
+        # captured graph will be replayed with minimal usage of CPU,
+        # what will lead to huge speedup.
+        input_shape = (
+            self.config.cuda_graphs_static_batch_size,
+            self.config.cuda_graphs_static_max_context_len,
+        )
+
+        # [1] Should be same as lengths_tensor from TEGemmaForCausalLM
+        lengths = torch.tensor(input_shape[0] * [input_shape[1]], device="cuda", dtype=torch.int32)
+        max_input_length = input_shape[1]
+
+        self.inference_params.pre_step(
+            OrderedDict(zip(list(range(len(lengths))), lengths.tolist()))
+        )
+
+        # print(f"context phase recording start")
+
+        self._model_context_phase = self.record_graph(
+            self._model_context_phase,
+            self.hidden_states_buffer,
+            attn_mask_type="padding_causal",
+            rope_emb=self.te_rope_emb,
+        )  # CUDA Graphs recording
+
+        # print(f"context phase recording done")
+        input_shape = (self.config.cuda_graphs_static_batch_size, 1)
+
+        lengths = torch.tensor(input_shape[0] * [1], device="cuda", dtype=torch.int32)
+
+        self.inference_params.pre_step(
+            OrderedDict(zip(list(range(len(lengths))), lengths.tolist()))
+        )
+
+        self._model_generation_phase = self.record_graph(
+            self._model_generation_phase,
+            self.generation_buffer,
+            attn_mask_type="padding",
+            rope_emb=self.te_rope_emb,
+        )  # CUDA Graphs recording
+
+    """
+        Functions _create_hidden_states_buffer and _create_inference_params
+        from base class are overriden to make hidden_states and inference_params static
+        - not changing their position in memory between every invocation.
+    """
+
+    def _create_hidden_states_buffer(self, *args, **kwargs):
+        return self.hidden_states_buffer
+
+    def _create_inference_params(self, *args, **kwargs):
+        self.inference_params.reset()
+        return self.inference_params
+
+    def _get_max_input_seq_len(self, _):
+        return self.config.cuda_graphs_static_max_context_len
+
+    @torch.no_grad()
+    def record_graph(self, function, input_tensor, **sample_kwargs):
+        # function is invoked on argument (self.hidden_states,) and all kernels are recorded.
+        # record_graph() returns captured function, which can be run later with lower of th CPU.
+        fp8_format = Format.HYBRID
+        fp8_recipe = DelayedScaling(
+            fp8_format=fp8_format, amax_history_len=1024, amax_compute_algo="max"
+        )
+
+        # We need both autocasts: FP8 for operations that can run in lower precision
+        # and BF16 for those that cannot.
+        with autocast(dtype=torch.bfloat16, cache_enabled=False):
+            graphed_function = te.pytorch.make_graphed_callables(
+                function,
+                (input_tensor,),
+                fp8_enabled=self.config.fp8,
+                fp8_recipe=fp8_recipe,
+                allow_unused_input=True,
+                num_warmup_iters=5,
+                sample_kwargs=sample_kwargs,
+            )
+        return graphed_function
diff --git a/docs/examples/te_gemma/te_gemma_loading_weights.py b/docs/examples/te_gemma/te_gemma_loading_weights.py
new file mode 100755
index 0000000000..41f62ad7f3
--- /dev/null
+++ b/docs/examples/te_gemma/te_gemma_loading_weights.py
@@ -0,0 +1,160 @@
+# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+import os
+import re
+import gc
+import torch
+
+from typing import List
+
+from transformer_engine.pytorch.fp8 import fp8_model_init
+
+from transformers.modeling_utils import load_state_dict, _load_state_dict_into_model
+from transformers.utils.hub import get_checkpoint_shard_files
+
+"""
+    This file contains logic of mapping the HuggingFace GemmaModel parameters
+    with TransformerEngine TransformerLayer. When we have initialized Transformer models
+    both with HF and with TE, we can copy parameters from the first to the second.
+"""
+
+
+def _load_weights_for_fp8_model(vanilla_model, hyperparams):
+    # The weights are loaded from the file with state_dict
+    # of model with weights which contains also fp8 parameters.
+    # The weights are in BF16 precision, but they contain fp8 metadata
+    # computed by the calibration procedure.
+    vanilla_model.load_state_dict(
+        torch.load(hyperparams.fp8_model_weights_filename),
+        strict=False,
+        # strict = false, because some parameters have
+        # multiple pointers to the same weight
+        # vanilla_model._model_context_phase.model
+        # and vanilla_model._model_generation_phase.model
+    )
+
+
+def _load_weights_for_standard_model(vanilla_model, config):
+    # The weights are loaded from the file with original weights.
+    archive_file = os.path.join(config.model_name, "model.safetensors.index.json")
+    resolved_archive_file, _ = get_checkpoint_shard_files(config.model_name, archive_file)
+    total_dict = {}
+    for shard_file in resolved_archive_file:
+        state_dict = load_state_dict(shard_file)
+        total_dict.update(state_dict)
+
+    replace_params(
+        total_dict,
+        vanilla_model.state_dict(),
+        config,
+        qkv_fused_and_interleaved=config.fuse_qkv_params,
+    )
+    # Copy parameters like embedding:
+    _load_state_dict_into_model(vanilla_model, total_dict, start_prefix="")
+
+    # Force mem release. Taken from huggingface code.
+    del total_dict
+    gc.collect()
+
+
+def load_te_model(cls, config):
+    """
+    Custom method adapted from `from_pretrained` method in HuggingFace
+    Transformers repo:
+    https://github.com/huggingface/transformers/blob/f497f564bb76697edab09184a252fc1b1a326d1e/src/transformers/modeling_utils.py#L2579
+    """
+    config.use_cache = False  # To make TransformerLayer compatible with GemmaModel
+    with fp8_model_init(config.fp8_model_init):
+        # there we need only to create model
+        vanilla_model = cls(config).to(torch.bfloat16).cuda()
+
+    # return vanilla_model
+    # and now we copy the weights into it
+    if config.fp8_model_weights_filename is not None:
+        _load_weights_for_fp8_model(vanilla_model, config)
+    else:
+        _load_weights_for_standard_model(vanilla_model, config)
+
+    return vanilla_model
+
+
+def _get_all_layer_prefixes_to_update(hf_state_dict):
+    """
+    There are many parameters in hf_state_dict, whose name start with "model.layers.[number]."
+    This function extracts all strings like "model.layers.[number]."
+    that are starting strings of keys in hf_state_dict.
+    """
+    all_layer_prefixes = set()
+    for param_key in hf_state_dict.keys():
+        layer_prefix_pat = "model.layers.\d+."
+        m = re.match(layer_prefix_pat, param_key)
+        if m is not None:
+            all_layer_prefixes.add(m.group())
+    return all_layer_prefixes
+
+
+def replace_params(hf_state_dict, te_state_dict, config, qkv_fused_and_interleaved=False):
+    """
+    Replaces params from TE TransformerLayer state_dict with corresponding parameters
+    from HuggingFace GemmaModel state_dict.
+    """
+    all_layer_prefixes: List[str] = _get_all_layer_prefixes_to_update(hf_state_dict)
+
+    for layer_prefix in all_layer_prefixes:
+
+        def copy_from_ht_to_te(te_name, hf_name, start=None, end=None):
+            te_state_dict[layer_prefix + te_name].data[start:end].copy_(
+                hf_state_dict[layer_prefix + hf_name]
+            )
+
+        copy_from_ht_to_te(
+            "self_attention.layernorm_qkv.layer_norm_weight", "input_layernorm.weight"
+        )
+        copy_from_ht_to_te("self_attention.proj.weight", "self_attn.o_proj.weight")
+        copy_from_ht_to_te("layernorm_mlp.layer_norm_weight", "post_attention_layernorm.weight")
+        copy_from_ht_to_te("layernorm_mlp.fc2_weight", "mlp.down_proj.weight")
+        copy_from_ht_to_te(
+            "layernorm_mlp.fc1_weight", "mlp.gate_proj.weight", end=config.intermediate_size
+        )
+        copy_from_ht_to_te(
+            "layernorm_mlp.fc1_weight", "mlp.up_proj.weight", start=config.intermediate_size
+        )
+
+        if qkv_fused_and_interleaved:
+            """
+            When qkv_fused_and_interleaved=True, key, query and value layers are on one tensor
+            in TE TransformerLayer. Moreover they are interleaved within each head.
+            Let q_i, k_i and v_i be query, key and value layers for i-th head respectively.
+            Then TE stores weight tensor in the form:
+            [q1 k1 v1 q2 k2 v2 ...]
+            This is done to maximally optimize performance time.
+            """
+            te_qkv_layer = te_state_dict[layer_prefix + "self_attention.layernorm_qkv.weight"]
+
+            def copy_interleave(hf_name, idx):
+                src = hf_state_dict[layer_prefix + hf_name]
+                for head_nr in range(config.num_attention_heads):
+                    dst_offset = head_nr * config.head_dim * 3
+                    dst_slice = slice(
+                        dst_offset + idx * config.head_dim, dst_offset + (idx + 1) * config.head_dim
+                    )
+                    src_slice = slice(
+                        head_nr * config.head_dim, head_nr * config.head_dim + config.head_dim
+                    )
+                    te_qkv_layer[dst_slice, :] = src[src_slice, :]
+
+            copy_interleave("self_attn.q_proj.weight", 0)
+            copy_interleave("self_attn.k_proj.weight", 1)
+            copy_interleave("self_attn.v_proj.weight", 2)
+        else:
+            copy_from_ht_to_te(
+                "self_attention.layernorm_qkv.query_weight", "self_attn.q_proj.weight"
+            )
+            copy_from_ht_to_te("self_attention.layernorm_qkv.key_weight", "self_attn.k_proj.weight")
+            copy_from_ht_to_te(
+                "self_attention.layernorm_qkv.value_weight", "self_attn.v_proj.weight"
+            )
+
+    return all_layer_prefixes
diff --git a/docs/examples/te_gemma/te_gemma_save.py b/docs/examples/te_gemma/te_gemma_save.py
new file mode 100755
index 0000000000..c83378840c
--- /dev/null
+++ b/docs/examples/te_gemma/te_gemma_save.py
@@ -0,0 +1,872 @@
+# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+from contextlib import contextmanager
+
+from typing import Optional
+from functools import partial
+from collections import OrderedDict
+
+import torch
+import transformer_engine as te
+from transformer_engine.pytorch.attention import InferenceParams, RotaryPositionEmbedding
+from transformer_engine.common.recipe import Format, DelayedScaling
+from torch.cuda.amp import autocast
+
+import transformers
+from transformers.models.gemma.modeling_gemma import GemmaForCausalLM, GemmaConfig, GemmaModel
+
+import torch.nn.functional as F
+
+
+class CacheParams:
+    def __init__(
+        self,
+        max_seqlen_q,
+        max_seqlen_kv,
+        cu_seqlens_q,
+        cu_seqlens_kv,
+        cu_seqlens_q_padded,
+        cu_seqlens_kv_padded,
+    ):
+        self.max_seqlen_q = max_seqlen_q
+        self.max_seqlen_kv = max_seqlen_kv
+        self.cu_seqlens_q = cu_seqlens_q
+        self.cu_seqlens_kv = cu_seqlens_kv
+        self.cu_seqlens_q_padded = cu_seqlens_q_padded
+        self.cu_seqlens_kv_padded = cu_seqlens_kv_padded
+
+
+def setup_cache_params_from_infer_params(inference_params, lengths_tensor, max_input_length):
+    """
+    Converts the `input_ids` to variables like `cu_seqlens_q/kv`, etc. which
+    will be used later.
+
+    (Currently a hack, this should be reformatted to a better method)
+    """
+
+    assert (
+        lengths_tensor is not None and max_input_length is not None
+    ), 'lengths_tensor and max_input_length should not be none for qkv_format = "thd"'
+
+    inference_params.max_incoming_seq_len = max_input_length
+
+    lengths_tensor = lengths_tensor.to(inference_params.cu_seqlens_q.device)
+
+    # inference_params.step_dict = OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist()))
+    inference_params.pre_step(
+        OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist()))
+    )
+
+    # print(inference_params.step_dict)
+
+    # def get_cache_params_in_infer_params():
+    # return CacheParams(max_seqlen_q, max_seqlen_kv, inference_params.cu_seqlens_q, inference_params.cu_seqlens_kv, inference_params.cu_seqlens_q_padded, inference_params.cu_seqlens_kv_padded)
+
+    # For the time being, create an ad-hoc field in `inference_params` to get the variables.
+    # @sudhakars: to create a better way later.
+    # inference_params.get_cache_params_from_infer_params = get_cache_params_in_infer_params
+
+
+# This class has been modified from
+# https://github.com/huggingface/transformers/blob/98adf24883b007c2a7fb17bab1c01b1614673433/src/transformers/models/gemma/modeling_gemma.py
+class GemmaRotaryEmbedding(torch.nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (
+            self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float() / self.dim)
+        )
+        self.register_buffer("inv_freq", tensor=inv_freq, persistent=False)
+
+    @torch.no_grad()
+    def forward(self, x, position_ids, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        self.inv_freq.to(x.device)
+        inv_freq_expanded = (
+            self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        )
+        position_ids_expanded = position_ids[:, None, :].float()
+        # Force float32 since bfloat16 loses precision on long contexts
+        # See https://github.com/huggingface/transformers/pull/29285
+        device_type = x.device.type
+        device_type = (
+            device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        )
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+        return emb.unsqueeze(2)  # should return in [b, s, 1, d] format
+
+
+class StaticBufferAllocator(torch.nn.Module):
+    """
+    This class is used when we use te.make_graphed_callable().
+    CUDA Graphs require all tensors to be static. Neverthless,
+    torch API make_graphed_callable() takes care of output of torch modules,
+    and makes them static. Thus by wrapping allocation of memory into
+    torch.nn.Module, we can greatly simplify our code.
+    """
+
+    # pylint: disable=no-self-use
+    def forward(self, size, dtype, device):
+        """
+        Return buffer of given size, dtype and device.
+        """
+        return torch.zeros(size, dtype=dtype, device=device)
+
+
+class TEGemmaDecoderLayer(te.pytorch.TransformerLayer):
+    """
+    Wrapper class over TE's `TransformerLayer`. This makes the wrapper very
+    similar to HF's `GemmaDecoderLayer` and easier to replace it in the code.
+
+    Args:
+        config: GemmaConfig
+        args: positional args (for compatibility with `GemmaDecoderLayer`)
+        kwargs: keyword args (for compatibility with `GemmaDecoderLayer`)
+    """
+
+    def __init__(self, config: GemmaConfig, layer_idx: int, *args, **kwargs):
+
+        self.gemma_config = config
+
+        super().__init__(
+            hidden_size=config.hidden_size,
+            ffn_hidden_size=config.intermediate_size,
+            num_attention_heads=config.num_attention_heads,
+            bias=False,
+            layernorm_epsilon=config.rms_norm_eps,
+            hidden_dropout=0,
+            attention_dropout=0,
+            fuse_qkv_params=config.fuse_qkv_params,
+            normalization="RMSNorm",
+            activation="geglu",
+            # attn_input_format=config.qkv_format,
+            attn_input_format="bshd",
+            num_gqa_groups=config.num_key_value_heads,
+            kv_channels=self.gemma_config.head_dim,
+            layer_number=(
+                layer_idx + 1
+            ),  # Layer numbers in TE starts from 1, not 0 like in the HF.
+            zero_centered_gamma=True,
+        )
+
+    def alloc(self, size, dtype, device):
+        """
+        Allocated the buffer and works correctly with CUDA Graphs.
+        """
+        return self._allocator(size, dtype, device)
+
+    def forward(self, *args, **kwargs):  # We need to additionally pass positional encoding.
+
+        # if "self_attn_mask_type" in kwargs:
+        #     attn_mask_type = kwargs['self_attn_mask_type']
+        # else:
+        #     attn_mask_type = "whatever_default_is"
+
+        # if attn_mask_type == "arbitrary":
+        #     # @sudhakars: following logic doesn't work for `thd`
+        #     attn_mask = kwargs['attention_mask']
+        #     attention_mask_inv = ~attn_mask
+        #     generation_case = torch.tensor(torch.tensor(attn_mask.shape).shape).item() > 2
+
+        #     if generation_case:
+        #         # @sudhakars: for some reason, `attention_mask` for generation is of the
+        #         # form [b, 1, 1, s].
+        #         attention_mask_inv = attention_mask_inv.squeeze(1).squeeze(1)
+        #         assert torch.tensor(torch.tensor(attention_mask_inv.shape).shape).item() == 2
+
+        #     # Create `position_ids` on the fly using `attention_mask` since HF
+        #     # does the same in generation logic.
+        #     position_ids = attention_mask_inv.long().cumsum(-1) - 1
+        #     position_ids.masked_fill_(attention_mask_inv == 0, 1)
+
+        #     if "position_ids" in kwargs and kwargs['position_ids'] is not None:
+        #         assert torch.all(torch.eq(position_ids, kwargs["position_ids"])), "position ids don't match match exactly!"
+
+        #     # convert [b, s] to [b, 1, s, s] since `arbitrary` is only set for
+        #     # context phase and context phase gets [b, s] sized attn mask
+        #     seq_len = 1 if torch.tensor(torch.tensor(attn_mask.shape).shape).item() > 2 else attention_mask_inv.shape[1]
+        #     arbitrary_attn_mask = torch.zeros(attention_mask_inv.shape[0], 1, seq_len, attention_mask_inv.shape[1]).bool()
+        #     for sample_idx in range(attn_mask.shape[0]):
+        #         pad_len = attn_mask[sample_idx].sum().int().item()
+        #         # set the columns to padded
+        #         arbitrary_attn_mask[sample_idx, :, :, :pad_len] = True
+        #         # set the rows to padded
+        #         if not generation_case:
+        #             arbitrary_attn_mask[sample_idx, :, :pad_len, :] = True
+        #             arbitrary_attn_mask[sample_idx] = torch.tril(arbitrary_attn_mask[sample_idx].logical_not()).logical_not()
+
+        #     # Update the attention mask to arbitrary
+        #     kwargs['attention_mask'] = arbitrary_attn_mask.cuda()
+
+        #     # @sudhakars: `max_position_embeddings` is not even used inside GemmaRotaryEmbedding
+        #     # @sudhakars: change the hardcoded `dim` to something like config.head_dim
+        #     te_rope_emb = GemmaRotaryEmbedding(dim=256, max_position_embeddings=self.gemma_config.max_position_embeddings).cuda()
+        #     te_rope_emb = te_rope_emb(args[0], position_ids.cuda())
+        # else:
+        # When the `attention_mask` is not `arbitrary`, then for the purpose
+        # of this tutorial, we're using `padding_causal` (for context) and
+        # `padding` (for generation)
+        # @sudhakars: find a better way to provide the `tensor_format`
+        te_rope_emb = RotaryPositionEmbedding(self.gemma_config.head_dim)(
+            max_seq_len=self.gemma_config.max_position_embeddings
+        ).cuda()
+
+        inference_params = kwargs["inference_params"]
+        # @sudhakars: big assumption that the input is "sbhd"
+        # batch_size = args[0].shape[0]
+
+        # if inference_params.qkv_format_legacy == "thd":
+        #     cache_params = kwargs["cache_params"]
+        #     max_seqlen_q = cache_params.max_seqlen_q
+        #     max_seqlen_kv = cache_params.max_seqlen_kv
+        #     cu_seqlens_q = cache_params.cu_seqlens_q
+        #     cu_seqlens_kv = cache_params.cu_seqlens_kv
+        #     cu_seqlens_q_padded = cache_params.cu_seqlens_q_padded
+        #     cu_seqlens_kv_padded = cache_params.cu_seqlens_kv_padded
+        # print(f"input_sequence_lengths (in forward): \n{inference_params.input_sequence_lengths}")
+
+        # this args cannot be passed to TransformerLayer
+        keys_to_remove = [
+            "position_ids",
+            "past_key_value",
+            "output_attentions",
+            "use_cache",
+            "cache_position",
+        ]
+        for key in keys_to_remove:
+            kwargs.pop(key, None)
+
+        # We need to return tuple to be compatible with HF.
+        return (
+            super().forward(
+                *args,
+                rotary_pos_emb=te_rope_emb,
+                # cu_seqlens_q=cu_seqlens_q,
+                # cu_seqlens_kv=cu_seqlens_kv,
+                # max_seqlen_q=max_seqlen_q,
+                # max_seqlen_kv=max_seqlen_kv,
+                **kwargs,
+            ),
+        )
+
+
+class StaticGemmaModel(torch.nn.Module):
+    """
+    StaticGemma is based of HF GemmaModel class.
+    It is adjusted to work properly with CUDA Graphs.
+    """
+
+    def __init__(
+        self,
+        model: GemmaModel,
+        dtype: torch.dtype,
+        mask: torch.Tensor,
+        lm_head: torch.nn.Module,
+    ):
+        super().__init__()
+        self.model = model
+        self.normalizer = torch.tensor(self.model.config.hidden_size**0.5, dtype=dtype)
+        self.mask = mask
+        self.lm_head = lm_head
+
+    def set_inference_params(self, inference_params):
+        self.inference_params = inference_params
+
+    # @sudhakars: is `arbitrary` fine being the default here?
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor = None,
+        attn_mask_type: str = "arbitrary",
+    ):
+        print(f"StaticGemmaModel forward start")
+        with torch.no_grad():
+            # static operation - for CUDA graphs
+            hidden_states.data[:] = hidden_states.data[:] * self.normalizer
+
+            for i, decoder_layer in enumerate(self.model.layers):
+                # print(f"layer {i}")
+                hidden_states.data[:] = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    self_attn_mask_type=self.mask if attn_mask_type is None else attn_mask_type,
+                    inference_params=self.inference_params,
+                )[
+                    0
+                ]  # static copy - for CUDA graphs
+
+        hidden_states.copy_(self.model.norm(hidden_states))  # static copy - for CUDA graphs
+        logits = self.lm_head(hidden_states)
+        logits = logits.float()
+        return logits, hidden_states
+
+
+class GemmaGenerator(torch.nn.Module):
+    """
+    GemmaGenerator gets one layer of embeddins,
+    makes forward pass and returns next tokens.
+    """
+
+    def __init__(
+        self, model: GemmaModel, lm_head: torch.nn.Module, dtype: torch.dtype, qkv_format: str
+    ):
+        super().__init__()
+        self.model = model
+        self.gemma_layers = StaticGemmaModel(model, dtype, "arbitrary", lm_head)
+        self.qkv_format = qkv_format
+
+    def set_inference_params(self, inference_params):
+        self.inference_params = inference_params
+        self.gemma_layers.set_inference_params(inference_params)
+
+    # @sudhakars: is `arbitrary` a good default value here?
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        mask: torch.Tensor = None,
+        attn_mask_type: str = "arbitrary",
+    ):
+        logits, _ = self.gemma_layers(
+            hidden_states, attention_mask=mask, attn_mask_type=attn_mask_type
+        )
+
+        assert logits.shape[0] == hidden_states.shape[0]  # b
+        assert logits.shape[1] == hidden_states.shape[1]  # seq_len
+        # logits.shape[2] = number of tokens
+        logits = logits[:, -1, :]
+        next_tokens = torch.argmax(logits, dim=1)
+
+        # static copy for CUDA graphs
+        hidden_states.copy_(self.model.embed_tokens(next_tokens).unsqueeze(1))
+
+        return next_tokens
+
+
+class PartialForwardWrapper(torch.nn.Module):
+    """
+    This class wraps a `torch.nn.Module` while partially modifying its `forward`
+
+    CUDAGraphs' `make_graphed_callables` method takes in a module but if only
+    `functools.partial` is used to wrap the module, it changes the modules'
+    type and that interferes with the `make_graphed_callables` intrinsics.
+    """
+
+    def __init__(self, module, **kwargs):
+        super().__init__()
+        self.module = module
+        self.partial_forward = partial(self.module.forward, **kwargs)
+
+    def __call__(self, *args, **kwargs):
+        return self.partial_forward(*args, **kwargs)
+
+    # @sudhakars: should we use better abstraction?
+    def set_inference_params(self, *args, **kwargs):
+        return self.module.set_inference_params(*args, **kwargs)
+
+
+@contextmanager
+def replace_decoder(te_decoder_cls):
+    """
+    Replace `GemmaDecoderLayer` with custom `TEGemmaDecoderLayer`.
+    """
+    original_gemma_decoder_cls = transformers.models.gemma.modeling_gemma.GemmaDecoderLayer
+    transformers.models.gemma.modeling_gemma.GemmaDecoderLayer = te_decoder_cls
+    try:
+        yield
+    finally:
+        transformers.models.gemma.modeling_gemma.GemmaDecoderLayer = original_gemma_decoder_cls
+
+
+class TEGemmaForCausalLM(GemmaForCausalLM):
+    """
+    Causal LM created with `GemmaModel`. The underlying `GemmaDecoderLayer`
+    class is monkey-patched with `TEGemmaDecoderLayer` class before
+    initializing the causal LM with `GemmaForCausalLM`.
+
+    Args:
+        config: GemmaConfig
+    """
+
+    def __init__(self, config: GemmaConfig):
+        with replace_decoder(te_decoder_cls=TEGemmaDecoderLayer):
+            super().__init__(config)
+        self.config = config
+        self.to(torch.bfloat16).cuda()
+        self.hidden_size = config.hidden_size
+        self._model_generation_phase = GemmaGenerator(
+            lm_head=self.lm_head,
+            model=self.model,
+            dtype=torch.bfloat16,
+            qkv_format=config.qkv_format,
+        )
+        self._model_context_phase = StaticGemmaModel(
+            self.model, torch.bfloat16, "arbitrary", self.lm_head
+        )
+
+        if self.config.fp8:
+            self.fp8_recipe = DelayedScaling(
+                fp8_format=Format.HYBRID, amax_history_len=16, amax_compute_algo="max"
+            )
+
+    @staticmethod
+    def _padding_to_end(inputs, lengths, max_seq_len=None):
+        """
+        Gets the tensor with sequence padded from the beginning and
+        return tensor padded from its end.
+
+        Parameters
+        ----------
+        inputs : Tensor, tensor with shape [b, s] containing token numbers.
+                 It's padded from the beggining.
+        lengths: Tensor, tensor with shape [s] with lengths of the sequences.
+
+        """
+        max_seq_len = torch.max(lengths) if max_seq_len is None else max_seq_len
+        batch_size, max_seq_len = inputs.shape
+        new_input_ids = inputs.clone()
+        for i in range(batch_size):
+            new_input_ids[i, : lengths[i]] = inputs[i, (max_seq_len - lengths[i]) : max_seq_len]
+            new_input_ids[i, lengths[i] :] = inputs[i, 0 : (max_seq_len - lengths[i])]
+
+        # Disable the input preparation that involves extra padding
+        # inputs.copy_(new_input_ids)
+
+        # Trim the inputs to no extra padding i.e. fix the max seq len to
+        # the longest sequence in the batch
+        actual_max_seq_len = max_seq_len
+        inputs.data = new_input_ids[:, :actual_max_seq_len]
+        print(f"actual_max_seq_len: {actual_max_seq_len}")
+
+        # For Paged Attention, make the valid sequences, multiple of 64
+        # inputs.data = new_input_ids[:, :4].repeat(1, 16)
+
+    def _next_64_multiply(self, x):
+        return ((x + 63) // 64) * 64
+
+    # This function is overriden in TeGEmmaForCausalLMCudaGraphs.
+    def _create_hidden_states_buffer(self, input_ids: torch.Tensor):
+        tensor = torch.empty(
+            (input_ids.shape[0], input_ids.shape[1], self.hidden_size),
+            device="cuda",
+            dtype=torch.float32,
+        )
+        # import pdb; pdb.set_trace()
+        return tensor
+
+    # This function is overriden in TeGEmmaForCausalLMCudaGraphs.
+    def _create_inference_params(self, *args, **kwargs):
+        infer_params = InferenceParams(*args, **kwargs)
+
+        # max_batch_size = kwargs["max_batch_size"]
+
+        # Initialize some legacy params
+        # _allocator = StaticBufferAllocator()
+        # infer_params.cached_sequence_lengths = _allocator((max_batch_size,), dtype=torch.int32, device="cuda")
+        # infer_params.input_sequence_lengths = _allocator((max_batch_size,), dtype=torch.int32, device="cuda")
+
+        # These are updated in setup_cache_params_from_infer_params and they should be static for
+        # the duration of the context as well as the generation phase.
+        # infer_params.cu_seqlens_q, infer_params.cu_seqlens_kv, infer_params.cu_seqlens_q_padded, infer_params.cu_seqlens_kv_padded = [
+        #     _allocator(max_batch_size + 1, dtype=torch.int32, device="cuda")
+        #     for _ in range(4)
+        # ]
+
+        return infer_params
+
+    # This function is overriden in TeGEmmaForCausalLMCudaGraphs.
+    def _get_max_input_seq_len(self, input_ids):
+        return (
+            input_ids.shape[1]
+            if not hasattr(self.config, "cuda_graphs_static_max_context_len")
+            else self.config.cuda_graphs_static_max_context_len
+        )
+
+    # The buffer for generation is some part (beginning) of hidden states buffer.
+    # This function returns pointer to it and also copies there data if provided.
+    def _get_generation_buffer(self, hidden_states_buffer, data_to_copy=None):
+        # hidden_states_buffer has shape [b, s, hd]
+        # generation_buffer will have shape [b, 1, hd]
+        # Notice that "generation_buffer = hidden_states_buffer[:, 0, :].unsqueeze(1)"
+        # will return uncontiguous buffer, which we want to avoid.
+        output = hidden_states_buffer.view(-1)[
+            : hidden_states_buffer.shape[0] * hidden_states_buffer.shape[2]
+        ]
+        if data_to_copy is not None:
+            output.copy_(data_to_copy.reshape(-1))
+        generation_buffer = output.view(
+            (hidden_states_buffer.shape[0], 1, hidden_states_buffer.shape[2])
+        )
+        return generation_buffer
+
+    def _generate_context_phase(self, input_ids: torch.Tensor, inference_params: InferenceParams):
+        # import pdb; pdb.set_trace()
+        hidden_states = self._create_hidden_states_buffer(input_ids)
+        hidden_states.data[:] = self.model.embed_tokens(input_ids)
+
+        # We need to update offsets before every forward pass to make cache work properly.
+        lengths = input_ids.ne(0).sum(dim=1)
+
+        # import pdb; pdb.set_trace()
+        if self.config.qkv_format == "thd":
+            # inference_params.setup_before_new_input(
+            #     lengths_tensor=lengths, max_input_length=input_ids.shape[1]
+            # )
+            lengths = input_ids.ne(0).sum(dim=1)
+            inference_params.pre_step(OrderedDict(zip(list(range(len(lengths))), lengths.tolist())))
+        else:
+            inference_params.setup_before_new_input(length=input_ids.shape[1])
+
+        logits, hs_buffer = self._model_context_phase(
+            hidden_states,
+            attention_mask=((input_ids == 0) if self.config.qkv_format != "thd" else None),
+            attn_mask_type="padding_causal" if self.config.qkv_format == "thd" else "arbitrary",
+        )
+
+        # We choose logits coresponding with last token in each sequence,
+        # which have various lengths - they are stored in (inference_params.incoming_seq_len - 1)
+        # Tensor when qkv_format == "thd" and
+        # they are the last token in the sequence when qkv_format != "thd".
+        # import pdb; pdb.set_trace()
+        if self.config.qkv_format == "thd":
+            logits = logits[torch.arange(logits.size(0)), lengths - 1, :]
+        else:
+            logits = logits[:, -1, :]
+
+        next_tokens = torch.argmax(logits, dim=1)
+
+        # self.hidden_states have shape [b, s, hd].
+        # We return hidden state for the last token - output has shape [b, 1, hd]
+        hidden_states = self._get_generation_buffer(
+            hidden_states, self.model.embed_tokens(next_tokens)
+        )
+        return hidden_states, next_tokens
+
+    def _make_mask_one_token_longer(self, mask):
+        return torch.cat(
+            [mask, torch.zeros(mask.size(0), 1, 1, 1, dtype=torch.bool, device=mask.device)], dim=-1
+        )
+
+    @torch.no_grad()
+    def generate(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        pad_token_id: int = 0,
+        max_new_tokens: int = 0,
+        *args,
+        **kwargs,
+    ):
+        self.eval()
+
+        # We need both autocasts: FP8 for operations that can run in lower precision
+        # and BF16 for those that cannot.
+        with autocast(dtype=torch.bfloat16, cache_enabled=False), te.pytorch.fp8_autocast(
+            enabled=self.config.fp8, fp8_recipe=self.fp8_recipe if self.config.fp8 else None
+        ):
+
+            lengths = torch.sum(input_ids.ne(pad_token_id), dim=-1).squeeze()  # [s]
+
+            batch_size, max_input_sequence_len = input_ids.shape[0], self._get_max_input_seq_len(
+                input_ids
+            )
+
+            # This is not needed since the padding to the left is already done in utils.py
+            # # Pad input_ids with zeros on the left to match max_input_sequence_len
+            # # This adds padding tokens (0) to the left side of each sequence in the batch
+            # # Shape goes from [batch_size, seq_len] to [batch_size, max_input_sequence_len]
+            # input_ids = F.pad(
+            #                 input_ids, (max_input_sequence_len - input_ids.shape[1], 0), "constant", 0
+            #             )
+
+            if self.config.qkv_format == "thd":
+                # For thd layout padding is at the end, otherwise at the beginning.
+                TEGemmaForCausalLM._padding_to_end(
+                    input_ids,
+                    lengths,
+                    max_seq_len=(
+                        self.config.cuda_graphs_static_max_context_len
+                        if self.config.generation_cuda_graphs
+                        else None
+                    ),
+                )
+
+            # import pdb; pdb.set_trace()
+
+            # InferenceParams is a cache, where keys and values of previous tokens are stored.
+            # Moreover it stores length of both already generated and input sequences.
+            inference_params = self._create_inference_params(
+                max_batch_size=batch_size,
+                # num_layers=self.config.num_hidden_layers,
+                max_sequence_length=self._next_64_multiply(max_input_sequence_len + max_new_tokens),
+                num_heads_kv=self.config.num_key_value_heads,
+                # num_heads_q=self.config.num_attention_heads,
+                head_dim_v=self.config.head_dim,
+                head_dim_k=self.config.head_dim,
+                dtype=torch.bfloat16,
+                is_paged=self.config.is_paged,
+                page_size=64,
+                total_num_pages=64,  # 64 * 64 (max_sequence_length) / 64 (page_size)
+                # is_cuda_graph=False
+            )
+
+            # def init_cache_params_in_infer_params(inference_params):
+            #     _allocator = StaticBufferAllocator()
+            #     inference_params.cached_sequence_lengths = _allocator(
+            #         (batch_size,), dtype=torch.int32, device="cuda")
+            #     inference_params.input_sequence_lengths = _allocator(
+            #         (batch_size,), dtype=torch.int32, device="cuda")
+
+            # init_cache_params_in_infer_params(inference_params)
+
+            # inference_params.qkv_format_legacy = self.config.qkv_format
+
+            self._model_context_phase.set_inference_params(inference_params)
+            self._model_generation_phase.set_inference_params(inference_params)
+
+            print(f"context phase start")
+            # import pdb; pdb.set_trace()
+            hidden_states, next_tokens = self._generate_context_phase(input_ids, inference_params)
+
+            print(f"context phase done")
+            # Generation phase.
+            if self.config.qkv_format == "thd":
+                # inference_params.setup_before_new_input(
+                #     lengths_tensor=torch.ones((next_tokens.shape[0],), device="cuda"),
+                #     max_input_length=1,
+                # )
+                lengths_tensor = torch.ones((next_tokens.shape[0],), dtype=int)
+                inference_params.pre_step(
+                    OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist()))
+                )
+            else:
+                inference_params.setup_before_new_input(length=1)
+
+            output_tokens = [next_tokens]
+
+            mask = None
+            if self.config.qkv_format != "thd":
+                mask = (input_ids == 0).unsqueeze(1).unsqueeze(1)
+
+            for _ in range(max_new_tokens):
+                if self.config.qkv_format != "thd":
+                    # It will not work with cuda graphs, but it is not used for thd qkv_format.
+                    # Attention mask in bshd needs attn_mask increased by 1 to
+                    # include the next token to be generated
+                    mask = self._make_mask_one_token_longer(mask)
+
+                # setup_cache_params_from_infer_params(inference_params, input_ids)
+                # @sudhakars: could create position_ids from mask here
+                next_tokens = self._model_generation_phase(
+                    hidden_states,
+                    mask,
+                    attn_mask_type="padding" if self.config.qkv_format == "thd" else "arbitrary",
+                )
+
+                # self.inference_params contains for example kv_cache.
+                # This needs to be called before every pass,
+                # to update the information of sequence lengths.
+                # Here we increase sequence offsets by one,
+                # because we generated one token for every sequence.
+                if self.config.qkv_format == "thd":
+                    # self.inference_params.setup_before_new_input(
+                    #     lengths_tensor=torch.ones((next_tokens.shape[0],), device="cuda"),
+                    #     max_input_length=1,
+                    # )
+                    lengths_tensor = torch.ones((next_tokens.shape[0],), dtype=int)
+                    inference_params.pre_step(
+                        OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist()))
+                    )
+                else:
+                    inference_params.setup_before_new_input(length=1)
+                # next_tokens is static output tensor, so we need to clone it
+                # - it gets changed every iteration.
+                output_tokens.append(next_tokens.clone())
+
+            result = torch.cat((input_ids, torch.stack(output_tokens).permute([1, 0])), dim=1)
+            return result
+
+    def forward(self, *args, **kwargs):
+        self._model_context_phase.set_inference_params(None)
+        hidden_states = self.model.embed_tokens(kwargs["input_ids"])
+        logits = self._model_context_phase(
+            hidden_states,
+            attention_mask=(
+                (kwargs["input_ids"] == 0) if self.config.qkv_format != "thd" else None
+            ),
+            attn_mask_type="arbitrary",
+        )
+        return logits
+
+
+class TEGemmaForCausalLMCudaGraphs(TEGemmaForCausalLM):
+    """
+    TEGemmaForCausalLMCudaGraphs is the version of the class TEGemmaForCausalLM
+    using CUDA Graphs to speed it up. We need to make one trade-off.
+    Namely, batch_size, max_seq_len and max_context_seq_len need to be static.
+    It is necessary to run generation with the same value of
+    these variables that we recorded graph on.
+    """
+
+    def __init__(self, config: GemmaConfig):
+        super().__init__(config)
+        assert (
+            config.qkv_format == "thd"
+        ), "Generation with CUDA Graphs are implemented only for thd format."
+
+        # Preparation of the static buffers.
+        self.config = config
+        self.hidden_states_buffer = torch.empty(
+            (
+                self.config.cuda_graphs_static_batch_size,
+                self.config.cuda_graphs_static_max_context_len,
+                self.config.hidden_size,
+            )
+        ).cuda()
+        # This is in fact part of the buffer for hidden_states.
+        self.generation_buffer = self._get_generation_buffer(self.hidden_states_buffer)
+        # self.inference_params = InferenceParams(
+        #     max_batch_size=config.cuda_graphs_static_batch_size,
+        #     max_sequence_length=config.cuda_graphs_static_max_seq_len,
+        #     qkv_format="thd",
+        # )
+        self.inference_params = InferenceParams(
+            max_batch_size=self.config.cuda_graphs_static_batch_size,
+            # num_layers=self.config.num_hidden_layers,
+            max_sequence_length=self.config.cuda_graphs_static_max_seq_len,
+            num_heads_kv=self.config.num_key_value_heads,
+            # num_heads_q=self.config.num_attention_heads,
+            head_dim_v=self.config.head_dim,
+            head_dim_k=self.config.head_dim,
+            dtype=torch.bfloat16,
+            is_paged=self.config.is_paged,
+            page_size=64,
+            total_num_pages=64,  # 64 * 64 (max_sequence_length) / 64 (page_size)
+            # is_cuda_graph=False
+        )
+
+        ## Taken from TEGemmaForCausalLM above
+        # max_batch_size = self.config.cuda_graphs_static_batch_size
+        # # Initialize some legacy params
+        # _allocator = StaticBufferAllocator()
+        # self.inference_params.cached_sequence_lengths = _allocator((max_batch_size,), dtype=torch.int32, device="cuda")
+        # self.inference_params.input_sequence_lengths = _allocator((max_batch_size,), dtype=torch.int32, device="cuda")
+
+        # self.inference_params.cu_seqlens_q, self.inference_params.cu_seqlens_kv, self.inference_params.cu_seqlens_q_padded, self.inference_params.cu_seqlens_kv_padded = [
+        #     _allocator(max_batch_size + 1, dtype=torch.int32, device="cuda")
+        #     for _ in range(4)
+        # ]
+
+        # def init_cache_params_in_infer_params(inference_params):
+        #         inference_params.cached_sequence_lengths = torch.zeros(
+        #         (batch_size,), device="cuda", dtype=torch.int32)
+        #         inference_params.input_sequence_lengths = torch.zeros(
+        #         (batch_size,), device="cuda", dtype=torch.int32)
+        # init_cache_params_in_infer_params(inference_params)
+
+        # self.inference_params.qkv_format_legacy = self.config.qkv_format
+
+        self._model_generation_phase.set_inference_params(self.inference_params)
+        self._model_context_phase.set_inference_params(self.inference_params)
+
+    def record(self):
+        # We want to record model in training=False, because it will be used in generation.
+        self.eval()
+
+        # Here "the trick" happens. We override methods from TEGemmaForCausalLM
+        # with their recorded version. After invocation of each of them,
+        # captured graph will be replayed with minimal usage of CPU,
+        # what will lead to huge speedup.
+        input_shape = (
+            self.config.cuda_graphs_static_batch_size,
+            self.config.cuda_graphs_static_max_context_len,
+        )
+        # self.inference_params.reset()
+        # self.inference_params.setup_before_new_input(
+        #     lengths_tensor=torch.tensor(input_shape[0] * [input_shape[1]], device="cuda"),
+        #     max_input_length=input_shape[1],
+        # )
+
+        # [1] Should be same as lengths_tensor from TEGemmaForCausalLM
+        lengths = torch.tensor(input_shape[0] * [input_shape[1]], device="cuda", dtype=torch.int32)
+        max_input_length = input_shape[1]
+
+        self.inference_params.pre_step(
+            OrderedDict(zip(list(range(len(lengths))), lengths.tolist()))
+        )
+
+        print(f"context phase recording start")
+        # self._model_context_phase.model.layers = torch.nn.ModuleList([
+        #     self.record_graph(
+        #             layer,
+        #             self.hidden_states_buffer,
+        #             self_attn_mask_type="padding_causal",
+        #             inference_params=self.inference_params
+        #         )
+        #     for layer in self._model_context_phase.model.layers
+        # ])
+        self._model_context_phase = self.record_graph(
+            self._model_context_phase, self.hidden_states_buffer, attn_mask_type="padding_causal"
+        )  # CUDA Graphs recording
+
+        print(f"context phase recording done")
+        input_shape = (self.config.cuda_graphs_static_batch_size, 1)
+        # self.inference_params.reset()
+        # self.inference_params.setup_before_new_input(
+        #     lengths_tensor=torch.tensor(input_shape[0] * [input_shape[1]], device="cuda"),
+        #     max_input_length=input_shape[1],
+        # )
+        lengths = torch.tensor(input_shape[0] * [1], device="cuda", dtype=torch.int32)
+        max_input_length = input_shape[1]
+
+        self.inference_params.pre_step(
+            OrderedDict(zip(list(range(len(lengths))), lengths.tolist()))
+        )
+
+        self._model_generation_phase = self.record_graph(
+            self._model_generation_phase, self.generation_buffer, attn_mask_type="padding"
+        )  # CUDA Graphs recording
+
+    """
+        Functions _create_hidden_states_buffer and _create_inference_params
+        from base class are overriden to make hidden_states and inference_params static
+        - not changing their position in memory between every invocation.
+    """
+
+    def _create_hidden_states_buffer(self, *args, **kwargs):
+        return self.hidden_states_buffer
+
+    def _create_inference_params(self, *args, **kwargs):
+        self.inference_params.reset()
+        return self.inference_params
+
+    def _get_max_input_seq_len(self, _):
+        return self.config.cuda_graphs_static_max_context_len
+
+    @torch.no_grad()
+    def record_graph(self, function, input_tensor, **sample_kwargs):
+        # function is invoked on argument (self.hidden_states,) and all kernels are recorded.
+        # record_graph() returns captured function, which can be run later with lower of th CPU.
+        fp8_format = Format.HYBRID
+        fp8_recipe = DelayedScaling(
+            fp8_format=fp8_format, amax_history_len=1024, amax_compute_algo="max"
+        )
+
+        # We need both autocasts: FP8 for operations that can run in lower precision
+        # and BF16 for those that cannot.
+        with autocast(dtype=torch.bfloat16, cache_enabled=False):
+            graphed_function = te.pytorch.make_graphed_callables(
+                function,
+                (input_tensor,),
+                fp8_enabled=self.config.fp8,
+                fp8_recipe=fp8_recipe,
+                allow_unused_input=True,
+                num_warmup_iters=5,
+                sample_kwargs=sample_kwargs,
+            )
+        return graphed_function
diff --git a/docs/examples/te_gemma/te_llama.py b/docs/examples/te_gemma/te_llama.py
new file mode 100755
index 0000000000..637f4f574c
--- /dev/null
+++ b/docs/examples/te_gemma/te_llama.py
@@ -0,0 +1,826 @@
+# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+from contextlib import contextmanager
+
+from typing import Optional
+from functools import partial
+from collections import OrderedDict
+
+import torch
+import transformer_engine as te
+from transformer_engine.pytorch.attention import InferenceParams, RotaryPositionEmbedding
+from transformer_engine.common.recipe import Format, DelayedScaling
+from torch.cuda.amp import autocast
+
+import transformers
+from transformers.models.llama.modeling_llama import LlamaForCausalLM, LlamaConfig, LlamaModel
+
+import torch.nn.functional as F
+
+
+def setup_cache_params_from_infer_params(inference_params, lengths_tensor, max_input_length):
+    """
+    Converts the `input_ids` to variables like `cu_seqlens_q/kv`, etc. which
+    will be used later.
+
+    (Currently a hack, this should be reformatted to a better method)
+    """
+
+    assert (
+        lengths_tensor is not None and max_input_length is not None
+    ), 'lengths_tensor and max_input_length should not be none for qkv_format = "thd"'
+    torch.add(
+        inference_params.cached_sequence_lengths,
+        inference_params.input_sequence_lengths,
+        out=inference_params.cached_sequence_lengths,
+    )
+    inference_params.input_sequence_lengths.copy_(lengths_tensor)
+    inference_params.max_incoming_seq_len = max_input_length
+
+    max_seqlen_q, max_seqlen_kv = (
+        inference_params.max_incoming_seq_len,
+        inference_params.max_sequence_length,
+    )
+
+    # # Allocation of buffers, it works correctly with CUDA Graphs.
+    _allocator = StaticBufferAllocator()
+    NR_BUFFERS = 4
+
+    cu_seqlens_q, cu_seqlens_kv, cu_seqlens_q_padded, cu_seqlens_kv_padded = [
+        _allocator(inference_params.max_batch_size + 1, dtype=torch.int32, device="cuda")
+        for _ in range(NR_BUFFERS)
+    ]
+
+    torch.cumsum(inference_params.input_sequence_lengths, dim=0, out=cu_seqlens_q[1:])
+    torch.cumsum(
+        inference_params.cached_sequence_lengths + inference_params.input_sequence_lengths,
+        dim=0,
+        out=cu_seqlens_kv[1:],
+    )
+    # If layer has shape [b * s_layer, h, d]
+    # offsets are of the form [k * s_layer * h * d for k = 0, ..., batch_size]
+    cu_seqlens_q_padded.copy_(
+        torch.arange(0, inference_params.max_batch_size + 1, device="cuda") * max_seqlen_q
+    )
+    cu_seqlens_kv_padded.copy_(
+        torch.arange(0, inference_params.max_batch_size + 1, device="cuda") * max_seqlen_kv
+    )
+
+    # inference_params.step_dict = OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist()))
+    inference_params.pre_step(
+        OrderedDict(zip(list(range(len(lengths_tensor))), lengths_tensor.tolist()))
+    )
+
+    # print(inference_params.step_dict)
+
+    def get_cache_params_in_infer_params():
+        return (
+            max_seqlen_q,
+            max_seqlen_kv,
+            cu_seqlens_q,
+            cu_seqlens_kv,
+            cu_seqlens_q_padded,
+            cu_seqlens_kv_padded,
+        )
+
+    # For the time being, create an ad-hoc field in `inference_params` to get the variables.
+    # @sudhakars: to create a better way later.
+    inference_params.get_cache_params_from_infer_params = get_cache_params_in_infer_params
+
+
+# This class has been modified from
+# https://github.com/huggingface/transformers/blob/98adf24883b007c2a7fb17bab1c01b1614673433/src/transformers/models/gemma/modeling_gemma.py
+class LlamaRotaryEmbedding(torch.nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (
+            self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float() / self.dim)
+        )
+        self.register_buffer("inv_freq", tensor=inv_freq, persistent=False)
+
+    @torch.no_grad()
+    def forward(self, x, position_ids, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        self.inv_freq.to(x.device)
+        inv_freq_expanded = (
+            self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        )
+        position_ids_expanded = position_ids[:, None, :].float()
+        # Force float32 since bfloat16 loses precision on long contexts
+        # See https://github.com/huggingface/transformers/pull/29285
+        device_type = x.device.type
+        device_type = (
+            device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        )
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+        return emb.unsqueeze(2)  # should return in [b, s, 1, d] format
+
+
+class StaticBufferAllocator(torch.nn.Module):
+    """
+    This class is used when we use te.make_graphed_callable().
+    CUDA Graphs require all tensors to be static. Neverthlessly,
+    torch API make_graphed_callable() takes care of output of torch modules,
+    and makes them static. Thus by wrapping allocation of memory into
+    torch.nn.Module, we can greatly simplify our code.
+    """
+
+    # pylint: disable=no-self-use
+    def forward(self, size, dtype, device):
+        """
+        Return buffer of given size, dtype and device.
+        """
+        return torch.zeros(size, dtype=dtype, device=device)
+
+
+class TELlamaDecoderLayer(te.pytorch.TransformerLayer):
+    """
+    Wrapper class over TE's `TransformerLayer`. This makes the wrapper very
+    similar to HF's `LlamaDecoderLayer` and easier to replace it in the code.
+
+    Args:
+        config: LlamaConfig
+        args: positional args (for compatibility with `LlamaDecoderLayer`)
+        kwargs: keyword args (for compatibility with `LlamaDecoderLayer`)
+    """
+
+    def __init__(self, config: LlamaConfig, layer_idx: int, *args, **kwargs):
+
+        self.llama_config = config
+        self.head_dim = self.llama_config.hidden_size // self.llama_config.num_attention_heads
+
+        super().__init__(
+            hidden_size=config.hidden_size,
+            ffn_hidden_size=config.intermediate_size,
+            num_attention_heads=config.num_attention_heads,
+            bias=False,  # LLaMA specific
+            layernorm_epsilon=config.rms_norm_eps,
+            hidden_dropout=0,
+            attention_dropout=0,
+            fuse_qkv_params=config.fuse_qkv_params,
+            normalization="RMSNorm",
+            activation="swiglu",  # LLaMA specific
+            # attn_input_format=config.qkv_format,
+            attn_input_format="bshd",
+            num_gqa_groups=config.num_key_value_heads,
+            kv_channels=self.head_dim,  # LLaMA specific
+            layer_number=(
+                layer_idx + 1
+            ),  # Layer numbers in TE starts from 1, not 0 like in the HF.
+            zero_centered_gamma=True,  # LLaMA specific
+        )
+
+    def alloc(self, size, dtype, device):
+        """
+        Allocated the buffer and works correctly with CUDA Graphs.
+        """
+        return self._allocator(size, dtype, device)
+
+    def forward(self, *args, **kwargs):  # We need to additionally pass positional encoding.
+
+        if "self_attn_mask_type" in kwargs:
+            attn_mask_type = kwargs["self_attn_mask_type"]
+        else:
+            attn_mask_type = "whatever_default_is"
+
+        if attn_mask_type == "arbitrary":
+            # @sudhakars: following logic doesn't work for `thd`
+            attn_mask = kwargs["attention_mask"]
+            attention_mask_inv = ~attn_mask
+            generation_case = torch.tensor(torch.tensor(attn_mask.shape).shape).item() > 2
+
+            if generation_case:
+                # @sudhakars: for some reason, `attention_mask` for generation is of the
+                # form [b, 1, 1, s].
+                attention_mask_inv = attention_mask_inv.squeeze(1).squeeze(1)
+                assert torch.tensor(torch.tensor(attention_mask_inv.shape).shape).item() == 2
+
+            # Create `position_ids` on the fly using `attention_mask` since HF
+            # does the same in generation logic.
+            position_ids = attention_mask_inv.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask_inv == 0, 1)
+
+            if "position_ids" in kwargs and kwargs["position_ids"] is not None:
+                assert torch.all(
+                    torch.eq(position_ids, kwargs["position_ids"])
+                ), "position ids don't match match exactly!"
+
+            # convert [b, s] to [b, 1, s, s] since `arbitrary` is only set for
+            # context phase and context phase gets [b, s] sized attn mask
+            seq_len = (
+                1
+                if torch.tensor(torch.tensor(attn_mask.shape).shape).item() > 2
+                else attention_mask_inv.shape[1]
+            )
+            arbitrary_attn_mask = torch.zeros(
+                attention_mask_inv.shape[0], 1, seq_len, attention_mask_inv.shape[1]
+            ).bool()
+            for sample_idx in range(attn_mask.shape[0]):
+                pad_len = attn_mask[sample_idx].sum().int().item()
+                # set the columns to padded
+                arbitrary_attn_mask[sample_idx, :, :, :pad_len] = True
+                # set the rows to padded
+                if not generation_case:
+                    arbitrary_attn_mask[sample_idx, :, :pad_len, :] = True
+                    arbitrary_attn_mask[sample_idx] = torch.tril(
+                        arbitrary_attn_mask[sample_idx].logical_not()
+                    ).logical_not()
+
+            # Update the attention mask to arbitrary
+            kwargs["attention_mask"] = arbitrary_attn_mask.cuda()
+
+            # @sudhakars: `max_position_embeddings` is not even used inside GemmaRotaryEmbedding
+            # @sudhakars: change the hardcoded `dim` to something like config.head_dim
+            te_rope_emb = LlamaRotaryEmbedding(
+                dim=self.head_dim, max_position_embeddings=self.llama_config.max_position_embeddings
+            ).cuda()
+            te_rope_emb = te_rope_emb(args[0], position_ids.cuda())
+        else:
+            # When the `attention_mask` is not `arbitrary`, then for the purpose
+            # of this tutorial, we're using `padding_causal` (for context) and
+            # `padding` (for generation)
+            # @sudhakars: find a better way to provide the `tensor_format`
+            te_rope_emb = RotaryPositionEmbedding(self.head_dim)(  # Use self.head_dim
+                max_seq_len=self.llama_config.max_position_embeddings
+            ).cuda()
+
+        inference_params = kwargs["inference_params"]
+        # @sudhakars: big assumption that the input is "sbhd"
+        # batch_size = args[0].shape[0]
+        if inference_params.qkv_format_legacy == "thd":
+            (
+                max_seqlen_q,
+                max_seqlen_kv,
+                cu_seqlens_q,
+                cu_seqlens_kv,
+                cu_seqlens_q_padded,
+                cu_seqlens_kv_padded,
+            ) = inference_params.get_cache_params_from_infer_params()
+
+        # this args cannot be passed to TransformerLayer
+        keys_to_remove = [
+            "position_ids",
+            "past_key_value",
+            "output_attentions",
+            "use_cache",
+            "cache_position",
+        ]
+        for key in keys_to_remove:
+            kwargs.pop(key, None)
+
+        # import pdb; pdb.set_trace()
+        # We need to return tuple to be compatible with HF.
+        return (
+            super().forward(
+                *args,
+                rotary_pos_emb=te_rope_emb,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_kv=cu_seqlens_kv,
+                max_seqlen_q=max_seqlen_q,
+                max_seqlen_kv=max_seqlen_kv,
+                **kwargs
+            ),
+        )
+
+
+class StaticLlamaModel(torch.nn.Module):
+    """
+    StaticLlama is based of HF LlamaModel class.
+    It is adjusted to work properly with CUDA Graphs.
+    """
+
+    def __init__(
+        self,
+        model: LlamaModel,
+        dtype: torch.dtype,
+        mask: torch.Tensor,
+        lm_head: torch.nn.Module,
+    ):
+        super().__init__()
+        self.model = model
+        self.llama_config = model.config  # Store LlamaConfig
+        self.normalizer = torch.tensor(self.llama_config.hidden_size**0.5, dtype=dtype)
+        self.mask = mask
+        self.lm_head = lm_head
+
+    def set_inference_params(self, inference_params):
+        self.inference_params = inference_params
+
+    # @sudhakars: is `arbitrary` fine being the default here?
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor = None,
+        attn_mask_type: str = "arbitrary",
+    ):
+        # import pdb; pdb.set_trace()
+        if hidden_states.shape[1] > 1:
+            torch.save(hidden_states, "input_ctxt.pth")
+
+        with torch.no_grad():
+            # static operation - for CUDA graphs
+            hidden_states.data[:] = hidden_states.data[:] * self.normalizer
+
+            for i, decoder_layer in enumerate(self.model.layers):
+                hidden_states.data[:] = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    self_attn_mask_type=self.mask if attn_mask_type is None else attn_mask_type,
+                    inference_params=self.inference_params,
+                )[
+                    0
+                ]  # static copy - for CUDA graphs
+
+        hidden_states.copy_(self.model.norm(hidden_states))  # static copy - for CUDA graphs
+        logits = self.lm_head(hidden_states)
+        logits = logits.float()
+        return logits
+
+
+class LlamaGenerator(torch.nn.Module):
+    """
+    LlamaGenerator gets one layer of embeddins,
+    makes forward pass and returns next tokens.
+    """
+
+    def __init__(
+        self, model: LlamaModel, lm_head: torch.nn.Module, dtype: torch.dtype, qkv_format: str
+    ):
+        super().__init__()
+        self.model = model
+        self.llama_layers = StaticLlamaModel(model, dtype, "arbitrary", lm_head)
+        self.qkv_format = qkv_format
+
+    def set_inference_params(self, inference_params):
+        self.inference_params = inference_params
+        self.llama_layers.set_inference_params(inference_params)
+
+    # @sudhakars: is `arbitrary` a good default value here?
+    def forward(
+        self, hidden_states: torch.Tensor, mask: torch.Tensor = None, mask_type: str = "arbitrary"
+    ):
+        logits = self.llama_layers(hidden_states, attention_mask=mask, attn_mask_type=mask_type)
+
+        assert logits.shape[0] == hidden_states.shape[0]  # b
+        assert logits.shape[1] == hidden_states.shape[1]  # seq_len
+        # logits.shape[2] = number of tokens
+        logits = logits[:, -1, :]
+        next_tokens = torch.argmax(logits, dim=1)
+
+        # static copy for CUDA graphs
+        hidden_states.copy_(self.model.embed_tokens(next_tokens).unsqueeze(1))
+
+        # self.inference_params contains for example kv_cache.
+        # This needs to be called before every pass,
+        # to update the information of sequence lengths.
+        # Here we increase sequence offsets by one,
+        # because we generated one token for every sequence.
+        if self.qkv_format == "thd":
+            # self.inference_params.setup_before_new_input(
+            #     lengths_tensor=torch.ones((next_tokens.shape[0],), device="cuda"),
+            #     max_input_length=1,
+            # )
+            setup_cache_params_from_infer_params(
+                self.inference_params,
+                lengths_tensor=torch.ones((next_tokens.shape[0],), dtype=int),
+                max_input_length=1,
+            )
+        else:
+            self.inference_params.setup_before_new_input(length=1)
+
+        return next_tokens
+
+
+class PartialForwardWrapper(torch.nn.Module):
+    """
+    This class wraps a `torch.nn.Module` while partially modifying its `forward`
+
+    CUDAGraphs' `make_graphed_callables` method takes in a module but if only
+    `functools.partial` is used to wrap the module, it changes the modules'
+    type and that interferes with the `make_graphed_callables` intrinsics.
+    """
+
+    def __init__(self, module, **kwargs):
+        super().__init__()
+        self.module = module
+        self.partial_forward = partial(self.module.forward, **kwargs)
+
+    def __call__(self, *args, **kwargs):
+        return self.partial_forward(*args, **kwargs)
+
+    # @sudhakars: should we use better abstraction?
+    def set_inference_params(self, *args, **kwargs):
+        return self.module.set_inference_params(*args, **kwargs)
+
+
+@contextmanager
+def replace_decoder(te_decoder_cls):
+    """
+    Replace `LlamaDecoderLayer` with custom `TELlamaDecoderLayer`.
+    """
+    original_llama_decoder_cls = transformers.models.llama.modeling_llama.LlamaDecoderLayer
+    transformers.models.llama.modeling_llama.LlamaDecoderLayer = te_decoder_cls
+    try:
+        yield
+    finally:
+        transformers.models.llama.modeling_llama.LlamaDecoderLayer = original_llama_decoder_cls
+
+
+class TELlamaForCausalLM(LlamaForCausalLM):
+    """
+    Causal LM created with `LlamaModel`. The underlying `LlamaDecoderLayer`
+    class is monkey-patched with `TELlamaDecoderLayer` class before
+    initializing the causal LM with `LlamaForCausalLM`.
+
+    Args:
+        config: LlamaConfig
+    """
+
+    def __init__(self, config: LlamaConfig):
+        with replace_decoder(te_decoder_cls=TELlamaDecoderLayer):
+            super().__init__(config)
+        self.config = config
+        self.to(torch.bfloat16).cuda()
+        self.hidden_size = config.hidden_size
+        self._model_generation_phase = LlamaGenerator(
+            lm_head=self.lm_head,
+            model=self.model,
+            dtype=torch.bfloat16,
+            qkv_format=config.qkv_format,
+        )
+        self._model_context_phase = StaticLlamaModel(
+            self.model, torch.bfloat16, "arbitrary", self.lm_head
+        )
+
+        if self.config.fp8:
+            self.fp8_recipe = DelayedScaling(
+                fp8_format=Format.HYBRID, amax_history_len=16, amax_compute_algo="max"
+            )
+
+    @staticmethod
+    def _padding_to_end(inputs, lengths):
+        """
+        Gets the tensor with sequence padded from the beginning and
+        return tensor padded from its end.
+
+        Parameters
+        ----------
+        inputs : Tensor, tensor with shape [b, s] containing token numbers.
+                 It's padded from the beggining.
+        lengths: Tensor, tensor with shape [s] with lengths of the sequences.
+
+        """
+        max_seq_len = torch.max(lengths)
+        batch_size, max_seq_len = inputs.shape
+        new_input_ids = inputs.clone()
+        for i in range(batch_size):
+            new_input_ids[i, : lengths[i]] = inputs[i, (max_seq_len - lengths[i]) : max_seq_len]
+            new_input_ids[i, lengths[i] :] = inputs[i, 0 : (max_seq_len - lengths[i])]
+
+        # Disable the input preparation that involves extra padding
+        # inputs.copy_(new_input_ids)
+
+        # Trim the inputs to no extra padding i.e. fix the max seq len to
+        # the longest sequence in the batch
+        actual_max_seq_len = inputs.ne(0).sum(dim=1).max()
+        inputs.data = new_input_ids[:, :actual_max_seq_len]
+
+        # For Paged Attention, make the valid sequences, multiple of 64
+        # inputs.data = new_input_ids[:, :4].repeat(1, 16)
+
+    def _next_64_multiply(self, x):
+        return ((x + 63) // 64) * 64
+
+    # This function is overriden in TeGEmmaForCausalLMCudaGraphs.
+    def _create_hidden_states_buffer(self, input_ids: torch.Tensor):
+        return torch.empty(
+            (input_ids.shape[0], input_ids.shape[1], self.hidden_size),
+            device="cuda",
+            dtype=torch.float32,
+        )
+
+    # This function is overriden in TeGEmmaForCausalLMCudaGraphs.
+    def _create_inference_params(self, *args, **kwargs):
+        infer_params = InferenceParams(*args, **kwargs)
+
+        max_batch_size = kwargs["max_batch_size"]
+
+        # Initialize some legacy params
+        infer_params.cached_sequence_lengths = torch.zeros(
+            (max_batch_size,), device="cuda", dtype=torch.int32
+        )
+        infer_params.input_sequence_lengths = torch.zeros(
+            (max_batch_size,), device="cuda", dtype=torch.int32
+        )
+
+        return infer_params
+
+    # This function is overriden in TeGEmmaForCausalLMCudaGraphs.
+    def _get_max_input_seq_len(self, input_ids):
+        return input_ids.shape[1]
+
+    # The buffer for generation is some part (beginning) of hidden states buffer.
+    # This function returns pointer to it and also copies there data if provided.
+    def _get_generation_buffer(self, hidden_states_buffer, data_to_copy=None):
+        # hidden_states_buffer has shape [b, s, hd]
+        # generation_buffer will have shape [b, 1, hd]
+        # Notice that "generation_buffer = hidden_states_buffer[:, 0, :].unsqueeze(1)"
+        # will return uncontiguous buffer, which we want to avoid.
+        output = hidden_states_buffer.view(-1)[
+            : hidden_states_buffer.shape[0] * hidden_states_buffer.shape[2]
+        ]
+        if data_to_copy is not None:
+            output.copy_(data_to_copy.reshape(-1))
+        generation_buffer = output.view(
+            (hidden_states_buffer.shape[0], 1, hidden_states_buffer.shape[2])
+        )
+        return generation_buffer
+
+    def _generate_context_phase(self, input_ids: torch.Tensor, inference_params: InferenceParams):
+        hidden_states = self._create_hidden_states_buffer(input_ids)
+        hidden_states.data[:] = self.model.embed_tokens(input_ids)
+
+        # We need to update offsets before every forward pass to make cache work properly.
+        lengths = input_ids.ne(0).sum(dim=1)
+        # import pdb; pdb.set_trace()
+        if self.config.qkv_format == "thd":
+            # inference_params.setup_before_new_input(
+            #     lengths_tensor=lengths, max_input_length=input_ids.shape[1]
+            # )
+            lengths = input_ids.ne(0).sum(dim=1)
+            max_input_length = input_ids.shape[1]
+            setup_cache_params_from_infer_params(inference_params, lengths, max_input_length)
+        else:
+            inference_params.setup_before_new_input(length=input_ids.shape[1])
+
+        logits = self._model_context_phase(
+            hidden_states,
+            attention_mask=((input_ids == 0) if self.config.qkv_format != "thd" else None),
+            attn_mask_type="padding_causal" if self.config.qkv_format == "thd" else "arbitrary",
+        )
+
+        # We choose logits coresponding with last token in each sequence,
+        # which have various lengths - they are stored in (inference_params.incoming_seq_len - 1)
+        # Tensor when qkv_format == "thd" and
+        # they are the last token in the sequence when qkv_format != "thd".
+        if self.config.qkv_format == "thd":
+            logits = logits[
+                torch.arange(logits.size(0)), inference_params.input_sequence_lengths - 1, :
+            ]
+        else:
+            logits = logits[:, -1, :]
+        torch.save(logits, "logits_ctxt.pth")
+        next_tokens = torch.argmax(logits, dim=1)
+
+        # self.hidden_states have shape [b, s, hd].
+        # We return hidden state for the last token - output has shape [b, 1, hd]
+        hidden_states = self._get_generation_buffer(
+            hidden_states, self.model.embed_tokens(next_tokens)
+        )
+        return hidden_states, next_tokens
+
+    def _make_mask_one_token_longer(self, mask):
+        return torch.cat(
+            [mask, torch.zeros(mask.size(0), 1, 1, 1, dtype=torch.bool, device=mask.device)], dim=-1
+        )
+
+    @torch.no_grad()
+    def generate(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        pad_token_id: int = 0,
+        max_new_tokens: int = 0,
+        *args,
+        **kwargs
+    ):
+        self.eval()
+
+        # We need both autocasts: FP8 for operations that can run in lower precision
+        # and BF16 for those that cannot.
+        with autocast(dtype=torch.bfloat16, cache_enabled=False), te.pytorch.fp8_autocast(
+            enabled=self.config.fp8, fp8_recipe=self.fp8_recipe if self.config.fp8 else None
+        ):
+
+            lengths = torch.sum(input_ids.ne(pad_token_id), dim=-1).squeeze()  # [s]
+            # input_ids = F.pad(
+            #                 input_ids, (max_input_sequence_len - input_ids.shape[1], 0), "constant", 0
+            #             )
+
+            if self.config.qkv_format == "thd":
+                # For thd layout padding is at the end, otherwise at the beginning.
+                TELlamaForCausalLM._padding_to_end(input_ids, lengths)
+
+            batch_size, max_input_sequence_len = input_ids.shape[0], self._get_max_input_seq_len(
+                input_ids
+            )
+            # import pdb; pdb.set_trace()
+
+            # InferenceParams is a cache, where keys and values of previous tokens are stored.
+            # Moreover it stores length of both already generated and input sequences.
+            head_dim = self.config.hidden_size // self.config.num_attention_heads
+            inference_params = self._create_inference_params(
+                max_batch_size=batch_size,
+                # num_layers=self.config.num_hidden_layers,
+                max_sequence_length=self._next_64_multiply(max_input_sequence_len + max_new_tokens),
+                num_heads_kv=self.config.num_key_value_heads,
+                # num_heads_q=self.config.num_attention_heads,
+                head_dim_v=head_dim,
+                head_dim_k=head_dim,
+                dtype=torch.bfloat16,
+                is_paged=True,
+                page_size=64,
+                total_num_pages=64 * 3,  # 64 * 64 (max_sequence_length) / 64 (page_size)
+                # is_cuda_graph=False
+            )
+
+            def init_cache_params_in_infer_params(inference_params):
+                inference_params.cached_sequence_lengths = torch.zeros(
+                    (batch_size,), device="cuda", dtype=torch.int32
+                )
+                inference_params.input_sequence_lengths = torch.zeros(
+                    (batch_size,), device="cuda", dtype=torch.int32
+                )
+
+            init_cache_params_in_infer_params(inference_params)
+            inference_params.qkv_format_legacy = self.config.qkv_format
+
+            self._model_context_phase.set_inference_params(inference_params)
+            self._model_generation_phase.set_inference_params(inference_params)
+
+            hidden_states, next_tokens = self._generate_context_phase(input_ids, inference_params)
+
+            # Generation phase.
+            if self.config.qkv_format == "thd":
+                # inference_params.setup_before_new_input(
+                #     lengths_tensor=torch.ones((next_tokens.shape[0],), device="cuda"),
+                #     max_input_length=1,
+                # )
+                setup_cache_params_from_infer_params(
+                    inference_params,
+                    lengths_tensor=torch.ones((next_tokens.shape[0],), dtype=int),
+                    max_input_length=1,
+                )
+            else:
+                inference_params.setup_before_new_input(length=1)
+
+            output_tokens = [next_tokens]
+
+            mask = None
+            if self.config.qkv_format != "thd":
+                mask = (input_ids == 0).unsqueeze(1).unsqueeze(1)
+
+            for _ in range(max_new_tokens):
+                if self.config.qkv_format != "thd":
+                    # It will not work with cuda graphs, but it is not used for thd qkv_format.
+                    # Attention mask in bshd needs attn_mask increased by 1 to
+                    # include the next token to be generated
+                    mask = self._make_mask_one_token_longer(mask)
+
+                # setup_cache_params_from_infer_params(inference_params, input_ids)
+                # @sudhakars: could create position_ids from mask here
+                next_tokens = self._model_generation_phase(
+                    hidden_states,
+                    mask,
+                    mask_type="padding" if self.config.qkv_format == "thd" else "arbitrary",
+                )
+                # next_tokens is static output tensor, so we need to clone it
+                # - it gets changed every iteration.
+                output_tokens.append(next_tokens.clone())
+
+            result = torch.cat((input_ids, torch.stack(output_tokens).permute([1, 0])), dim=1)
+            return result
+
+    def forward(self, *args, **kwargs):
+        self._model_context_phase.set_inference_params(None)
+        hidden_states = self.model.embed_tokens(kwargs["input_ids"])
+        logits = self._model_context_phase(
+            hidden_states,
+            attention_mask=(
+                (kwargs["input_ids"] == 0) if self.config.qkv_format != "thd" else None
+            ),
+            attn_mask_type="arbitrary",
+        )
+        return logits
+
+
+class TELlamaForCausalLMCudaGraphs(TELlamaForCausalLM):
+    """
+    TELlamaForCausalLMCudaGraphs is the version of the class TELlamaForCausalLM
+    using CUDA Graphs to speed it up. We need to make one trade-off.
+    Namely, batch_size, max_seq_len and max_context_seq_len need to be static.
+    It is necessary to run generation with the same value of
+    these variables that we recorded graph on.
+    """
+
+    def __init__(self, config: LlamaConfig):
+        super().__init__(config)
+        assert (
+            config.qkv_format == "thd"
+        ), "Generation with CUDA Graphs are implemented only for thd format."
+
+        # Preparation of the static buffers.
+        self.config = config
+        self.hidden_states_buffer = torch.empty(
+            (
+                config.cuda_graphs_static_batch_size,
+                config.cuda_graphs_static_max_context_len,
+                config.hidden_size,
+            )
+        ).cuda()
+        # This is in fact part of the buffer for hidden_states.
+        self.generation_buffer = self._get_generation_buffer(self.hidden_states_buffer)
+        self.inference_params = InferenceParams(
+            max_batch_size=config.cuda_graphs_static_batch_size,
+            max_sequence_length=config.cuda_graphs_static_max_seq_len,
+            qkv_format="thd",
+        )
+
+        self._model_generation_phase.set_inference_params(self.inference_params)
+        self._model_context_phase.set_inference_params(self.inference_params)
+
+    def record(self):
+        # We want to record model in training=False, because it will be used in generation.
+        self.eval()
+
+        # Here "the trick" happens. We override methods from TELlamaForCausalLM
+        # with their recorded version. After invocation of each of them,
+        # captured graph will be replayed with minimal usage of CPU,
+        # what will lead to huge speedup.
+        input_shape = (
+            self.config.cuda_graphs_static_batch_size,
+            self.config.cuda_graphs_static_max_context_len,
+        )
+        self.inference_params.reset()
+        self.inference_params.setup_before_new_input(
+            lengths_tensor=torch.tensor(input_shape[0] * [input_shape[1]], device="cuda"),
+            max_input_length=input_shape[1],
+        )
+        self._model_context_phase = self.record_graph(
+            PartialForwardWrapper(
+                self._model_context_phase,
+                attn_mask_type=(
+                    "padding_causal" if self.inference_params.qkv_format == "thd" else "arbitrary"
+                ),
+            ),
+            self.hidden_states_buffer,
+        )  # CUDA Graphs recording
+
+        input_shape = (self.config.cuda_graphs_static_batch_size, 1)
+        self.inference_params.reset()
+        self.inference_params.setup_before_new_input(
+            lengths_tensor=torch.tensor(input_shape[0] * [input_shape[1]], device="cuda"),
+            max_input_length=input_shape[1],
+        )
+        self._model_generation_phase = self.record_graph(
+            PartialForwardWrapper(
+                self._model_generation_phase,
+                mask_type="padding" if self.inference_params.qkv_format == "thd" else "arbitrary",
+            ),
+            self.generation_buffer,
+        )  # CUDA Graphs recording
+
+    """
+        Functions _create_hidden_states_buffer and _create_inference_params
+        from base class are overriden to make hidden_states and inference_params static
+        - not changing their position in memory between every invocation.
+    """
+
+    def _create_hidden_states_buffer(self, *args, **kwargs):
+        return self.hidden_states_buffer
+
+    def _create_inference_params(self, *args, **kwargs):
+        self.inference_params.reset()
+        return self.inference_params
+
+    def _get_max_input_seq_len(self, _):
+        return self.config.cuda_graphs_static_max_context_len
+
+    @torch.no_grad()
+    def record_graph(self, function, input_tensor):
+        # function is invoked on argument (self.hidden_states,) and all kernels are recorded.
+        # record_graph() returns captured function, which can be run later with lower of th CPU.
+        fp8_format = Format.HYBRID
+        fp8_recipe = DelayedScaling(
+            fp8_format=fp8_format, amax_history_len=1024, amax_compute_algo="max"
+        )
+
+        # We need both autocasts: FP8 for operations that can run in lower precision
+        # and BF16 for those that cannot.
+        with autocast(dtype=torch.bfloat16, cache_enabled=False):
+            graphed_function = te.pytorch.make_graphed_callables(
+                function,
+                (input_tensor,),
+                fp8_enabled=self.config.fp8,
+                fp8_recipe=fp8_recipe,
+                allow_unused_input=True,
+                num_warmup_iters=3,
+            )
+        return graphed_function
diff --git a/docs/examples/te_gemma/te_llama_loading_weights.py b/docs/examples/te_gemma/te_llama_loading_weights.py
new file mode 100755
index 0000000000..a5ab151f67
--- /dev/null
+++ b/docs/examples/te_gemma/te_llama_loading_weights.py
@@ -0,0 +1,224 @@
+# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+import os
+import re
+import gc
+import torch
+
+from typing import List
+
+from transformer_engine.pytorch.fp8 import fp8_model_init
+
+from transformers.modeling_utils import load_state_dict, _load_state_dict_into_model
+from transformers.utils.hub import get_checkpoint_shard_files
+
+"""
+    This file contains logic of mapping the HuggingFace LlamaModel parameters
+    with TransformerEngine TransformerLayer. When we have initialized Transformer models
+    both with HF and with TE, we can copy parameters from the first to the second.
+"""
+
+
+def _load_weights_for_fp8_model(vanilla_model, hyperparams):
+    # The weights are loaded from the file with state_dict
+    # of model with weights which contains also fp8 parameters.
+    # The weights are in BF16 precision, but they contain fp8 metadata
+    # computed by the calibration procedure.
+    vanilla_model.load_state_dict(
+        torch.load(hyperparams.fp8_model_weights_filename),
+        strict=False,
+        # strict = false, because some parameters have
+        # multiple pointers to the same weight
+        # vanilla_model._model_context_phase.model
+        # and vanilla_model._model_generation_phase.model
+    )
+
+
+def _load_weights_for_standard_model(vanilla_model, config):
+    # The weights are loaded from the file with original weights.
+    archive_file = os.path.join(config.model_name, "model.safetensors.index.json")
+    resolved_archive_file, _ = get_checkpoint_shard_files(config.model_name, archive_file)
+    total_dict = {}
+    for shard_file in resolved_archive_file:
+        state_dict = load_state_dict(shard_file)
+        total_dict.update(state_dict)
+
+    replace_params(
+        total_dict,
+        vanilla_model.state_dict(),
+        config,
+        qkv_fused_and_interleaved=config.fuse_qkv_params,
+    )
+    # Copy parameters like embedding:
+    _load_state_dict_into_model(vanilla_model, total_dict, start_prefix="")
+
+    # Force mem release. Taken from huggingface code.
+    del total_dict
+    gc.collect()
+
+
+def load_te_model(cls, config):
+    """
+    Custom method adapted from `from_pretrained` method in HuggingFace
+    Transformers repo:
+    https://github.com/huggingface/transformers/blob/f497f564bb76697edab09184a252fc1b1a326d1e/src/transformers/modeling_utils.py#L2579
+    """
+
+    config.use_cache = False  # To make TransformerLayer compatible with LlamaModel
+    with fp8_model_init(config.fp8_model_init):
+        # there we need only to create model
+        vanilla_model = cls(config).to(torch.bfloat16).cuda()
+
+    # return vanilla_model
+    # and now we copy the weights into it
+    if config.fp8_model_weights_filename is not None:
+        _load_weights_for_fp8_model(vanilla_model, config)
+    else:
+        _load_weights_for_standard_model(vanilla_model, config)
+
+    return vanilla_model
+
+
+def _get_all_layer_prefixes_to_update(hf_state_dict):
+    """
+    There are many parameters in hf_state_dict, whose name start with "model.layers.[number]."
+    This function extracts all strings like "model.layers.[number]."
+    that are starting strings of keys in hf_state_dict.
+    """
+    all_layer_prefixes = set()
+    for param_key in hf_state_dict.keys():
+        layer_prefix_pat = "model.layers.\d+."
+        m = re.match(layer_prefix_pat, param_key)
+        if m is not None:
+            all_layer_prefixes.add(m.group())
+    return all_layer_prefixes
+
+
+def replace_params(hf_state_dict, te_state_dict, config, qkv_fused_and_interleaved=False):
+    # collect all layer prefixes to update
+    all_layer_prefixes = set()
+    for param_key in hf_state_dict.keys():
+        layer_prefix_pat = "model.layers.\d+."
+        m = re.match(layer_prefix_pat, param_key)
+        if m is not None:
+            all_layer_prefixes.add(m.group())
+
+    for layer_prefix in all_layer_prefixes:
+        # When loading weights into models with less number of layers, skip the
+        # copy if the corresponding layer doesn't exist in HF model
+        if layer_prefix + "input_layernorm.weight" in hf_state_dict:
+            te_state_dict[layer_prefix + "self_attention.layernorm_qkv.layer_norm_weight"].data[
+                :
+            ] = hf_state_dict[layer_prefix + "input_layernorm.weight"].data[:]
+
+        if layer_prefix + "self_attn.q_proj.weight" in hf_state_dict:
+            te_state_dict[layer_prefix + "self_attention.layernorm_qkv.query_weight"].data[:] = (
+                hf_state_dict[layer_prefix + "self_attn.q_proj.weight"].data[:]
+            )
+
+        if layer_prefix + "self_attn.k_proj.weight" in hf_state_dict:
+            te_state_dict[layer_prefix + "self_attention.layernorm_qkv.key_weight"].data[:] = (
+                hf_state_dict[layer_prefix + "self_attn.k_proj.weight"].data[:]
+            )
+
+        if layer_prefix + "self_attn.v_proj.weight" in hf_state_dict:
+            te_state_dict[layer_prefix + "self_attention.layernorm_qkv.value_weight"].data[:] = (
+                hf_state_dict[layer_prefix + "self_attn.v_proj.weight"].data[:]
+            )
+
+        if layer_prefix + "self_attn.o_proj.weight" in hf_state_dict:
+            te_state_dict[layer_prefix + "self_attention.proj.weight"].data[:] = hf_state_dict[
+                layer_prefix + "self_attn.o_proj.weight"
+            ].data[:]
+
+        if layer_prefix + "post_attention_layernorm.weight" in hf_state_dict:
+            te_state_dict[layer_prefix + "layernorm_mlp.layer_norm_weight"].data[:] = hf_state_dict[
+                layer_prefix + "post_attention_layernorm.weight"
+            ].data[:]
+
+        # It may happen that gate_proj.weight and up_proj.weight will be in the different files, so we need to
+        # load them separately.
+        if layer_prefix + "mlp.gate_proj.weight" in hf_state_dict:
+            te_state_dict[layer_prefix + "layernorm_mlp.fc1_weight"].data[
+                : config.intermediate_size
+            ] = hf_state_dict[layer_prefix + "mlp.gate_proj.weight"].data
+
+        if layer_prefix + "mlp.up_proj.weight" in hf_state_dict:
+            te_state_dict[layer_prefix + "layernorm_mlp.fc1_weight"].data[
+                config.intermediate_size :
+            ] = hf_state_dict[layer_prefix + "mlp.up_proj.weight"].data
+
+        if layer_prefix + "mlp.down_proj.weight" in hf_state_dict:
+            te_state_dict[layer_prefix + "layernorm_mlp.fc2_weight"].data[:] = hf_state_dict[
+                layer_prefix + "mlp.down_proj.weight"
+            ].data[:]
+    return all_layer_prefixes
+
+
+# def replace_params(hf_state_dict, te_state_dict, config, qkv_fused_and_interleaved=False):
+#     """
+#     Replaces params from TE TransformerLayer state_dict with corresponding parameters
+#     from HuggingFace LlamaModel state_dict.
+#     """
+#     all_layer_prefixes: List[str] = _get_all_layer_prefixes_to_update(hf_state_dict)
+
+#     head_dim = config.hidden_size // config.num_attention_heads
+
+#     for layer_prefix in all_layer_prefixes:
+
+#         def copy_from_ht_to_te(te_name, hf_name, start=None, end=None):
+#             te_state_dict[layer_prefix + te_name].data[start:end].copy_(
+#                 hf_state_dict[layer_prefix + hf_name]
+#             )
+
+#         copy_from_ht_to_te(
+#             "self_attention.layernorm_qkv.layer_norm_weight", "input_layernorm.weight"
+#         )
+#         copy_from_ht_to_te("self_attention.proj.weight", "self_attn.o_proj.weight")
+#         copy_from_ht_to_te("layernorm_mlp.layer_norm_weight", "post_attention_layernorm.weight")
+#         copy_from_ht_to_te("layernorm_mlp.fc2_weight", "mlp.down_proj.weight")
+#         copy_from_ht_to_te(
+#             "layernorm_mlp.fc1_weight", "mlp.gate_proj.weight", end=config.intermediate_size
+#         )
+#         copy_from_ht_to_te(
+#             "layernorm_mlp.fc1_weight", "mlp.up_proj.weight", start=config.intermediate_size
+#         )
+
+#         if qkv_fused_and_interleaved:
+#             """
+#             When qkv_fused_and_interleaved=True, key, query and value layers are on one tensor
+#             in TE TransformerLayer. Moreover they are interleaved within each head.
+#             Let q_i, k_i and v_i be query, key and value layers for i-th head respectively.
+#             Then TE stores weight tensor in the form:
+#             [q1 k1 v1 q2 k2 v2 ...]
+#             This is done to maximally optimize performance time.
+#             """
+#             te_qkv_layer = te_state_dict[layer_prefix + "self_attention.layernorm_qkv.weight"]
+
+#             def copy_interleave(hf_name, idx):
+#                 src = hf_state_dict[layer_prefix + hf_name]
+#                 for head_nr in range(config.num_attention_heads):
+#                     dst_offset = head_nr * config.head_dim * 3
+#                     dst_slice = slice(
+#                         dst_offset + idx * config.head_dim, dst_offset + (idx + 1) * config.head_dim
+#                     )
+#                     src_slice = slice(
+#                         head_nr * config.head_dim, head_nr * config.head_dim + config.head_dim
+#                     )
+#                     te_qkv_layer[dst_slice, :] = src[src_slice, :]
+
+#             copy_interleave("self_attn.q_proj.weight", 0)
+#             copy_interleave("self_attn.k_proj.weight", 1)
+#             copy_interleave("self_attn.v_proj.weight", 2)
+#         else:
+#             copy_from_ht_to_te(
+#                 "self_attention.layernorm_qkv.query_weight", "self_attn.q_proj.weight"
+#             )
+#             copy_from_ht_to_te("self_attention.layernorm_qkv.key_weight", "self_attn.k_proj.weight")
+#             copy_from_ht_to_te(
+#                 "self_attention.layernorm_qkv.value_weight", "self_attn.v_proj.weight"
+#             )
+
+#     return all_layer_prefixes
diff --git a/docs/examples/te_gemma/test_paged_attn.ipynb b/docs/examples/te_gemma/test_paged_attn.ipynb
new file mode 100755
index 0000000000..543ebe9262
--- /dev/null
+++ b/docs/examples/te_gemma/test_paged_attn.ipynb
@@ -0,0 +1,33 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ace403ac-c276-4378-a4e8-0155165f9934",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/docs/examples/te_gemma/tutorial_accelerate_hf_gemma_finetuning_with_te.ipynb b/docs/examples/te_gemma/tutorial_accelerate_hf_gemma_finetuning_with_te.ipynb
new file mode 100755
index 0000000000..7875ffc9f3
--- /dev/null
+++ b/docs/examples/te_gemma/tutorial_accelerate_hf_gemma_finetuning_with_te.ipynb
@@ -0,0 +1,314 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Accelerating a Hugging Face Gemma model finetuning with Transformer Engine"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the previous [tutorial](../te_llama/tutorial_accelerate_hf_llama_finetuning_with_te.ipynb), we demonstrated how to accelerate HF Llama models using the Transformer Engine library. We replaced `LlamaDecoderLayer` with `TransformerLayer` from the Transformer Engine, achieving a speedup. Furthermore, we conducted the finetuning in FP8 precision, which yielded an additional speedup.\n",
+    "\n",
+    "Now, we will undertake a similar enhancement for the Google's [Gemma](https://blog.google/technology/developers/gemma-open-models/) model."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Dependencies for this tutorial\n",
+    "\n",
+    "Following files and media are necessary to effectively run this tutorial:\n",
+    "\n",
+    "1. `te_gemma.py`\n",
+    "    - This file contains the code to load a Hugging Face Gemma checkpoint in Transformer Engine's `TransformerLayer` instead of Hugging Face's `GemmaDecoderLayer`. This is used in the following two sections of the tutorial - \"Improvement 1\" and \"Improvement 2\".\n",
+    "2. `utils.py`\n",
+    "    - This file contains the code related to dataloading, hyperparameters, setting up model/optimizers/accelerator, model training and other miscellaneous tasks like restarting the jupyter notebook from within the cell. \n",
+    "3. `requirements.txt`\n",
+    "    - This file contains necessary Python packages for this tutorial.\n",
+    "4. `media/`\n",
+    "    - This directory contains the images used in the following tutorial."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%pip install -r requirements.txt\n",
+    "\n",
+    "import torch\n",
+    "cudnn_version = torch.backends.cudnn.version()\n",
+    "assert cudnn_version >= 90100, \"cuDNN version >= 9.1.0 is needed to run this tutorial.\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Differences between Llama and Gemma"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Thr Llama and the Gemma are very similar models - both are based on Transformer Decoder architecture. The most important architectural differences between them are the following:\n",
+    "\n",
+    "\n",
+    "| Feature                                      | Llama                              | Gemma                                      |\n",
+    "|----------------------------------------------|------------------------------------|--------------------------------------------|\n",
+    "| **Norm Layer**                               | Standard RMSNorm <br> $y = \\frac{x - \\mathrm{E}[x]}{ \\sqrt{\\mathrm{Var}[x] + \\varepsilon}} * \\gamma + \\beta$                   | RMSNorm with zero centered gamma parameter <br>  $y = \\frac{x - \\mathrm{E}[x]}{ \\sqrt{\\mathrm{Var}[x] + \\varepsilon}} * (\\textcolor{red}{1 +} \\gamma) + \\beta$   |\n",
+    "| **Embedding Dimension/Head Dimension**             | 4096/4096                              | 3072/4096                                  |\n",
+    "| **Activation Function**                      | SwiGlu                             | GeGlu                                      |\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## [Baseline] Running HF `GemmaModel` (Precision: `BF16`)\n",
+    "\n",
+    "Similarly to the Llama tutorial, we begin the experiments by running baseline Hugging Face Gemma model finetuning in BF16 precision.\n",
+    "\n",
+    "<div class=\"alert alert-info\">\n",
+    "\n",
+    "<b>Note</b>\n",
+    "    \n",
+    "This tutorial loads and trains a Gemma 7B model which takes up most of the GPU memory and therefore, we need to restart the jupyter notebook each time before running the following sections. A small utility method `restart_jupyter_notebook` is defined in the accompanying `utils.py` file. This function restarts the jupyter notebook so that the GPU memory is flushed before the model is loaded again from the checkpoint in order to avoid running into OOM (Out Of Memory) errors.\n",
+    "\n",
+    "If the utility doesn't work, comment this line `restart_jupyter_notebook()` in the following cell and manually restart the jupyter notebook before running the cell. Repeat the same for other sections in this tutorial.\n",
+    "\n",
+    "</div>\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "10 finetuning steps complete!\n",
+      "\n",
+      "Average time taken per step: \n",
+      "298 \n",
+      "milliseconds\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Restart the notebook (to flush the GPU memory)\n",
+    "from utils import restart_jupyter_notebook\n",
+    "restart_jupyter_notebook()\n",
+    "\n",
+    "\n",
+    "# Import necessary packages and methods\n",
+    "from utils import *\n",
+    "\n",
+    "\n",
+    "# Default hyperparams, also defined in `utils.py` in class `Hyperparameters`\n",
+    "## !!! `model_name` attr must point to the location of the model weights !!!\n",
+    "## Weights can be downloaded from: https://huggingface.co/google/gemma-7b\n",
+    "hyperparams.model_name = \"../../../../gemma-7b\" # <== Add model weight location here e.g. \"/path/to/downloaded/gemma/weights\"\n",
+    "hyperparams.mixed_precision = \"bf16\"\n",
+    "\n",
+    "\n",
+    "# Init the model and accelerator wrapper\n",
+    "model = init_baseline_model(hyperparams).cuda()\n",
+    "accelerator, model, optimizer, train_dataloader, lr_scheduler = wrap_with_accelerator(model, hyperparams)\n",
+    "\n",
+    "\n",
+    "# Finetune the model\n",
+    "finetune_model(model, hyperparams, accelerator, train_dataloader, optimizer, lr_scheduler)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's add this information in a table and keep comparing it with a few possible improvements in future sections:\n",
+    "\n",
+    "| Models                                                      | Precision | Step Time (or ms per batch) | Speedup (over baseline) |\n",
+    "|-------------------------------------------------------------|-----------|-----------------------------|-------------------------|\n",
+    "| HF (baseline)                                               | BF16      | 298                         | 1                       |"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## [Improvement 1] Replace HF's `GemmaDecoderLayer` with TE's `TransformerLayer` (Precision: `BF16`)\n",
+    "\n",
+    "We replace *GemmaDecoderLayer* with the highly tuned *TransformerLayer*, similarly to our approach in the [Llama tutorial](../te_llama/tutorial_accelerate_hf_llama_finetuning_with_te.ipynb). Let's observe the impact this change has on the model's speed."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "10 finetuning steps complete!\n",
+      "\n",
+      "Average time taken per step: \n",
+      "257 \n",
+      "milliseconds\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Restart the notebook (to flush the GPU memory)\n",
+    "from utils import restart_jupyter_notebook\n",
+    "restart_jupyter_notebook()\n",
+    "\n",
+    "\n",
+    "# Import necessary packages and methods\n",
+    "from utils import *\n",
+    "\n",
+    "\n",
+    "# Default hyperparams, also defined in `utils.py` in class `Hyperparameters`\n",
+    "## !!! `model_name` attr must point to the location of the model weights !!!\n",
+    "## Weights can be downloaded from: https://huggingface.co/google/gemma-7b\n",
+    "hyperparams.model_name = \"../../../../gemma-7b\"  # <== Add model weight location here e.g. \"/path/to/downloaded/gemma/weights\"\n",
+    "hyperparams.mixed_precision = \"bf16\"\n",
+    "\n",
+    "\n",
+    "# Init the model and accelerator wrapper\n",
+    "model = init_te_gemma_model(hyperparams).cuda()\n",
+    "accelerator, model, optimizer, train_dataloader, lr_scheduler = wrap_with_accelerator(model, hyperparams)\n",
+    "\n",
+    "\n",
+    "# Finetune the model\n",
+    "finetune_model(model, hyperparams, accelerator, train_dataloader, optimizer, lr_scheduler)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Compared to the \"baseline\" implementation, we see that using Transformer Engine's `TransformerLayer` in place of Huggging Face's `GemmaDecoderLayer` gives a speedup of **16%** even when using only BF16 precision!\n",
+    "\n",
+    "| Models                                                      | Precision | Step Time (or ms per batch) | Speedup (over baseline) |\n",
+    "|-------------------------------------------------------------|-----------|-----------------------------|-------------------------|\n",
+    "| HF (baseline)                                               | BF16      | 298                        | 1                       |\n",
+    "| TE (replace `GemmaDecoderLayer` with `TE.TransformerLayer`) | BF16      | 257                         | 1.16                    |"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## [Improvement 2] Replace HF's `GemmaDecoderLayer` with TE's `TransformerLayer` (Precision: `FP8`)\n",
+    "\n",
+    "The last improvement is about enabling FP8 precision. Let's see how it works."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "10 finetuning steps complete!\n",
+      "\n",
+      "Average time taken per step: \n",
+      "214 \n",
+      "milliseconds\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Restart the notebook (to flush the GPU memory)\n",
+    "from utils import restart_jupyter_notebook\n",
+    "#restart_jupyter_notebook()\n",
+    "\n",
+    "\n",
+    "# Import necessary packages and methods\n",
+    "from utils import *\n",
+    "\n",
+    "\n",
+    "# Default hyperparams, also defined in `utils.py` in class `Hyperparameters`\n",
+    "## !!! `model_name` attr must point to the location of the model weights !!!\n",
+    "## Weights can be downloaded from: https://huggingface.co/google/gemma-7b\n",
+    "hyperparams.model_name = \"../../../../gemma-7b\"  # <== Add model weight location here e.g. \"/path/to/downloaded/gemma/weights\"\n",
+    "hyperparams.mixed_precision = \"fp8\"\n",
+    "\n",
+    "\n",
+    "# Init the model and accelerator wrapper\n",
+    "model = init_te_gemma_model(hyperparams).cuda()\n",
+    "accelerator, model, optimizer, train_dataloader, lr_scheduler = wrap_with_accelerator(model, hyperparams)\n",
+    "\n",
+    "\n",
+    "# Finetune the model\n",
+    "finetune_model(model, hyperparams, accelerator, train_dataloader, optimizer, lr_scheduler)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "| Models                                                      | Precision | Step Time (or ms per batch) | Speedup (over baseline) |\n",
+    "|-------------------------------------------------------------|-----------|-----------------------------|-------------------------|\n",
+    "| HF (baseline)                                               | BF16      | 298                        | 1                       |\n",
+    "| TE (replace `GemmaDecoderLayer` with `TE.TransformerLayer`) | BF16      | 257                         | 1.16                    |\n",
+    "| TE (replace `GemmaDecoderLayer` with `TE.TransformerLayer`) | FP8       | 214                         | 1.39                    |\n",
+    "\n",
+    "\n",
+    "After turning on FP8 precision, we get even more speedup of almost **39%**!"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Conclusion\n",
+    "\n",
+    "As shown in the [Llama tutorial](../te_llama/tutorial_accelerate_hf_llama_finetuning_with_te.ipynb), using the `TransformerLayer` module from Transformer Engine to replace Hugging Face's `GemmaDecoderLayer` results in a speedup compared to Hugging Face's native Gemma implementation."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## See more\n",
+    "\n",
+    "We also prepared [tutorial](./tutorial_generation_gemma_with_te.ipynb) in which we will show how to speedup the Gemma model generation using Transformer Engine."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/examples/te_gemma/tutorial_generation_gemma_with_te.ipynb b/docs/examples/te_gemma/tutorial_generation_gemma_with_te.ipynb
new file mode 100755
index 0000000000..acb93b795e
--- /dev/null
+++ b/docs/examples/te_gemma/tutorial_generation_gemma_with_te.ipynb
@@ -0,0 +1,1277 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "40364db7",
+   "metadata": {},
+   "source": [
+    "# Accelerating token generation of the Hugging Face Gemma Model with Transformer Engine\n",
+    "\n",
+    "Generative AI has made remarkable strides in recent years, with Large Language Models (LLMs) like ChatGPT at the forefront. These models have revolutionized how we interact with machine-generated content, providing capabilities that range from writing assistance to complex decision support. The core functionality of these models is the generation process, which involves predicting the next token in a sequence based on the preceding text. This task is critical for applications such as automated content creation, translation, and more, emphasizing the importance of efficient implementation.\n",
+    "\n",
+    "\n",
+    "\n",
+    "<figure align=\"center\">\n",
+    "<img src=\"./media/generation_animation.gif\" alt=\"\" >\n",
+    "<figcaption>\n",
+    "Animation 1: Hugging Face Gemma model token generation.\n",
+    "</figcaption>\n",
+    "</figure>\n",
+    "\n",
+    "For those seeking a deeper understanding of text generation mechanisms in Transformers, it is recommended to check out the [HuggingFace generation tutorial](https://huggingface.co/docs/transformers/llm_tutorial).\n",
+    "\n",
+    "In the previous tutorials on [Llama](../te_llama/tutorial_accelerate_hf_llama_finetuning_with_te.ipynb) and [Gemma](./tutorial_accelerate_hf_gemma_finetuning_with_te.ipynb), it was demonstrated how finetuning can be accelerated using the Transformer Engine's `TransformerLayer`. Building on this foundation, the current objective is to enhance the generation speed of the Gemma model.\n",
+    "\n",
+    "This tutorial will introduce and explain several advanced features of the Transformer Engine that contribute to this goal:\n",
+    "\n",
+    "###### **1. THD Attention Layout.**\n",
+    "\n",
+    "Addressing the challenge of computing attention for sequences with varying lengths, a common method is to pad these sequences and apply an attention mask. The Transformer Engine, however, offers a more optimized approach—by specifying the lengths and offsets of the sequences, attention can be computed directly. Instead of passing the tensor with shape `[b, s, h, d]` and the attention mask, one can pass a tensor of the shape `[t, h, d]` along with tensors detailing cumulative sequence lengths and offsets to run the attention optimized for this case. This specific attention layout is referred to as the **THD layout**. \n",
+    "\n",
+    "\n",
+    "The letter `t` in the standard `[t, h, d]` layout is equal to the total length of the sequences, namely `t = s_1 + s_2 + ... + s_b`, where `s_i` denotes the length of sequence `i`. TransformerEngine supports a THD layout that incorporates gaps between these sequences - the lengths of the offsets need to be passed in the additional parameter.\n",
+    "\n",
+    "<figure align=\"center\">\n",
+    "<img src=\"./media/thd_bshd.svg\" alt=\"\">\n",
+    "<figcaption>\n",
+    "Figure 1: The difference between BSHD (default) and THD attention layouts is as follows: with BSHD, one needs to provide the attention mask, while with THD, one needs to provide cumulative sequence lengths and sequence offsets.\n",
+    "</figcaption>\n",
+    "</figure>\n",
+    "\n",
+    "###### **2. CUDA Graphs API.**\n",
+    "\n",
+    "The speed of GPUs is increasing at a rapid pace. It turns out that sometimes the runtime of kernels is shorter than the time it takes for the CPU to submit them, which can lead to significant overhead. CUDA Graphs can address this issue. When certain kernels are executed repeatedly, it allows us to record and replay them with less CPU involvement. This becomes particularly useful in applications like token generation, where a `TransformerLayer` is run for every token that needs to be generated.\n",
+    "\n",
+    "One can read more about CUDA Graphs [here](https://developer.nvidia.com/blog/cuda-graphs/).\n",
+    "\n",
+    "PyTorch exposes graphs via a raw `torch.cuda.CUDAGraph` class and two convenience wrappers: `torch.cuda.graph` and `torch.cuda.make_graphed_callables`. More information about the cuda graphs in Pytorch can be found [here](https://pytorch.org/blog/accelerating-pytorch-with-cuda-graphs/).\n",
+    "\n",
+    "<figure align=\"center\">\n",
+    "<img src=\"./media/graphs.svg\" alt=\"\">\n",
+    "<figcaption>\n",
+    "Figure 2: CUDA Graphs reduce the overhead generated by the long time it takes to launch a single kernel. It enables the recording and replaying of subsequent launches, thus reducing the total time used by the CPU.\n",
+    "</figcaption>\n",
+    "</figure>\n",
+    "\n",
+    "\n",
+    "###### **3. FP8 Weights Calibration.**\n",
+    "\n",
+    "Assuming that the model is trained in FP32/BF16 precision and the goal is to execute it in FP8 precision, the process isn't straightforward due to the absence of appropriate FP8 scaling factors. In this scenario, FP8 calibration becomes essential. By conducting several forward passes on sample data, the FP8 scaling parameters can be computed. This calibration allows the model to operate correctly in FP8 precision.\n",
+    "\n",
+    "It is highly recommended to familiarize oneself with the [tutorial](../../examples/fp8_primer.ipynb) on FP8 precision to understand the importance of proper scaling factors.\n",
+    "\n",
+    "\n",
+    "<figure align=\"center\">\n",
+    "<img src=\"./media/calibration.svg\" alt=\"\">\n",
+    "<figcaption>\n",
+    "Figure 3:\n",
+    "If the model is trained in BF16/FP32, it does not include the computed FP8 scaling factors. When it is run under <b>fp8_autocast()</b>, the value of these scaling factors will default to their initial values, which can cause numerical errors. Weight calibration involves calculating FP8 scaling factors from higher precision forward passes. Once these factors are computed, the model becomes numerically stable. \n",
+    "</figcaption>\n",
+    "</figure>\n",
+    "\n",
+    "###### **4. FP8 Model Weights.**\n",
+    "\n",
+    "The typical approach is to store weights in higher precision and then cast them to fp8 before operations. This may prevent accuraccy drops in training. However, for inference, this level of precision is not necessary.\n",
+    "\n",
+    "The TransformerEngine includes a wrapper `fp8_model_​init`, which allows for the creation of models that store only the FP8 copy of the weights. This eliminates the need to cast from higher precision to BF16, saving time in this casting process. \n",
+    "\n",
+    "<figure align=\"center\">\n",
+    "<img src=\"./media/fp8_model_init.svg\" alt=\"\">\n",
+    "<figcaption>\n",
+    "Figure 4: Model under <b>fp8_autocast()</b> stores weights in high precision by default, and casts them if needed. It can leads to slowdown and increased memory usage. Using <i>fp8_model_init()</i> results in storing weight in FP8.\n",
+    "</figcaption>\n",
+    "</figure>\n",
+    "\n",
+    "###### Benchmarking\n",
+    "\n",
+    "We'll evaluate the generation time across one benchmark: generation with context phase max sequence length = 128, batch size = 64 and number of generated tokens = 896 on random texts with random lengths.\n",
+    "\n",
+    "<div class=\"alert alert-info\">\n",
+    "<b>Note</b>\n",
+    "    \n",
+    "This tutorial focuses on showcasing the mentioned features of Transformer Engine in the context of token generation. It's important to note, however, that NVIDIA provides [TensorRT](https://developer.nvidia.com/tensorrt), which is optimized for inference tasks and should be considered for such use cases.\n",
+    "</div>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b18f91a9",
+   "metadata": {},
+   "source": [
+    "## Dependencies for this tutorial"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e5201d77",
+   "metadata": {},
+   "source": [
+    "Following files and media are necessary to effectively run this tutorial:\n",
+    "\n",
+    "1. `te_gemma.py`\n",
+    "    - This file contains the code to load a Hugging Face Gemma checkpoint in Transformer Engine's `TransformerLayer` instead of Hugging Face's `GemmaDecoderLayer`. It does also contain code for generation with THD attention, CUDA Graphs and weight calibration.\n",
+    "2. `te_gemma_loading_weights.py`\n",
+    "    - This file contains logic of mapping the parameters from `GemmaDecoderLayer` into the `TransformerLayer`.\n",
+    "3. `utils.py`\n",
+    "    - This file contains the code related to dataloading, hyperparameters, setting up model/optimizers/accelerator, model training and other miscellaneous tasks like restarting the jupyter notebook from within the cell. \n",
+    "4. `requirements.txt`\n",
+    "    - This file contains necessary Python packages for this tutorial.\n",
+    "5. `media/`\n",
+    "    - This directory contains the images used in the following tutorial."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "31390c76",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com\n",
+      "Collecting transformers==4.41.1 (from -r requirements.txt (line 1))\n",
+      "  Downloading transformers-4.41.1-py3-none-any.whl.metadata (43 kB)\n",
+      "Collecting accelerate==0.30.1 (from -r requirements.txt (line 2))\n",
+      "  Downloading accelerate-0.30.1-py3-none-any.whl.metadata (18 kB)\n",
+      "Collecting datasets==2.19.1 (from -r requirements.txt (line 3))\n",
+      "  Downloading datasets-2.19.1-py3-none-any.whl.metadata (19 kB)\n",
+      "Collecting sentencepiece==0.2.0 (from -r requirements.txt (line 4))\n",
+      "  Downloading sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)\n",
+      "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers==4.41.1->-r requirements.txt (line 1)) (3.16.1)\n",
+      "Collecting huggingface-hub<1.0,>=0.23.0 (from transformers==4.41.1->-r requirements.txt (line 1))\n",
+      "  Downloading huggingface_hub-0.26.2-py3-none-any.whl.metadata (13 kB)\n",
+      "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers==4.41.1->-r requirements.txt (line 1)) (1.24.4)\n",
+      "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers==4.41.1->-r requirements.txt (line 1)) (23.2)\n",
+      "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers==4.41.1->-r requirements.txt (line 1)) (6.0.2)\n",
+      "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers==4.41.1->-r requirements.txt (line 1)) (2024.9.11)\n",
+      "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers==4.41.1->-r requirements.txt (line 1)) (2.32.3)\n",
+      "Collecting tokenizers<0.20,>=0.19 (from transformers==4.41.1->-r requirements.txt (line 1))\n",
+      "  Downloading tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)\n",
+      "Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.10/dist-packages (from transformers==4.41.1->-r requirements.txt (line 1)) (0.4.5)\n",
+      "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers==4.41.1->-r requirements.txt (line 1)) (4.66.5)\n",
+      "Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from accelerate==0.30.1->-r requirements.txt (line 2)) (6.0.0)\n",
+      "Requirement already satisfied: torch>=1.10.0 in /usr/local/lib/python3.10/dist-packages (from accelerate==0.30.1->-r requirements.txt (line 2)) (2.5.0a0+e000cf0ad9.nv24.10)\n",
+      "Requirement already satisfied: pyarrow>=12.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets==2.19.1->-r requirements.txt (line 3)) (16.1.0)\n",
+      "Collecting pyarrow-hotfix (from datasets==2.19.1->-r requirements.txt (line 3))\n",
+      "  Downloading pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)\n",
+      "Collecting dill<0.3.9,>=0.3.0 (from datasets==2.19.1->-r requirements.txt (line 3))\n",
+      "  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)\n",
+      "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets==2.19.1->-r requirements.txt (line 3)) (2.2.2)\n",
+      "Collecting xxhash (from datasets==2.19.1->-r requirements.txt (line 3))\n",
+      "  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)\n",
+      "Collecting multiprocess (from datasets==2.19.1->-r requirements.txt (line 3))\n",
+      "  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)\n",
+      "Collecting fsspec<=2024.3.1,>=2023.1.0 (from fsspec[http]<=2024.3.1,>=2023.1.0->datasets==2.19.1->-r requirements.txt (line 3))\n",
+      "  Downloading fsspec-2024.3.1-py3-none-any.whl.metadata (6.8 kB)\n",
+      "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets==2.19.1->-r requirements.txt (line 3)) (3.10.5)\n",
+      "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets==2.19.1->-r requirements.txt (line 3)) (2.4.0)\n",
+      "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets==2.19.1->-r requirements.txt (line 3)) (1.3.1)\n",
+      "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets==2.19.1->-r requirements.txt (line 3)) (24.2.0)\n",
+      "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets==2.19.1->-r requirements.txt (line 3)) (1.4.1)\n",
+      "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets==2.19.1->-r requirements.txt (line 3)) (6.0.5)\n",
+      "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets==2.19.1->-r requirements.txt (line 3)) (1.9.4)\n",
+      "Requirement already satisfied: async-timeout<5.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets==2.19.1->-r requirements.txt (line 3)) (4.0.3)\n",
+      "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.23.0->transformers==4.41.1->-r requirements.txt (line 1)) (4.12.2)\n",
+      "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.41.1->-r requirements.txt (line 1)) (3.3.2)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.41.1->-r requirements.txt (line 1)) (3.7)\n",
+      "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.41.1->-r requirements.txt (line 1)) (2.0.7)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers==4.41.1->-r requirements.txt (line 1)) (2024.8.30)\n",
+      "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate==0.30.1->-r requirements.txt (line 2)) (3.3)\n",
+      "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate==0.30.1->-r requirements.txt (line 2)) (3.1.4)\n",
+      "Requirement already satisfied: sympy==1.13.1 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate==0.30.1->-r requirements.txt (line 2)) (1.13.1)\n",
+      "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from sympy==1.13.1->torch>=1.10.0->accelerate==0.30.1->-r requirements.txt (line 2)) (1.3.0)\n",
+      "INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.\n",
+      "Collecting multiprocess (from datasets==2.19.1->-r requirements.txt (line 3))\n",
+      "  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)\n",
+      "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets==2.19.1->-r requirements.txt (line 3)) (2.9.0.post0)\n",
+      "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets==2.19.1->-r requirements.txt (line 3)) (2023.4)\n",
+      "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets==2.19.1->-r requirements.txt (line 3)) (2024.1)\n",
+      "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas->datasets==2.19.1->-r requirements.txt (line 3)) (1.16.0)\n",
+      "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=1.10.0->accelerate==0.30.1->-r requirements.txt (line 2)) (2.1.5)\n",
+      "Downloading transformers-4.41.1-py3-none-any.whl (9.1 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m9.1/9.1 MB\u001b[0m \u001b[31m175.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading accelerate-0.30.1-py3-none-any.whl (302 kB)\n",
+      "Downloading datasets-2.19.1-py3-none-any.whl (542 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m542.0/542.0 kB\u001b[0m \u001b[31m334.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m628.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)\n",
+      "Downloading fsspec-2024.3.1-py3-none-any.whl (171 kB)\n",
+      "Downloading huggingface_hub-0.26.2-py3-none-any.whl (447 kB)\n",
+      "Downloading tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.6/3.6 MB\u001b[0m \u001b[31m296.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading multiprocess-0.70.16-py310-none-any.whl (134 kB)\n",
+      "Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)\n",
+      "Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)\n",
+      "Installing collected packages: sentencepiece, xxhash, pyarrow-hotfix, fsspec, dill, multiprocess, huggingface-hub, tokenizers, accelerate, transformers, datasets\n",
+      "  Attempting uninstall: fsspec\n",
+      "    Found existing installation: fsspec 2024.6.1\n",
+      "    Uninstalling fsspec-2024.6.1:\n",
+      "      Successfully uninstalled fsspec-2024.6.1\n",
+      "  Attempting uninstall: dill\n",
+      "    Found existing installation: dill 0.3.9\n",
+      "    Uninstalling dill-0.3.9:\n",
+      "      Successfully uninstalled dill-0.3.9\n",
+      "Successfully installed accelerate-0.30.1 datasets-2.19.1 dill-0.3.8 fsspec-2024.3.1 huggingface-hub-0.26.2 multiprocess-0.70.16 pyarrow-hotfix-0.6 sentencepiece-0.2.0 tokenizers-0.19.1 transformers-4.41.1 xxhash-3.5.0\n",
+      "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable.It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.\u001b[0m\u001b[33m\n",
+      "\u001b[0m\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.3.1\u001b[0m\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython -m pip install --upgrade pip\u001b[0m\n",
+      "Note: you may need to restart the kernel to use updated packages.\n"
+     ]
+    }
+   ],
+   "source": [
+    "%pip install -r requirements.txt\n",
+    "\n",
+    "import torch\n",
+    "cudnn_version = torch.backends.cudnn.version()\n",
+    "assert cudnn_version >= 90100, \"cuDNN version >= 9.1.0 is needed to run this tutorial.\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e8dfabbf",
+   "metadata": {},
+   "source": [
+    "\n",
+    "|\n",
+    "## [Baseline] Running Hugging Face generation with Gemma model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "59560bff",
+   "metadata": {},
+   "source": [
+    "HuggingFace Transformers library offers generation API. \n",
+    "HuggingFace generation for the Gemma model will be used as a baseline."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "2803e0ec",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python3.10/dist-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.\n",
+      "`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.\n",
+      "Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use\n",
+      "`config.hidden_activation` if you want to override this behaviour.\n",
+      "See https://github.com/huggingface/transformers/pull/29402 for more details.\n",
+      "Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:04<00:00,  1.02s/it]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "============================== Generation example 1 ==============================\n",
+      "Prompt:\n",
+      "Here are the two facts about GPUs:\n",
+      "Generated text:\n",
+      "\n",
+      "\n",
+      "1. GPUs are very good at doing the same thing over and over again.\n",
+      "2. GPUs are very bad at doing different things at the same time.\n",
+      "\n",
+      "The first fact is why GPUs are so good at graphics. The second fact is\n",
+      "============================== Generation example 2 ==============================\n",
+      "Prompt:\n",
+      "Some facts about NVIDIA:\n",
+      "Generated text:\n",
+      "\n",
+      "\n",
+      "* NVIDIA is a global technology company that designs and develops high-performance computer graphics and computer processing units (CPUs) for the gaming and professional markets.\n",
+      "* The company was founded in 1993 and is headquartered in Santa Clara\n",
+      "============================== Generation example 3 ==============================\n",
+      "Prompt:\n",
+      "Here are the two facts about GPUs:\n",
+      "Generated text:\n",
+      "\n",
+      "\n",
+      "1. GPUs are very good at doing the same thing over and over again.\n",
+      "2. GPUs are very bad at doing different things at the same time.\n",
+      "\n",
+      "The first fact is why GPUs are so good at graphics. The second fact is\n",
+      "============================== Generation example 4 ==============================\n",
+      "Prompt:\n",
+      "Some facts about NVIDIA:\n",
+      "Generated text:\n",
+      "\n",
+      "\n",
+      "* NVIDIA is a global technology company that designs and develops high-performance computer graphics and computer processing units (CPUs) for the gaming and professional markets.\n",
+      "* The company was founded in 1993 and is headquartered in Santa Clara\n",
+      "============================== Generation example 5 ==============================\n",
+      "Prompt:\n",
+      "Here are the two facts about GPUs:\n",
+      "Generated text:\n",
+      "\n",
+      "\n",
+      "1. GPUs are very good at doing the same thing over and over again.\n",
+      "2. GPUs are very bad at doing different things at the same time.\n",
+      "\n",
+      "The first fact is why GPUs are so good at graphics. The second fact is\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Restart the notebook (to flush the GPU memory)\n",
+    "from utils import restart_jupyter_notebook\n",
+    "restart_jupyter_notebook()\n",
+    "\n",
+    "from utils import *\n",
+    "\n",
+    "# Default hyperparams, also defined in `utils.py` in class `Hyperparameters`\n",
+    "# !!! `model_name` attr must point to the location of the model weights !!!\n",
+    "# Weights can be downloaded from: https://huggingface.co/google/gemma-7b.\n",
+    "# Weights should be in the *.safetensors HF format, not in the original format.\n",
+    "hyperparams.model_name = \"/tmp/gemma-7b-hf\"  # <== Add model weight location here e.g. \"/path/to/downloaded/gemma/weights\"\n",
+    "\n",
+    "model = init_baseline_model(hyperparams)\n",
+    "\n",
+    "print_sample_of_generated_texts(model)\n",
+    "# benchmark_generation(model)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b3698dc6",
+   "metadata": {},
+   "source": [
+    "Let's put this time into the table for later comparison.\n",
+    "\n",
+    "| Models                                                      | Time (s) | Speedup |  \n",
+    "|-------------------------------------------------------------|---------------------------------------|--------------------------------------|\n",
+    "| HF (baseline)                                               | 87.68      | 1                         |"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8bb40f45",
+   "metadata": {},
+   "source": [
+    "## [Improvement 1] Using TransformerLayer from Transformer Engine instead of GemmaDecoderLayer."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "263b40f2",
+   "metadata": {},
+   "source": [
+    "As in the [Gemma](./tutorial_accelerate_hf_gemma_finetuning_with_te.ipynb) finetuning tutorial, a GemmaDecoderLayer is substituted by a tuned TransformerLayer from the Transformer Engine. Let's run it and compare the time with the baseline."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "9dceef93",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python3.10/dist-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in TEGemmaForCausalLM is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained(\"openai/whisper-tiny\", attn_implementation=\"flash_attention_2\", torch_dtype=torch.float16)`\n",
+      "Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in GemmaModel is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained(\"openai/whisper-tiny\", attn_implementation=\"flash_attention_2\", torch_dtype=torch.float16)`\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "> \u001b[0;32m/perfhome/mnt/wkstn/work/repos/TransformerEngine/transformer_engine/pytorch/attention.py\u001b[0m(8223)\u001b[0;36mforward\u001b[0;34m()\u001b[0m\n",
+      "\u001b[0;32m   8221 \u001b[0;31m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0m\u001b[0;32m   8222 \u001b[0;31m                \u001b[0;32mimport\u001b[0m \u001b[0mpdb\u001b[0m\u001b[0;34m;\u001b[0m \u001b[0mpdb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mset_trace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0m\u001b[0;32m-> 8223 \u001b[0;31m                key_layer, value_layer = inference_params.save_to_kv_cache(\n",
+      "\u001b[0m\u001b[0;32m   8224 \u001b[0;31m                    \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlayer_number\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey_layer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue_layer\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0m\u001b[0;32m   8225 \u001b[0;31m                )\n",
+      "\u001b[0m\n"
+     ]
+    },
+    {
+     "name": "stdin",
+     "output_type": "stream",
+     "text": [
+      "ipdb>  key_layer.shape\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "torch.Size([128, 64, 16, 256])\n"
+     ]
+    },
+    {
+     "name": "stdin",
+     "output_type": "stream",
+     "text": [
+      "ipdb>  value_layer.shape\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "torch.Size([128, 64, 16, 256])\n"
+     ]
+    },
+    {
+     "name": "stdin",
+     "output_type": "stream",
+     "text": [
+      "ipdb>  query_layer.shape\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "torch.Size([8192, 16, 256])\n"
+     ]
+    },
+    {
+     "name": "stdin",
+     "output_type": "stream",
+     "text": [
+      "ipdb>  c\n"
+     ]
+    },
+    {
+     "ename": "AssertionError",
+     "evalue": "Queries, keys and values must be 4D tensors when qkv_format = bshd!",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAssertionError\u001b[0m                            Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[2], line 11\u001b[0m\n\u001b[1;32m      7\u001b[0m hyperparams\u001b[38;5;241m.\u001b[39mmodel_name \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m/tmp/gemma-7b-hf\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;66;03m# <== Add model weight location here e.g. \"/path/to/downloaded/gemma/weights\"\u001b[39;00m\n\u001b[1;32m      9\u001b[0m model \u001b[38;5;241m=\u001b[39m init_te_gemma_model(hyperparams)\n\u001b[0;32m---> 11\u001b[0m \u001b[43mprint_sample_of_generated_texts\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     12\u001b[0m \u001b[38;5;66;03m# benchmark_generation(model)\u001b[39;00m\n",
+      "File \u001b[0;32m/perfhome/mnt/wkstn/work/repos/TransformerEngine/docs/examples/te_gemma/utils.py:280\u001b[0m, in \u001b[0;36mprint_sample_of_generated_texts\u001b[0;34m(model)\u001b[0m\n\u001b[1;32m    277\u001b[0m inputs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minput_ids\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m inputs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minput_ids\u001b[39m\u001b[38;5;124m\"\u001b[39m]\u001b[38;5;241m.\u001b[39mcuda()\n\u001b[1;32m    278\u001b[0m inputs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mattention_mask\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m inputs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mattention_mask\u001b[39m\u001b[38;5;124m\"\u001b[39m]\u001b[38;5;241m.\u001b[39mcuda()\n\u001b[0;32m--> 280\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgenerate\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmax_new_tokens\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m50\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m    281\u001b[0m generated_texts \u001b[38;5;241m=\u001b[39m tokenizer\u001b[38;5;241m.\u001b[39mbatch_decode(outputs, skip_special_tokens\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[1;32m    283\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mprint_output\u001b[39m(prompts, generated_texts, idx):\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py:116\u001b[0m, in \u001b[0;36mcontext_decorator.<locals>.decorate_context\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    113\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(func)\n\u001b[1;32m    114\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdecorate_context\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m    115\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m ctx_factory():\n\u001b[0;32m--> 116\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/perfhome/mnt/wkstn/work/repos/TransformerEngine/docs/examples/te_gemma/te_gemma.py:450\u001b[0m, in \u001b[0;36mTEGemmaForCausalLM.generate\u001b[0;34m(self, input_ids, pad_token_id, max_new_tokens, *args, **kwargs)\u001b[0m\n\u001b[1;32m    446\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39mqkv_format \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mthd\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m    447\u001b[0m     \u001b[38;5;66;03m# For thd layout padding is at the end, otherwise at the beginning.\u001b[39;00m\n\u001b[1;32m    448\u001b[0m     TEGemmaForCausalLM\u001b[38;5;241m.\u001b[39m_padding_to_end(input_ids, lengths)\n\u001b[0;32m--> 450\u001b[0m hidden_states, next_tokens \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_generate_context_phase\u001b[49m\u001b[43m(\u001b[49m\u001b[43minput_ids\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minference_params\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    452\u001b[0m \u001b[38;5;66;03m# Generation phase.\u001b[39;00m\n\u001b[1;32m    453\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39mqkv_format \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mthd\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n",
+      "File \u001b[0;32m/perfhome/mnt/wkstn/work/repos/TransformerEngine/docs/examples/te_gemma/te_gemma.py:381\u001b[0m, in \u001b[0;36mTEGemmaForCausalLM._generate_context_phase\u001b[0;34m(self, input_ids, inference_params)\u001b[0m\n\u001b[1;32m    378\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    379\u001b[0m     inference_params\u001b[38;5;241m.\u001b[39msetup_before_new_input(length\u001b[38;5;241m=\u001b[39minput_ids\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m1\u001b[39m])\n\u001b[0;32m--> 381\u001b[0m logits \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_model_context_phase\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    382\u001b[0m \u001b[43m    \u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    383\u001b[0m \u001b[43m    \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m(\u001b[49m\u001b[43minput_ids\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m==\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconfig\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mqkv_format\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m!=\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mthd\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    384\u001b[0m \u001b[43m    \u001b[49m\u001b[43mattn_mask_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mpadding_causal\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43minference_params\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mqkv_format\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m==\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mthd\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43marbitrary\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\n\u001b[1;32m    385\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    387\u001b[0m \u001b[38;5;66;03m# We choose logits coresponding with last token in each sequence,\u001b[39;00m\n\u001b[1;32m    388\u001b[0m \u001b[38;5;66;03m# which have various lengths - they are stored in (inference_params.incoming_seq_len - 1)\u001b[39;00m\n\u001b[1;32m    389\u001b[0m \u001b[38;5;66;03m# Tensor when qkv_format == \"thd\" and\u001b[39;00m\n\u001b[1;32m    390\u001b[0m \u001b[38;5;66;03m# they are the last token in the sequence when qkv_format != \"thd\".\u001b[39;00m\n\u001b[1;32m    391\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39mqkv_format \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mthd\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1736\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1734\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1735\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1736\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1747\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1742\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1743\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1744\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1745\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1746\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1747\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1749\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m   1750\u001b[0m called_always_called_hooks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n",
+      "File \u001b[0;32m/perfhome/mnt/wkstn/work/repos/TransformerEngine/docs/examples/te_gemma/te_gemma.py:183\u001b[0m, in \u001b[0;36mStaticGemmaModel.forward\u001b[0;34m(self, hidden_states, attention_mask, attn_mask_type)\u001b[0m\n\u001b[1;32m    180\u001b[0m     hidden_states\u001b[38;5;241m.\u001b[39mdata[:] \u001b[38;5;241m=\u001b[39m hidden_states\u001b[38;5;241m.\u001b[39mdata[:] \u001b[38;5;241m*\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnormalizer\n\u001b[1;32m    182\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m i, decoder_layer \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel\u001b[38;5;241m.\u001b[39mlayers):\n\u001b[0;32m--> 183\u001b[0m         hidden_states\u001b[38;5;241m.\u001b[39mdata[:] \u001b[38;5;241m=\u001b[39m \u001b[43mdecoder_layer\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    184\u001b[0m \u001b[43m            \u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    185\u001b[0m \u001b[43m            \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    186\u001b[0m \u001b[43m            \u001b[49m\u001b[43mself_attn_mask_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmask\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mattn_mask_type\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mis\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mattn_mask_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    187\u001b[0m \u001b[43m            \u001b[49m\u001b[43minference_params\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43minference_params\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    188\u001b[0m \u001b[43m        \u001b[49m\u001b[43m)\u001b[49m[\n\u001b[1;32m    189\u001b[0m             \u001b[38;5;241m0\u001b[39m\n\u001b[1;32m    190\u001b[0m         ]  \u001b[38;5;66;03m# static copy - for CUDA graphs\u001b[39;00m\n\u001b[1;32m    192\u001b[0m hidden_states\u001b[38;5;241m.\u001b[39mcopy_(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmodel\u001b[38;5;241m.\u001b[39mnorm(hidden_states))  \u001b[38;5;66;03m# static copy - for CUDA graphs\u001b[39;00m\n\u001b[1;32m    193\u001b[0m logits \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlm_head(hidden_states)\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1736\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1734\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1735\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1736\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1747\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1742\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1743\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1744\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1745\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1746\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1747\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1749\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m   1750\u001b[0m called_always_called_hooks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n",
+      "File \u001b[0;32m/perfhome/mnt/wkstn/work/repos/TransformerEngine/docs/examples/te_gemma/te_gemma.py:151\u001b[0m, in \u001b[0;36mTEGemmaDecoderLayer.forward\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m    148\u001b[0m     kwargs\u001b[38;5;241m.\u001b[39mpop(key, \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[1;32m    150\u001b[0m \u001b[38;5;66;03m# We need to return tuple to be compatible with HF.\u001b[39;00m\n\u001b[0;32m--> 151\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m (\u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mforward\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrotary_pos_emb\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mte_rope_emb\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m,)\n",
+      "File \u001b[0;32m/perfhome/mnt/wkstn/work/repos/TransformerEngine/transformer_engine/pytorch/transformer.py:690\u001b[0m, in \u001b[0;36mTransformerLayer.forward\u001b[0;34m(self, hidden_states, attention_mask, self_attn_mask_type, window_size, encoder_output, enc_dec_attn_mask, enc_dec_attn_mask_type, enc_dec_window_size, is_first_microbatch, checkpoint_core_attention, inference_params, rotary_pos_emb, core_attention_bias_type, core_attention_bias, alibi_slopes, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv, fast_zero_fill)\u001b[0m\n\u001b[1;32m    687\u001b[0m     hidden_states \u001b[38;5;241m=\u001b[39m cast_if_needed(hidden_states, torch\u001b[38;5;241m.\u001b[39mget_autocast_gpu_dtype())\n\u001b[1;32m    689\u001b[0m \u001b[38;5;66;03m# Self attention.\u001b[39;00m\n\u001b[0;32m--> 690\u001b[0m self_attention_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mself_attention\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    691\u001b[0m \u001b[43m    \u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    692\u001b[0m \u001b[43m    \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    693\u001b[0m \u001b[43m    \u001b[49m\u001b[43mattn_mask_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mself_attn_mask_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    694\u001b[0m \u001b[43m    \u001b[49m\u001b[43mwindow_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mwindow_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    695\u001b[0m \u001b[43m    \u001b[49m\u001b[43minference_params\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minference_params\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    696\u001b[0m \u001b[43m    \u001b[49m\u001b[43mis_first_microbatch\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_first_microbatch\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    697\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcheckpoint_core_attention\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcheckpoint_core_attention\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    698\u001b[0m \u001b[43m    \u001b[49m\u001b[43mrotary_pos_emb\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrotary_pos_emb\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    699\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcore_attention_bias_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcore_attention_bias_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    700\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcore_attention_bias\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcore_attention_bias\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    701\u001b[0m \u001b[43m    \u001b[49m\u001b[43malibi_slopes\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43malibi_slopes\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    702\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcu_seqlens_q\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcu_seqlens_q\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    703\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcu_seqlens_kv\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcu_seqlens_kv\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    704\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmax_seqlen_q\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_seqlen_q\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    705\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmax_seqlen_kv\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_seqlen_kv\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    706\u001b[0m \u001b[43m    \u001b[49m\u001b[43mfast_zero_fill\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfast_zero_fill\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    707\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    709\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mapply_residual_connection_post_layernorm \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moutput_layernorm:\n\u001b[1;32m    710\u001b[0m     attention_output, attention_bias, residual \u001b[38;5;241m=\u001b[39m self_attention_outputs\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1736\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1734\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1735\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1736\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1747\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1742\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1743\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1744\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1745\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1746\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1747\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1749\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m   1750\u001b[0m called_always_called_hooks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n",
+      "File \u001b[0;32m/perfhome/mnt/wkstn/work/repos/TransformerEngine/transformer_engine/pytorch/attention.py:9453\u001b[0m, in \u001b[0;36mMultiheadAttention.forward\u001b[0;34m(self, hidden_states, attention_mask, encoder_output, attn_mask_type, window_size, is_first_microbatch, checkpoint_core_attention, inference_params, rotary_pos_emb, core_attention_bias_type, core_attention_bias, alibi_slopes, cu_seqlens_q, cu_seqlens_kv, max_seqlen_q, max_seqlen_kv, fast_zero_fill)\u001b[0m\n\u001b[1;32m   9447\u001b[0m query_layer \u001b[38;5;241m=\u001b[39m query_layer\u001b[38;5;241m.\u001b[39mview(\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m*\u001b[39mquery_layer\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m2\u001b[39m:])\u001b[38;5;241m.\u001b[39mcontiguous()\n\u001b[1;32m   9449\u001b[0m \u001b[38;5;66;03m# ===========================\u001b[39;00m\n\u001b[1;32m   9450\u001b[0m \u001b[38;5;66;03m# Core attention computation\u001b[39;00m\n\u001b[1;32m   9451\u001b[0m \u001b[38;5;66;03m# ===========================\u001b[39;00m\n\u001b[0;32m-> 9453\u001b[0m context_layer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcore_attention\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   9454\u001b[0m \u001b[43m    \u001b[49m\u001b[43mquery_layer\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   9455\u001b[0m \u001b[43m    \u001b[49m\u001b[43mkey_layer\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   9456\u001b[0m \u001b[43m    \u001b[49m\u001b[43mvalue_layer\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   9457\u001b[0m \u001b[43m    \u001b[49m\u001b[43mqkv_format\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mqkv_format\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   9458\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcu_seqlens_q\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcu_seqlens_q\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   9459\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcu_seqlens_kv\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcu_seqlens_kv\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   9460\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmax_seqlen_q\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_seqlen_q\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   9461\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmax_seqlen_kv\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmax_seqlen_kv\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   9462\u001b[0m \u001b[43m    \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   9463\u001b[0m \u001b[43m    \u001b[49m\u001b[43mattn_mask_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mattn_mask_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   9464\u001b[0m \u001b[43m    \u001b[49m\u001b[43mwindow_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mwindow_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   9465\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcheckpoint_core_attention\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcheckpoint_core_attention\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   9466\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcore_attention_bias_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcore_attention_bias_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   9467\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcore_attention_bias\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcore_attention_bias\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   9468\u001b[0m \u001b[43m    \u001b[49m\u001b[43malibi_slopes\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43malibi_slopes\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   9469\u001b[0m \u001b[43m    \u001b[49m\u001b[43mfast_zero_fill\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfast_zero_fill\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   9470\u001b[0m \u001b[43m    \u001b[49m\u001b[43minference_params\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minference_params\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   9471\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   9473\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mqkv_format \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mthd\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m   9474\u001b[0m     \u001b[38;5;66;03m# [b * sq, h] -> [qs, b, h]\u001b[39;00m\n\u001b[1;32m   9475\u001b[0m     context_layer  \u001b[38;5;241m=\u001b[39m context_layer\u001b[38;5;241m.\u001b[39mview(\n\u001b[1;32m   9476\u001b[0m         (inference_params\u001b[38;5;241m.\u001b[39mmax_batch_size, \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m, context_layer\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m1\u001b[39m])\n\u001b[1;32m   9477\u001b[0m     )\u001b[38;5;241m.\u001b[39mcontiguous()\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1736\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1734\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1735\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1736\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py:1747\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1742\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1743\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1744\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1745\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1746\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1747\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1749\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m   1750\u001b[0m called_always_called_hooks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mset\u001b[39m()\n",
+      "File \u001b[0;32m/perfhome/mnt/wkstn/work/repos/TransformerEngine/transformer_engine/pytorch/attention.py:8301\u001b[0m, in \u001b[0;36mDotProductAttention.forward\u001b[0;34m(self, query_layer, key_layer, value_layer, attention_mask, qkv_format, cu_seqlens_q, cu_seqlens_kv, cu_seqlens_q_padded, cu_seqlens_kv_padded, max_seqlen_q, max_seqlen_kv, attn_mask_type, window_size, checkpoint_core_attention, core_attention_bias_type, core_attention_bias, alibi_slopes, fast_zero_fill, inference_params, is_first_microbatch)\u001b[0m\n\u001b[1;32m   8298\u001b[0m context_parallel \u001b[38;5;241m=\u001b[39m cp_size \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m   8300\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m qkv_format \u001b[38;5;129;01min\u001b[39;00m [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msbhd\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbshd\u001b[39m\u001b[38;5;124m\"\u001b[39m]:\n\u001b[0;32m-> 8301\u001b[0m     \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mall\u001b[39m(\n\u001b[1;32m   8302\u001b[0m         \u001b[38;5;28mlen\u001b[39m(x\u001b[38;5;241m.\u001b[39mshape) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m4\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m (query_layer, key_layer, value_layer)\n\u001b[1;32m   8303\u001b[0m     ), \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mQueries, keys and values must be 4D tensors when qkv_format = \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mqkv_format\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m!\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m   8304\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m qkv_format \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msbhd\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m   8305\u001b[0m         max_seqlen_q \u001b[38;5;241m=\u001b[39m query_layer\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m0\u001b[39m] \u001b[38;5;28;01mif\u001b[39;00m max_seqlen_q \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m max_seqlen_q\n",
+      "\u001b[0;31mAssertionError\u001b[0m: Queries, keys and values must be 4D tensors when qkv_format = bshd!"
+     ]
+    }
+   ],
+   "source": [
+    "# Restart the notebook (to flush the GPU memory)\n",
+    "from utils import restart_jupyter_notebook\n",
+    "restart_jupyter_notebook()\n",
+    "\n",
+    "from utils import *\n",
+    "\n",
+    "hyperparams.model_name = \"/tmp/gemma-7b-hf\" # <== Add model weight location here e.g. \"/path/to/downloaded/gemma/weights\"\n",
+    "\n",
+    "model = init_te_gemma_model(hyperparams)\n",
+    "\n",
+    "print_sample_of_generated_texts(model)\n",
+    "# benchmark_generation(model)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b5d40836",
+   "metadata": {
+    "jupyter": {
+     "source_hidden": true
+    }
+   },
+   "source": [
+    "The speedup of **62%** was obtained."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "006d18e8",
+   "metadata": {},
+   "source": [
+    "| Models                                                      | Time (s) | Speedup |  \n",
+    "|-------------------------------------------------------------|---------------------------------------|--------------------------------------|\n",
+    "| HF (baseline)                                               | 87.68      | 1                         |\n",
+    "| TE (subsitution of GemmaDecoderLayer with te.TransformerLayer)                                              | 54.11      | 1.62                         | "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2bbf3d47",
+   "metadata": {},
+   "source": [
+    "## [Improvement 2] Use of THD attention layout.\n",
+    "\n",
+    "Input sequences can have various lengths. Hugging Face generation – as can be seen in Animation 1 – pads the sequences and then uses attention mask. In the THD attention layout cumulative sequence lengths and offsets need to be provided, instead of attention mask. The THD attention layout is much more optimized than BSHD layout.\n",
+    "\n",
+    "The class `transformer_engine.pytorch.DotProductAttention` supports this format. One need to pass the following things as the arguments to the forward:\n",
+    "- `seq_offsets_q`, `seq_offsets_k`, `seq_offsets_v` – offsets of the beginnings of the next sequences,\n",
+    "- `cu_seqlens_q`, `cu_seqlens_kv` – cumulative sum of the lengths of the sequences of query and values,\n",
+    "- `max_seqlen_q` – maximum sequence length in query layer,\n",
+    "- `max_seqlen_kv` – maximum sequence length in key-value layer.\n",
+    "\n",
+    "<div class=\"alert alert-info\">\n",
+    "<b>Note</b>\n",
+    "\n",
+    "Currently, the THD attention for `TransformerLayer` is supported only for token generation.\n",
+    "</div>\n",
+    "\n",
+    "Let's look how using TransformerEngine with THD attention impacts the speed of token generation:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "4fc5e1cd",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python3.10/dist-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in TEGemmaForCausalLM is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained(\"openai/whisper-tiny\", attn_implementation=\"flash_attention_2\", torch_dtype=torch.float16)`\n",
+      "Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in GemmaModel is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained(\"openai/whisper-tiny\", attn_implementation=\"flash_attention_2\", torch_dtype=torch.float16)`\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "============================== Generation example 1 ==============================\n",
+      "Prompt:\n",
+      "Here are the two facts about GPUs:\n",
+      "Generated text:\n",
+      "\n",
+      "\n",
+      "1. They are very good at doing the same thing over and over again.\n",
+      "2. They are very bad at doing different things at the same time.\n",
+      "\n",
+      "This is why they are so good at rendering 3D graphics.\n",
+      "\n",
+      "The GPU\n",
+      "============================== Generation example 2 ==============================\n",
+      "Prompt:\n",
+      "Some facts about NVIDIA:\n",
+      "Generated text:\n",
+      "\n",
+      "\n",
+      "* NVIDIA is a global technology company that designs and develops graphics processing units (GPUs) for the gaming and professional markets.\n",
+      "* NVIDIA was founded in 1993 by Jensen Huang, Chris Malachowsky, and Curtis Priem.\n",
+      "============================== Generation example 3 ==============================\n",
+      "Prompt:\n",
+      "Here are the two facts about GPUs:\n",
+      "Generated text:\n",
+      "\n",
+      "\n",
+      "1. They are very good at doing the same thing over and over again.\n",
+      "2. They are very bad at doing different things at the same time.\n",
+      "\n",
+      "This is why they are so good at rendering 3D graphics.\n",
+      "\n",
+      "The GPU\n",
+      "============================== Generation example 4 ==============================\n",
+      "Prompt:\n",
+      "Some facts about NVIDIA:\n",
+      "Generated text:\n",
+      "\n",
+      "\n",
+      "* NVIDIA is a global technology company that designs and develops graphics processing units (GPUs) for the gaming and professional markets.\n",
+      "* NVIDIA was founded in 1993 by Jensen Huang, Chris Malachowsky, and Curtis Priem.\n",
+      "============================== Generation example 5 ==============================\n",
+      "Prompt:\n",
+      "Here are the two facts about GPUs:\n",
+      "Generated text:\n",
+      "\n",
+      "\n",
+      "1. They are very good at doing the same thing over and over again.\n",
+      "2. They are very bad at doing different things at the same time.\n",
+      "\n",
+      "This is why they are so good at rendering 3D graphics.\n",
+      "\n",
+      "The GPU\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Restart the notebook (to flush the GPU memory)\n",
+    "from utils import restart_jupyter_notebook\n",
+    "restart_jupyter_notebook()\n",
+    "\n",
+    "from utils import *\n",
+    "\n",
+    "hyperparams.model_name = \"/tmp/gemma-7b-hf/\" # <== Add model weight location here e.g. \"/path/to/downloaded/gemma/weights\"\n",
+    "hyperparams.qkv_format = \"thd\"\n",
+    "\n",
+    "model = init_te_gemma_model(hyperparams)\n",
+    "\n",
+    "print_sample_of_generated_texts(model)\n",
+    "# benchmark_generation(model)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8e397a65",
+   "metadata": {},
+   "source": [
+    "By using THD attention, the following speedup was obtained:\n",
+    "\n",
+    "| Models                                                      | Time (s) | Speedup |  \n",
+    "|-------------------------------------------------------------|---------------------------------------|--------------------------------------|\n",
+    "| HF (baseline)                                               | 87.68      | 1                         |\n",
+    "| TE (subsitution of GemmaDecoderLayer with te.TransformerLayer)                                              | 54.11      | 1.62                         | \n",
+    "| TE + THD attention                                               | 28.22      | 3.11                         |  "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "21a89d9c",
+   "metadata": {},
+   "source": [
+    "## [Improvement 3] Speeding up generation with CUDA Graphs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e2d53e7b",
+   "metadata": {},
+   "source": [
+    "TransformerEngine includes a function `transformer_engine.pytorch.make_graphed_callables`, which functions similarly to the corresponding feature in PyTorch. It is capable of recording any modules from the Transformer Engine. Below is a code excerpt from `te_gemma.py` from class `TEGemmaForCausalLMCudaGraphs`:\n",
+    "```\n",
+    "    def __init__(self, config : GemmaConfig):\n",
+    "            (...)\n",
+    "            \n",
+    "            # Here \"the trick\" happens. We override methods from TEGemmaForCausalLM\n",
+    "            # with their recorded version. After invocation of each of them,\n",
+    "            # captured graph will be replayed with minimal usage of CPU,\n",
+    "            # what will lead to huge speedup.\n",
+    "            (...)\n",
+    "            self._model_context_phase = \n",
+    "                self.record_graph(self._model_context_phase, self.hidden_states_buffer) # CUDA Graphs recording\n",
+    "\n",
+    "            (...)        \n",
+    "            self._model_generation_phase = \n",
+    "                self.record_graph(self._model_generation_phase, self.generation_buffer) # CUDA Graphs recording\n",
+    "\n",
+    "    @torch.no_grad()\n",
+    "    def record_graph(self, function, input_tensor):\n",
+    "        (...)\n",
+    "        # function is invoked on argument (self.hidden_states,) and all kernels are recorded.\n",
+    "        # record_graph() returns captured function, which can be run later with minimal use of th CPU.\n",
+    "        fp8_format = Format.HYBRID\n",
+    "        fp8_recipe = DelayedScaling(fp8_format=fp8_format, amax_history_len=32, amax_compute_algo=\"max\")\n",
+    "        with autocast(dtype=torch.bfloat16, cache_enabled=False):\n",
+    "            graphed_function = te.pytorch.make_graphed_callables(\n",
+    "                function, \n",
+    "                (input_tensor,), \n",
+    "                fp8_enabled=True, \n",
+    "                fp8_recipe=fp8_recipe, \n",
+    "                allow_unused_input=True,\n",
+    "                num_warmup_iters=3\n",
+    "            )\n",
+    "        return graphed_function\n",
+    "```\n",
+    "\n",
+    "It is strongly reccomended to review the entire code of the class `TEGemmaForCausalLMCudaGraphs`. Let's now proceed to evaluate the performance improvement offered by CUDA Graphs."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "31a3a8a3",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python3.10/dist-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in TEGemmaForCausalLMCudaGraphs is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained(\"openai/whisper-tiny\", attn_implementation=\"flash_attention_2\", torch_dtype=torch.float16)`\n",
+      "Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in GemmaModel is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained(\"openai/whisper-tiny\", attn_implementation=\"flash_attention_2\", torch_dtype=torch.float16)`\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "============================== Generation example 1 ==============================\n",
+      "Prompt:\n",
+      "Here are the two facts about GPUs:\n",
+      "Generated text:\n",
+      "\n",
+      "\n",
+      "1. They are very good at doing the same thing over and over again.\n",
+      "2. They are very bad at doing different things at the same time.\n",
+      "\n",
+      "This is why they are so good at rendering 3D graphics.\n",
+      "\n",
+      "The GPU\n",
+      "============================== Generation example 2 ==============================\n",
+      "Prompt:\n",
+      "Some facts about NVIDIA:\n",
+      "Generated text:\n",
+      "\n",
+      "\n",
+      "* NVIDIA is a global technology company that designs and develops graphics processing units (GPUs) for the gaming and professional markets.\n",
+      "* NVIDIA was founded in 1993 by Jensen Huang, Chris Malachowsky, and Curtis Priem.\n",
+      "============================== Generation example 3 ==============================\n",
+      "Prompt:\n",
+      "Here are the two facts about GPUs:\n",
+      "Generated text:\n",
+      "\n",
+      "\n",
+      "1. They are very good at doing the same thing over and over again.\n",
+      "2. They are very bad at doing different things at the same time.\n",
+      "\n",
+      "This is why they are so good at rendering 3D graphics.\n",
+      "\n",
+      "The GPU\n",
+      "============================== Generation example 4 ==============================\n",
+      "Prompt:\n",
+      "Some facts about NVIDIA:\n",
+      "Generated text:\n",
+      "\n",
+      "\n",
+      "* NVIDIA is a global technology company that designs and develops graphics processing units (GPUs) for the gaming and professional markets.\n",
+      "* NVIDIA was founded in 1993 by Jensen Huang, Chris Malachowsky, and Curtis Priem.\n",
+      "============================== Generation example 5 ==============================\n",
+      "Prompt:\n",
+      "Here are the two facts about GPUs:\n",
+      "Generated text:\n",
+      "\n",
+      "\n",
+      "1. They are very good at doing the same thing over and over again.\n",
+      "2. They are very bad at doing different things at the same time.\n",
+      "\n",
+      "This is why they are so good at rendering 3D graphics.\n",
+      "\n",
+      "The GPU\n"
+     ]
+    }
+   ],
+   "source": [
+    "#Restart the notebook (to flush the GPU memory)\n",
+    "from utils import restart_jupyter_notebook\n",
+    "restart_jupyter_notebook()\n",
+    "\n",
+    "\n",
+    "from utils import *\n",
+    "\n",
+    "hyperparams.model_name = \"/tmp/gemma-7b-hf/\" # <== Add model weight location here e.g. \"/path/to/downloaded/gemma/weights\"\n",
+    "hyperparams.qkv_format = \"thd\"\n",
+    "\n",
+    "hyperparams.generation_cuda_graphs = True\n",
+    "\n",
+    "# It is necessary to preallocate a static buffer.\n",
+    "# CUDA graphs require static input tensors for every kernel.\n",
+    "# This approach may result in a slight increase in memory consumption;\n",
+    "# however, the substantial speedup achieved makes it worthwhile.\n",
+    "hyperparams.cuda_graphs_static_batch_size = 64\n",
+    "hyperparams.cuda_graphs_static_max_seq_len = 1024\n",
+    "hyperparams.cuda_graphs_static_max_context_len = 128\n",
+    "model = init_te_gemma_model(hyperparams)\n",
+    "\n",
+    "print_sample_of_generated_texts(model)\n",
+    "# benchmark_generation(model)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "53bb430f",
+   "metadata": {},
+   "source": [
+    "The **5.23x** speedup was obtained.\n",
+    "\n",
+    "| Models                                                      | Time (s) | Speedup |  \n",
+    "|-------------------------------------------------------------|---------------------------------------|--------------------------------------|\n",
+    "| HF (baseline)                                               | 87.68      | 1                         |\n",
+    "| TE (subsitution of GemmaDecoderLayer with te.TransformerLayer)                                              | 54.11      | 1.62                         | \n",
+    "| TE + THD attention                                               | 28.22      | 3.11                         |  \n",
+    "| TE + THD attention + CUDA Graphs                                             | 16.75      | 5.23                         |  \n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0a11b75c",
+   "metadata": {},
+   "source": [
+    "Let's look at the screenshots from *NVIDIA Nsight System* profiler to see where this speedup comes from:\n",
+    "\n",
+    "<figure align=\"center\">\n",
+    "<img src=\"./media/graphs_1.png\" width=\"80%\">\n",
+    "<figcaption>\n",
+    "Figure 5: Without CUDA Graphs. One can see that GPU (blue) is idle for big portion of the time.\n",
+    "</figcaption>\n",
+    "</figure>\n",
+    "\n",
+    "<figure align=\"center\">\n",
+    "<img src=\"./media/graphs_2.png\" width=\"80%\">\n",
+    "<figcaption>\n",
+    "Figure 6: With CUDA Graphs. One can see that GPU (orange) is fully utilized.\n",
+    "</figcaption>\n",
+    "</figure>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e6b171a0",
+   "metadata": {},
+   "source": [
+    "## [Improvement 4] Running generation in FP8 of the model trained in higher precision "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1a80288b",
+   "metadata": {},
+   "source": [
+    "Implementing FP8 generation with the Gemma model is not straightforward, because this model was initially trained using BF16 precision, and the necessary FP8 scaling factors are missing. Running the model at this lower precision without proper scaling could lead to significant errors and incorrect results.\n",
+    "\n",
+    "It is highly recommended to familiarize oneself with the [tutorial](../../examples/fp8_primer.ipynb) on FP8 precision to understand the necessity of scaling.\n",
+    "\n",
+    "\n",
+    "<figure align=\"center\">\n",
+    "<img src=\"./media/calibration_1_half.svg\">\n",
+    "<figcaption>\n",
+    "    Figure 8: The FP8 scaling factors are incorrect and that leads to numerical errors. The weight calibration allows us to compute FP8 metadata during the forwards in higher precision.\n",
+    "</figcaption>\n",
+    "</figure>\n",
+    "\n",
+    "### Weight Calibration\n",
+    "\n",
+    "To address the issue outlined above, weight calibration will be used. This involves running several forward iterations at BF16 precision within the context `te.fp8_autocast(enabled=False, calibration=True)`. This setup allows the forward pass to operate at higher precision, while simultaneously collecting `amax_history` and other parameters related to the FP8 precision, which are essential for calculating the FP8 scaling well.\n",
+    "\n",
+    "The code below outlines the steps to initialize the BF16 model and conduct several forward iterations within the specified context. After these iterations, the model is saved, and these weights will be utilized in subsequent chapters."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "aecee0e1",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python3.10/dist-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in TEGemmaForCausalLM is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained(\"openai/whisper-tiny\", attn_implementation=\"flash_attention_2\", torch_dtype=torch.float16)`\n",
+      "Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in GemmaModel is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained(\"openai/whisper-tiny\", attn_implementation=\"flash_attention_2\", torch_dtype=torch.float16)`\n",
+      "Repo card metadata block was not found. Setting CardData to empty.\n",
+      "[WARNING  | huggingface_hub.repocard]: Repo card metadata block was not found. Setting CardData to empty.\n",
+      "Repo card metadata block was not found. Setting CardData to empty.\n",
+      "[WARNING  | huggingface_hub.repocard]: Repo card metadata block was not found. Setting CardData to empty.\n"
+     ]
+    }
+   ],
+   "source": [
+    "#Restart the notebook (to flush the GPU memory)\n",
+    "from utils import restart_jupyter_notebook\n",
+    "restart_jupyter_notebook()\n",
+    "\n",
+    "from utils import *\n",
+    "import transformer_engine.pytorch as te\n",
+    "\n",
+    "hyperparams.model_name = \"/tmp/gemma-7b-hf/\" # <== Add model weight location here e.g. \"/path/to/downloaded/gemma/weights\"\n",
+    "hyperparams.fuse_qkv_params = True # This is needed by the last improvement.\n",
+    "\n",
+    "model = init_te_gemma_model(hyperparams)\n",
+    "\n",
+    "# Calibration\n",
+    "with te.fp8_autocast(enabled=False, calibrating=True), \\\n",
+    "    torch.autocast(device_type='cuda', dtype=torch.bfloat16):\n",
+    "    model.train()\n",
+    "    run_forward_pass(model, hyperparams, num_iters=512)\n",
+    "\n",
+    "# Compute scale_fwd with enabled fp8 autocast\n",
+    "with te.fp8_autocast(enabled=True), \\\n",
+    "    torch.autocast(device_type='cuda', dtype=torch.bfloat16):\n",
+    "    run_forward_pass(model, hyperparams, 1)\n",
+    "\n",
+    "# Some parameters are in pointing to the same tensors, double save is avoided here.\n",
+    "dict_to_save = {k: v for k, v in model.state_dict().items() \\\n",
+    "                if (\"_context_phase\" not in k and \"_generation_phase\" not in k)}\n",
+    "torch.save(dict_to_save, 'calibrated_weights.pth') # <== Add path to save calibrated weights."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b6dcd135",
+   "metadata": {},
+   "source": [
+    "|\n",
+    "### Generation in FP8\n",
+    "\n",
+    "<figure align=\"center\">\n",
+    "<img src=\"./media/calibration_2_half.svg\">\n",
+    "<figcaption>\n",
+    "    Figure 8: After the weight calibration FP8 scaling factors are correct and prevent numerical errors.\n",
+    "</figcaption>\n",
+    "</figure>\n",
+    "\n",
+    "Now FP8 inference is ready to be run."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "38e005f5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!ls -alh /perfhome/repos/data/gemma-7b-hf/"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "a913f54d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python3.10/dist-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in TEGemmaForCausalLMCudaGraphs is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained(\"openai/whisper-tiny\", attn_implementation=\"flash_attention_2\", torch_dtype=torch.float16)`\n",
+      "Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in GemmaModel is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained(\"openai/whisper-tiny\", attn_implementation=\"flash_attention_2\", torch_dtype=torch.float16)`\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "============================== Generation example 1 ==============================\n",
+      "Prompt:\n",
+      "Here are the two facts about GPUs:\n",
+      "Generated text:\n",
+      "\n",
+      "\n",
+      "1. GPUs are very good at doing the same thing over and over again.\n",
+      "2. GPUs are very bad at doing different things at the same time.\n",
+      "\n",
+      "This is a very important distinction to make.\n",
+      "\n",
+      "The first fact is a good thing\n",
+      "============================== Generation example 2 ==============================\n",
+      "Prompt:\n",
+      "Some facts about NVIDIA:\n",
+      "Generated text:\n",
+      "\n",
+      "\n",
+      "* NVIDIA is a global technology company that designs and develops graphics processing units (GPUs) for the gaming and professional markets.\n",
+      "* NVIDIA was founded in 1993 and is headquartered in Santa Clara, California.\n",
+      "* NVIDIA's\n",
+      "============================== Generation example 3 ==============================\n",
+      "Prompt:\n",
+      "Here are the two facts about GPUs:\n",
+      "Generated text:\n",
+      "\n",
+      "\n",
+      "1. GPUs are very good at doing the same thing over and over again.\n",
+      "2. GPUs are very bad at doing different things at the same time.\n",
+      "\n",
+      "This is a very important distinction to make.\n",
+      "\n",
+      "The first fact is a good thing\n",
+      "============================== Generation example 4 ==============================\n",
+      "Prompt:\n",
+      "Some facts about NVIDIA:\n",
+      "Generated text:\n",
+      "\n",
+      "\n",
+      "* NVIDIA is a global technology company that designs and develops graphics processing units (GPUs) for the gaming and professional markets.\n",
+      "* NVIDIA was founded in 1993 and is headquartered in Santa Clara, California.\n",
+      "* NVIDIA's\n",
+      "============================== Generation example 5 ==============================\n",
+      "Prompt:\n",
+      "Here are the two facts about GPUs:\n",
+      "Generated text:\n",
+      "\n",
+      "\n",
+      "1. GPUs are very good at doing the same thing over and over again.\n",
+      "2. GPUs are very bad at doing different things at the same time.\n",
+      "\n",
+      "This is a very important distinction to make.\n",
+      "\n",
+      "The first fact is a good thing\n"
+     ]
+    }
+   ],
+   "source": [
+    "#Restart the notebook (to flush the GPU memory)\n",
+    "from utils import restart_jupyter_notebook\n",
+    "restart_jupyter_notebook()\n",
+    "\n",
+    "from utils import *\n",
+    "\n",
+    "hyperparams.model_name = \"/tmp/gemma-7b-hf/\"   # <== Add model weight location here e.g. \"/path/to/downloaded/gemma/weights\"\n",
+    "hyperparams.qkv_format = \"thd\"\n",
+    "hyperparams.fuse_qkv_params = True # This is needed by the last improvement.\n",
+    "\n",
+    "hyperparams.fp8 = True\n",
+    "# Calibrated fp8 weights are loaded directly from the file.\n",
+    "\n",
+    "hyperparams.fp8_model_weights_filename = \"calibrated_weights.pth\" # <== Add calibrated weights location here.\n",
+    "\n",
+    "hyperparams.generation_cuda_graphs = True\n",
+    "hyperparams.cuda_graphs_static_batch_size = 64\n",
+    "hyperparams.cuda_graphs_static_max_seq_len = 1024\n",
+    "hyperparams.cuda_graphs_static_max_context_len = 128\n",
+    "model = init_te_gemma_model(hyperparams)\n",
+    "\n",
+    "print_sample_of_generated_texts(model)\n",
+    "# benchmark_generation(model)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8cdbb56c",
+   "metadata": {},
+   "source": [
+    "One can observe that the outputs are coherent; however, the generation time has increased. Why is this the case?\n",
+    "\n",
+    "\n",
+    "<figure align=\"center\">\n",
+    "<img src=\"./media/fp8_model_init_1_half.svg\">\n",
+    "<figcaption>\n",
+    "    Figure 9: Running the model at higher precision involves only one GEMM operation. However, when the model operates in FP8, it requires not just the low-precision GEMM but also weight casting.\n",
+    "</figcaption>\n",
+    "</figure>\n",
+    "\n",
+    "Running the model in FP8 does not imply that all weights are stored in FP8. By default, they are stored in higher precision and are cast to FP8, using saved scaling factors, before operations such as GEMMs.\n",
+    "\n",
+    "This approach is beneficial during training: one can perform one cast for both backward and forward passes, leading to speedups. However, performing a single cast for each forward pass introduces too much overhead to achieve a speedup. This issue will be addressed in the next section of the tutorial."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8d3945e3",
+   "metadata": {},
+   "source": [
+    "### Use of only FP8 model weights"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2dd0cba9",
+   "metadata": {},
+   "source": [
+    "TransformerEngine stores parameters in higher precision and only casts them to FP8. It may be necessary to maintain accucacy during training. However, high precision is not needed when doing inference. \n",
+    "\n",
+    "Transformer Engine supports maintaining only FP8 weights with `fp8_model_init` decorator. Let's see an example\n",
+    "```\n",
+    "linear = te.Linear(1024, 1024) # this module is initialized with full precision weights\n",
+    "with te.fp8_model_init(enabled=True):\n",
+    "    linear_fp8 = te.Linear(1024, 1024) # this module is initialized only with fp8 weights\n",
+    "\n",
+    "assert type(linear.weight.data) is torch.Tensor\n",
+    "assert type(linear_fp8.weight.data) is te.float8_tensor.Float8Tensor\n",
+    "```\n",
+    "\n",
+    "<figure align=\"center\">\n",
+    "<img src=\"./media/fp8_model_init_2_half.svg\">\n",
+    "<figcaption>\n",
+    "    Figure 10: Using fp8_model_init stores the weights directly in FP8 format, which reduces both time and memory usage.\n",
+    "</figcaption>\n",
+    "</figure>\n",
+    "\n",
+    "Let's run the code with `fp8_model_init`:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "96264b9c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python3.10/dist-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in TEGemmaForCausalLMCudaGraphs is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained(\"openai/whisper-tiny\", attn_implementation=\"flash_attention_2\", torch_dtype=torch.float16)`\n",
+      "Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but the current dype in GemmaModel is torch.float32. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator, or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained(\"openai/whisper-tiny\", attn_implementation=\"flash_attention_2\", torch_dtype=torch.float16)`\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "============================== Generation example 1 ==============================\n",
+      "Prompt:\n",
+      "Here are the two facts about GPUs:\n",
+      "Generated text:\n",
+      "\n",
+      "\n",
+      "1. GPUs are very good at doing the same thing over and over again.\n",
+      "2. GPUs are very bad at doing different things at the same time.\n",
+      "\n",
+      "This is a very important distinction to make.\n",
+      "\n",
+      "The first fact is a good thing\n",
+      "============================== Generation example 2 ==============================\n",
+      "Prompt:\n",
+      "Some facts about NVIDIA:\n",
+      "Generated text:\n",
+      "\n",
+      "\n",
+      "* NVIDIA is a global technology company that designs and develops graphics processing units (GPUs) for the gaming and professional markets.\n",
+      "* NVIDIA was founded in 1993 and is headquartered in Santa Clara, California.\n",
+      "* NVIDIA's\n",
+      "============================== Generation example 3 ==============================\n",
+      "Prompt:\n",
+      "Here are the two facts about GPUs:\n",
+      "Generated text:\n",
+      "\n",
+      "\n",
+      "1. GPUs are very good at doing the same thing over and over again.\n",
+      "2. GPUs are very bad at doing different things at the same time.\n",
+      "\n",
+      "This is a very important distinction to make.\n",
+      "\n",
+      "The first fact is a good thing\n",
+      "============================== Generation example 4 ==============================\n",
+      "Prompt:\n",
+      "Some facts about NVIDIA:\n",
+      "Generated text:\n",
+      "\n",
+      "\n",
+      "* NVIDIA is a global technology company that designs and develops graphics processing units (GPUs) for the gaming and professional markets.\n",
+      "* NVIDIA was founded in 1993 and is headquartered in Santa Clara, California.\n",
+      "* NVIDIA's\n",
+      "============================== Generation example 5 ==============================\n",
+      "Prompt:\n",
+      "Here are the two facts about GPUs:\n",
+      "Generated text:\n",
+      "\n",
+      "\n",
+      "1. GPUs are very good at doing the same thing over and over again.\n",
+      "2. GPUs are very bad at doing different things at the same time.\n",
+      "\n",
+      "This is a very important distinction to make.\n",
+      "\n",
+      "The first fact is a good thing\n"
+     ]
+    }
+   ],
+   "source": [
+    "#Restart the notebook (to flush the GPU memory)\n",
+    "from utils import restart_jupyter_notebook\n",
+    "restart_jupyter_notebook()\n",
+    "\n",
+    "# Import necessary packages and methods\n",
+    "from utils import *\n",
+    "\n",
+    "hyperparams.model_name = \"/tmp/gemma-7b-hf/\" # <== Add model weight location here e.g. \"/path/to/downloaded/gemma/weights\"\n",
+    "hyperparams.fuse_qkv_params = True # Needed for fp8_model_init().\n",
+    "hyperparams.qkv_format = \"thd\"\n",
+    "\n",
+    "hyperparams.fp8 = True\n",
+    "hyperparams.fp8_model_init = True # This will result in storing only fp8 weights.\n",
+    "hyperparams.fp8_model_weights_filename = \"calibrated_weights.pth\" # <== Add calibrated weights location here.\n",
+    "\n",
+    "hyperparams.generation_cuda_graphs = True\n",
+    "hyperparams.cuda_graphs_static_batch_size = 64\n",
+    "hyperparams.cuda_graphs_static_max_seq_len = 1024\n",
+    "hyperparams.cuda_graphs_static_max_context_len = 128\n",
+    "model = init_te_gemma_model(hyperparams)\n",
+    "\n",
+    "print_sample_of_generated_texts(model)\n",
+    "# benchmark_generation(model)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3e30ca5a",
+   "metadata": {},
+   "source": [
+    "| Models                                                      | Time (s) | Speedup |  \n",
+    "|-------------------------------------------------------------|---------------------------------------|--------------------------------------|\n",
+    "| HF (baseline)                                               | 87.68      | 1                         |\n",
+    "| TE (subsitution of GemmaDecoderLayer with te.TransformerLayer)                                              | 54.11      | 1.62                         | \n",
+    "| TE + THD attention                                               | 28.22      | 3.11                         |  \n",
+    "| TE + THD attention + CUDA Graphs                                             | 16.75      | 5.23                         |  \n",
+    "| TE + THD attention + FP8                                             | 12.13      | 7.23                         |  \n",
+    "\n",
+    "The final speedup is **7.23x**."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c6e87275",
+   "metadata": {},
+   "source": [
+    "## Conclusions"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7bb2452d",
+   "metadata": {},
+   "source": [
+    "\n",
+    "<figure align=\"center\">\n",
+    "<img src=\"./media/plot.svg\">\n",
+    "<figcaption>\n",
+    "    Figure 11: Times obtained with optimizations using TransformerEngine (seconds).\n",
+    "</figcaption>\n",
+    "</figure>\n",
+    "\n",
+    "In this tutorial, we've explored three features of the Transformer Engine:\n",
+    "1. Support for the THD attention layout,\n",
+    "2. Integration with CUDA Graphs,\n",
+    "3. FP8 weights calibration,\n",
+    "4. Models containing only FP8 version of their parameters.\n",
+    "\n",
+    "Each of these features can be applied in various contexts, such as fast token generation. It's important to note that the fastest possible inference speeds can be achieved using NVIDIA's inference-optimized [TensorRT](https://developer.nvidia.com/tensorrt) library."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/docs/examples/te_gemma/utils.py b/docs/examples/te_gemma/utils.py
new file mode 100755
index 0000000000..27e07ee15a
--- /dev/null
+++ b/docs/examples/te_gemma/utils.py
@@ -0,0 +1,367 @@
+# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+import time
+import sys
+import IPython
+import random
+import string
+
+from te_gemma_loading_weights import load_te_model
+from te_llama_loading_weights import load_te_model as load_te_model_llama
+import torch
+from torch.optim import AdamW
+from torch.utils.data import DataLoader
+
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    get_linear_schedule_with_warmup,
+    AutoConfig,
+)
+from transformers import DataCollatorForLanguageModeling
+from datasets import load_dataset
+from accelerate import Accelerator
+from accelerate.utils.dataclasses import FP8RecipeKwargs
+
+
+from te_gemma import TEGemmaForCausalLM, TEGemmaForCausalLMCudaGraphs
+from te_llama import TELlamaForCausalLM, TELlamaForCausalLMCudaGraphs
+
+
+class HyperParameters:
+    def __init__(self):
+        self.mixed_precision = "bf16"
+        self.model_name = None
+
+        self.fp8 = False
+
+        # Weights in fp8
+        self.fp8_model_weights_filename = None
+        self.fp8_model_init = False
+
+        # Cuda graphs
+        self.generation_cuda_graphs = False
+        self.cuda_graphs_static_batch_size = 16
+        self.cuda_graphs_static_max_seq_len = 256
+        self.cuda_graphs_static_max_context_len = 16
+
+        # Finetuning settings.
+        self.dataset_name = "timdettmers/openassistant-guanaco"
+        self.dataset_text_field = "text"
+        self.learning_rate = 1.41e-5
+        self.batch_size = 8
+        self.max_seq_length = 256
+        self.gradient_accumulation_steps = 1
+        self.num_warmup_steps = 5
+        self.num_training_steps = 10
+
+        # QKV format.
+        self.fuse_qkv_params = False
+        self.qkv_format = "bshd"
+
+
+hyperparams = HyperParameters()
+
+assert (
+    torch.backends.cudnn.version() >= 90100
+), "cuDNN version >= 9.1.0 is needed to run this tutorial."
+
+
+def get_dataloaders(accelerator: Accelerator, hyperparams):
+    dataset = load_dataset(hyperparams.dataset_name, split="train")
+    tokenizer = AutoTokenizer.from_pretrained(hyperparams.model_name)
+    if getattr(tokenizer, "pad_token", None) is None:
+        tokenizer.pad_token = tokenizer.eos_token
+
+    def tokenize(element):
+        outputs = tokenizer(
+            element["text"],
+            truncation=True,
+            padding=False,
+            max_length=hyperparams.max_seq_length,
+            return_overflowing_tokens=False,
+            return_length=False,
+        )
+        return {"input_ids": outputs["input_ids"], "attention_mask": outputs["attention_mask"]}
+
+    with accelerator.main_process_first():
+        dataset = dataset.map(tokenize, batched=True, remove_columns=dataset.column_names)
+
+    # Simply pad to the multiple of 16 for both FP8 and BF16 precision
+    pad_to_multiple_of = 16
+    data_collator = DataCollatorForLanguageModeling(
+        tokenizer=tokenizer,
+        mlm=False,
+        pad_to_multiple_of=pad_to_multiple_of,
+    )
+
+    dataloader_params = {
+        "batch_size": hyperparams.batch_size,
+        "collate_fn": data_collator,
+        "drop_last": True,
+    }
+    train_dataloader = DataLoader(dataset, **dataloader_params)
+    return train_dataloader
+
+
+def init_baseline_model(hyperparams):
+    # Init the model
+    config = AutoConfig.from_pretrained(hyperparams.model_name)
+    # make sure to use flash_attention to do iso comparison with TEGemmaModel
+    config._attn_implementation = "flash_attention_2"
+    model = AutoModelForCausalLM.from_pretrained(
+        hyperparams.model_name,
+        config=config,
+        torch_dtype=torch.bfloat16,
+    )
+    return model.cuda()
+
+
+def init_te_llama_model(hyperparams):
+    cls = TELlamaForCausalLMCudaGraphs if hyperparams.generation_cuda_graphs else TELlamaForCausalLM
+    config = AutoConfig.from_pretrained(hyperparams.model_name)
+    config._attn_implementation = "flash_attention_2"
+    # config.hidden_size = 1024
+    # config.head_dim = 128
+    print(config)
+    # Adding all params from the hyperparams to the config to make the code simpler.
+    for key, value in hyperparams.__dict__.items():
+        setattr(config, key, value)
+    model = load_te_model_llama(cls, config)
+    if hyperparams.generation_cuda_graphs:
+        model.record()
+    return model.cuda()
+
+
+def init_te_gemma_model(hyperparams):
+    cls = TEGemmaForCausalLMCudaGraphs if hyperparams.generation_cuda_graphs else TEGemmaForCausalLM
+    config = AutoConfig.from_pretrained(hyperparams.model_name)
+    config._attn_implementation = "flash_attention_2"
+    # config.hidden_size = 1024
+    # config.head_dim = 128
+    print(config)
+    # Adding all params from the hyperparams to the config to make the code simpler.
+    for key, value in hyperparams.__dict__.items():
+        setattr(config, key, value)
+    model = load_te_model(cls, config)
+    if hyperparams.generation_cuda_graphs:
+        model.record()
+    return model.cuda()
+
+
+def wrap_with_accelerator(model, hyperparams):
+    # Create FP8 kwarg handler if required
+    fp8_kwarg_handler = (
+        [FP8RecipeKwargs(backend="te")] if hyperparams.mixed_precision == "fp8" else None
+    )
+
+    # Init HF accelerator that's used for training
+    accelerator = Accelerator(
+        log_with="wandb",
+        gradient_accumulation_steps=hyperparams.gradient_accumulation_steps,
+        mixed_precision=hyperparams.mixed_precision,
+        kwargs_handlers=fp8_kwarg_handler,
+    )
+    # accelerator.print(f'State: {accelerator.state}')
+    train_dataloader = get_dataloaders(accelerator, hyperparams)
+
+    # Wrap model, optimizer/scheduler, dataloaders in accelerate
+    optimizer = AdamW(params=model.parameters(), lr=hyperparams.learning_rate, fused=True)
+    lr_scheduler = get_linear_schedule_with_warmup(
+        optimizer=optimizer,
+        num_warmup_steps=100,
+        num_training_steps=hyperparams.num_training_steps,
+    )
+    model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        model, optimizer, train_dataloader, lr_scheduler
+    )
+
+    return accelerator, model, optimizer, train_dataloader, lr_scheduler
+
+
+def finetune_model(model, hyperparams, accelerator, train_dataloader, optimizer, lr_scheduler):
+    model.train()
+    optimizer.zero_grad()
+    train_dataloader = enumerate(train_dataloader)
+
+    def run_iters(num_iters):
+        for _ in range(num_iters):
+            _, batch = next(train_dataloader)
+            with accelerator.accumulate(model):
+                outputs = model(**batch)
+                loss = outputs.loss
+                accelerator.backward(loss)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+
+    run_iters(hyperparams.num_warmup_steps)  # Warmup iters
+
+    # Get the timers ready
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+    torch.cuda.synchronize()
+
+    start.record()
+    run_iters(hyperparams.num_training_steps)  # Training iters
+    torch.cuda.synchronize()
+    end.record()
+    accelerator.end_training()
+
+    print(
+        f"""{hyperparams.num_training_steps} finetuning steps complete!\n
+          Average time taken per step:
+          {(start.elapsed_time(end)/hyperparams.num_training_steps):.0f}
+          milliseconds"""
+    )
+
+
+def restart_jupyter_notebook():
+    # Try restarting the Jupyter kernel
+    IPython.Application.instance().kernel.do_shutdown(True)
+
+    # Check whether the device memory has been flushed
+    if torch.cuda.memory_allocated() != 0:
+        import warnings
+
+        warnings.warn("The device memory hasn't been flushed, trying with a second method!")
+
+        # Try restarting the Jupyter kernel another way
+        # Restart the kernel
+        from IPython.core.display import HTML
+
+        HTML("<script>Jupyter.notebook.kernel.restart()</script>")
+
+        if torch.cuda.memory_allocated() != 0:
+            print(
+                "The device memory hasn't been flushed, try manually restarting the Jupyter kernel!"
+            )
+
+    # Suppress the warnings
+    if not sys.warnoptions:
+        import warnings
+
+        warnings.simplefilter("ignore")
+        torch.set_warn_always(False)
+
+
+@torch.no_grad()
+def run_forward_pass(model, hyperparams, num_iters):
+    """
+    It runs num_iters forward passes with sample data.
+    """
+    accelerator = Accelerator(
+        log_with="wandb",
+        gradient_accumulation_steps=hyperparams.gradient_accumulation_steps,
+        mixed_precision="no",
+    )
+    train_dataloader = get_dataloaders(accelerator, hyperparams)
+
+    # @sudhakars: what's the point of calling `model.train` inside `no_grad`
+    # context?
+    model.train()
+    train_dataloader = enumerate(train_dataloader)
+
+    for _ in range(num_iters):
+        _, batch = next(train_dataloader)
+        batch["input_ids"] = batch["input_ids"].cuda()
+        batch["attention_mask"] = batch["attention_mask"].cuda()
+        model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"])
+
+
+"""
+    Benchmarking and example generation functions.
+"""
+
+
+def print_sample_of_generated_texts(model):
+    tokenizer = AutoTokenizer.from_pretrained(hyperparams.model_name)
+    if getattr(tokenizer, "pad_token", None) is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    prompts = ["Here are the two facts about GPUs:", "Some facts about NVIDIA:"]
+    prompts *= 32
+    inputs = tokenizer(prompts, return_tensors="pt", padding=True)
+
+    max_length = inputs["input_ids"].size(1)
+    new_length = ((max_length + 63) // 64) * 128
+
+    # Add padding to the left
+    inputs["input_ids"] = torch.nn.functional.pad(
+        inputs["input_ids"], (new_length - max_length, 0), value=tokenizer.pad_token_id
+    )
+
+    # Add padding to the left (only intended for baseline generation with HF
+    # which expects padding to the left)
+    inputs["attention_mask"] = torch.nn.functional.pad(
+        inputs["attention_mask"], (new_length - max_length, 0), value=0
+    )
+
+    inputs["input_ids"] = inputs["input_ids"].cuda()
+    inputs["attention_mask"] = inputs["attention_mask"].cuda()
+
+    outputs = model.generate(**inputs, max_new_tokens=50)
+    generated_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
+
+    def print_output(prompts, generated_texts, idx):
+        print("=" * 30 + f" Generation example {idx+1} " + "=" * 30)
+        print("Prompt:")
+        print(generated_texts[idx][: len(prompts[idx])])
+        print("Generated text:")
+        print(generated_texts[idx][len(prompts[idx]) :])
+
+    for i in range(5):
+        print_output(prompts, generated_texts, i)
+
+
+def _generate_random_words(num_words, max_word_length):
+    words = []
+    for _ in range(num_words):
+        word_length = random.randint(1, max_word_length)
+        word = "".join(random.choices(string.ascii_lowercase, k=word_length))
+        words.append(word)
+    return words
+
+
+def benchmark_generation(model):
+    batch_size = 64
+    context_length = 128
+    max_new_tokens = 156 - 128
+    print("=" * 30 + " Benchmarking " + "=" * 30)
+    print(
+        f"Benchmarking for batch_size = {batch_size} and max total tokens ="
+        f" {context_length + max_new_tokens}"
+    )
+
+    input_str = _generate_random_words(batch_size, context_length)
+
+    tokenizer = AutoTokenizer.from_pretrained(hyperparams.model_name)
+    inputs = tokenizer(input_str, return_tensors="pt", padding=True)
+
+    max_length = inputs["input_ids"].size(1)
+
+    # Add padding to the left
+    inputs["input_ids"] = torch.nn.functional.pad(
+        inputs["input_ids"], (context_length - max_length, 0), value=tokenizer.pad_token_id
+    )
+
+    # Add padding to the left (only intended for baseline generation with HF
+    # which expects padding to the left)
+    inputs["attention_mask"] = torch.nn.functional.pad(
+        inputs["attention_mask"], (context_length - max_length, 0), value=0
+    )
+
+    inputs["input_ids"] = inputs["input_ids"].cuda()
+    inputs["attention_mask"] = inputs["attention_mask"].cuda()
+
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+    torch.cuda.synchronize()
+    start.record()
+
+    model.generate(inputs["input_ids"].cuda(), max_new_tokens=max_new_tokens)
+    torch.cuda.synchronize()
+    end.record()
+
+    print(f"Time: {start.elapsed_time(end)/1000:.2f} s.")
diff --git a/transformer_engine/common/fused_attn/kv_cache.cu b/transformer_engine/common/fused_attn/kv_cache.cu
index af69faaabe..ea468e435b 100644
--- a/transformer_engine/common/fused_attn/kv_cache.cu
+++ b/transformer_engine/common/fused_attn/kv_cache.cu
@@ -116,14 +116,14 @@ void copy_to_kv_cache_launcher(Tensor new_k, Tensor new_v, Tensor k_cache, Tenso
                                bool is_non_paged, cudaStream_t stream) {
   if (new_k.has_data() && new_v.has_data() && k_cache.has_data() && v_cache.has_data()) {
     if (is_non_paged) {
-      reindex_kv_cache_kernel<<<16, 256, 0, stream>>>(
+      reindex_kv_cache_kernel<<<128, 1024, 0, stream>>>(
           reinterpret_cast<dtype *>(k_cache.data.dptr),
           reinterpret_cast<dtype *>(v_cache.data.dptr),
           reinterpret_cast<int *>(page_table.data.dptr),
           reinterpret_cast<int *>(cu_new_lens.data.dptr),
           reinterpret_cast<int *>(cu_cached_lens.data.dptr), h_kv, d_k, d_v, b, max_seq_len);
     }
-    copy_to_kv_cache_kernel<<<16, 256, 0, stream>>>(
+    copy_to_kv_cache_kernel<<<128, 1024, 0, stream>>>(
         reinterpret_cast<dtype *>(new_k.data.dptr), reinterpret_cast<dtype *>(new_v.data.dptr),
         reinterpret_cast<dtype *>(k_cache.data.dptr), reinterpret_cast<dtype *>(v_cache.data.dptr),
         reinterpret_cast<int *>(page_table.data.dptr),
diff --git a/transformer_engine/pytorch/attention/inference.py b/transformer_engine/pytorch/attention/inference.py
index 8267bf63c7..62a724ef79 100644
--- a/transformer_engine/pytorch/attention/inference.py
+++ b/transformer_engine/pytorch/attention/inference.py
@@ -214,6 +214,11 @@ def __init__(
             dtype=torch.int32,
             device=torch.cuda.current_device(),
         )
+        self.pre_step_seqlens = torch.zeros(
+            self.max_batch_size,
+            dtype=torch.int32,
+            device=torch.cuda.current_device(),
+        )
 
     def reset(self):
         """Reset InferenceParams state"""
@@ -266,6 +271,11 @@ def pre_step(
         for k, v in self.sequences.items():
             self.sequences_pre_step[k] = v - step_dict[k]
 
+        pre_step_seqlens = torch.Tensor(list(self.sequences_pre_step.values())).to(
+            dtype=torch.int32, device="cpu"
+        )
+        self.pre_step_seqlens[: len(pre_step_seqlens)].copy_(pre_step_seqlens, non_blocking=True)
+
         seqlens_q = list(step_dict.values())
         cu_seqlens_q = [0] + [sum(seqlens_q[:i]) for i in range(1, self.batch_size + 1)]
         cu_seqlens_q = cu_seqlens_q + [cu_seqlens_q[-1]] * (self.max_batch_size - self.batch_size)
@@ -280,9 +290,13 @@ def pre_step(
 
     def get_seqlens_pre_step(self):
         """Get cached sequence lengths before the stepping"""
-        return torch.Tensor(list(self.sequences_pre_step.values())).to(
-            dtype=torch.int32, device="cpu"
-        )
+        # seqlens = torch.Tensor(list(self.sequences_pre_step.values())).to(
+        #     dtype=torch.int32, device="cpu"
+        # )
+        # # return seqlens.cuda()
+        # self.cu_pre_step_seqlens[:len(seqlens)].copy_(seqlens, non_blocking=True)
+        # return self.cu_pre_step_seqlens
+        return self.pre_step_seqlens
 
     def convert_paged_to_nonpaged(self, layer_number: int):
         """
@@ -455,14 +469,14 @@ def pre_step(
         finished_seqs = self.sequences.keys() - unfinished_seqs
         unfinished_indices = [i for i, j in enumerate(self.sequences) if j in unfinished_seqs]
         finished_indices = [i for i, j in enumerate(self.sequences) if j in finished_seqs]
-        self.batch_indices.copy_(
+        self.batch_indices.data[:].copy_(
             torch.Tensor(
                 (
                     unfinished_indices
                     + finished_indices
                     + list(range(prev_batch_size, self.max_batch_size))
                 )
-            ).to(dtype=torch.int32, device="cpu")
+            )
         )
 
         # Advance unfinished sequences
diff --git a/transformer_engine/pytorch/attention/multi_head_attention.py b/transformer_engine/pytorch/attention/multi_head_attention.py
index f018465dc1..cac16d9de6 100644
--- a/transformer_engine/pytorch/attention/multi_head_attention.py
+++ b/transformer_engine/pytorch/attention/multi_head_attention.py
@@ -768,8 +768,8 @@ def forward(
                 # sequence_start = inference_params.seqlens[0]
                 sequence_end = sequence_start + sequence_length
 
-                q_pos_emb = q_pos_emb[sequence_start:sequence_end, ...]
-                k_pos_emb = k_pos_emb[sequence_start:sequence_end, ...]
+                # q_pos_emb = q_pos_emb[sequence_start:sequence_end, ...]
+                # k_pos_emb = k_pos_emb[sequence_start:sequence_end, ...]
 
             query_layer = apply_rotary_pos_emb(
                 query_layer,
@@ -779,6 +779,7 @@ def forward(
                 cu_seqlens=cu_seqlens_q,
                 cp_size=self.cp_size,
                 cp_rank=self.cp_rank,
+                start_positions=sequence_start,
                 interleaved=self.rotary_pos_interleaved,
             )
             key_layer = apply_rotary_pos_emb(
@@ -789,6 +790,7 @@ def forward(
                 cu_seqlens=cu_seqlens_kv,
                 cp_size=self.cp_size,
                 cp_rank=self.cp_rank,
+                start_positions=sequence_start,
                 interleaved=self.rotary_pos_interleaved,
             )
 
diff --git a/transformer_engine/pytorch/csrc/extensions/apply_rope.cpp b/transformer_engine/pytorch/csrc/extensions/apply_rope.cpp
index fe640f67c0..781002f154 100644
--- a/transformer_engine/pytorch/csrc/extensions/apply_rope.cpp
+++ b/transformer_engine/pytorch/csrc/extensions/apply_rope.cpp
@@ -28,9 +28,10 @@ at::Tensor fused_rope_forward(const at::Tensor &input, const at::Tensor &freqs,
   auto freqs_cu = makeTransformerEngineTensor(freqs);
   auto output_cu = makeTransformerEngineTensor(output);
 
-  auto start_positions_cu = TensorWrapper();  // empty cu_seqlens tensor
+  auto start_positions_cu = TensorWrapper();  // empty start_positions tensor
   if (start_positions) {
     start_positions_cu = makeTransformerEngineTensor(start_positions.value());
+    TORCH_CHECK(start_positions_cu.ndim() == 1, "expected 1D tensor");
   }
 
   if (qkv_format == NVTE_QKV_Format::NVTE_THD) {