NVIDIA · pggPL · Jun 6, 2025 · Jun 16, 2025 · Jun 17, 2025 · Jun 17, 2025
diff --git a/tests/pytorch/fused_attn/run_fused_attn_with_cp.py b/tests/pytorch/fused_attn/run_fused_attn_with_cp.py
@@ -95,6 +95,7 @@ def run_dpa_with_cp(
         qkv_format=qkv_format,
         attn_mask_type=config.attn_mask_type,
         window_size=config.window_size,
+        chunk_size=config.chunk_size,
     )
     core_attn = core_attn.cuda()
 
@@ -284,6 +285,7 @@ def run_dpa_with_cp(
             cu_seqlens_q_padded=cu_seqlens_q_padded,
             cu_seqlens_kv_padded=cu_seqlens_kv_padded,
         )
+
         if fp8_mha:
             dout_fp8_ = dout_quantizer(dout_)
             out_.backward(dout_fp8_)
@@ -401,8 +403,12 @@ def _error(a, b):
             _error(a[0], b[0])
             _error(a[1], b[1])
     elif qkv_format == "thd":
+        i = 0
         for a, b in zip([out_, dq_, dk_, dv_], [out, dq, dk, dv]):
             _error(a, b)
+            str_names = ["out_", "dq_", "dk_", "dv_"]
+            print(f"{str_names[i]} passed on rank {rank}")
+            i += 1
     else:
         assert False, f"{qkv_format} is an unsupported qkv_format!"
 

diff --git a/tests/pytorch/fused_attn/test_fused_attn.py b/tests/pytorch/fused_attn/test_fused_attn.py
@@ -90,6 +90,7 @@ def __init__(
         window_size: Tuple[int, int] = (-1, -1),
         total_requests: int = None,
         max_ctx_len: int = None,
+        chunk_size: int = None,
     ):
         self.batch_size = batch_size
         self.num_heads = num_heads
@@ -110,6 +111,7 @@ def __init__(
         self.window_size = window_size
         self.total_requests = total_requests
         self.max_ctx_len = max_ctx_len
+        self.chunk_size = chunk_size
 
 
 @contextmanager

diff --git a/tests/pytorch/fused_attn/test_fused_attn_with_cp.py b/tests/pytorch/fused_attn/test_fused_attn_with_cp.py
@@ -24,6 +24,9 @@
     "cp_1_3": ModelConfig(
         2, 12, 12, 128, 4096, 4096, 0.0, "no_mask", "no_bias", window_size=(512, 512)
     ),  # MHA
+    "cp_1_4": ModelConfig(
+        2, 12, 12, 128, 4096, 4096, 0.0, "causal", "no_bias", chunk_size=1024
+    ),  # MHA with chunks
     "cp_2_0": ModelConfig(2, 12, 2, 128, 4096, 4096, 0.0, "causal", "no_bias"),  # GQA
     "cp_2_1": ModelConfig(2, 12, 2, 128, 4096, 4096, 0.0, "no_mask", "no_bias"),  # GQA
     "cp_2_2": ModelConfig(
@@ -100,6 +103,9 @@ def test_cp_with_flash_attention(dtype, model, qkv_format, cp_comm_type):
     "cp_1_4": ModelConfig(
         2, 12, 12, 128, 4096, 4096, 0.0, "causal", "no_bias", window_size=(512, 0)
     ),  # MHA
+    "cp_1_5": ModelConfig(
+        2, 12, 12, 128, 4096, 4096, 0.0, "causal", "no_bias", chunk_size=1024
+    ),  # MHA
     "cp_2_0": ModelConfig(2, 12, 2, 128, 4096, 4096, 0.0, "causal", "no_bias"),  # GQA
     "cp_2_1": ModelConfig(2, 12, 2, 128, 4096, 4096, 0.0, "no_mask", "no_bias"),  # GQA
     "cp_2_2": ModelConfig(2, 12, 2, 128, 4096, 4096, 0.0, "causal", "post_scale_bias"),  # GQA
@@ -144,6 +150,8 @@ def test_cp_with_fused_attention(dtype, model, qkv_format, cp_comm_type, fp8_mha
     config = model_configs_fused_attn[model]
     if qkv_format == "thd" and config.attn_bias_type == "post_scale_bias":
         pytest.skip("THD format does not support post_scale_bias yet!")
+    if qkv_format != "thd" and config.chunk_size is not None:
+        pytest.skip("Only THD format supports chunking!")
     if qkv_format == "thd" and cp_comm_type == "all_gather":
         pytest.skip("CP implementation with KV all-gather does not support THD format yet!")
     if qkv_format == "thd" and "a2a" in cp_comm_type: