NVIDIA · pggPL · Jun 27, 2025 · Jun 30, 2025 · Jul 2, 2025 · Jul 2, 2025
diff --git a/build_tools/pytorch.py b/build_tools/pytorch.py
@@ -17,7 +17,7 @@ def install_requirements() -> List[str]:
     reqs = ["torch>=2.1", "einops"]
     reqs.append(
         "nvdlfw-inspect @"
-        " git+https://github.com/NVIDIA/nvidia-dlfw-inspect.git@v0.1#egg=nvdlfw-inspect"
+        " git+https://github.com/NVIDIA/nvidia-dlfw-inspect.git@v0.2#egg=nvdlfw-inspect"
     )
     return reqs
 

diff --git a/qa/L0_pytorch_debug_unittest/test.sh b/qa/L0_pytorch_debug_unittest/test.sh
@@ -19,6 +19,9 @@ pytest -v -s $TE_PATH/tests/pytorch/debug/test_sanity.py  --feature_dirs=$NVTE_T
 pytest -v -s $TE_PATH/tests/pytorch/debug/test_config.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS || FAIL=1
 pytest -v -s $TE_PATH/tests/pytorch/debug/test_numerics.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS || FAIL=1
 NVTE_TORCH_COMPILE=0 pytest -v -s $TE_PATH/tests/pytorch/debug/test_api_features.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS --configs_dir=$NVTE_TEST_NVINSPECT_CONFIGS_DIR || FAIL=1
+pytest -v -s $TE_PATH/tests/pytorch/debug/test_log.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS --configs_dir=$NVTE_TEST_NVINSPECT_CONFIGS_DIR || FAIL=1
+pytest -v -s $TE_PATH/tests/pytorch/debug/test_perf.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS --configs_dir=$NVTE_TEST_NVINSPECT_CONFIGS_DIR || FAIL=1
+
 
 # standard numerics tests with initialized debug
 NVTE_TEST_NVINSPECT_ENABLED=True NVTE_TEST_NVINSPECT_CONFIG_FILE=$NVTE_TEST_NVINSPECT_DUMMY_CONFIG_FILE NVTE_TEST_NVINSPECT_FEATURE_DIRS=$NVTE_TEST_NVINSPECT_FEATURE_DIRS PYTORCH_JIT=0 NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 pytest -v -s $TE_PATH/tests/pytorch/test_numerics.py || FAIL=1

diff --git a/tests/pytorch/debug/test_api_features.py b/tests/pytorch/debug/test_api_features.py
@@ -26,20 +26,20 @@ def test_transformer_engine_no_config(feature_dirs):
             "decoder.1.attn.qkv", gemm="fprop", iteration=0
         )
 
-        # modify_tensor_enabled - False by default
+        # modify_tensor_enabled - (False, float("inf")) by default
         assert not debug_api.transformer_engine.modify_tensor_enabled(
             "decoder.1.attn.qkv", gemm="fprop", tensor_name="activation", iteration=0
-        )
+        )[0]
 
-        # inspect_tensor_enabled - False by default
+        # inspect_tensor_enabled - (False, float("inf")) by default
         assert not debug_api.transformer_engine.inspect_tensor_enabled(
             "decoder.1.attn.qkv", tensor_name="activation", iteration=0
-        )
+        )[0]
 
-        # inspect_tensor_postquantize - False by default
+        # inspect_tensor_postquantize - (False, float("inf")) by default
         assert not debug_api.transformer_engine.inspect_tensor_postquantize_enabled(
             "decoder.1.attn.qkv", gemm="fprop", tensor_name="activation", iteration=0
-        )
+        )[0]
 
     finally:
         debug_api.end_debug()
@@ -120,13 +120,13 @@ def test_per_tensor_scaling(configs_dir, feature_dirs):
         )
         assert not debug_api.transformer_engine.modify_tensor_enabled(
             "decoder.1.mlp.fc1", gemm="dgrad", tensor_name="weight", iteration=0
-        )
+        )[0]
         assert not debug_api.transformer_engine.modify_tensor_enabled(
             "decoder.1.mlp.fc1", gemm="wgrad", tensor_name="gradient", iteration=0
-        )
+        )[0]
         assert not debug_api.transformer_engine.modify_tensor_enabled(
             "decoder.1.mlp.fc1", gemm="wgrad", tensor_name="activation", iteration=0
-        )
+        )[0]
 
         # check modify_tensor
 
@@ -168,14 +168,14 @@ def test_per_tensor_scaling(configs_dir, feature_dirs):
             gemm="wgrad",
             tensor_name="gradient",
             iteration=0,
-        )
+        )[0]
 
         assert not debug_api.transformer_engine.modify_tensor_enabled(
             "decoder.1.mlp.fc4",
             gemm="fprop",
             tensor_name="activation",
             iteration=0,
-        )
+        )[0]
     finally:
         debug_api.end_debug()
 
@@ -265,21 +265,20 @@ def assert_empty():
         assert stats[("decoder.1.mlp.fc1", "activation", "cur_amax", 200)] == tensor.abs().max()
         assert not debug_api.transformer_engine.inspect_tensor_enabled(
             "decoder.1.mlp.fc1", tensor_name="activation", iteration=201
-        )
+        )[0]
         assert not debug_api.transformer_engine.inspect_tensor_enabled(
             "decoder.2.mlp.fc1", tensor_name="activation", iteration=200
-        )
+        )[0]
         assert not debug_api.transformer_engine.inspect_tensor_enabled(
             "decoder.1.mlp.fc1", tensor_name="gradient", iteration=200
-        )
+        )[0]
 
         expected_underflows = (tensor_fp8._data == 0).sum() * 100 / (100 * 100 * 5)
-        expected_overflows = (tensor_fp8._data == 126).sum() * 100 / (100 * 100 * 5)
 
         # TE FP8 tensor stats --
         assert debug_api.transformer_engine.inspect_tensor_postquantize_enabled(
             "decoder.1.mlp.fc1", tensor_name="gradient", gemm="wgrad", iteration=200
-        )
+        )[0]
         debug_api.transformer_engine.inspect_tensor_postquantize(
             "decoder.1.mlp.fc1",
             tensor=tensor_fp8,
@@ -295,10 +294,10 @@ def assert_empty():
 
         assert not debug_api.transformer_engine.inspect_tensor_postquantize_enabled(
             "decoder.1.mlp.fc1", tensor_name="activation", gemm="fprop", iteration=201
-        )
+        )[0]
         assert not debug_api.transformer_engine.inspect_tensor_postquantize_enabled(
             "decoder.2.mlp.fc1", tensor_name="gradient", gemm="wgrad", iteration=200
-        )
+        )[0]
 
         # Second config in same yaml
         tensor = torch.rand((100, 100, 5))
@@ -328,7 +327,7 @@ def assert_empty():
 
         assert not debug_api.transformer_engine.inspect_tensor_enabled(
             "decoder.7.mlp.fc1", tensor_name="weight", iteration=201
-        )
+        )[0]
         assert_empty()
 
     finally:

diff --git a/tests/pytorch/debug/test_configs/log_config.yaml b/tests/pytorch/debug/test_configs/log_config.yaml
@@ -0,0 +1,19 @@
+test:
+  enabled: True
+  layers:
+    layer_name_regex_pattern: .*
+  transformer_engine:
+    LogTensorStats:
+      enabled: True
+      tensors_struct:
+        - tensor: activation
+          stats: [cur_amax, dynamic_range, mean, std, l1_norm]
+          start_step: 1
+          freq: 3
+    LogFp8TensorStats:
+      enabled: True
+      tensors: weight
+      stats: [underflows%]
+      start_step: 1
+      freq: 3
+
diff --git a/tests/pytorch/debug/test_configs/perf_config.yaml b/tests/pytorch/debug/test_configs/perf_config.yaml
@@ -0,0 +1,13 @@
+test:
+  enabled: True
+  layers:
+    layer_name_regex_pattern: .*1
+  transformer_engine:
+    LogTensorStats:
+      enabled: True
+      tensors_struct:
+        - tensor: activation
+          stats: [cur_amax, dynamic_range, mean, std, l1_norm]
+          start_step: 0
+          freq: 100000
+
diff --git a/tests/pytorch/debug/test_log.py b/tests/pytorch/debug/test_log.py
@@ -0,0 +1,58 @@
+# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+
+import pytest
+import torch
+import transformer_engine.pytorch as te
+import tempfile
+import os
+
+import nvdlfw_inspect.api as debug_api
+
+from transformer_engine.debug.pytorch.debug_state import TEDebugState
+
+
+@pytest.mark.parametrize("layer", ["linear", "transformer"])
+def test_log_every_3_layers(layer, configs_dir, feature_dirs):
+    # If layer does not invoke any feature in current iteration,
+    # then it changed into non-debug mode.
+    # This test checks whether this works correctly -
+    # layer should be logged every 3 iterations.
+    with tempfile.TemporaryDirectory() as temp_dir:
+        debug_api.initialize(
+            config_file=configs_dir + "/log_config.yaml",
+            feature_dirs=feature_dirs,
+            log_dir=temp_dir,
+        )
+
+        if layer == "linear":
+            model = te.Linear(128, 128, name="linear1")
+        elif layer == "transformer":
+            model = te.TransformerLayer(128, 128, 4, name="transformer1")
+        else:
+            raise ValueError(f"Invalid layer: {layer}")
+
+        for i in range(10):
+            x = torch.randn(4, 4, 128).cuda()
+            with te.fp8_autocast(enabled=True):
+                y = model(x)
+            y.sum().backward()
+            debug_api.step()
+
+        with open(
+            os.path.join(
+                temp_dir, "nvdlfw_inspect_statistics_logs/nvdlfw_inspect_globalrank-0.log"
+            ),
+            "r",
+        ) as f:
+            file_content = f.read()
+            for i in range(1, 11):
+                if i % 3 == 0:
+                    assert f"iteration={i:06d}" in file_content
+                else:
+                    assert f"iteration={i:06d}" not in file_content
+
+    debug_api.end_debug()
+    TEDebugState._reset()
diff --git a/tests/pytorch/debug/test_perf.py b/tests/pytorch/debug/test_perf.py
@@ -0,0 +1,76 @@
+# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+
+import pytest
+import torch
+import transformer_engine.pytorch as te
+import time
+
+import nvdlfw_inspect.api as debug_api
+
+from transformer_engine.debug.pytorch.debug_state import TEDebugState
+
+
+def _run_cpu_overhead(debug_tools_initialized, layer, configs_dir, feature_dirs):
+    debug_api.end_debug()
+    TEDebugState._reset()
+    if debug_tools_initialized:
+        # This config log stats starting from 0, every N iterations for huge N >> NUM_ITERS.
+        # So after 1 warm-up iteration, this layers should work in non-debug mode.
+        debug_api.initialize(
+            config_file=configs_dir + "/perf_config.yaml", feature_dirs=feature_dirs
+        )
+
+    try:
+        if layer == "linear":
+            model = torch.nn.Sequential(
+                te.Linear(1, 1, name="linear1"), te.Linear(1, 1, name="linear2")
+            ).cuda()
+            NUM_ITERS = 18000
+        elif layer == "transformer":
+            model = torch.nn.Sequential(
+                te.TransformerLayer(1, 1, 1, name="transformer1"),
+                te.TransformerLayer(1, 1, 1, name="transformer2"),
+            ).cuda()
+            NUM_ITERS = 2000
+
+        x = torch.randn(1, 1, 1).cuda()
+
+        y = model(x)
+        y.sum().backward()
+        debug_api.step()
+        torch.cuda.synchronize()
+
+        time_start = time.time()
+        for i in range(NUM_ITERS):
+            y = model(x)
+            y.sum().backward()
+            if debug_tools_initialized:
+                debug_api.step()
+        torch.cuda.synchronize()
+        time_end = time.time()
+
+    finally:
+        if debug_tools_initialized:
+            debug_api.end_debug()
+
+    return time_end - time_start
+
+
+@pytest.mark.parametrize("layer", ["linear", "transformer"])
+def test_cpu_overhead(layer, configs_dir, feature_dirs):
+    # runs one layer many times on very small tensor
+    # - gpu time should be negligible, so time should be dominated by cpu time.
+    # if layers does not invoke any feature in current iteration,
+    # then it changed into non-debug mode and should not have any non-negligible cpu overhead
+    # compared to layer without debug tools initialized.
+
+    with_debug_tools = _run_cpu_overhead(True, layer, configs_dir, feature_dirs)
+    without_debug_tools = _run_cpu_overhead(False, layer, configs_dir, feature_dirs)
+
+    print(f"with_debug_tools: {with_debug_tools} s")
+    print(f"without_debug_tools: {without_debug_tools} s")
+
+    assert with_debug_tools < without_debug_tools * 1.1  # 10% overhead margin