NVIDIA · pggPL · Jun 26, 2025 · Jun 26, 2025 · Jun 26, 2025 · Jun 30, 2025
diff --git a/docs/debug/3_api_te_calls.rst b/docs/debug/3_api_te_calls.rst
@@ -12,14 +12,7 @@ Let's look deeper into how Nvidia-DL-Framework-Inspect with Transformer Engine w
 
    Fig 1: Example of Nvidia-DL-Framework-Inspect affecting training script with 1 Linear Layer. For tensors mentioned in ``config.yaml``, behavior of ``modify_tensor_enabled()`` and ``modify_tensor()`` calls are substituted with definitions from the feature class. Other calls return default values - in fact they do nothing.
 
-In this page, all calls from TransformerEngine to the Nvidia-DL-Framework-Inspect for each GEMM are listed. The order of these calls is illustrated in the image below.
-
-.. figure:: ./img/api_calls2.svg
-   :align: center
-
-   Fig 2: The calls to Nvidia-DL-Framework-Inspect done for Transformer Engine. There are 2 types of calls: GEMM calls and routing calls.
-
-
+In this page, all calls from TransformerEngine to the Nvidia-DL-Framework-Inspect for each GEMM are listed. 
 There are 2 categories of API calls, each is used for different purposes:
 
 - GEMM calls - invoked during every GEMM, used to process or quantize tensors and collect information about them,
@@ -32,14 +25,19 @@ if fusions happen. An important remark is that if no feature is used for the lay
 
 .. autoapifunction:: transformer_engine.debug.features.api.TEDefaultFeatures.modify_tensor
 
-.. autoapifunction:: transformer_engine.debug.features.api.TEDefaultFeatures.inspect_tensor
-
-.. autoapifunction:: transformer_engine.debug.features.api.TEDefaultFeatures.inspect_tensor_postquantize
 
 .. autoapifunction:: transformer_engine.debug.features.api.TEDefaultFeatures.modify_tensor_enabled
 
 .. autoapifunction:: transformer_engine.debug.features.api.TEDefaultFeatures.fp8_gemm_enabled
 
+.. autoapifunction:: transformer_engine.debug.features.api.TEDefaultFeatures.inspect_tensor_all
+
+.. autoapifunction:: transformer_engine.debug.features.api.TEDefaultFeatures.inspect_tensor_all_enabled
+
+.. autoapifunction:: transformer_engine.debug.features.api.TEDefaultFeatures.inspect_tensor
+
+.. autoapifunction:: transformer_engine.debug.features.api.TEDefaultFeatures.inspect_tensor_postquantize
+
 .. autoapifunction:: transformer_engine.debug.features.api.TEDefaultFeatures.inspect_tensor_enabled
 
 .. autoapifunction:: transformer_engine.debug.features.api.TEDefaultFeatures.inspect_tensor_postquantize_enabled
diff --git a/docs/debug/img/api_calls2.svg b/docs/debug/img/api_calls2.svg
diff --git a/qa/L0_pytorch_debug_unittest/test.sh b/qa/L0_pytorch_debug_unittest/test.sh
@@ -18,6 +18,7 @@ pip install pytest==8.2.1
 pytest -v -s $TE_PATH/tests/pytorch/debug/test_sanity.py  --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS || FAIL=1
 pytest -v -s $TE_PATH/tests/pytorch/debug/test_config.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS || FAIL=1
 pytest -v -s $TE_PATH/tests/pytorch/debug/test_numerics.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS || FAIL=1
+pytest -v -s $TE_PATH/tests/pytorch/debug/test_log.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS || FAIL=1
 NVTE_TORCH_COMPILE=0 pytest -v -s $TE_PATH/tests/pytorch/debug/test_api_features.py --feature_dirs=$NVTE_TEST_NVINSPECT_FEATURE_DIRS --configs_dir=$NVTE_TEST_NVINSPECT_CONFIGS_DIR || FAIL=1
 
 # standard numerics tests with initialized debug

diff --git a/tests/pytorch/debug/run_distributed.py b/tests/pytorch/debug/run_distributed.py
@@ -346,6 +346,40 @@ def get_stat(tensor, stat):
     set_weight_tensor_tp_group_reduce(True)  # reset
 
 
+@run_debug_test
+def sanity_test_log_quantized_stats(parallel_mode, gather_weight, **kwargs):
+    from test_log import LOG_QUANTIZED_CONFIG
+
+    kwargs["config_file"].write(LOG_QUANTIZED_CONFIG)
+    kwargs["config_file"].flush()
+    _init_debug(kwargs["config_file"].name, kwargs["log_dir"], FEATURE_DIRS)
+    set_weight_tensor_tp_group_reduce(gather_weight)
+    if WORLD_SIZE % 2 != 0:
+        return  # skip
+    TP_SIZE = WORLD_SIZE // 2
+    DP_SIZE = 2
+    TP_RANK = WORLD_RANK % TP_SIZE
+    DP_RANK = (WORLD_RANK - TP_RANK) // TP_SIZE
+
+    debug_api.set_tensor_reduction_group(NCCL_WORLD)
+
+    x, weight = _get_tensors(
+        parallel_mode,
+        weight_seed=TP_RANK * 1234,
+        data_seed=DP_RANK * 1234,
+        tp_size=TP_SIZE,
+        tp_rank=TP_RANK,
+    )
+
+    tp_group_ranks = [i for i in range(DP_RANK * TP_SIZE, (DP_RANK + 1) * TP_SIZE)]
+    tp_group = dist.new_group(ranks=tp_group_ranks)
+
+    model = _init_model(weight, parallel_mode=parallel_mode, tp_group=tp_group)
+    _run_forward_backward(x, model, parallel_mode=parallel_mode, group=tp_group)
+
+    set_weight_tensor_tp_group_reduce(True)  # reset
+
+
 @run_debug_test
 def test_log_expert_parallel(**kwargs):
     """