NVIDIA · phu0ngng · Jul 11, 2025 · Jul 11, 2025
diff --git a/transformer_engine/common/include/transformer_engine/normalization.h b/transformer_engine/common/include/transformer_engine/normalization.h
@@ -149,6 +149,10 @@ void nvte_rmsnorm_bwd(const NVTETensor dz, const NVTETensor x, const NVTETensor
 void nvte_enable_cudnn_norm_fwd(bool enable);
 void nvte_enable_cudnn_norm_bwd(bool enable);
 
+/*! \brief Helper to query if cuDNN backend for normalization is enabled */
+int nvte_use_cudnn_norm_fwd();
+int nvte_use_cudnn_norm_bwd();
+
 /*! \brief Control whether norm computes `gamma += 1.0` for zero-centered gamma
  *  in weight dtype. If set to false, it will compute in compute dtype.
  *

diff --git a/transformer_engine/common/normalization/common.cpp b/transformer_engine/common/normalization/common.cpp
@@ -539,3 +539,13 @@ void nvte_enable_zero_centered_gamma_in_weight_dtype(bool enable) {
   NVTE_API_CALL(nvte_enable_zero_centered_gamma_in_weight_dtype);
   transformer_engine::normalization::_zero_centered_gamma_in_weight_dtype() = enable;
 }
+
+int nvte_use_cudnn_norm_fwd() {
+  NVTE_API_CALL(nvte_use_cudnn_norm_fwd);
+  return transformer_engine::normalization::use_cudnn_norm_fwd();
+}
+
+int nvte_use_cudnn_norm_bwd() {
+  NVTE_API_CALL(nvte_use_cudnn_norm_bwd);
+  return transformer_engine::normalization::use_cudnn_norm_bwd();
+}
diff --git a/transformer_engine/jax/cpp_extensions/normalization.py b/transformer_engine/jax/cpp_extensions/normalization.py
@@ -71,8 +71,7 @@ def is_norm_fwd_cudnn_enabled(scaling_mode: ScalingMode) -> bool:
     """Retrieves whether CuDNN norm fwd is enabled."""
     # MXFP8_1D_SCALING always uses CuDNN currently
     return (
-        int(os.getenv("NVTE_NORM_FWD_USE_CUDNN", "0")) == 1
-        or scaling_mode == ScalingMode.MXFP8_1D_SCALING
+        transformer_engine_jax.use_cudnn_norm_fwd() or scaling_mode == ScalingMode.MXFP8_1D_SCALING
     )
 
 

diff --git a/transformer_engine/jax/csrc/extensions/normalization.cpp b/transformer_engine/jax/csrc/extensions/normalization.cpp
@@ -36,6 +36,9 @@ pybind11::tuple GetNormForwardWorkspaceSizes(size_t batch_size, size_t hidden_si
     output_tensor.set_columnwise_data(static_cast<void *>(&temp), out_dtype, input_shape);
   }
 
+  // Enable norm with cuDNN instead of TE kernels
+  nvte_enable_cudnn_norm_fwd(true);
+
   // dummy tensor wrappers that will carry workspace size info later
   TensorWrapper dummy_work_tensor;
   auto num_sm = cudaDevicePropertiesManager::Instance().GetMultiProcessorCount() - sm_margin;
@@ -138,6 +141,9 @@ Error_Type NormForwardFFI(cudaStream_t stream, Buffer_Type x_buf, Buffer_Type sc
                             colwise_scale_inv_buf->dimensions().back()});
   }
 
+  // Enable norm with cuDNN instead of TE kernels
+  nvte_enable_cudnn_norm_fwd(true);
+
   if (_norm_type == NVTE_Norm_Type::LayerNorm) {
     NVTE_CHECK(w_dtype == convert_ffi_datatype_to_te_dtype(beta_buf.element_type()),
                "gamma and beta must have the same data type.");
@@ -200,6 +206,9 @@ pybind11::tuple GetNormBackwardWorkspaceSizes(size_t batch_size, size_t hidden_s
   TensorWrapper dummy_work_tensor;
   auto num_sm = cudaDevicePropertiesManager::Instance().GetMultiProcessorCount() - sm_margin;
 
+  // Enable norm with cuDNN instead of TE kernels
+  nvte_enable_cudnn_norm_bwd(true);
+
   if (norm_type == NVTE_Norm_Type::LayerNorm) {
     auto mu_tensor = TensorWrapper(nullptr, intermediates_shape, intermediates_dtype);
     auto dbeta_tensor = TensorWrapper(nullptr, weight_shape, w_dtype);
@@ -270,6 +279,9 @@ Error_Type NormBackwardFFI(cudaStream_t stream, Buffer_Type dz_buf, Buffer_Type
   auto workspace_shape = std::vector<size_t>{wkspace_size};
   auto workspace_tensor = TensorWrapper(workspace, workspace_shape, wkspace_dtype);
 
+  // Enable norm with cuDNN instead of TE kernels
+  nvte_enable_cudnn_norm_bwd(true);
+
   if (static_cast<NVTE_Norm_Type>(norm_type) == NVTE_Norm_Type::LayerNorm) {
     auto mu_tensor = TensorWrapper(mu, intermediates_shape, intermediates_dtype);
     auto dbeta_tensor = TensorWrapper(dbeta, weight_shape, w_dtype);

diff --git a/transformer_engine/jax/csrc/extensions/pybind.cpp b/transformer_engine/jax/csrc/extensions/pybind.cpp
@@ -78,6 +78,7 @@ PYBIND11_MODULE(transformer_engine_jax, m) {
   m.def("get_fused_attn_fwd_workspace_sizes", &GetFusedAttnForwardWorkspaceSizes);
   m.def("get_fused_attn_bwd_workspace_sizes", &GetFusedAttnBackwardWorkspaceSizes);
   m.def("nvte_get_qkv_format", &nvte_get_qkv_format);
+  m.def("use_cudnn_norm_fwd", &nvte_use_cudnn_norm_fwd);
 
   pybind11::enum_<DType>(m, "DType", pybind11::module_local())
       .value("kByte", DType::kByte)