Skip to content

[Common] NVFP4 kernels #1904

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 35 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
e219db8
Fixed conflicts
Oleg-Goncharov Mar 31, 2025
48fbb42
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 31, 2025
eed3134
Minor code refactoring to avoid unnecessary checks
Oleg-Goncharov Mar 31, 2025
1531249
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 31, 2025
7368d8a
Fixed typo
Oleg-Goncharov Mar 31, 2025
23c5855
Fixed dBias accumulation error due to initialization. Minor code refa…
Oleg-Goncharov Apr 1, 2025
fd5393a
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 1, 2025
0457e52
Test case to reproduce the init error
Oleg-Goncharov Apr 1, 2025
dad3f47
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 1, 2025
e68d0c8
Fixed rowwise dbias error
Oleg-Goncharov Apr 2, 2025
1797748
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 2, 2025
f3db6c3
Changed ptx API
Oleg-Goncharov Apr 3, 2025
3b192e4
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 3, 2025
97dc2b5
Added a struct for two packed FP8 values
Oleg-Goncharov Apr 4, 2025
25809ef
Rolled back to scalar code for columnwise scaling due to its better p…
Oleg-Goncharov Apr 8, 2025
1ac4b52
Minor corrections
Oleg-Goncharov Apr 10, 2025
a890735
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 10, 2025
2e06498
Rebased on main
Oleg-Goncharov Apr 20, 2025
b8e63c7
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 20, 2025
9ed02ca
Fixes per code review
Oleg-Goncharov May 6, 2025
b5a7cbf
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] May 6, 2025
6ca9799
Removed constexpr in C++ test suite to build faster
Oleg-Goncharov May 9, 2025
51149e9
Computed activations are now numerically truncated to InputType befor…
Oleg-Goncharov May 26, 2025
ffa39a3
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] May 26, 2025
9781e8b
Minor refactoring
Oleg-Goncharov Jun 17, 2025
1fc122b
Minor refactoring
Oleg-Goncharov Jun 18, 2025
559c117
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jun 18, 2025
6b657e1
Modified mismatches checks of MXFP8 to address FP8 numerics
Oleg-Goncharov Jun 18, 2025
b238cc5
Implemented functional NVFP4 kernel (rowwise scaling) and the test suite
Oleg-Goncharov Jun 27, 2025
e8a173e
Improved NVFP4 kernel performance
Oleg-Goncharov Jul 2, 2025
fd85c69
Small optimizations
Oleg-Goncharov Jul 3, 2025
34ce5ff
Fixed the condition to store scaling factors
Oleg-Goncharov Jul 7, 2025
8a9c26f
Used the 'scale' class member for the second stage NVFP4 scaling
Oleg-Goncharov Jul 7, 2025
97e3d6a
Implemented columnwise scaling
Oleg-Goncharov Jul 9, 2025
58c32c1
Fixed columnwise scale store index. Implemented iterative scheme with…
Oleg-Goncharov Jul 9, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions build_tools/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,9 +237,9 @@ def cuda_archs() -> str:
version = cuda_version()
if os.getenv("NVTE_CUDA_ARCHS") is None:
if version >= (13, 0):
os.environ["NVTE_CUDA_ARCHS"] = "75;80;89;90;100;120"
os.environ["NVTE_CUDA_ARCHS"] = "100a"
elif version >= (12, 8):
os.environ["NVTE_CUDA_ARCHS"] = "70;80;89;90;100;120"
os.environ["NVTE_CUDA_ARCHS"] = "100a"
else:
os.environ["NVTE_CUDA_ARCHS"] = "70;80;89;90"
return os.getenv("NVTE_CUDA_ARCHS")
Expand Down
4 changes: 2 additions & 2 deletions tests/cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 3.18)

if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.8)
set(CMAKE_CUDA_ARCHITECTURES 75 80 89 90 100 120)
set(CMAKE_CUDA_ARCHITECTURES 100)
else ()
set(CMAKE_CUDA_ARCHITECTURES 75 80 89 90)
endif()
Expand All @@ -26,7 +26,7 @@ enable_testing()
include_directories(${gtest_SOURCE_DIR}/include ${gtest_SOURCE_DIR})

if(NOT DEFINED TE_LIB_PATH)
execute_process(COMMAND bash -c "pip3 show transformer-engine | grep Location | cut -d ' ' -f 2 | tr -d '\n'"
execute_process(COMMAND bash -c "pip3 show transformer-engine | grep 'Editable project location' | cut -d ' ' -f 4 | tr -d '\n'"
OUTPUT_VARIABLE TE_LIB_PATH)
endif()

Expand Down
47 changes: 24 additions & 23 deletions tests/cpp/operator/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,29 +3,30 @@
# See LICENSE for license information.

add_executable(test_operator
test_cast.cu
test_cast_current_scaling.cu
test_cast_dbias.cu
test_cast_dbias_dgelu.cu
test_cast_gated_swiglu.cu
test_cast_mxfp8_gated_swiglu.cu
test_qdq.cu
test_cast_mxfp8.cu
test_cast_float8blockwise.cu
test_dequantize_mxfp8.cu
test_transpose.cu
test_cast_transpose.cu
test_cast_transpose_current_scaling.cu
test_cast_transpose_dbias.cu
test_cast_transpose_dbias_dgelu.cu
test_cast_transpose_dgeglu.cu
test_act.cu
test_normalization.cu
test_normalization_mxfp8.cu
test_multi_cast_transpose.cu
test_multi_padding.cu
test_causal_softmax.cu
test_swizzle.cu
# test_cast.cu
# test_cast_current_scaling.cu
# test_cast_dbias.cu
# test_cast_dbias_dgelu.cu
# test_cast_gated_swiglu.cu
# test_cast_mxfp8_gated_swiglu.cu
# test_qdq.cu
# test_cast_mxfp8.cu
test_cast_nvfp4.cu
# test_cast_float8blockwise.cu
# test_dequantize_mxfp8.cu
# test_transpose.cu
# test_cast_transpose.cu
# test_cast_transpose_current_scaling.cu
# test_cast_transpose_dbias.cu
# test_cast_transpose_dbias_dgelu.cu
# test_cast_transpose_dgeglu.cu
# test_act.cu
# test_normalization.cu
# test_normalization_mxfp8.cu
# test_multi_cast_transpose.cu
# test_multi_padding.cu
# test_causal_softmax.cu
# test_swizzle.cu
../test_common.cu)

find_package(OpenMP REQUIRED)
Expand Down
Loading