-
Notifications
You must be signed in to change notification settings - Fork 14.5k
[X86][AVX10.2] Decouple AMX-AVX512 from AVX10.2 #148633
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
According to AVX10.2 rev. 4: AMX-AVX512's explicit AVX10.2 sensitivity is removed and the instructions are removed in favor of inclusion in the ISE/SDM. Users of AMX-AVX512 ISA should follow enabling and checking rules for both AMX and Intel® AVX-512/AVX10. Ref.: https://cdrdv2.intel.com/v1/dl/getContent/828965 We set amx-avx512 as implying amx-tile, avx512f and evex512 when avx512fp16 and avx512bf16 need to be specified separately.
@llvm/pr-subscribers-backend-x86 @llvm/pr-subscribers-clang Author: Evgenii Kudriashov (e-kud) ChangesAccording to AVX10.2 rev. 4: > AMX-AVX512's explicit AVX10.2 sensitivity is removed and the instructions are removed in favor of inclusion in the ISE/SDM. Users of AMX-AVX512 ISA should follow enabling and checking rules for both AMX and Intel® AVX-512/AVX10. Ref.: https://cdrdv2.intel.com/v1/dl/getContent/828965 We set Patch is 21.56 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/148633.diff 10 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsX86_64.td b/clang/include/clang/Basic/BuiltinsX86_64.td
index f2b35874e3876..fecaaed37a868 100644
--- a/clang/include/clang/Basic/BuiltinsX86_64.td
+++ b/clang/include/clang/Basic/BuiltinsX86_64.td
@@ -290,13 +290,19 @@ let Features = "amx-complex,amx-transpose", Attributes = [NoThrow] in {
def tconjtfp16_internal : X86Builtin<"_Vector<256, int>(unsigned short, unsigned short, _Vector<256, int>)">;
}
-let Features = "amx-avx512,avx10.2-512", Attributes = [NoThrow] in {
+let Features = "amx-avx512", Attributes = [NoThrow] in {
def tcvtrowd2ps_internal : X86Builtin<"_Vector<16, float>(unsigned short, unsigned short, _Vector<256, int>, unsigned int)">;
+ def tilemovrow_internal : X86Builtin<"_Vector<16, int>(unsigned short, unsigned short, _Vector<256, int>, unsigned int)">;
+}
+
+let Features = "amx-avx512,avx512bf16", Attributes = [NoThrow] in {
def tcvtrowps2bf16h_internal : X86Builtin<"_Vector<32, __bf16>(unsigned short, unsigned short, _Vector<256, int>, unsigned int)">;
def tcvtrowps2bf16l_internal : X86Builtin<"_Vector<32, __bf16>(unsigned short, unsigned short, _Vector<256, int>, unsigned int)">;
+}
+
+let Features = "amx-avx512,avx512fp16", Attributes = [NoThrow] in {
def tcvtrowps2phh_internal : X86Builtin<"_Vector<32, _Float16>(unsigned short, unsigned short, _Vector<256, int>, unsigned int)">;
def tcvtrowps2phl_internal : X86Builtin<"_Vector<32, _Float16>(unsigned short, unsigned short, _Vector<256, int>, unsigned int)">;
- def tilemovrow_internal : X86Builtin<"_Vector<16, int>(unsigned short, unsigned short, _Vector<256, int>, unsigned int)">;
}
let Features = "amx-tf32", Attributes = [NoThrow] in {
@@ -382,13 +388,19 @@ let Features = "amx-complex,amx-transpose", Attributes = [NoThrow] in {
def tconjtfp16 : X86Builtin<"void(_Constant unsigned char, _Constant unsigned char)">;
}
-let Features = "amx-avx512,avx10.2-512", Attributes = [NoThrow] in {
+let Features = "amx-avx512", Attributes = [NoThrow] in {
def tcvtrowd2ps : X86Builtin<"_Vector<16, float>(_Constant unsigned char, unsigned int)">;
+ def tilemovrow : X86Builtin<"_Vector<16, int>(_Constant unsigned char, unsigned int)">;
+}
+
+let Features = "amx-avx512,avx512bf16", Attributes = [NoThrow] in {
def tcvtrowps2bf16h : X86Builtin<"_Vector<32, __bf16>(_Constant unsigned char, unsigned int)">;
def tcvtrowps2bf16l : X86Builtin<"_Vector<32, __bf16>(_Constant unsigned char, unsigned int)">;
+}
+
+let Features = "amx-avx512,avx512fp16", Attributes = [NoThrow] in {
def tcvtrowps2phh : X86Builtin<"_Vector<32, _Float16>(_Constant unsigned char, unsigned int)">;
def tcvtrowps2phl : X86Builtin<"_Vector<32, _Float16>(_Constant unsigned char, unsigned int)">;
- def tilemovrow : X86Builtin<"_Vector<16, int>(_Constant unsigned char, unsigned int)">;
}
let Features = "amx-fp16", Attributes = [NoThrow] in {
diff --git a/clang/lib/Headers/amxavx512intrin.h b/clang/lib/Headers/amxavx512intrin.h
index bbde44fc265b3..e6c58e5c138a1 100644
--- a/clang/lib/Headers/amxavx512intrin.h
+++ b/clang/lib/Headers/amxavx512intrin.h
@@ -16,7 +16,15 @@
#define __DEFAULT_FN_ATTRS_AVX512 \
__attribute__((__always_inline__, __nodebug__, \
- __target__("amx-avx512,avx10.2-512")))
+ __target__("amx-avx512")))
+
+#define __DEFAULT_FN_ATTRS_AVX512BF16 \
+ __attribute__((__always_inline__, __nodebug__, \
+ __target__("amx-avx512,avx512bf16")))
+
+#define __DEFAULT_FN_ATTRS_AVX512FP16 \
+ __attribute__((__always_inline__, __nodebug__, \
+ __target__("amx-avx512,avx512fp16")))
/// Moves a row from a tile register to a zmm destination register, converting
/// the int32 source elements to fp32. The row of the tile is selected by a
@@ -237,25 +245,27 @@ static __inline__ __m512 __DEFAULT_FN_ATTRS_AVX512 _tile_cvtrowd2ps_internal(
return __builtin_ia32_tcvtrowd2ps_internal(m, n, src, u);
}
-static __inline__ __m512bh __DEFAULT_FN_ATTRS_AVX512
+static __inline__ __m512bh __DEFAULT_FN_ATTRS_AVX512BF16
_tile_cvtrowps2bf16h_internal(unsigned short m, unsigned short n,
_tile1024i src, unsigned u) {
return __builtin_ia32_tcvtrowps2bf16h_internal(m, n, src, u);
}
-static __inline__ __m512bh __DEFAULT_FN_ATTRS_AVX512
+static __inline__ __m512bh __DEFAULT_FN_ATTRS_AVX512BF16
_tile_cvtrowps2bf16l_internal(unsigned short m, unsigned short n,
_tile1024i src, unsigned u) {
return __builtin_ia32_tcvtrowps2bf16l_internal(m, n, src, u);
}
-static __inline__ __m512h __DEFAULT_FN_ATTRS_AVX512 _tile_cvtrowps2phh_internal(
- unsigned short m, unsigned short n, _tile1024i src, unsigned u) {
+static __inline__ __m512h __DEFAULT_FN_ATTRS_AVX512FP16
+_tile_cvtrowps2phh_internal(unsigned short m, unsigned short n,
+ _tile1024i src, unsigned u) {
return __builtin_ia32_tcvtrowps2phh_internal(m, n, src, u);
}
-static __inline__ __m512h __DEFAULT_FN_ATTRS_AVX512 _tile_cvtrowps2phl_internal(
- unsigned short m, unsigned short n, _tile1024i src, unsigned u) {
+static __inline__ __m512h __DEFAULT_FN_ATTRS_AVX512FP16
+_tile_cvtrowps2phl_internal(unsigned short m, unsigned short n,
+ _tile1024i src, unsigned u) {
return __builtin_ia32_tcvtrowps2phl_internal(m, n, src, u);
}
@@ -298,7 +308,7 @@ static __m512 __tile_cvtrowd2ps(__tile1024i src0, unsigned src1) {
/// The 2nd source r32. Size is 4 Bytes.
/// \returns
/// The destination v32bf16 data. Size is 64 Bytes.
-__DEFAULT_FN_ATTRS_AVX512
+__DEFAULT_FN_ATTRS_AVX512BF16
static __m512bh __tile_cvtrowps2bf16h(__tile1024i src0, unsigned src1) {
return _tile_cvtrowps2bf16h_internal(src0.row, src0.col, src0.tile, src1);
}
@@ -317,7 +327,7 @@ static __m512bh __tile_cvtrowps2bf16h(__tile1024i src0, unsigned src1) {
/// The 2nd source r32. Size is 4 Bytes.
/// \returns
/// The destination v32bf16 data. Size is 64 Bytes.
-__DEFAULT_FN_ATTRS_AVX512
+__DEFAULT_FN_ATTRS_AVX512BF16
static __m512bh __tile_cvtrowps2bf16l(__tile1024i src0, unsigned src1) {
return _tile_cvtrowps2bf16l_internal(src0.row, src0.col, src0.tile, src1);
}
@@ -336,7 +346,7 @@ static __m512bh __tile_cvtrowps2bf16l(__tile1024i src0, unsigned src1) {
/// The 2nd source r32. Size is 4 Bytes.
/// \returns
/// The destination v32fp16 data. Size is 64 Bytes.
-__DEFAULT_FN_ATTRS_AVX512
+__DEFAULT_FN_ATTRS_AVX512FP16
static __m512h __tile_cvtrowps2phh(__tile1024i src0, unsigned src1) {
return _tile_cvtrowps2phh_internal(src0.row, src0.col, src0.tile, src1);
}
@@ -355,7 +365,7 @@ static __m512h __tile_cvtrowps2phh(__tile1024i src0, unsigned src1) {
/// The 2nd source r32. Size is 4 Bytes.
/// \returns
/// The destination v32fp16 data. Size is 64 Bytes.
-__DEFAULT_FN_ATTRS_AVX512
+__DEFAULT_FN_ATTRS_AVX512FP16
static __m512h __tile_cvtrowps2phl(__tile1024i src0, unsigned src1) {
return _tile_cvtrowps2phl_internal(src0.row, src0.col, src0.tile, src1);
}
diff --git a/clang/test/CodeGen/X86/amx_avx512_api.c b/clang/test/CodeGen/X86/amx_avx512_api.c
index fac41ea6c214f..54bf72a8f389b 100644
--- a/clang/test/CodeGen/X86/amx_avx512_api.c
+++ b/clang/test/CodeGen/X86/amx_avx512_api.c
@@ -1,6 +1,6 @@
// RUN: %clang_cc1 %s -flax-vector-conversions=none -ffreestanding -triple=x86_64-unknown-unknown \
-// RUN: -target-feature +amx-avx512 -target-feature +avx10.2-512 \
-// RUN: -emit-llvm -o - -Werror -pedantic | FileCheck %s --check-prefixes=CHECK
+// RUN: -target-feature +amx-avx512 -emit-llvm -o - -Werror -pedantic | \
+// RUN: FileCheck %s --check-prefixes=CHECK
#include <immintrin.h>
@@ -16,6 +16,7 @@ __m512 test_tile_cvtrowd2ps(__tile1024i a, unsigned b) {
return __tile_cvtrowd2ps(a, b);
}
+__attribute__((__target__("avx512bf16")))
__m512bh test_tile_cvtrowps2bf16h(__tile1024i a, unsigned b) {
//CHECK-LABEL: @test_tile_cvtrowps2bf16h
//CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}})
@@ -23,6 +24,7 @@ __m512bh test_tile_cvtrowps2bf16h(__tile1024i a, unsigned b) {
return __tile_cvtrowps2bf16h(a, b);
}
+__attribute__((__target__("avx512bf16")))
__m512bh test_tile_cvtrowps2bf16l(__tile1024i a, unsigned b) {
//CHECK-LABEL: @test_tile_cvtrowps2bf16l
//CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}})
@@ -30,6 +32,7 @@ __m512bh test_tile_cvtrowps2bf16l(__tile1024i a, unsigned b) {
return __tile_cvtrowps2bf16l(a, b);
}
+__attribute__((__target__("avx512fp16")))
__m512h test_tile_cvtrowps2phh(__tile1024i a, unsigned b) {
//CHECK-LABEL: @test_tile_cvtrowps2phh
//CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}})
@@ -37,6 +40,7 @@ __m512h test_tile_cvtrowps2phh(__tile1024i a, unsigned b) {
return __tile_cvtrowps2phh(a, b);
}
+__attribute__((__target__("avx512fp16")))
__m512h test_tile_cvtrowps2phl(__tile1024i a, unsigned b) {
//CHECK-LABEL: @test_tile_cvtrowps2phl
//CHECK-DAG: call x86_amx @llvm.x86.cast.vector.to.tile.v256i32(<256 x i32> {{%.*}})
diff --git a/clang/test/CodeGen/X86/amxavx512-builtins.c b/clang/test/CodeGen/X86/amxavx512-builtins.c
index d60929994901a..f6b8cb421407e 100644
--- a/clang/test/CodeGen/X86/amxavx512-builtins.c
+++ b/clang/test/CodeGen/X86/amxavx512-builtins.c
@@ -1,5 +1,5 @@
// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -target-feature +amx-tile -target-feature +amx-avx512 \
-// RUN: -target-feature +avx10.2-512 -emit-llvm -o - -Wall -Werror -pedantic -Wno-gnu-statement-expression -flax-vector-conversions=none | FileCheck %s
+// RUN: -emit-llvm -o - -Wall -Werror -pedantic -Wno-gnu-statement-expression -flax-vector-conversions=none | FileCheck %s
#include <immintrin.h>
#include <stddef.h>
@@ -10,24 +10,28 @@ __m512 test_tile_cvtrowd2ps(unsigned int A) {
return _tile_cvtrowd2ps(1, A);
}
+__attribute__((__target__("avx512bf16")))
__m512bh test_tile_cvtrowps2bf16h(unsigned int A) {
// CHECK-LABEL: @test_tile_cvtrowps2bf16h(
// CHECK: call <32 x bfloat> @llvm.x86.tcvtrowps2bf16h(i8 1, i32 %{{.*}})
return _tile_cvtrowps2bf16h(1, A);
}
+__attribute__((__target__("avx512bf16")))
__m512bh test_tile_cvtrowps2bf16l(unsigned int A) {
// CHECK-LABEL: @test_tile_cvtrowps2bf16l(
// CHECK: call <32 x bfloat> @llvm.x86.tcvtrowps2bf16l(i8 1, i32 %{{.*}})
return _tile_cvtrowps2bf16l(1, A);
}
+__attribute__((__target__("avx512fp16")))
__m512h test_tile_cvtrowps2phh(unsigned int A) {
// CHECK-LABEL: @test_tile_cvtrowps2phh(
// CHECK: call <32 x half> @llvm.x86.tcvtrowps2phh(i8 1, i32 %{{.*}})
return _tile_cvtrowps2phh(1, A);
}
+__attribute__((__target__("avx512fp16")))
__m512h test_tile_cvtrowps2phl(unsigned int A) {
// CHECK-LABEL: @test_tile_cvtrowps2phl(
// CHECK: call <32 x half> @llvm.x86.tcvtrowps2phl(i8 1, i32 %{{.*}})
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 990b381341f07..83b633b73cd5d 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -277,7 +277,8 @@ def FeatureAMXTRANSPOSE : SubtargetFeature<"amx-transpose", "HasAMXTRANSPOSE", "
def FeatureAMXAVX512 : SubtargetFeature<"amx-avx512",
"HasAMXAVX512", "true",
"Support AMX-AVX512 instructions",
- [FeatureAMXTILE]>;
+ [FeatureAMXTILE, FeatureAVX512,
+ FeatureEVEX512]>;
def FeatureAMXTF32 : SubtargetFeature<"amx-tf32", "HasAMXTF32", "true",
"Support AMX-TF32 instructions",
[FeatureAMXTILE]>;
diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td
index 1beaaafb159e3..5d93e6a4089ca 100644
--- a/llvm/lib/Target/X86/X86InstrAMX.td
+++ b/llvm/lib/Target/X86/X86InstrAMX.td
@@ -550,7 +550,7 @@ let Predicates = [HasAMXMOVRS, In64BitMode], SchedRW = [WriteSystem] in {
} // HasAMXMOVRS, In64BitMode
multiclass m_tcvtrowd2ps {
- let Predicates = [HasAMXAVX512, HasAVX10_2_512, In64BitMode] in {
+ let Predicates = [HasAMXAVX512, In64BitMode] in {
let SchedRW = [WriteSystem] in {
def rri : Ii8<0x7, MRMSrcReg, (outs VR512:$dst),
(ins TILE:$src1, i32u8imm:$src2),
@@ -561,12 +561,12 @@ multiclass m_tcvtrowd2ps {
"tcvtrowd2ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[]>, T8,XS, EVEX, VVVV, EVEX_V512;
}
- } // HasAMXAVX512, HasAVX10_2_512, In64BitMode
+ } // HasAMXAVX512, In64BitMode
}
defm TCVTROWD2PS : m_tcvtrowd2ps;
-let Predicates = [HasAMXAVX512, HasAVX10_2_512, In64BitMode] in {
+let Predicates = [HasAMXAVX512, In64BitMode] in {
let SchedRW = [WriteSystem] in {
let usesCustomInserter = 1 in {
def PTCVTROWD2PSrri : PseudoI<(outs VR512:$dst), (ins u8imm:$src1, i32u8imm:$src2),
@@ -630,7 +630,7 @@ let Predicates = [HasAMXAVX512, HasAVX10_2_512, In64BitMode] in {
multiclass AMXAVX512_BASE<bits<8> Opcode1, bits<8> Opcode2, string Opstr,
Prefix P1, Prefix P2> {
- let Predicates = [HasAMXAVX512, HasAVX10_2_512, In64BitMode], SchedRW = [WriteSystem] in {
+ let Predicates = [HasAMXAVX512, In64BitMode], SchedRW = [WriteSystem] in {
let OpPrefix = P1 in
def rre : I<Opcode1, MRMSrcReg4VOp3, (outs VR512:$dst),
(ins TILE:$src1, GR32:$src2),
@@ -658,7 +658,7 @@ defm TCVTROWPS2BF16H : AMXAVX512_BASE<0x6d, 0x07, "tcvtrowps2bf16h", XD, XD>;
defm TCVTROWPS2BF16L : AMXAVX512_BASE<0x6d, 0x77, "tcvtrowps2bf16l", XS, XS>;
multiclass m_tilemovrow {
- let Predicates = [HasAMXAVX512, HasAVX10_2_512, In64BitMode] in {
+ let Predicates = [HasAMXAVX512, In64BitMode] in {
let SchedRW = [WriteSystem] in {
def rri : Ii8<0x7, MRMSrcReg, (outs VR512:$dst),
(ins TILE:$src1, u8imm:$src2),
@@ -669,12 +669,12 @@ multiclass m_tilemovrow {
"tilemovrow\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[]>, T8,PD, EVEX, VVVV, EVEX_V512;
}
- } // HasAMXAVX512, HasAVX10_2_512, In64BitMode
+ } // HasAMXAVX512, In64BitMode
}
defm TILEMOVROW : m_tilemovrow;
-let Predicates = [HasAMXAVX512, HasAVX10_2_512, In64BitMode] in {
+let Predicates = [HasAMXAVX512, In64BitMode] in {
let SchedRW = [WriteSystem] in {
let usesCustomInserter = 1 in {
def PTILEMOVROWrri : PseudoI<(outs VR512:$dst), (ins u8imm:$src1, i32u8imm:$src2),
diff --git a/llvm/lib/TargetParser/X86TargetParser.cpp b/llvm/lib/TargetParser/X86TargetParser.cpp
index 57fbc71fa22ee..f0349fec8af4e 100644
--- a/llvm/lib/TargetParser/X86TargetParser.cpp
+++ b/llvm/lib/TargetParser/X86TargetParser.cpp
@@ -616,7 +616,7 @@ constexpr FeatureBitset ImpliedFeaturesAMX_FP8 = FeatureAMX_TILE;
constexpr FeatureBitset ImpliedFeaturesAMX_TRANSPOSE = FeatureAMX_TILE;
constexpr FeatureBitset ImpliedFeaturesAMX_MOVRS = FeatureAMX_TILE;
constexpr FeatureBitset ImpliedFeaturesAMX_AVX512 =
- FeatureAMX_TILE | FeatureAVX10_2_512;
+ FeatureAMX_TILE | FeatureAVX512F | FeatureEVEX512;
constexpr FeatureBitset ImpliedFeaturesAMX_TF32 = FeatureAMX_TILE;
constexpr FeatureBitset ImpliedFeaturesHRESET = {};
diff --git a/llvm/test/CodeGen/X86/amx-across-func-tilemovrow.ll b/llvm/test/CodeGen/X86/amx-across-func-tilemovrow.ll
index 71f8f231747fe..9e73d8c494443 100644
--- a/llvm/test/CodeGen/X86/amx-across-func-tilemovrow.ll
+++ b/llvm/test/CodeGen/X86/amx-across-func-tilemovrow.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx10.2-512 -mattr=+amx-avx512 -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx10.2-512 -mattr=+amx-avx512 -verify-machineinstrs -enable-ipra | FileCheck -check-prefix=IPRA %s
-; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx10.2-512 -mattr=+amx-avx512 -verify-machineinstrs | FileCheck -check-prefix=O0 %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8,+amx-avx512 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8,+amx-avx512 -verify-machineinstrs -enable-ipra | FileCheck -check-prefix=IPRA %s
+; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-int8,+amx-avx512 -verify-machineinstrs | FileCheck -check-prefix=O0 %s
@buf = dso_local global [3072 x i8] zeroinitializer, align 64
@@ -95,7 +95,7 @@ define dso_local <16 x i32> @test_api(i16 signext %0, i16 signext %1) nounwind {
; O0-NEXT: movq %rsp, %rbp
; O0-NEXT: andq $-1024, %rsp # imm = 0xFC00
; O0-NEXT: subq $4096, %rsp # imm = 0x1000
-; O0-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; O0-NEXT: vxorps %xmm0, %xmm0, %xmm0
; O0-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp)
; O0-NEXT: movb $1, {{[0-9]+}}(%rsp)
; O0-NEXT: movw %si, %cx
diff --git a/llvm/test/CodeGen/X86/amx-avx512-intrinsics.ll b/llvm/test/CodeGen/X86/amx-avx512-intrinsics.ll
index 8f82bd2587ec3..94db7609dfd5d 100644
--- a/llvm/test/CodeGen/X86/amx-avx512-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/amx-avx512-intrinsics.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -O0 -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+amx-tile,+amx-avx512,+avx10.2-512 | FileCheck %s
+; RUN: llc < %s -O0 -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+amx-tile,+amx-avx512 | FileCheck %s
define <16 x float> @test_tcvtrowd2ps(i32 %A) {
; CHECK-LABEL: test_tcvtrowd2ps:
@@ -20,7 +20,7 @@ define <16 x float> @test_tcvtrowd2psi() {
}
declare <16 x float> @llvm.x86.tcvtrowd2ps(i8 %A, i32 %B)
-define <32 x bfloat> @test_tcvtrowps2bf16h(i32 %A) {
+define <32 x bfloat> @test_tcvtrowps2bf16h(i32 %A) "target-features"="+avx512bf16" {
; CHECK-LABEL: test_tcvtrowps2bf16h:
; CHECK: # %bb.0:
; CHECK-NEXT: tcvtrowps2bf16h %edi, %tmm1, %zmm0 # encoding: [0x62,0xf2,0x47,0x48,0x6d,0xc1]
@@ -29,7 +29,7 @@ define <32 x bfloat> @test_tcvtrowps2bf16h(i32 %A) {
ret <32 x bfloat> %ret
}
-define <32 x bfloat> @test_tcvtrowps2bf16hi() {
+define <32 x bfloat> @test_tcvtrowps2bf16hi() "target-features"="+avx512bf16" {
; CHECK-LABEL: test_tcvtrowps2bf16hi:
; CHECK: # %bb.0:
; CHECK-NEXT: tcvtrowps2bf16h $127, %tmm1, %zmm0 # encoding: [0x62,0xf3,0x7f,0x48,0x07,0xc1,0x7f]
@@ -39,7 +39,7 @@ define <32 x bfloat> @test_tcvtrowps2bf16hi() {
}
declare <32 x bfloat> @llvm.x86.tcvtrowps2bf16h(i8 %A, i32 %B)
-define <32 x bfloat> @test_tcvtrowps2bf16l(i32 %A) {
+define <32 x bfloat> @test_tcvtrowps2bf16l(i32 %A) "target-features"="+avx512bf16" {
; CHECK-LABEL: test_tcvtrowps2bf16l:
; CHECK: # %bb.0:
; CHECK-NEXT: tcvtrowps2bf16l %edi, %tmm1, %zmm0 # encoding: [0x62,0xf2,0x46,0x48,0x6d,0xc1]
@@ -48,7 +48,7 @@ define <32 x bfloat> @test_tcvtrowps2bf16l(i32 %A) {
ret <32 x bfloat> %ret
}
-define <32 x bfloat> @test_tcvtrowps2bf16li() {
+define <32 x bfloat> @test_tcvtrowps2bf16li() "target-features"="+avx512bf16" {
; CHECK-LABEL: test_tcvtrowps2bf16li:
; CHECK: # %bb.0:
; CHECK-NEXT: tcvtrowps2bf16l $127, %tmm1, %zmm0 # encoding: [0x62,0xf3,0x7e,0x48,0x77,0xc1,0x7f]
@@ -58,7 +58,7 @@ define <32 x bfloat> @test_tcvtrowps2bf16li() {
}
declare <32 x bfloat> @llvm.x86.tcvtrowps2bf16l(i8 %A, i32 %B)
-define <32 x half> @test_tcvtrowps2phh(i32 %A) {
+define <32 x half> @test_tcvtrowps2phh(i32 %A) "target-features"="+avx512fp16" {
; CHECK-LABEL: test_tcvtrowps2phh:
; CHECK: # %bb.0:
; CHECK-NEXT: tcvtrowps2phh %edi, %tmm1, %zmm0 # encoding: [0x62,0xf2,0x44,0x48,0x6d,0xc1]
@@ -67,7 +67,7 @@ define <32 x half> @test_tcvtrowps2phh(i32 %A) {
ret <32 x half> %ret
}
-define <32 x half> @test_tcvtrowps2phhi() {
+define <32 x half> @test_tcvtrowps2phhi() "target-features"="+avx512fp16" {
; CHECK-LABEL: test_tcvtrowps2phhi:
; CHECK: # %bb.0:
; CHECK-NEXT: tcvtrowps2phh $127, %tmm1, %zmm0 # encoding: [0x62,0xf3,0x7c,0x4...
[truncated]
|
TBH I'm not sure if we need to add |
You can test this locally with the following command:git-clang-format --diff HEAD~1 HEAD --extensions h,c,cpp -- clang/lib/Headers/amxavx512intrin.h clang/test/CodeGen/X86/amx_avx512_api.c clang/test/CodeGen/X86/amxavx512-builtins.c llvm/lib/TargetParser/X86TargetParser.cpp View the diff from clang-format here.diff --git a/clang/lib/Headers/amxavx512intrin.h b/clang/lib/Headers/amxavx512intrin.h
index e6c58e5c1..b9fef3100 100644
--- a/clang/lib/Headers/amxavx512intrin.h
+++ b/clang/lib/Headers/amxavx512intrin.h
@@ -15,8 +15,7 @@
#if defined(__x86_64__) && defined(__SSE2__)
#define __DEFAULT_FN_ATTRS_AVX512 \
- __attribute__((__always_inline__, __nodebug__, \
- __target__("amx-avx512")))
+ __attribute__((__always_inline__, __nodebug__, __target__("amx-avx512")))
#define __DEFAULT_FN_ATTRS_AVX512BF16 \
__attribute__((__always_inline__, __nodebug__, \
@@ -258,14 +257,14 @@ _tile_cvtrowps2bf16l_internal(unsigned short m, unsigned short n,
}
static __inline__ __m512h __DEFAULT_FN_ATTRS_AVX512FP16
-_tile_cvtrowps2phh_internal(unsigned short m, unsigned short n,
- _tile1024i src, unsigned u) {
+_tile_cvtrowps2phh_internal(unsigned short m, unsigned short n, _tile1024i src,
+ unsigned u) {
return __builtin_ia32_tcvtrowps2phh_internal(m, n, src, u);
}
static __inline__ __m512h __DEFAULT_FN_ATTRS_AVX512FP16
-_tile_cvtrowps2phl_internal(unsigned short m, unsigned short n,
- _tile1024i src, unsigned u) {
+_tile_cvtrowps2phl_internal(unsigned short m, unsigned short n, _tile1024i src,
+ unsigned u) {
return __builtin_ia32_tcvtrowps2phl_internal(m, n, src, u);
}
|
#define __DEFAULT_FN_ATTRS_AVX512BF16 \ | ||
__attribute__((__always_inline__, __nodebug__, \ | ||
__target__("amx-avx512,avx512bf16"))) | ||
|
||
#define __DEFAULT_FN_ATTRS_AVX512FP16 \ | ||
__attribute__((__always_inline__, __nodebug__, \ | ||
__target__("amx-avx512,avx512fp16"))) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think we need avx512bf/fp16
here. We don't rely on their specific instructions.
Yes, we need them for now. Otherwise, we cannot allocate ZMM registers. |
According to AVX10.2 rev. 4:
Ref.: https://cdrdv2.intel.com/v1/dl/getContent/828965
We set
amx-avx512
as implyingamx-tile
,avx512f
andevex512
whenavx512fp16
andavx512bf16
need to be specified separately.