Skip to content

Commit

Permalink
JIT: Added SVE Prefetch* APIs. (#103094)
Browse files Browse the repository at this point in the history
* Initial APIs

* Added SvePrefetch test template. Prefetch APIs now have special codegen.

* Minor cleanup

* Feedback

* Some cleanup

* More work

* Tests pass

* Revert changes

* put back

* Fix merge

* Fixing enum type

* Quick test fix

* Fix api

* Feedback

* Feedback

* Feedback - testing prefetch types

* load fix
  • Loading branch information
TIHan committed Jun 12, 2024
1 parent 7d23d61 commit be2827c
Show file tree
Hide file tree
Showing 12 changed files with 442 additions and 18 deletions.
4 changes: 4 additions & 0 deletions src/coreclr/jit/gentree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26916,6 +26916,10 @@ bool GenTreeHWIntrinsic::OperIsMemoryLoad(GenTree** pAddr) const
case NI_Sve_Load2xVectorAndUnzip:
case NI_Sve_Load3xVectorAndUnzip:
case NI_Sve_Load4xVectorAndUnzip:
case NI_Sve_PrefetchBytes:
case NI_Sve_PrefetchInt16:
case NI_Sve_PrefetchInt32:
case NI_Sve_PrefetchInt64:
addr = Op(2);
break;
#endif // TARGET_ARM64
Expand Down
8 changes: 8 additions & 0 deletions src/coreclr/jit/hwintrinsicarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -448,6 +448,14 @@ void HWIntrinsicInfo::lookupImmBounds(
}
break;

case NI_Sve_PrefetchBytes:
case NI_Sve_PrefetchInt16:
case NI_Sve_PrefetchInt32:
case NI_Sve_PrefetchInt64:
immLowerBound = (int)SVE_PRFOP_PLDL1KEEP;
immUpperBound = (int)SVE_PRFOP_CONST15;
break;

default:
unreached();
}
Expand Down
27 changes: 27 additions & 0 deletions src/coreclr/jit/hwintrinsiccodegenarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1489,6 +1489,33 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
break;
}

case NI_Sve_PrefetchBytes:
case NI_Sve_PrefetchInt16:
case NI_Sve_PrefetchInt32:
case NI_Sve_PrefetchInt64:
{
assert(hasImmediateOperand);
assert(HWIntrinsicInfo::HasEnumOperand(intrin.id));
if (intrin.op3->IsCnsIntOrI())
{
GetEmitter()->emitIns_PRFOP_R_R_I(ins, emitSize,
(insSvePrfop)intrin.op3->AsIntConCommon()->IconValue(), op1Reg,
op2Reg, 0);
}
else
{
assert(!intrin.op3->isContainedIntOrIImmed());

HWIntrinsicImmOpHelper helper(this, intrin.op3, node);
for (helper.EmitBegin(); !helper.Done(); helper.EmitCaseEnd())
{
const insSvePrfop prfop = (insSvePrfop)helper.ImmValue();
GetEmitter()->emitIns_PRFOP_R_R_I(ins, emitSize, prfop, op1Reg, op2Reg, 0);
}
}
break;
}

case NI_Vector64_ToVector128:
GetEmitter()->emitIns_Mov(ins, emitSize, targetReg, op1Reg, /* canSkip */ false);
break;
Expand Down
4 changes: 4 additions & 0 deletions src/coreclr/jit/hwintrinsiclistarm64sve.h
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,10 @@ HARDWARE_INTRINSIC(Sve, Negate,
HARDWARE_INTRINSIC(Sve, Or, -1, -1, false, {INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_sve_orr, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_OptionalEmbeddedMaskedOperation|HW_Flag_HasRMWSemantics|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, OrAcross, -1, -1, false, {INS_sve_orv, INS_sve_orv, INS_sve_orv, INS_sve_orv, INS_sve_orv, INS_sve_orv, INS_sve_orv, INS_sve_orv, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, PopCount, -1, -1, false, {INS_sve_cnt, INS_sve_cnt, INS_sve_cnt, INS_sve_cnt, INS_sve_cnt, INS_sve_cnt, INS_sve_cnt, INS_sve_cnt, INS_sve_cnt, INS_sve_cnt}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_BaseTypeFromFirstArg|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, PrefetchBytes, -1, 3, false, {INS_invalid, INS_sve_prfb, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_BaseTypeFromFirstArg|HW_Flag_HasImmediateOperand|HW_Flag_HasEnumOperand)
HARDWARE_INTRINSIC(Sve, PrefetchInt16, -1, 3, false, {INS_invalid, INS_invalid, INS_invalid, INS_sve_prfh, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_BaseTypeFromFirstArg|HW_Flag_HasImmediateOperand|HW_Flag_HasEnumOperand)
HARDWARE_INTRINSIC(Sve, PrefetchInt32, -1, 3, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_prfw, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_BaseTypeFromFirstArg|HW_Flag_HasImmediateOperand|HW_Flag_HasEnumOperand)
HARDWARE_INTRINSIC(Sve, PrefetchInt64, -1, 3, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_prfd, INS_invalid, INS_invalid}, HW_Category_MemoryLoad, HW_Flag_Scalable|HW_Flag_SpecialCodeGen|HW_Flag_ExplicitMaskedOperation|HW_Flag_LowMaskedOperation|HW_Flag_BaseTypeFromFirstArg|HW_Flag_HasImmediateOperand|HW_Flag_HasEnumOperand)
HARDWARE_INTRINSIC(Sve, ReverseElement, -1, 1, true, {INS_sve_rev, INS_sve_rev, INS_sve_rev, INS_sve_rev, INS_sve_rev, INS_sve_rev, INS_sve_rev, INS_sve_rev, INS_sve_rev, INS_sve_rev}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_SpecialCodeGen)
HARDWARE_INTRINSIC(Sve, ReverseElement16, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_revh, INS_sve_revh, INS_sve_revh, INS_sve_revh, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation)
HARDWARE_INTRINSIC(Sve, ReverseElement32, -1, -1, false, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_sve_revw, INS_sve_revw, INS_invalid, INS_invalid}, HW_Category_SIMD, HW_Flag_Scalable|HW_Flag_EmbeddedMaskedOperation|HW_Flag_LowMaskedOperation)
Expand Down
4 changes: 4 additions & 0 deletions src/coreclr/jit/lowerarmarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3194,6 +3194,10 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node)
case NI_AdvSimd_Arm64_StoreSelectedScalarVector128x2:
case NI_AdvSimd_Arm64_StoreSelectedScalarVector128x3:
case NI_AdvSimd_Arm64_StoreSelectedScalarVector128x4:
case NI_Sve_PrefetchBytes:
case NI_Sve_PrefetchInt16:
case NI_Sve_PrefetchInt32:
case NI_Sve_PrefetchInt64:
assert(hasImmediateOperand);
assert(varTypeIsIntegral(intrin.op3));
if (intrin.op3->IsCnsIntOrI())
Expand Down
52 changes: 34 additions & 18 deletions src/coreclr/jit/lsraarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1447,6 +1447,10 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou
case NI_AdvSimd_Arm64_StoreSelectedScalarVector128x2:
case NI_AdvSimd_Arm64_StoreSelectedScalarVector128x3:
case NI_AdvSimd_Arm64_StoreSelectedScalarVector128x4:
case NI_Sve_PrefetchBytes:
case NI_Sve_PrefetchInt16:
case NI_Sve_PrefetchInt32:
case NI_Sve_PrefetchInt64:
needBranchTargetReg = !intrin.op3->isContainedIntOrIImmed();
break;

Expand Down Expand Up @@ -1966,28 +1970,40 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree, int* pDstCou
(argNum == lowVectorOperandNum) ? lowVectorCandidates : RBM_NONE);
}
}
else if (intrin.id == NI_Sve_StoreAndZip)
{
srcCount += BuildAddrUses(intrin.op2);
}
else
{
SingleTypeRegSet candidates = lowVectorOperandNum == 2 ? lowVectorCandidates : RBM_NONE;
switch (intrin.id)
{
case NI_Sve_StoreAndZip:
case NI_Sve_PrefetchBytes:
case NI_Sve_PrefetchInt16:
case NI_Sve_PrefetchInt32:
case NI_Sve_PrefetchInt64:
assert(intrinsicTree->OperIsMemoryLoadOrStore());
srcCount += BuildAddrUses(intrin.op2);
break;

if (intrin.op2->gtType == TYP_MASK)
{
assert(lowVectorOperandNum != 2);
candidates = RBM_ALLMASK.GetPredicateRegSet();
}
default:
{
SingleTypeRegSet candidates = lowVectorOperandNum == 2 ? lowVectorCandidates : RBM_NONE;

if (forceOp2DelayFree)
{
srcCount += BuildDelayFreeUses(intrin.op2, nullptr, candidates);
}
else
{
srcCount += isRMW ? BuildDelayFreeUses(intrin.op2, intrin.op1, candidates)
: BuildOperandUses(intrin.op2, candidates);
if (intrin.op2->gtType == TYP_MASK)
{
assert(lowVectorOperandNum != 2);
candidates = RBM_ALLMASK.GetPredicateRegSet();
}

if (forceOp2DelayFree)
{
srcCount += BuildDelayFreeUses(intrin.op2, nullptr, candidates);
}
else
{
srcCount += isRMW ? BuildDelayFreeUses(intrin.op2, intrin.op1, candidates)
: BuildOperandUses(intrin.op2, candidates);
}
}
break;
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -92,4 +92,67 @@ public enum SveMaskPattern : byte
/// </summary>
All = 31 // All available (implicitly a multiple of two).
}

public enum SvePrefetchType : byte
{
/// <summary>
/// PLDL1KEEP
/// </summary>
LoadL1Temporal = 0,

/// <summary>
/// PLDL1STRM
/// </summary>
LoadL1NonTemporal = 1,

/// <summary>
/// PLDL2KEEP
/// </summary>
LoadL2Temporal = 2,

/// <summary>
/// PLDL2STRM
/// </summary>
LoadL2NonTemporal = 3,

/// <summary>
/// PLDL3KEEP
/// </summary>
LoadL3Temporal = 4,

/// <summary>
/// PLDL3STRM
/// </summary>
LoadL3NonTemporal = 5,

/// <summary>
/// PSTL1KEEP
/// </summary>
StoreL1Temporal = 8,

/// <summary>
/// PSTL1STRM
/// </summary>
StoreL1NonTemporal = 9,

/// <summary>
/// PSTL2KEEP
/// </summary>
StoreL2Temporal = 10,

/// <summary>
/// PSTL2STRM
/// </summary>
StoreL2NonTemporal = 11,

/// <summary>
/// PSTL3KEEP
/// </summary>
StoreL3Temporal = 12,

/// <summary>
/// PSTL3STRM
/// </summary>
StoreL3NonTemporal = 13
};
}
Original file line number Diff line number Diff line change
Expand Up @@ -3376,6 +3376,30 @@ internal Arm64() { }
/// </summary>
public static unsafe Vector<ulong> PopCount(Vector<ulong> value) { throw new PlatformNotSupportedException(); }

/// <summary>
/// void svprfb(svbool_t pg, const void *base, enum svprfop op)
/// PRFB op, Pg, [Xbase, #0, MUL VL]
/// </summary>
public static unsafe void PrefetchBytes(Vector<byte> mask, void* address, [ConstantExpected] SvePrefetchType prefetchType) { throw new PlatformNotSupportedException(); }

/// <summary>
/// void svprfh(svbool_t pg, const void *base, enum svprfop op)
/// PRFH op, Pg, [Xbase, #0, MUL VL]
/// </summary>
public static unsafe void PrefetchInt16(Vector<ushort> mask, void* address, [ConstantExpected] SvePrefetchType prefetchType) { throw new PlatformNotSupportedException(); }

/// <summary>
/// void svprfw(svbool_t pg, const void *base, enum svprfop op)
/// PRFW op, Pg, [Xbase, #0, MUL VL]
/// </summary>
public static unsafe void PrefetchInt32(Vector<uint> mask, void* address, [ConstantExpected] SvePrefetchType prefetchType) { throw new PlatformNotSupportedException(); }

/// <summary>
/// void svprfd(svbool_t pg, const void *base, enum svprfop op)
/// PRFD op, Pg, [Xbase, #0, MUL VL]
/// </summary>
public static unsafe void PrefetchInt64(Vector<ulong> mask, void* address, [ConstantExpected] SvePrefetchType prefetchType) { throw new PlatformNotSupportedException(); }


/// Reverse all elements

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3432,6 +3432,29 @@ internal Arm64() { }
/// </summary>
public static unsafe Vector<ulong> PopCount(Vector<ulong> value) => PopCount(value);

/// <summary>
/// void svprfb(svbool_t pg, const void *base, enum svprfop op)
/// PRFB op, Pg, [Xbase, #0, MUL VL]
/// </summary>
public static unsafe void PrefetchBytes(Vector<byte> mask, void* address, [ConstantExpected] SvePrefetchType prefetchType) => PrefetchBytes(mask, address, prefetchType);

/// <summary>
/// void svprfh(svbool_t pg, const void *base, enum svprfop op)
/// PRFH op, Pg, [Xbase, #0, MUL VL]
/// </summary>
public static unsafe void PrefetchInt16(Vector<ushort> mask, void* address, [ConstantExpected] SvePrefetchType prefetchType) => PrefetchInt16(mask, address, prefetchType);

/// <summary>
/// void svprfw(svbool_t pg, const void *base, enum svprfop op)
/// PRFW op, Pg, [Xbase, #0, MUL VL]
/// </summary>
public static unsafe void PrefetchInt32(Vector<uint> mask, void* address, [ConstantExpected] SvePrefetchType prefetchType) => PrefetchInt32(mask, address, prefetchType);

/// <summary>
/// void svprfd(svbool_t pg, const void *base, enum svprfop op)
/// PRFD op, Pg, [Xbase, #0, MUL VL]
/// </summary>
public static unsafe void PrefetchInt64(Vector<ulong> mask, void* address, [ConstantExpected] SvePrefetchType prefetchType) => PrefetchInt64(mask, address, prefetchType);

/// Reverse all elements

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4649,6 +4649,11 @@ internal Arm64() { }
public static System.Numerics.Vector<ulong> PopCount(System.Numerics.Vector<long> value) { throw null; }
public static System.Numerics.Vector<ulong> PopCount(System.Numerics.Vector<ulong> value) { throw null; }

public static unsafe void PrefetchBytes(System.Numerics.Vector<byte> mask, void* address, [ConstantExpected] SvePrefetchType prefetchType) { throw null; }
public static unsafe void PrefetchInt16(System.Numerics.Vector<ushort> mask, void* address, [ConstantExpected] SvePrefetchType prefetchType) { throw null; }
public static unsafe void PrefetchInt32(System.Numerics.Vector<uint> mask, void* address, [ConstantExpected] SvePrefetchType prefetchType) { throw null; }
public static unsafe void PrefetchInt64(System.Numerics.Vector<ulong> mask, void* address, [ConstantExpected] SvePrefetchType prefetchType) { throw null; }

public static System.Numerics.Vector<byte> ReverseElement(System.Numerics.Vector<byte> value) { throw null; }
public static System.Numerics.Vector<double> ReverseElement(System.Numerics.Vector<double> value) { throw null; }
public static System.Numerics.Vector<short> ReverseElement(System.Numerics.Vector<short> value) { throw null; }
Expand Down Expand Up @@ -4981,6 +4986,22 @@ public enum SveMaskPattern : byte
LargestMultipleOf3 = 30, // The largest multiple of 3.
All = 31 // All available (implicitly a multiple of two).
};

public enum SvePrefetchType : byte
{
LoadL1Temporal = 0,
LoadL1NonTemporal = 1,
LoadL2Temporal = 2,
LoadL2NonTemporal = 3,
LoadL3Temporal = 4,
LoadL3NonTemporal = 5,
StoreL1Temporal = 8,
StoreL1NonTemporal = 9,
StoreL2Temporal = 10,
StoreL2NonTemporal = 11,
StoreL3Temporal = 12,
StoreL3NonTemporal = 13
};
}
namespace System.Runtime.Intrinsics.X86
{
Expand Down
Loading

0 comments on commit be2827c

Please sign in to comment.