Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ensure that we aren't accidentally generating instructions for unsupported ISAs #64140

Merged
merged 5 commits into from
Jan 24, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 42 additions & 19 deletions src/coreclr/jit/gentree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19294,20 +19294,22 @@ GenTree* Compiler::gtNewSimdCmpOpAllNode(genTreeOps op,

NamedIntrinsic intrinsic = NI_Illegal;

#if defined(TARGET_XARCH)
if (simdSize == 32)
{
assert(compIsaSupportedDebugOnly(InstructionSet_AVX));
assert(varTypeIsFloating(simdBaseType) || compIsaSupportedDebugOnly(InstructionSet_AVX2));
}
#endif // TARGET_XARCH

switch (op)
{
#if defined(TARGET_XARCH)
case GT_EQ:
{
intrinsic = (simdSize == 32) ? NI_Vector256_op_Equality : NI_Vector128_op_Equality;
if (simdSize == 32)
{
assert(compIsaSupportedDebugOnly(InstructionSet_AVX));
assert(varTypeIsFloating(simdBaseType) || compIsaSupportedDebugOnly(InstructionSet_AVX2));

intrinsic = NI_Vector256_op_Equality;
}
else
{
intrinsic = NI_Vector128_op_Equality;
}
break;
}

Expand All @@ -19323,6 +19325,12 @@ GenTree* Compiler::gtNewSimdCmpOpAllNode(genTreeOps op,

if (simdSize == 32)
{
// TODO-XArch-CQ: It's a non-trivial amount of work to support these
// for floating-point while only utilizing AVX. It would require, among
// other things, inverting the comparison and potentially support for a
// new Avx.TestNotZ intrinsic to ensure the codegen remains efficient.
assert(compIsaSupportedDebugOnly(InstructionSet_AVX2));

Comment on lines +19328 to +19333
Copy link
Member Author

@tannergooding tannergooding Jan 22, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just noting that this is worth doing even if we decided to only accelerate on AVX2+. Using CompareNotGreaterThan (and other "inverse" comparisons) allows us to use PTEST, which in turn is more efficient and allows the code to directly branch rather than requiring a MoveMask instruction.

As indicated in hwintrinsicxarch.cpp, I opted not to do this for the bug fix as I expect we'll need to backport this to .NET 7 P1.

intrinsic = NI_Vector256_op_Equality;
getAllBitsSet = NI_Vector256_get_AllBitsSet;
}
Expand Down Expand Up @@ -19433,14 +19441,6 @@ GenTree* Compiler::gtNewSimdCmpOpAnyNode(genTreeOps op,

NamedIntrinsic intrinsic = NI_Illegal;

#if defined(TARGET_XARCH)
if (simdSize == 32)
{
assert(compIsaSupportedDebugOnly(InstructionSet_AVX));
assert(varTypeIsFloating(simdBaseType) || compIsaSupportedDebugOnly(InstructionSet_AVX2));
}
#endif // TARGET_XARCH

switch (op)
{
#if defined(TARGET_XARCH)
Expand All @@ -19453,7 +19453,20 @@ GenTree* Compiler::gtNewSimdCmpOpAnyNode(genTreeOps op,
// We want to generate a comparison along the lines of
// GT_XX(op1, op2).As<T, TInteger>() != Vector128<TInteger>.Zero

intrinsic = (simdSize == 32) ? NI_Vector256_op_Inequality : NI_Vector128_op_Inequality;
if (simdSize == 32)
{
// TODO-XArch-CQ: It's a non-trivial amount of work to support these
// for floating-point while only utilizing AVX. It would require, among
// other things, inverting the comparison and potentially support for a
// new Avx.TestNotZ intrinsic to ensure the codegen remains efficient.
assert(compIsaSupportedDebugOnly(InstructionSet_AVX2));

intrinsic = NI_Vector256_op_Inequality;
}
else
{
intrinsic = NI_Vector128_op_Inequality;
}

op1 = gtNewSimdCmpOpNode(op, simdType, op1, op2, simdBaseJitType, simdSize,
/* isSimdAsHWIntrinsic */ false);
Expand All @@ -19475,7 +19488,17 @@ GenTree* Compiler::gtNewSimdCmpOpAnyNode(genTreeOps op,

case GT_NE:
{
intrinsic = (simdSize == 32) ? NI_Vector256_op_Inequality : NI_Vector128_op_Inequality;
if (simdSize == 32)
{
assert(compIsaSupportedDebugOnly(InstructionSet_AVX));
assert(varTypeIsFloating(simdBaseType) || compIsaSupportedDebugOnly(InstructionSet_AVX2));

intrinsic = NI_Vector256_op_Inequality;
}
else
{
intrinsic = NI_Vector128_op_Inequality;
}
break;
}
#elif defined(TARGET_ARM64)
Expand Down
3 changes: 3 additions & 0 deletions src/coreclr/jit/hwintrinsiccodegenarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,9 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
{
const HWIntrinsic intrin(node);

// We need to validate that other phases of the compiler haven't introduced unsupported intrinsics
assert(compiler->compIsaSupportedDebugOnly(HWIntrinsicInfo::lookupIsa(intrin.id)));

regNumber targetReg = node->GetRegNum();

regNumber op1Reg = REG_NA;
Expand Down
3 changes: 3 additions & 0 deletions src/coreclr/jit/hwintrinsiccodegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,9 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
HWIntrinsicCategory category = HWIntrinsicInfo::lookupCategory(intrinsicId);
size_t numArgs = node->GetOperandCount();

// We need to validate that other phases of the compiler haven't introduced unsupported intrinsics
assert(compiler->compIsaSupportedDebugOnly(isa));
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This shows the assert triggering on the original bug: https://dev.azure.com/dnceng/public/_build/results?buildId=1566647&view=ms.vss-test-web.build-test-results-tab&runId=44059054&resultId=102987&paneView=debug

It's not "perfect" and its still possible for something else in the JIT to incorrectly generate an AVX2 instruction; but this should catch the bulk of them, particularly since most intrinsic usages are now driven through the HWIntrinsic nodes so that optimizations and other transforms are consistently applied.


int ival = HWIntrinsicInfo::lookupIval(intrinsicId, compiler->compOpportunisticallyDependsOn(InstructionSet_AVX));

assert(HWIntrinsicInfo::RequiresCodegen(intrinsicId));
Expand Down
25 changes: 15 additions & 10 deletions src/coreclr/jit/hwintrinsicxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1024,7 +1024,7 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic,
{
assert(sig->numArgs == 2);

if ((simdSize != 32) || varTypeIsFloating(simdBaseType) || compExactlyDependsOn(InstructionSet_AVX2))
if ((simdSize != 32) || compExactlyDependsOn(InstructionSet_AVX2))
Copy link
Member Author

@tannergooding tannergooding Jan 22, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was originally going to handle this differently but this likely needs to be backported to .NET 7 P1 and so I kept it "minimal" and added the TODO-XArch-CQ to track the other scenario.

{
var_types simdType = getSIMDTypeForSize(simdSize);

Expand Down Expand Up @@ -1287,7 +1287,7 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic,
{
assert(sig->numArgs == 2);

if ((simdSize != 32) || varTypeIsFloating(simdBaseType) || compExactlyDependsOn(InstructionSet_AVX2))
if ((simdSize != 32) || compExactlyDependsOn(InstructionSet_AVX2))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if it's worth having such ISA requirements done in a declarative way and checking that before the switch? (Obviously, as a follow-up exercise).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've considered the same. Part of the issue is the number of variations/considerations that might come into play here and things being missed in refactorings or changes.

It'd be great if we had a way to easily unit test these and validate when something is treated as intrinsic vs not. There was a functioning (but hacky) solution done for Vector that was never done for the hardware intrinsics; but we're also already hitting various timing issues.

It might work better if many of the HWIntrinsic tests were ported to be library tests instead and we limited the runtime side to super simple validation of things like "was it jitted or treated as an intrinsic"...

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It might work better if many of the HWIntrinsic tests were ported to be library tests

What are the library tests doing differently that would make it work better?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What are the library tests doing differently that would make it work better?

For starters, they don't run under GCStress today. But on top of that, the managed testing frameworks already handle things like parallelizing the test runs; tracking each scenario individually when considering timeouts, etc.

xUnit (as do others like MSTest and NUnit) already handles a large number of these scenarios/considerations and so we'd be able to still provide the desired coverage while limiting what gets stress tested for the runtime to just the scenarios we believe are "most impactful".

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So, I imagine moving most of the logic to the libraries side may actually speed up the inner loop test time, make it easier to debug/validate certain changes, and fix the GCStress timeouts.

We'd still have to keep some scenarios to ensure that we do get stress test coverage; but we may be able to limit that to a single scenario per intrinsic rather than 10-12 scenarios per intrinsic.

Copy link
Member

@jkotas jkotas Jan 24, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For starters, they don't run under GCStress today

That is a bug, not a feature. I would love to have a capability to run GCStress on anything. One of the problems with GCStress and libraries tests is reflection-based xunit runner that is too heavy for GCStress.

managed testing frameworks already handle things like parallelizing the test runs; tracking each scenario individually when considering timeouts, etc.

We are moving the runtime tests to the same xunit authoring as libraries tests (except that the runner is going to be source generated). I hope we are going to have similar flexibility in the runtime tests once that is all done.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That is a bug, not a feature. I would love to have a capability to run GCStress on anything. One of the problems with GCStress and libraries tests is reflection-based xunit runner that is too heavy for GCStress.

fwiw, I created (a long time ago) GCStress pipelines for libraries tests: https://dev.azure.com/dnceng/public/_build?definitionId=839, https://dev.azure.com/dnceng/public/_build?definitionId=840, but never enabled them because there are too many timeouts and other GCStress-related issues that need to be cleaned up before they wouldn't be too noisy.

{
var_types simdType = getSIMDTypeForSize(simdSize);

Expand All @@ -1305,7 +1305,7 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic,
{
assert(sig->numArgs == 2);

if ((simdSize != 32) || varTypeIsFloating(simdBaseType) || compExactlyDependsOn(InstructionSet_AVX2))
if ((simdSize != 32) || compExactlyDependsOn(InstructionSet_AVX2))
{
var_types simdType = getSIMDTypeForSize(simdSize);

Expand Down Expand Up @@ -1339,7 +1339,7 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic,
{
assert(sig->numArgs == 2);

if ((simdSize != 32) || varTypeIsFloating(simdBaseType) || compExactlyDependsOn(InstructionSet_AVX2))
if ((simdSize != 32) || compExactlyDependsOn(InstructionSet_AVX2))
{
var_types simdType = getSIMDTypeForSize(simdSize);

Expand All @@ -1357,7 +1357,7 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic,
{
assert(sig->numArgs == 2);

if ((simdSize != 32) || varTypeIsFloating(simdBaseType) || compExactlyDependsOn(InstructionSet_AVX2))
if ((simdSize != 32) || compExactlyDependsOn(InstructionSet_AVX2))
{
var_types simdType = getSIMDTypeForSize(simdSize);

Expand Down Expand Up @@ -1391,7 +1391,7 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic,
{
assert(sig->numArgs == 2);

if ((simdSize != 32) || varTypeIsFloating(simdBaseType) || compExactlyDependsOn(InstructionSet_AVX2))
if ((simdSize != 32) || compExactlyDependsOn(InstructionSet_AVX2))
{
var_types simdType = getSIMDTypeForSize(simdSize);

Expand All @@ -1409,7 +1409,7 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic,
{
assert(sig->numArgs == 2);

if ((simdSize != 32) || varTypeIsFloating(simdBaseType) || compExactlyDependsOn(InstructionSet_AVX2))
if ((simdSize != 32) || compExactlyDependsOn(InstructionSet_AVX2))
{
var_types simdType = getSIMDTypeForSize(simdSize);

Expand Down Expand Up @@ -1443,7 +1443,7 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic,
{
assert(sig->numArgs == 2);

if ((simdSize != 32) || varTypeIsFloating(simdBaseType) || compExactlyDependsOn(InstructionSet_AVX2))
if ((simdSize != 32) || compExactlyDependsOn(InstructionSet_AVX2))
{
var_types simdType = getSIMDTypeForSize(simdSize);

Expand All @@ -1461,7 +1461,7 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic,
{
assert(sig->numArgs == 2);

if ((simdSize != 32) || varTypeIsFloating(simdBaseType) || compExactlyDependsOn(InstructionSet_AVX2))
if ((simdSize != 32) || compExactlyDependsOn(InstructionSet_AVX2))
{
var_types simdType = getSIMDTypeForSize(simdSize);

Expand Down Expand Up @@ -2024,7 +2024,12 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic,
assert(sig->numArgs == 1);
var_types simdType = getSIMDTypeForSize(simdSize);

if (varTypeIsFloating(simdBaseType))
if ((simdSize == 32) && !compOpportunisticallyDependsOn(InstructionSet_AVX2))
{
// Vector256 for integer types requires AVX2
break;
}
else if (varTypeIsFloating(simdBaseType))
{
if (!compOpportunisticallyDependsOn(InstructionSet_SSE3))
{
Expand Down
2 changes: 1 addition & 1 deletion src/coreclr/jit/lowerxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3429,7 +3429,7 @@ void Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node)
// e6, e7, e4, e5 | e2, e3, e0, e1
// e7, e6, e5, e4 | e3, e2, e1, e0

shuffleConst = 0x4D;
shuffleConst = 0x4E;
tannergooding marked this conversation as resolved.
Show resolved Hide resolved
Copy link
Member Author

@tannergooding tannergooding Jan 22, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I ran tests on all ISAs from SSE2-AVX2 to ensure no other "disabled" instructions were being generated and it flagged this as a failing test (4D meant it was selecting 1, 3, 0, 1; While 4E means it will pick 2, 3, 0, 1)

break;
}

Expand Down