From e9915df6320aadf73be4717a3396b2f26d6e4c9a Mon Sep 17 00:00:00 2001 From: Egor Bogatov Date: Tue, 12 Apr 2022 13:54:13 +0300 Subject: [PATCH] [arm64] Add RCPC ISA (8.3+) and use ldap for volatile reads (#67384) Co-authored-by: Adeel Mujahid <3840695+am11@users.noreply.github.com> --- src/coreclr/inc/clrconfigvalues.h | 1 + src/coreclr/inc/corinfoinstructionset.h | 3 +++ src/coreclr/inc/jiteeversionguid.h | 10 +++++----- src/coreclr/jit/codegenarm64.cpp | 6 ++++++ src/coreclr/jit/codegenarmarch.cpp | 10 +++++++--- src/coreclr/jit/emitarm64.cpp | 11 ++++++++++- src/coreclr/jit/instrsarm64.h | 11 +++++++++++ src/coreclr/nativeaot/Runtime/IntrinsicConstants.h | 3 ++- src/coreclr/nativeaot/Runtime/unix/PalRedhawkUnix.cpp | 4 ++-- src/coreclr/pal/src/misc/jitsupport.cpp | 11 ++++++++--- .../Runtime/ReadyToRunInstructionSetHelper.cs | 1 + .../Common/JitInterface/CorInfoInstructionSet.cs | 3 +++ .../ThunkGenerator/InstructionSetDesc.txt | 1 + .../Compiler/HardwareIntrinsicHelpers.Aot.cs | 3 +++ src/coreclr/tools/aot/ILCompiler/Program.cs | 1 + src/coreclr/vm/codeman.cpp | 5 +++++ 16 files changed, 69 insertions(+), 15 deletions(-) diff --git a/src/coreclr/inc/clrconfigvalues.h b/src/coreclr/inc/clrconfigvalues.h index 4d908aceee543..86beb15b0a637 100644 --- a/src/coreclr/inc/clrconfigvalues.h +++ b/src/coreclr/inc/clrconfigvalues.h @@ -775,6 +775,7 @@ RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableArm64Dp, W("EnableArm64Dp"), 1 RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableArm64Rdm, W("EnableArm64Rdm"), 1, "Allows Arm64 Rdm+ hardware intrinsics to be disabled") RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableArm64Sha1, W("EnableArm64Sha1"), 1, "Allows Arm64 Sha1+ hardware intrinsics to be disabled") RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableArm64Sha256, W("EnableArm64Sha256"), 1, "Allows Arm64 Sha256+ hardware intrinsics to be disabled") +RETAIL_CONFIG_DWORD_INFO(EXTERNAL_EnableArm64Rcpc, W("EnableArm64Rcpc"), 1, "Allows Arm64 Rcpc+ hardware intrinsics to be disabled") #endif /// diff --git a/src/coreclr/inc/corinfoinstructionset.h b/src/coreclr/inc/corinfoinstructionset.h index b068b316168f5..e8c79cc143362 100644 --- a/src/coreclr/inc/corinfoinstructionset.h +++ b/src/coreclr/inc/corinfoinstructionset.h @@ -36,6 +36,7 @@ enum CORINFO_InstructionSet InstructionSet_Rdm_Arm64=18, InstructionSet_Sha1_Arm64=19, InstructionSet_Sha256_Arm64=20, + InstructionSet_Rcpc=21, #endif // TARGET_ARM64 #ifdef TARGET_AMD64 InstructionSet_X86Base=1, @@ -486,6 +487,8 @@ inline const char *InstructionSetToString(CORINFO_InstructionSet instructionSet) return "Vector128"; case InstructionSet_Dczva : return "Dczva"; + case InstructionSet_Rcpc : + return "Rcpc"; #endif // TARGET_ARM64 #ifdef TARGET_AMD64 case InstructionSet_X86Base : diff --git a/src/coreclr/inc/jiteeversionguid.h b/src/coreclr/inc/jiteeversionguid.h index 86335c0a1ddce..b1b0f06a4ece9 100644 --- a/src/coreclr/inc/jiteeversionguid.h +++ b/src/coreclr/inc/jiteeversionguid.h @@ -43,11 +43,11 @@ typedef const GUID *LPCGUID; #define GUID_DEFINED #endif // !GUID_DEFINED -constexpr GUID JITEEVersionIdentifier = { /* b2d3c86f-87fd-4724-9e5d-4c44905eba91 */ - 0xb2d3c86f, - 0x87fd, - 0x4724, - {0x9e, 0x5d, 0x4c, 0x44, 0x90, 0x5e, 0xba, 0x91} +constexpr GUID JITEEVersionIdentifier = { /* 206a7aa6-9f5c-47c1-b63b-54f4cb169ee3 */ + 0x206a7aa6, + 0x9f5c, + 0x47c1, + {0xb6, 0x3b, 0x54, 0xf4, 0xcb, 0x16, 0x9e, 0xe3} }; ////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/coreclr/jit/codegenarm64.cpp b/src/coreclr/jit/codegenarm64.cpp index 69fc1885da3e9..d24d0f9db1522 100644 --- a/src/coreclr/jit/codegenarm64.cpp +++ b/src/coreclr/jit/codegenarm64.cpp @@ -5233,6 +5233,12 @@ void CodeGen::genArm64EmitterUnitTests() theEmitter->emitIns_R_R(INS_stlrb, EA_4BYTE, REG_R5, REG_R14); theEmitter->emitIns_R_R(INS_stlrh, EA_4BYTE, REG_R3, REG_R15); + // ldapr Rt, [reg] + theEmitter->emitIns_R_R(INS_ldapr, EA_8BYTE, REG_R9, REG_R8); + theEmitter->emitIns_R_R(INS_ldapr, EA_4BYTE, REG_R7, REG_R10); + theEmitter->emitIns_R_R(INS_ldaprb, EA_4BYTE, REG_R5, REG_R11); + theEmitter->emitIns_R_R(INS_ldaprh, EA_4BYTE, REG_R5, REG_R12); + // ldaxr Rt, [reg] theEmitter->emitIns_R_R(INS_ldaxr, EA_8BYTE, REG_R9, REG_R8); theEmitter->emitIns_R_R(INS_ldaxr, EA_4BYTE, REG_R7, REG_R10); diff --git a/src/coreclr/jit/codegenarmarch.cpp b/src/coreclr/jit/codegenarmarch.cpp index 460a2b1635d93..b2ab4e84ba87f 100644 --- a/src/coreclr/jit/codegenarmarch.cpp +++ b/src/coreclr/jit/codegenarmarch.cpp @@ -1884,17 +1884,21 @@ void CodeGen::genCodeForIndir(GenTreeIndir* tree) bool addrIsInReg = tree->Addr()->isUsedFromReg(); bool addrIsAligned = ((tree->gtFlags & GTF_IND_UNALIGNED) == 0); + // on arm64-v8.3+ we can use ldap* instructions with acquire/release semantics to avoid + // full memory barriers if mixed with STLR + bool hasRcpc = compiler->compOpportunisticallyDependsOn(InstructionSet_Rcpc); + if ((ins == INS_ldrb) && addrIsInReg) { - ins = INS_ldarb; + ins = hasRcpc ? INS_ldaprb : INS_ldarb; } else if ((ins == INS_ldrh) && addrIsInReg && addrIsAligned) { - ins = INS_ldarh; + ins = hasRcpc ? INS_ldaprh : INS_ldarh; } else if ((ins == INS_ldr) && addrIsInReg && addrIsAligned && genIsValidIntReg(targetReg)) { - ins = INS_ldar; + ins = hasRcpc ? INS_ldapr : INS_ldar; } else #endif // TARGET_ARM64 diff --git a/src/coreclr/jit/emitarm64.cpp b/src/coreclr/jit/emitarm64.cpp index 63e8b3bb3ab08..50060a5855ee4 100644 --- a/src/coreclr/jit/emitarm64.cpp +++ b/src/coreclr/jit/emitarm64.cpp @@ -1132,6 +1132,7 @@ emitAttr emitter::emitInsTargetRegSize(instrDesc* id) { case INS_ldxrb: case INS_ldarb: + case INS_ldaprb: case INS_ldaxrb: case INS_stxrb: case INS_stlrb: @@ -1145,6 +1146,7 @@ emitAttr emitter::emitInsTargetRegSize(instrDesc* id) case INS_ldxrh: case INS_ldarh: + case INS_ldaprh: case INS_ldaxrh: case INS_stxrh: case INS_stlrh: @@ -1181,6 +1183,7 @@ emitAttr emitter::emitInsTargetRegSize(instrDesc* id) case INS_ldxr: case INS_ldar: + case INS_ldapr: case INS_ldaxr: case INS_stxr: case INS_stlr: @@ -1212,6 +1215,7 @@ emitAttr emitter::emitInsLoadStoreSize(instrDesc* id) switch (ins) { case INS_ldarb: + case INS_ldaprb: case INS_stlrb: case INS_ldrb: case INS_strb: @@ -1223,6 +1227,7 @@ emitAttr emitter::emitInsLoadStoreSize(instrDesc* id) break; case INS_ldarh: + case INS_ldaprh: case INS_stlrh: case INS_ldrh: case INS_strh: @@ -1247,6 +1252,7 @@ emitAttr emitter::emitInsLoadStoreSize(instrDesc* id) break; case INS_ldar: + case INS_ldapr: case INS_stlr: case INS_ldr: case INS_str: @@ -4460,6 +4466,7 @@ void emitter::emitIns_R_R( break; case INS_ldar: + case INS_ldapr: case INS_ldaxr: case INS_ldxr: case INS_stlr: @@ -4468,9 +4475,11 @@ void emitter::emitIns_R_R( FALLTHROUGH; case INS_ldarb: + case INS_ldaprb: case INS_ldaxrb: case INS_ldxrb: case INS_ldarh: + case INS_ldaprh: case INS_ldaxrh: case INS_ldxrh: case INS_stlrb: @@ -14206,7 +14215,7 @@ emitter::insExecutionCharacteristics emitter::getInsExecutionCharacteristics(ins break; case IF_LS_2A: // ldr, ldrsw, ldrb, ldrh, ldrsb, ldrsh, str, strb, strh (no immediate) - // ldar, ldarb, ldarh, ldxr, ldxrb, ldxrh, + // ldar, ldarb, ldarh, ldapr, ldaprb, ldaprh, ldxr, ldxrb, ldxrh, // ldaxr, ldaxrb, ldaxrh, stlr, stlrb, stlrh result.insThroughput = PERFSCORE_THROUGHPUT_1C; diff --git a/src/coreclr/jit/instrsarm64.h b/src/coreclr/jit/instrsarm64.h index 294dcf0b219d8..9ddff3624576e 100644 --- a/src/coreclr/jit/instrsarm64.h +++ b/src/coreclr/jit/instrsarm64.h @@ -1053,6 +1053,17 @@ INST1(ldarb, "ldarb", LD, IF_LS_2A, 0x08DFFC00) INST1(ldarh, "ldarh", LD, IF_LS_2A, 0x48DFFC00) // ldarh Rt,[Xn] LS_2A 0100100011011111 111111nnnnnttttt 48DF FC00 + +INST1(ldapr, "ldapr", LD, IF_LS_2A, 0xB8BFC000) + // ldapr Rt,[Xn] LS_2A 1X11100010111111 110000nnnnnttttt B8BF C000 Rm Rt Rn ARMv8.3 LRCPC + +INST1(ldaprb, "ldaprb", LD, IF_LS_2A, 0x38BFC000) + // ldaprb Rt,[Xn] LS_2A 0011100010111111 110000nnnnnttttt 38BF C000 Rm Rt Rn ARMv8.3 LRCPC + +INST1(ldaprh, "ldaprh", LD, IF_LS_2A, 0x78BFC000) + // ldaprh Rt,[Xn] LS_2A 0111100010111111 110000nnnnnttttt 78BF C000 Rm Rt Rn ARMv8.3 LRCPC + + INST1(ldxr, "ldxr", LD, IF_LS_2A, 0x885F7C00) // ldxr Rt,[Xn] LS_2A 1X00100001011111 011111nnnnnttttt 885F 7C00 diff --git a/src/coreclr/nativeaot/Runtime/IntrinsicConstants.h b/src/coreclr/nativeaot/Runtime/IntrinsicConstants.h index 8ce413315e81c..7908d1a215b53 100644 --- a/src/coreclr/nativeaot/Runtime/IntrinsicConstants.h +++ b/src/coreclr/nativeaot/Runtime/IntrinsicConstants.h @@ -40,7 +40,8 @@ enum ARM64IntrinsicConstants ARM64IntrinsicConstants_Sha256 = 0x0100, ARM64IntrinsicConstants_Atomics = 0x0200, ARM64IntrinsicConstants_Vector64 = 0x0400, - ARM64IntrinsicConstants_Vector128 = 0x0800 + ARM64IntrinsicConstants_Vector128 = 0x0800, + ARM64IntrinsicConstants_Rcpc = 0x1000 }; #endif //HOST_ARM64 diff --git a/src/coreclr/nativeaot/Runtime/unix/PalRedhawkUnix.cpp b/src/coreclr/nativeaot/Runtime/unix/PalRedhawkUnix.cpp index 410222948e18c..705bf18cf8bc6 100644 --- a/src/coreclr/nativeaot/Runtime/unix/PalRedhawkUnix.cpp +++ b/src/coreclr/nativeaot/Runtime/unix/PalRedhawkUnix.cpp @@ -1219,8 +1219,8 @@ REDHAWK_PALEXPORT void REDHAWK_PALAPI PAL_GetCpuCapabilityFlags(int* flags) // *flags |= ARM64IntrinsicConstants_???; #endif #ifdef HWCAP_LRCPC -// if (hwCap & HWCAP_LRCPC) -// *flags |= ARM64IntrinsicConstants_???; + if (hwCap & HWCAP_LRCPC) + *flags |= ARM64IntrinsicConstants_Rcpc; #endif #ifdef HWCAP_PMULL // if (hwCap & HWCAP_PMULL) diff --git a/src/coreclr/pal/src/misc/jitsupport.cpp b/src/coreclr/pal/src/misc/jitsupport.cpp index 6dcc535f2aa14..209b36590837a 100644 --- a/src/coreclr/pal/src/misc/jitsupport.cpp +++ b/src/coreclr/pal/src/misc/jitsupport.cpp @@ -54,7 +54,9 @@ static const CpuCapability CpuCapabilities[] = { #endif //{ "jscvt", HWCAP_JSCVT }, //{ "fcma", HWCAP_FCMA }, - //{ "lrcpc", HWCAP_LRCPC }, +#ifdef HWCAP_LRCPC + { "lrcpc", HWCAP_LRCPC }, +#endif //{ "dcpop", HWCAP_DCPOP }, //{ "sha3", HWCAP_SHA3 }, //{ "sm3", HWCAP_SM3 }, @@ -208,8 +210,8 @@ PAL_GetJitCpuCapabilityFlags(CORJIT_FLAGS *flags) // flags->Set(CORJIT_FLAGS::CORJIT_FLAG_HAS_ARM64_JSCVT); #endif #ifdef HWCAP_LRCPC -// if (hwCap & HWCAP_LRCPC) -// flags->Set(CORJIT_FLAGS::CORJIT_FLAG_HAS_ARM64_LRCPC); + if (hwCap & HWCAP_LRCPC) + flags->Set(InstructionSet_Rcpc); #endif #ifdef HWCAP_PMULL // if (hwCap & HWCAP_PMULL) @@ -280,6 +282,9 @@ PAL_GetJitCpuCapabilityFlags(CORJIT_FLAGS *flags) if ((sysctlbyname("hw.optional.armv8_1_atomics", &valueFromSysctl, &sz, nullptr, 0) == 0) && (valueFromSysctl != 0)) flags->Set(InstructionSet_Atomics); + + if ((sysctlbyname("hw.optional.arm.FEAT_LRCPC", &valueFromSysctl, &sz, nullptr, 0) == 0) && (valueFromSysctl != 0)) + flags->Set(InstructionSet_Rcpc); #endif // HAVE_SYSCTLBYNAME // CoreCLR SIMD and FP support is included in ARM64 baseline // On exceptional basis platforms may leave out support, but CoreCLR does not diff --git a/src/coreclr/tools/Common/Internal/Runtime/ReadyToRunInstructionSetHelper.cs b/src/coreclr/tools/Common/Internal/Runtime/ReadyToRunInstructionSetHelper.cs index ffc302a827567..595004fabe405 100644 --- a/src/coreclr/tools/Common/Internal/Runtime/ReadyToRunInstructionSetHelper.cs +++ b/src/coreclr/tools/Common/Internal/Runtime/ReadyToRunInstructionSetHelper.cs @@ -43,6 +43,7 @@ public static class ReadyToRunInstructionSetHelper case InstructionSet.ARM64_Vector64: return null; case InstructionSet.ARM64_Vector128: return null; case InstructionSet.ARM64_Dczva: return null; + case InstructionSet.ARM64_Rcpc: return null; default: throw new Exception("Unknown instruction set"); } diff --git a/src/coreclr/tools/Common/JitInterface/CorInfoInstructionSet.cs b/src/coreclr/tools/Common/JitInterface/CorInfoInstructionSet.cs index e4ac437796163..d9b6cfb812899 100644 --- a/src/coreclr/tools/Common/JitInterface/CorInfoInstructionSet.cs +++ b/src/coreclr/tools/Common/JitInterface/CorInfoInstructionSet.cs @@ -38,6 +38,7 @@ public enum InstructionSet ARM64_Rdm_Arm64 = InstructionSet_ARM64.Rdm_Arm64, ARM64_Sha1_Arm64 = InstructionSet_ARM64.Sha1_Arm64, ARM64_Sha256_Arm64 = InstructionSet_ARM64.Sha256_Arm64, + ARM64_Rcpc = InstructionSet_ARM64.Rcpc, X64_X86Base = InstructionSet_X64.X86Base, X64_SSE = InstructionSet_X64.SSE, X64_SSE2 = InstructionSet_X64.SSE2, @@ -136,6 +137,7 @@ public enum InstructionSet_ARM64 Rdm_Arm64 = 18, Sha1_Arm64 = 19, Sha256_Arm64 = 20, + Rcpc = 21, } public enum InstructionSet_X64 @@ -740,6 +742,7 @@ public static IEnumerable ArchitectureToValidInstructionSets yield return new InstructionSetInfo("Vector64", "", InstructionSet.ARM64_Vector64, false); yield return new InstructionSetInfo("Vector128", "", InstructionSet.ARM64_Vector128, false); yield return new InstructionSetInfo("Dczva", "", InstructionSet.ARM64_Dczva, false); + yield return new InstructionSetInfo("Rcpc", "", InstructionSet.ARM64_Rcpc, false); break; case TargetArchitecture.X64: diff --git a/src/coreclr/tools/Common/JitInterface/ThunkGenerator/InstructionSetDesc.txt b/src/coreclr/tools/Common/JitInterface/ThunkGenerator/InstructionSetDesc.txt index 0d5f36fead942..59ba3ade1c450 100644 --- a/src/coreclr/tools/Common/JitInterface/ThunkGenerator/InstructionSetDesc.txt +++ b/src/coreclr/tools/Common/JitInterface/ThunkGenerator/InstructionSetDesc.txt @@ -114,6 +114,7 @@ instructionset64bit,ARM64 ,Dp instructionset64bit,ARM64 ,Rdm instructionset64bit,ARM64 ,Sha1 instructionset64bit,ARM64 ,Sha256 +instructionset ,ARM64 , , , ,Rcpc , vectorinstructionset,ARM64,Vector64 vectorinstructionset,ARM64,Vector128 diff --git a/src/coreclr/tools/aot/ILCompiler.Compiler/Compiler/HardwareIntrinsicHelpers.Aot.cs b/src/coreclr/tools/aot/ILCompiler.Compiler/Compiler/HardwareIntrinsicHelpers.Aot.cs index 188d44c3f65fb..34a05ce2d93e8 100644 --- a/src/coreclr/tools/aot/ILCompiler.Compiler/Compiler/HardwareIntrinsicHelpers.Aot.cs +++ b/src/coreclr/tools/aot/ILCompiler.Compiler/Compiler/HardwareIntrinsicHelpers.Aot.cs @@ -190,6 +190,7 @@ private static class Arm64IntrinsicConstants public const int Atomics = 0x0200; public const int Vector64 = 0x0400; public const int Vector128 = 0x0800; + public const int Rcpc = 0x1000; public static int FromHardwareIntrinsicId(string id) { @@ -207,6 +208,7 @@ public static int FromHardwareIntrinsicId(string id) "Atomics" => Atomics, "Vector64" => Vector64, "Vector128" => Vector128, + "Rcpc" => Rcpc, _ => throw new NotSupportedException(), }; } @@ -231,6 +233,7 @@ public static int FromInstructionSetFlags(InstructionSetFlags instructionSets) InstructionSet.ARM64_Atomics => Atomics, InstructionSet.ARM64_Vector64 => Vector64, InstructionSet.ARM64_Vector128 => Vector128, + InstructionSet.ARM64_Rcpc => Rcpc, _ => throw new NotSupportedException() }; } diff --git a/src/coreclr/tools/aot/ILCompiler/Program.cs b/src/coreclr/tools/aot/ILCompiler/Program.cs index c0e7b493be5ba..d05d7a6c1e906 100644 --- a/src/coreclr/tools/aot/ILCompiler/Program.cs +++ b/src/coreclr/tools/aot/ILCompiler/Program.cs @@ -444,6 +444,7 @@ private int Run(string[] args) optimisticInstructionSetSupportBuilder.AddSupportedInstructionSet("sha1"); optimisticInstructionSetSupportBuilder.AddSupportedInstructionSet("sha2"); optimisticInstructionSetSupportBuilder.AddSupportedInstructionSet("lse"); + optimisticInstructionSetSupportBuilder.AddSupportedInstructionSet("rcpc"); } optimisticInstructionSetSupportBuilder.ComputeInstructionSetFlags(out var optimisticInstructionSet, out _, diff --git a/src/coreclr/vm/codeman.cpp b/src/coreclr/vm/codeman.cpp index aece10d072ac2..6f7cf19da2ec9 100644 --- a/src/coreclr/vm/codeman.cpp +++ b/src/coreclr/vm/codeman.cpp @@ -1592,6 +1592,11 @@ void EEJitManager::SetCpuInfo() CPUCompileFlags.Clear(InstructionSet_Atomics); } + if (!CLRConfig::GetConfigValue(CLRConfig::EXTERNAL_EnableArm64Rcpc)) + { + CPUCompileFlags.Clear(InstructionSet_Rcpc); + } + if (!CLRConfig::GetConfigValue(CLRConfig::EXTERNAL_EnableArm64Crc32)) { CPUCompileFlags.Clear(InstructionSet_Crc32);