From d6f4d5209ffcdc6d8e33a14fe8df70283b768f45 Mon Sep 17 00:00:00 2001 From: zhongyunde 00443407 Date: Tue, 28 Nov 2023 07:05:51 -0500 Subject: [PATCH] [CGP][AArch64] Rebase the common base offset for better ISel When all the large const offsets masked with the same value from bit-12 to bit-23. Fold add x8, x0, #2031, lsl #12 add x8, x8, #960 ldr x9, [x8, x8] ldr x8, [x8, #2056] into add x8, x0, #2031, lsl #12 ldr x9, [x8, #960] ldr x8, [x8, #3016] --- llvm/include/llvm/CodeGen/BasicTTIImpl.h | 4 + llvm/include/llvm/CodeGen/TargetLowering.h | 8 +- llvm/lib/CodeGen/CodeGenPrepare.cpp | 79 ++++++++++++------- .../Target/AArch64/AArch64ISelLowering.cpp | 14 ++++ llvm/lib/Target/AArch64/AArch64ISelLowering.h | 3 + llvm/test/CodeGen/AArch64/arm64-addrmode.ll | 5 +- .../AArch64/large-offset-gep.ll | 57 ++++++------- 7 files changed, 105 insertions(+), 65 deletions(-) diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index fd3410586e172a..e05ce2890a08c8 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -342,6 +342,10 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I); } + int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset) { + return getTLI()->getPreferredLargeGEPBaseOffset(MinOffset, MaxOffset); + } + unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const { auto &&IsSupportedByTarget = [this, ScalarMemTy, ScalarValTy](unsigned VF) { diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 77ee6b89ed8a34..3cd89c71f71640 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -30,8 +30,8 @@ #include "llvm/CodeGen/DAGCombine.h" #include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/CodeGen/LowLevelTypeUtils.h" -#include "llvm/CodeGen/MachineValueType.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineValueType.h" #include "llvm/CodeGen/RuntimeLibcalls.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGNodes.h" @@ -2721,6 +2721,12 @@ class TargetLoweringBase { Type *Ty, unsigned AddrSpace, Instruction *I = nullptr) const; + /// Return the prefered common base offset. + virtual int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, + int64_t MaxOffset) const { + return 0; + } + /// Return true if the specified immediate is legal icmp immediate, that is /// the target has icmp instructions which can compare a register against the /// immediate without having to materialize the immediate into a register. diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index 885d2d3ce24825..824371c9b9f91f 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -6121,6 +6121,55 @@ bool CodeGenPrepare::splitLargeGEPOffsets() { int64_t BaseOffset = LargeOffsetGEPs.begin()->second; Value *NewBaseGEP = nullptr; + auto createNewBase = [&](int64_t BaseOffset, Value *OldBase, + GetElementPtrInst *GEP) { + LLVMContext &Ctx = GEP->getContext(); + Type *PtrIdxTy = DL->getIndexType(GEP->getType()); + Type *I8PtrTy = + PointerType::get(Ctx, GEP->getType()->getPointerAddressSpace()); + Type *I8Ty = Type::getInt8Ty(Ctx); + + BasicBlock::iterator NewBaseInsertPt; + BasicBlock *NewBaseInsertBB; + if (auto *BaseI = dyn_cast(OldBase)) { + // If the base of the struct is an instruction, the new base will be + // inserted close to it. + NewBaseInsertBB = BaseI->getParent(); + if (isa(BaseI)) + NewBaseInsertPt = NewBaseInsertBB->getFirstInsertionPt(); + else if (InvokeInst *Invoke = dyn_cast(BaseI)) { + NewBaseInsertBB = + SplitEdge(NewBaseInsertBB, Invoke->getNormalDest(), DT.get(), LI); + NewBaseInsertPt = NewBaseInsertBB->getFirstInsertionPt(); + } else + NewBaseInsertPt = std::next(BaseI->getIterator()); + } else { + // If the current base is an argument or global value, the new base + // will be inserted to the entry block. + NewBaseInsertBB = &BaseGEP->getFunction()->getEntryBlock(); + NewBaseInsertPt = NewBaseInsertBB->getFirstInsertionPt(); + } + IRBuilder<> NewBaseBuilder(NewBaseInsertBB, NewBaseInsertPt); + // Create a new base. + Value *BaseIndex = ConstantInt::get(PtrIdxTy, BaseOffset); + NewBaseGEP = OldBase; + if (NewBaseGEP->getType() != I8PtrTy) + NewBaseGEP = NewBaseBuilder.CreatePointerCast(NewBaseGEP, I8PtrTy); + NewBaseGEP = + NewBaseBuilder.CreateGEP(I8Ty, NewBaseGEP, BaseIndex, "splitgep"); + NewGEPBases.insert(NewBaseGEP); + return; + }; + + // Check whether all the offsets can be encoded with prefered common base. + if (int64_t PreferBase = TLI->getPreferredLargeGEPBaseOffset( + LargeOffsetGEPs.front().second, LargeOffsetGEPs.back().second)) { + BaseOffset = PreferBase; + // Create a new base if the offset of the BaseGEP can be decoded with one + // instruction. + createNewBase(BaseOffset, OldBase, BaseGEP); + } + auto *LargeOffsetGEP = LargeOffsetGEPs.begin(); while (LargeOffsetGEP != LargeOffsetGEPs.end()) { GetElementPtrInst *GEP = LargeOffsetGEP->first; @@ -6153,35 +6202,7 @@ bool CodeGenPrepare::splitLargeGEPOffsets() { if (!NewBaseGEP) { // Create a new base if we don't have one yet. Find the insertion // pointer for the new base first. - BasicBlock::iterator NewBaseInsertPt; - BasicBlock *NewBaseInsertBB; - if (auto *BaseI = dyn_cast(OldBase)) { - // If the base of the struct is an instruction, the new base will be - // inserted close to it. - NewBaseInsertBB = BaseI->getParent(); - if (isa(BaseI)) - NewBaseInsertPt = NewBaseInsertBB->getFirstInsertionPt(); - else if (InvokeInst *Invoke = dyn_cast(BaseI)) { - NewBaseInsertBB = - SplitEdge(NewBaseInsertBB, Invoke->getNormalDest(), DT.get(), LI); - NewBaseInsertPt = NewBaseInsertBB->getFirstInsertionPt(); - } else - NewBaseInsertPt = std::next(BaseI->getIterator()); - } else { - // If the current base is an argument or global value, the new base - // will be inserted to the entry block. - NewBaseInsertBB = &BaseGEP->getFunction()->getEntryBlock(); - NewBaseInsertPt = NewBaseInsertBB->getFirstInsertionPt(); - } - IRBuilder<> NewBaseBuilder(NewBaseInsertBB, NewBaseInsertPt); - // Create a new base. - Value *BaseIndex = ConstantInt::get(PtrIdxTy, BaseOffset); - NewBaseGEP = OldBase; - if (NewBaseGEP->getType() != I8PtrTy) - NewBaseGEP = NewBaseBuilder.CreatePointerCast(NewBaseGEP, I8PtrTy); - NewBaseGEP = - NewBaseBuilder.CreateGEP(I8Ty, NewBaseGEP, BaseIndex, "splitgep"); - NewGEPBases.insert(NewBaseGEP); + createNewBase(BaseOffset, OldBase, GEP); } IRBuilder<> Builder(GEP); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index b6a16217dfae39..f6e64c49ef05ee 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -16070,6 +16070,20 @@ bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL, AM.Scale); } +// Check whether the 2 offsets belong to the same imm24 range, and their high +// 12bits are same, then their high part can be decoded with the offset of add. +int64_t +AArch64TargetLowering::getPreferredLargeGEPBaseOffset(int64_t MinOffset, + int64_t MaxOffset) const { + int64_t HighPart = MinOffset & ~0xfffULL; + if (MinOffset >> 12 == MaxOffset >> 12 && isLegalAddImmediate(HighPart)) { + // Rebase the value to an integer multiple of imm12. + return HighPart; + } + + return 0; +} + bool AArch64TargetLowering::shouldConsiderGEPOffsetSplit() const { // Consider splitting large offset of struct or array. return true; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 3c8479e1f6e3c3..6ddbcd41dcb769 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -699,6 +699,9 @@ class AArch64TargetLowering : public TargetLowering { unsigned AS, Instruction *I = nullptr) const override; + int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, + int64_t MaxOffset) const override; + /// Return true if an FMA operation is faster than a pair of fmul and fadd /// instructions. fmuladd intrinsics will be expanded to FMAs when this method /// returns true, otherwise fmuladd is expanded to fmul + fadd. diff --git a/llvm/test/CodeGen/AArch64/arm64-addrmode.ll b/llvm/test/CodeGen/AArch64/arm64-addrmode.ll index 72c979c5a844df..3d4749a7b8e7df 100644 --- a/llvm/test/CodeGen/AArch64/arm64-addrmode.ll +++ b/llvm/test/CodeGen/AArch64/arm64-addrmode.ll @@ -252,9 +252,8 @@ define i64 @LdOffset_i64_multi_offset(ptr %a) { ; CHECK-LABEL: LdOffset_i64_multi_offset: ; CHECK: // %bb.0: ; CHECK-NEXT: add x8, x0, #2031, lsl #12 // =8318976 -; CHECK-NEXT: add x8, x8, #960 -; CHECK-NEXT: ldr x9, [x8] -; CHECK-NEXT: ldr x8, [x8, #2056] +; CHECK-NEXT: ldr x9, [x8, #960] +; CHECK-NEXT: ldr x8, [x8, #3016] ; CHECK-NEXT: add x0, x8, x9 ; CHECK-NEXT: ret %arrayidx = getelementptr inbounds i64, ptr %a, i64 1039992 diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/large-offset-gep.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/large-offset-gep.ll index 080b3dd75ee9a9..097575ca86bccb 100644 --- a/llvm/test/Transforms/CodeGenPrepare/AArch64/large-offset-gep.ll +++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/large-offset-gep.ll @@ -6,18 +6,17 @@ define void @test1(ptr %s, i32 %n) { ; CHECK-LABEL: test1: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldr x9, [x0] -; CHECK-NEXT: mov w10, #40000 // =0x9c40 -; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: add x9, x9, x10 -; CHECK-NEXT: cmp w8, w1 +; CHECK-NEXT: ldr x8, [x0] +; CHECK-NEXT: mov w9, wzr +; CHECK-NEXT: add x8, x8, #9, lsl #12 // =36864 +; CHECK-NEXT: cmp w9, w1 ; CHECK-NEXT: b.ge .LBB0_2 ; CHECK-NEXT: .LBB0_1: // %while_body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: str w8, [x9, #4] -; CHECK-NEXT: add w8, w8, #1 -; CHECK-NEXT: str w8, [x9] -; CHECK-NEXT: cmp w8, w1 +; CHECK-NEXT: str w9, [x8, #3140] +; CHECK-NEXT: add w9, w9, #1 +; CHECK-NEXT: str w9, [x8, #3136] +; CHECK-NEXT: cmp w9, w1 ; CHECK-NEXT: b.lt .LBB0_1 ; CHECK-NEXT: .LBB0_2: // %while_end ; CHECK-NEXT: ret @@ -47,16 +46,15 @@ define void @test2(ptr %struct, i32 %n) { ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: cbz x0, .LBB1_3 ; CHECK-NEXT: // %bb.1: // %while_cond.preheader -; CHECK-NEXT: mov w8, #40000 // =0x9c40 ; CHECK-NEXT: mov w9, wzr -; CHECK-NEXT: add x8, x0, x8 +; CHECK-NEXT: add x8, x0, #9, lsl #12 // =36864 ; CHECK-NEXT: cmp w9, w1 ; CHECK-NEXT: b.ge .LBB1_3 ; CHECK-NEXT: .LBB1_2: // %while_body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: str w9, [x8, #4] +; CHECK-NEXT: str w9, [x8, #3140] ; CHECK-NEXT: add w9, w9, #1 -; CHECK-NEXT: str w9, [x8] +; CHECK-NEXT: str w9, [x8, #3136] ; CHECK-NEXT: cmp w9, w1 ; CHECK-NEXT: b.lt .LBB1_2 ; CHECK-NEXT: .LBB1_3: // %while_end @@ -89,16 +87,15 @@ define void @test3(ptr %s1, ptr %s2, i1 %cond, i32 %n) { ; CHECK-NEXT: csel x8, x1, x0, ne ; CHECK-NEXT: cbz x8, .LBB2_3 ; CHECK-NEXT: // %bb.1: // %while_cond.preheader -; CHECK-NEXT: mov w10, #40000 // =0x9c40 ; CHECK-NEXT: mov w9, wzr -; CHECK-NEXT: add x8, x8, x10 +; CHECK-NEXT: add x8, x8, #9, lsl #12 // =36864 ; CHECK-NEXT: cmp w9, w3 ; CHECK-NEXT: b.ge .LBB2_3 ; CHECK-NEXT: .LBB2_2: // %while_body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: str w9, [x8, #4] +; CHECK-NEXT: str w9, [x8, #3140] ; CHECK-NEXT: add w9, w9, #1 -; CHECK-NEXT: str w9, [x8] +; CHECK-NEXT: str w9, [x8, #3136] ; CHECK-NEXT: cmp w9, w3 ; CHECK-NEXT: b.lt .LBB2_2 ; CHECK-NEXT: .LBB2_3: // %while_end @@ -141,17 +138,15 @@ define void @test4(i32 %n) uwtable personality ptr @__FrameHandler { ; CHECK-NEXT: .cfi_personality 156, DW.ref.__FrameHandler ; CHECK-NEXT: .cfi_lsda 28, .Lexception0 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: str x30, [sp, #-32]! // 8-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: .cfi_offset w19, -8 ; CHECK-NEXT: .cfi_offset w20, -16 -; CHECK-NEXT: .cfi_offset w21, -24 ; CHECK-NEXT: .cfi_offset w30, -32 ; CHECK-NEXT: .cfi_remember_state ; CHECK-NEXT: mov w19, w0 -; CHECK-NEXT: mov w21, wzr -; CHECK-NEXT: mov w20, #40000 // =0x9c40 +; CHECK-NEXT: mov w20, wzr ; CHECK-NEXT: .LBB3_1: // %while_cond ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: .Ltmp0: @@ -159,23 +154,22 @@ define void @test4(i32 %n) uwtable personality ptr @__FrameHandler { ; CHECK-NEXT: .Ltmp1: ; CHECK-NEXT: // %bb.2: // %while_cond_x.split ; CHECK-NEXT: // in Loop: Header=BB3_1 Depth=1 -; CHECK-NEXT: add x8, x0, x20 -; CHECK-NEXT: cmp w21, w19 -; CHECK-NEXT: str wzr, [x8] +; CHECK-NEXT: add x8, x0, #9, lsl #12 // =36864 +; CHECK-NEXT: cmp w20, w19 +; CHECK-NEXT: str wzr, [x8, #3136] ; CHECK-NEXT: b.ge .LBB3_4 ; CHECK-NEXT: // %bb.3: // %while_body ; CHECK-NEXT: // in Loop: Header=BB3_1 Depth=1 -; CHECK-NEXT: str w21, [x8, #4] -; CHECK-NEXT: add w21, w21, #1 -; CHECK-NEXT: str w21, [x8] +; CHECK-NEXT: str w20, [x8, #3140] +; CHECK-NEXT: add w20, w20, #1 +; CHECK-NEXT: str w20, [x8, #3136] ; CHECK-NEXT: b .LBB3_1 ; CHECK-NEXT: .LBB3_4: // %while_end ; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload -; CHECK-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp], #32 // 8-byte Folded Reload ; CHECK-NEXT: .cfi_def_cfa_offset 0 ; CHECK-NEXT: .cfi_restore w19 ; CHECK-NEXT: .cfi_restore w20 -; CHECK-NEXT: .cfi_restore w21 ; CHECK-NEXT: .cfi_restore w30 ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB3_5: // %cleanup @@ -223,14 +217,13 @@ define void @test5(ptr %s, i32 %n) { ; CHECK-NEXT: ldr x8, [x0] ; CHECK-NEXT: mov w9, wzr ; CHECK-NEXT: add x8, x8, #19, lsl #12 // =77824 -; CHECK-NEXT: add x8, x8, #2176 ; CHECK-NEXT: cmp w9, w1 ; CHECK-NEXT: b.ge .LBB4_2 ; CHECK-NEXT: .LBB4_1: // %while_body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: str w9, [x8, #4] +; CHECK-NEXT: str w9, [x8, #2180] ; CHECK-NEXT: add w9, w9, #1 -; CHECK-NEXT: str w9, [x8] +; CHECK-NEXT: str w9, [x8, #2176] ; CHECK-NEXT: cmp w9, w1 ; CHECK-NEXT: b.lt .LBB4_1 ; CHECK-NEXT: .LBB4_2: // %while_end