Skip to content

Commit c82c78c

Browse files
committed
[SLP] Fix cost estimation of external uses with wrong VF
It assumed that the VF remains constant throughout the tree. That's not always true. This meant that we could query the extraction cost for a lane that is out of bounds. While experimenting with re-vectorisation for AArch64, and we ran into this issue. We cannot add a proper AArch64 test as more changes would need to be brought in. This commit is only fixing the computation of VF and adding an assert. Some tests were failing after adding the assert: - foo() in llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll - test() in llvm/test/Transforms/SLPVectorizer/X86/reduction-with-removed-extracts.ll - test_with_extract() in llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads.ll
1 parent aee21c3 commit c82c78c

File tree

2 files changed

+47
-7
lines changed

2 files changed

+47
-7
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3898,7 +3898,7 @@ class BoUpSLP {
38983898

38993899
/// When ReuseReorderShuffleIndices is empty it just returns position of \p
39003900
/// V within vector of Scalars. Otherwise, try to remap on its reuse index.
3901-
int findLaneForValue(Value *V) const {
3901+
unsigned findLaneForValue(Value *V) const {
39023902
unsigned FoundLane = getVectorFactor();
39033903
for (auto *It = find(Scalars, V), *End = Scalars.end(); It != End;
39043904
std::advance(It, 1)) {
@@ -4344,7 +4344,7 @@ class BoUpSLP {
43444344

43454345
/// This POD struct describes one external user in the vectorized tree.
43464346
struct ExternalUser {
4347-
ExternalUser(Value *S, llvm::User *U, const TreeEntry &E, int L)
4347+
ExternalUser(Value *S, llvm::User *U, const TreeEntry &E, unsigned L)
43484348
: Scalar(S), User(U), E(E), Lane(L) {}
43494349

43504350
/// Which scalar in our function.
@@ -4357,7 +4357,7 @@ class BoUpSLP {
43574357
const TreeEntry &E;
43584358

43594359
/// Which lane does the scalar belong to.
4360-
int Lane;
4360+
unsigned Lane;
43614361
};
43624362
using UserList = SmallVector<ExternalUser, 16>;
43634363

@@ -7901,7 +7901,7 @@ void BoUpSLP::buildExternalUses(
79017901
// Check if the scalar is externally used as an extra arg.
79027902
const auto ExtI = ExternallyUsedValues.find(Scalar);
79037903
if (ExtI != ExternallyUsedValues.end()) {
7904-
int FoundLane = Entry->findLaneForValue(Scalar);
7904+
unsigned FoundLane = Entry->findLaneForValue(Scalar);
79057905
LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
79067906
<< FoundLane << " from " << *Scalar << ".\n");
79077907
ScalarToExtUses.try_emplace(Scalar, ExternalUses.size());
@@ -7949,7 +7949,7 @@ void BoUpSLP::buildExternalUses(
79497949

79507950
if (U && Scalar->hasNUsesOrMore(UsesLimit))
79517951
U = nullptr;
7952-
int FoundLane = Entry->findLaneForValue(Scalar);
7952+
unsigned FoundLane = Entry->findLaneForValue(Scalar);
79537953
LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
79547954
<< " from lane " << FoundLane << " from " << *Scalar
79557955
<< ".\n");
@@ -14568,8 +14568,6 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
1456814568
LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
1456914569
<< VectorizableTree.size() << ".\n");
1457014570

14571-
unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
14572-
1457314571
SmallPtrSet<Value *, 4> CheckedExtracts;
1457414572
for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
1457514573
TreeEntry &TE = *VectorizableTree[I];
@@ -14632,6 +14630,11 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
1463214630
}
1463314631
SmallDenseSet<std::pair<Value *, Value *>, 8> CheckedScalarUser;
1463414632
for (ExternalUser &EU : ExternalUses) {
14633+
LLVM_DEBUG(dbgs() << "SLP: Computing cost for external use of TreeEntry "
14634+
<< EU.E.Idx << " in lane " << EU.Lane << "\n");
14635+
LLVM_DEBUG(dbgs() << " User:" << *EU.User << "\n");
14636+
LLVM_DEBUG(dbgs() << " Use: " << EU.Scalar->getNameOrAsOperand() << "\n");
14637+
1463514638
// Uses by ephemeral values are free (because the ephemeral value will be
1463614639
// removed prior to code generation, and so the extraction will be
1463714640
// removed as well).
@@ -14739,6 +14742,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
1473914742
// for the extract and the added cost of the sign extend if needed.
1474014743
InstructionCost ExtraCost = TTI::TCC_Free;
1474114744
auto *ScalarTy = EU.Scalar->getType();
14745+
const unsigned BundleWidth = EU.E.getVectorFactor();
14746+
assert(EU.Lane < BundleWidth && "Extracted lane out of bounds.");
1474214747
auto *VecTy = getWidenedType(ScalarTy, BundleWidth);
1474314748
const TreeEntry *Entry = &EU.E;
1474414749
auto It = MinBWs.find(Entry);
@@ -14752,10 +14757,14 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
1475214757
VecTy = getWidenedType(MinTy, BundleWidth);
1475314758
ExtraCost =
1475414759
getExtractWithExtendCost(*TTI, Extend, ScalarTy, VecTy, EU.Lane);
14760+
LLVM_DEBUG(dbgs() << " ExtractExtend or ExtractSubvec cost: "
14761+
<< ExtraCost << "\n");
1475514762
} else {
1475614763
ExtraCost =
1475714764
getVectorInstrCost(*TTI, ScalarTy, Instruction::ExtractElement, VecTy,
1475814765
CostKind, EU.Lane, EU.Scalar, ScalarUserAndIdx);
14766+
LLVM_DEBUG(dbgs() << " ExtractElement cost for " << *ScalarTy << " from "
14767+
<< *VecTy << ": " << ExtraCost << "\n");
1475914768
}
1476014769
// Leave the scalar instructions as is if they are cheaper than extracts.
1476114770
if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||

llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads.ll

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,3 +31,34 @@ define void @test() {
3131
store double %res4, ptr getelementptr inbounds ([8 x double], ptr @dst, i32 0, i64 3), align 8
3232
ret void
3333
}
34+
35+
; Same as above, but %a7 is also used as a scalar and must be extracted from
36+
; the wide load. (Or in this case, kept as a scalar load).
37+
define double @test_with_extract() {
38+
; CHECK-LABEL: @test_with_extract(
39+
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x double>, ptr @src, align 8
40+
; CHECK-NEXT: [[A7:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 7), align 8
41+
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
42+
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
43+
; CHECK-NEXT: [[TMP4:%.*]] = fsub fast <4 x double> [[TMP2]], [[TMP3]]
44+
; CHECK-NEXT: store <4 x double> [[TMP4]], ptr @dst, align 8
45+
; CHECK-NEXT: ret double [[A7]]
46+
;
47+
%a0 = load double, ptr @src, align 8
48+
%a1 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 1), align 8
49+
%a2 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 2), align 8
50+
%a3 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 3), align 8
51+
%a4 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 4), align 8
52+
%a5 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 5), align 8
53+
%a6 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 6), align 8
54+
%a7 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 7), align 8
55+
%res1 = fsub fast double %a0, %a1
56+
%res2 = fsub fast double %a2, %a3
57+
%res3 = fsub fast double %a4, %a5
58+
%res4 = fsub fast double %a6, %a7
59+
store double %res1, ptr @dst, align 8
60+
store double %res2, ptr getelementptr inbounds ([8 x double], ptr @dst, i32 0, i64 1), align 8
61+
store double %res3, ptr getelementptr inbounds ([8 x double], ptr @dst, i32 0, i64 2), align 8
62+
store double %res4, ptr getelementptr inbounds ([8 x double], ptr @dst, i32 0, i64 3), align 8
63+
ret double %a7
64+
}

0 commit comments

Comments
 (0)