Skip to content

Commit a4b41eb

Browse files
committed
[SLP] Fix cost estimation of external uses with wrong VF
It assumed that the VF remains constant throughout the tree. That's not always true. This meant that we could query the extraction cost for a lane that is out of bounds. While experimenting with re-vectorisation for AArch64, we ran into this issue. We cannot add a proper AArch64 test as more changes would need to be brought in. This commit is only fixing the computation of VF and adding an assert. Some tests were failing after adding the assert: - foo() in llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll - test() in llvm/test/Transforms/SLPVectorizer/X86/reduction-with-removed-extracts.ll - test_with_extract() in llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads.ll
1 parent d386b3b commit a4b41eb

File tree

2 files changed

+42
-2
lines changed

2 files changed

+42
-2
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14568,8 +14568,6 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
1456814568
LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
1456914569
<< VectorizableTree.size() << ".\n");
1457014570

14571-
unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
14572-
1457314571
SmallPtrSet<Value *, 4> CheckedExtracts;
1457414572
for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
1457514573
TreeEntry &TE = *VectorizableTree[I];
@@ -14632,6 +14630,11 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
1463214630
}
1463314631
SmallDenseSet<std::pair<Value *, Value *>, 8> CheckedScalarUser;
1463414632
for (ExternalUser &EU : ExternalUses) {
14633+
LLVM_DEBUG(dbgs() << "SLP: Computing cost for external use of TreeEntry "
14634+
<< EU.E.Idx << " in lane " << EU.Lane << "\n");
14635+
LLVM_DEBUG(dbgs() << " User:" << *EU.User << "\n");
14636+
LLVM_DEBUG(dbgs() << " Use: " << EU.Scalar->getNameOrAsOperand() << "\n");
14637+
1463514638
// Uses by ephemeral values are free (because the ephemeral value will be
1463614639
// removed prior to code generation, and so the extraction will be
1463714640
// removed as well).
@@ -14739,6 +14742,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
1473914742
// for the extract and the added cost of the sign extend if needed.
1474014743
InstructionCost ExtraCost = TTI::TCC_Free;
1474114744
auto *ScalarTy = EU.Scalar->getType();
14745+
const unsigned BundleWidth = EU.E.getVectorFactor();
14746+
assert(EU.Lane < BundleWidth && "Extracted lane out of bounds.");
1474214747
auto *VecTy = getWidenedType(ScalarTy, BundleWidth);
1474314748
const TreeEntry *Entry = &EU.E;
1474414749
auto It = MinBWs.find(Entry);
@@ -14752,10 +14757,14 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
1475214757
VecTy = getWidenedType(MinTy, BundleWidth);
1475314758
ExtraCost =
1475414759
getExtractWithExtendCost(*TTI, Extend, ScalarTy, VecTy, EU.Lane);
14760+
LLVM_DEBUG(dbgs() << " ExtractExtend or ExtractSubvec cost: "
14761+
<< ExtraCost << "\n");
1475514762
} else {
1475614763
ExtraCost =
1475714764
getVectorInstrCost(*TTI, ScalarTy, Instruction::ExtractElement, VecTy,
1475814765
CostKind, EU.Lane, EU.Scalar, ScalarUserAndIdx);
14766+
LLVM_DEBUG(dbgs() << " ExtractElement cost for " << *ScalarTy << " from "
14767+
<< *VecTy << ": " << ExtraCost << "\n");
1475914768
}
1476014769
// Leave the scalar instructions as is if they are cheaper than extracts.
1476114770
if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||

llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads.ll

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,3 +31,34 @@ define void @test() {
3131
store double %res4, ptr getelementptr inbounds ([8 x double], ptr @dst, i32 0, i64 3), align 8
3232
ret void
3333
}
34+
35+
; Same as above, but %a7 is also used as a scalar and must be extracted from
36+
; the wide load. (Or in this case, kept as a scalar load).
37+
define double @test_with_extract() {
38+
; CHECK-LABEL: @test_with_extract(
39+
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x double>, ptr @src, align 8
40+
; CHECK-NEXT: [[A7:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 7), align 8
41+
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
42+
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
43+
; CHECK-NEXT: [[TMP4:%.*]] = fsub fast <4 x double> [[TMP2]], [[TMP3]]
44+
; CHECK-NEXT: store <4 x double> [[TMP4]], ptr @dst, align 8
45+
; CHECK-NEXT: ret double [[A7]]
46+
;
47+
%a0 = load double, ptr @src, align 8
48+
%a1 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 1), align 8
49+
%a2 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 2), align 8
50+
%a3 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 3), align 8
51+
%a4 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 4), align 8
52+
%a5 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 5), align 8
53+
%a6 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 6), align 8
54+
%a7 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 7), align 8
55+
%res1 = fsub fast double %a0, %a1
56+
%res2 = fsub fast double %a2, %a3
57+
%res3 = fsub fast double %a4, %a5
58+
%res4 = fsub fast double %a6, %a7
59+
store double %res1, ptr @dst, align 8
60+
store double %res2, ptr getelementptr inbounds ([8 x double], ptr @dst, i32 0, i64 1), align 8
61+
store double %res3, ptr getelementptr inbounds ([8 x double], ptr @dst, i32 0, i64 2), align 8
62+
store double %res4, ptr getelementptr inbounds ([8 x double], ptr @dst, i32 0, i64 3), align 8
63+
ret double %a7
64+
}

0 commit comments

Comments
 (0)