-
Notifications
You must be signed in to change notification settings - Fork 14.4k
[LV] Ensure getScaledReductions only matches extends inside the loop #148264
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
In getScaledReductions for the case where we try to match a partial reduction of the form: %phi = phi i32 ... ... %add = add i32 %phi, %zext where %zext = i8 %some_val to i32 we should ensure that %zext is actually inside the loop. Fixes llvm#148260
@llvm/pr-subscribers-llvm-transforms Author: David Sherwood (david-arm) ChangesIn getScaledReductions for the case where we try to match a partial reduction of the form: %phi = phi i32 ... where %zext = i8 %some_val to i32 we should ensure that %zext is actually inside the loop. Fixes #148260 Full diff: https://github.com/llvm/llvm-project/pull/148264.diff 2 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f3de24aa4c3d1..7a00e94efb228 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8109,7 +8109,7 @@ bool VPRecipeBuilder::getScaledReductions(
std::optional<unsigned> BinOpc;
Type *ExtOpTypes[2] = {nullptr};
- auto CollectExtInfo = [&Exts,
+ auto CollectExtInfo = [this, &Exts,
&ExtOpTypes](SmallVectorImpl<Value *> &Ops) -> bool {
unsigned I = 0;
for (Value *OpI : Ops) {
@@ -8117,6 +8117,11 @@ bool VPRecipeBuilder::getScaledReductions(
if (!match(OpI, m_ZExtOrSExt(m_Value(ExtOp))))
return false;
Exts[I] = cast<Instruction>(OpI);
+
+ // Other operand should live inside the loop
+ if (!CM.TheLoop->contains(Exts[I]))
+ return false;
+
ExtOpTypes[I] = ExtOp->getType();
I++;
}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
index 5d5ee570da0ff..70c8019a2115f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
@@ -911,6 +911,253 @@ for.exit: ; preds = %for.body
ret i32 %add
}
+define void @add_of_zext_outside_loop(ptr noalias %a, ptr noalias %b, i8 %c, i8 %d) #0 {
+; CHECK-INTERLEAVE1-LABEL: define void @add_of_zext_outside_loop(
+; CHECK-INTERLEAVE1-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i8 [[C:%.*]], i8 [[D:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT: entry:
+; CHECK-INTERLEAVE1-NEXT: [[CONV:%.*]] = zext i8 [[D]] to i32
+; CHECK-INTERLEAVE1-NEXT: [[A_PROMOTED:%.*]] = load i32, ptr [[A]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[CONV1:%.*]] = zext i8 [[C]] to i32
+; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-INTERLEAVE1: for.body:
+; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i32 [ [[CONV]], [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[RDX:%.*]] = phi i32 [ [[A_PROMOTED]], [[ENTRY]] ], [ [[RDX_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[IDXPROM:%.*]] = sext i32 [[IV]] to i64
+; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i8], ptr [[B]], i64 0, i64 [[IDXPROM]]
+; CHECK-INTERLEAVE1-NEXT: store i8 0, ptr [[ARRAYIDX]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[RDX_NEXT]] = add nsw i32 [[RDX]], [[CONV1]]
+; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add i32 [[IV]], 4
+; CHECK-INTERLEAVE1-NEXT: [[CMP:%.*]] = icmp eq i32 [[IV_NEXT]], 0
+; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_BODY]]
+; CHECK-INTERLEAVE1: exit:
+; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[RDX_NEXT]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: store i32 [[ADD_LCSSA]], ptr [[A]], align 4
+; CHECK-INTERLEAVE1-NEXT: ret void
+;
+; CHECK-INTERLEAVED-LABEL: define void @add_of_zext_outside_loop(
+; CHECK-INTERLEAVED-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i8 [[C:%.*]], i8 [[D:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT: entry:
+; CHECK-INTERLEAVED-NEXT: [[CONV:%.*]] = zext i8 [[D]] to i32
+; CHECK-INTERLEAVED-NEXT: [[A_PROMOTED:%.*]] = load i32, ptr [[A]], align 1
+; CHECK-INTERLEAVED-NEXT: [[CONV1:%.*]] = zext i8 [[C]] to i32
+; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = sub i32 -4, [[CONV]]
+; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP0]], 2
+; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 1
+; CHECK-INTERLEAVED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK-INTERLEAVED: vector.scevcheck:
+; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = trunc i8 [[D]] to i2
+; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = sub i2 0, [[TMP3]]
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext i2 [[TMP4]] to i32
+; CHECK-INTERLEAVED-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i32 [[TMP5]], 0
+; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = sub i32 -4, [[CONV]]
+; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 2
+; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 4, i32 [[TMP7]])
+; CHECK-INTERLEAVED-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
+; CHECK-INTERLEAVED-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
+; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = add i32 [[CONV]], [[MUL_RESULT]]
+; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = icmp slt i32 [[TMP8]], [[CONV]]
+; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = or i1 [[TMP9]], [[MUL_OVERFLOW]]
+; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = or i1 [[IDENT_CHECK]], [[TMP10]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP11]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED: vector.ph:
+; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP2]], 2
+; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP2]], [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = mul i32 [[N_VEC]], 4
+; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = add i32 [[CONV]], [[TMP12]]
+; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED: vector.body:
+; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[A_PROMOTED]], [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = mul i32 [[INDEX]], 4
+; CHECK-INTERLEAVED-NEXT: [[OFFSET_IDX:%.*]] = add i32 [[CONV]], [[TMP14]]
+; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = add i32 [[OFFSET_IDX]], 4
+; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = sext i32 [[OFFSET_IDX]] to i64
+; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = sext i32 [[TMP15]] to i64
+; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr inbounds [0 x i8], ptr [[B]], i64 0, i64 [[TMP16]]
+; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr inbounds [0 x i8], ptr [[B]], i64 0, i64 [[TMP17]]
+; CHECK-INTERLEAVED-NEXT: store i8 0, ptr [[TMP18]], align 1
+; CHECK-INTERLEAVED-NEXT: store i8 0, ptr [[TMP19]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP20]] = add i32 [[VEC_PHI]], [[CONV1]]
+; CHECK-INTERLEAVED-NEXT: [[TMP21]] = add i32 [[VEC_PHI1]], [[CONV1]]
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-INTERLEAVED: middle.block:
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP21]], [[TMP20]]
+; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVED: scalar.ph:
+;
+; CHECK-MAXBW-LABEL: define void @add_of_zext_outside_loop(
+; CHECK-MAXBW-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i8 [[C:%.*]], i8 [[D:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT: entry:
+; CHECK-MAXBW-NEXT: [[CONV:%.*]] = zext i8 [[D]] to i32
+; CHECK-MAXBW-NEXT: [[A_PROMOTED:%.*]] = load i32, ptr [[A]], align 1
+; CHECK-MAXBW-NEXT: [[CONV1:%.*]] = zext i8 [[C]] to i32
+; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-MAXBW: for.body:
+; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i32 [ [[CONV]], [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[RDX:%.*]] = phi i32 [ [[A_PROMOTED]], [[ENTRY]] ], [ [[RDX_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[IDXPROM:%.*]] = sext i32 [[IV]] to i64
+; CHECK-MAXBW-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i8], ptr [[B]], i64 0, i64 [[IDXPROM]]
+; CHECK-MAXBW-NEXT: store i8 0, ptr [[ARRAYIDX]], align 1
+; CHECK-MAXBW-NEXT: [[RDX_NEXT]] = add nsw i32 [[RDX]], [[CONV1]]
+; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add i32 [[IV]], 4
+; CHECK-MAXBW-NEXT: [[CMP:%.*]] = icmp eq i32 [[IV_NEXT]], 0
+; CHECK-MAXBW-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_BODY]]
+; CHECK-MAXBW: exit:
+; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[RDX_NEXT]], [[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT: store i32 [[ADD_LCSSA]], ptr [[A]], align 4
+; CHECK-MAXBW-NEXT: ret void
+;
+entry:
+ %conv = zext i8 %d to i32
+ %a.promoted = load i32, ptr %a, align 1
+ %conv1 = zext i8 %c to i32
+ br label %for.body
+
+for.body:
+ %iv = phi i32 [ %conv, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi i32 [ %a.promoted, %entry ], [ %rdx.next, %for.body ]
+ %idxprom = sext i32 %iv to i64
+ %arrayidx = getelementptr inbounds [0 x i8], ptr %b, i64 0, i64 %idxprom
+ store i8 0, ptr %arrayidx, align 1
+ %rdx.next = add nsw i32 %rdx, %conv1
+ %iv.next = add i32 %iv, 4
+ %cmp = icmp eq i32 %iv.next, 0
+ br i1 %cmp, label %exit, label %for.body
+
+exit:
+ %add.lcssa = phi i32 [ %rdx.next, %for.body ]
+ store i32 %add.lcssa, ptr %a, align 4
+ ret void
+}
+
+define void @add_of_loop_invariant_zext(ptr noalias %a, ptr noalias %b, i8 %c, i8 %d) #0 {
+; CHECK-INTERLEAVE1-LABEL: define void @add_of_loop_invariant_zext(
+; CHECK-INTERLEAVE1-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i8 [[C:%.*]], i8 [[D:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT: entry:
+; CHECK-INTERLEAVE1-NEXT: [[CONV:%.*]] = zext i8 [[D]] to i32
+; CHECK-INTERLEAVE1-NEXT: [[A_PROMOTED:%.*]] = load i32, ptr [[A]], align 1
+; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-INTERLEAVE1: for.body:
+; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i32 [ [[CONV]], [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[RDX:%.*]] = phi i32 [ [[A_PROMOTED]], [[ENTRY]] ], [ [[RDX_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[IDXPROM:%.*]] = sext i32 [[IV]] to i64
+; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i8], ptr [[B]], i64 0, i64 [[IDXPROM]]
+; CHECK-INTERLEAVE1-NEXT: store i8 0, ptr [[ARRAYIDX]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[CONV1:%.*]] = zext i8 [[C]] to i32
+; CHECK-INTERLEAVE1-NEXT: [[RDX_NEXT]] = add nsw i32 [[RDX]], [[CONV1]]
+; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add i32 [[IV]], 4
+; CHECK-INTERLEAVE1-NEXT: [[CMP:%.*]] = icmp eq i32 [[IV_NEXT]], 0
+; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_BODY]]
+; CHECK-INTERLEAVE1: exit:
+; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[RDX_NEXT]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: store i32 [[ADD_LCSSA]], ptr [[A]], align 4
+; CHECK-INTERLEAVE1-NEXT: ret void
+;
+; CHECK-INTERLEAVED-LABEL: define void @add_of_loop_invariant_zext(
+; CHECK-INTERLEAVED-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i8 [[C:%.*]], i8 [[D:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT: entry:
+; CHECK-INTERLEAVED-NEXT: [[CONV:%.*]] = zext i8 [[D]] to i32
+; CHECK-INTERLEAVED-NEXT: [[A_PROMOTED:%.*]] = load i32, ptr [[A]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = sub i32 -4, [[CONV]]
+; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP0]], 2
+; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 1
+; CHECK-INTERLEAVED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK-INTERLEAVED: vector.scevcheck:
+; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = trunc i8 [[D]] to i2
+; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = sub i2 0, [[TMP3]]
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext i2 [[TMP4]] to i32
+; CHECK-INTERLEAVED-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i32 [[TMP5]], 0
+; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = sub i32 -4, [[CONV]]
+; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 2
+; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 4, i32 [[TMP7]])
+; CHECK-INTERLEAVED-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
+; CHECK-INTERLEAVED-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
+; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = add i32 [[CONV]], [[MUL_RESULT]]
+; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = icmp slt i32 [[TMP8]], [[CONV]]
+; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = or i1 [[TMP9]], [[MUL_OVERFLOW]]
+; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = or i1 [[IDENT_CHECK]], [[TMP10]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP11]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED: vector.ph:
+; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP2]], 2
+; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP2]], [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = mul i32 [[N_VEC]], 4
+; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = add i32 [[CONV]], [[TMP12]]
+; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = zext i8 [[C]] to i32
+; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED: vector.body:
+; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[A_PROMOTED]], [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = mul i32 [[INDEX]], 4
+; CHECK-INTERLEAVED-NEXT: [[OFFSET_IDX:%.*]] = add i32 [[CONV]], [[TMP15]]
+; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = add i32 [[OFFSET_IDX]], 4
+; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = sext i32 [[OFFSET_IDX]] to i64
+; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = sext i32 [[TMP16]] to i64
+; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr inbounds [0 x i8], ptr [[B]], i64 0, i64 [[TMP17]]
+; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = getelementptr inbounds [0 x i8], ptr [[B]], i64 0, i64 [[TMP18]]
+; CHECK-INTERLEAVED-NEXT: store i8 0, ptr [[TMP19]], align 1
+; CHECK-INTERLEAVED-NEXT: store i8 0, ptr [[TMP20]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP21]] = add i32 [[VEC_PHI]], [[TMP14]]
+; CHECK-INTERLEAVED-NEXT: [[TMP22]] = add i32 [[VEC_PHI1]], [[TMP14]]
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-INTERLEAVED: middle.block:
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP22]], [[TMP21]]
+; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVED: scalar.ph:
+;
+; CHECK-MAXBW-LABEL: define void @add_of_loop_invariant_zext(
+; CHECK-MAXBW-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i8 [[C:%.*]], i8 [[D:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT: entry:
+; CHECK-MAXBW-NEXT: [[CONV:%.*]] = zext i8 [[D]] to i32
+; CHECK-MAXBW-NEXT: [[A_PROMOTED:%.*]] = load i32, ptr [[A]], align 1
+; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-MAXBW: for.body:
+; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i32 [ [[CONV]], [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[RDX:%.*]] = phi i32 [ [[A_PROMOTED]], [[ENTRY]] ], [ [[RDX_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[IDXPROM:%.*]] = sext i32 [[IV]] to i64
+; CHECK-MAXBW-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i8], ptr [[B]], i64 0, i64 [[IDXPROM]]
+; CHECK-MAXBW-NEXT: store i8 0, ptr [[ARRAYIDX]], align 1
+; CHECK-MAXBW-NEXT: [[CONV1:%.*]] = zext i8 [[C]] to i32
+; CHECK-MAXBW-NEXT: [[RDX_NEXT]] = add nsw i32 [[RDX]], [[CONV1]]
+; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add i32 [[IV]], 4
+; CHECK-MAXBW-NEXT: [[CMP:%.*]] = icmp eq i32 [[IV_NEXT]], 0
+; CHECK-MAXBW-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_BODY]]
+; CHECK-MAXBW: exit:
+; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[RDX_NEXT]], [[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT: store i32 [[ADD_LCSSA]], ptr [[A]], align 4
+; CHECK-MAXBW-NEXT: ret void
+;
+entry:
+ %conv = zext i8 %d to i32
+ %a.promoted = load i32, ptr %a, align 1
+ br label %for.body
+
+for.body:
+ %iv = phi i32 [ %conv, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi i32 [ %a.promoted, %entry ], [ %rdx.next, %for.body ]
+ %idxprom = sext i32 %iv to i64
+ %arrayidx = getelementptr inbounds [0 x i8], ptr %b, i64 0, i64 %idxprom
+ store i8 0, ptr %arrayidx, align 1
+ %conv1 = zext i8 %c to i32
+ %rdx.next = add nsw i32 %rdx, %conv1
+ %iv.next = add i32 %iv, 4
+ %cmp = icmp eq i32 %iv.next, 0
+ br i1 %cmp, label %exit, label %for.body
+
+exit:
+ %add.lcssa = phi i32 [ %rdx.next, %for.body ]
+ store i32 %add.lcssa, ptr %a, align 4
+ ret void
+}
+
+
!0 = distinct !{!0, !1}
!1 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
attributes #0 = { vscale_range(1,16) "target-features"="+sve" }
|
@llvm/pr-subscribers-vectorizers Author: David Sherwood (david-arm) ChangesIn getScaledReductions for the case where we try to match a partial reduction of the form: %phi = phi i32 ... where %zext = i8 %some_val to i32 we should ensure that %zext is actually inside the loop. Fixes #148260 Full diff: https://github.com/llvm/llvm-project/pull/148264.diff 2 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f3de24aa4c3d1..7a00e94efb228 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8109,7 +8109,7 @@ bool VPRecipeBuilder::getScaledReductions(
std::optional<unsigned> BinOpc;
Type *ExtOpTypes[2] = {nullptr};
- auto CollectExtInfo = [&Exts,
+ auto CollectExtInfo = [this, &Exts,
&ExtOpTypes](SmallVectorImpl<Value *> &Ops) -> bool {
unsigned I = 0;
for (Value *OpI : Ops) {
@@ -8117,6 +8117,11 @@ bool VPRecipeBuilder::getScaledReductions(
if (!match(OpI, m_ZExtOrSExt(m_Value(ExtOp))))
return false;
Exts[I] = cast<Instruction>(OpI);
+
+ // Other operand should live inside the loop
+ if (!CM.TheLoop->contains(Exts[I]))
+ return false;
+
ExtOpTypes[I] = ExtOp->getType();
I++;
}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
index 5d5ee570da0ff..70c8019a2115f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
@@ -911,6 +911,253 @@ for.exit: ; preds = %for.body
ret i32 %add
}
+define void @add_of_zext_outside_loop(ptr noalias %a, ptr noalias %b, i8 %c, i8 %d) #0 {
+; CHECK-INTERLEAVE1-LABEL: define void @add_of_zext_outside_loop(
+; CHECK-INTERLEAVE1-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i8 [[C:%.*]], i8 [[D:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT: entry:
+; CHECK-INTERLEAVE1-NEXT: [[CONV:%.*]] = zext i8 [[D]] to i32
+; CHECK-INTERLEAVE1-NEXT: [[A_PROMOTED:%.*]] = load i32, ptr [[A]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[CONV1:%.*]] = zext i8 [[C]] to i32
+; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-INTERLEAVE1: for.body:
+; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i32 [ [[CONV]], [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[RDX:%.*]] = phi i32 [ [[A_PROMOTED]], [[ENTRY]] ], [ [[RDX_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[IDXPROM:%.*]] = sext i32 [[IV]] to i64
+; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i8], ptr [[B]], i64 0, i64 [[IDXPROM]]
+; CHECK-INTERLEAVE1-NEXT: store i8 0, ptr [[ARRAYIDX]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[RDX_NEXT]] = add nsw i32 [[RDX]], [[CONV1]]
+; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add i32 [[IV]], 4
+; CHECK-INTERLEAVE1-NEXT: [[CMP:%.*]] = icmp eq i32 [[IV_NEXT]], 0
+; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_BODY]]
+; CHECK-INTERLEAVE1: exit:
+; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[RDX_NEXT]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: store i32 [[ADD_LCSSA]], ptr [[A]], align 4
+; CHECK-INTERLEAVE1-NEXT: ret void
+;
+; CHECK-INTERLEAVED-LABEL: define void @add_of_zext_outside_loop(
+; CHECK-INTERLEAVED-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i8 [[C:%.*]], i8 [[D:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT: entry:
+; CHECK-INTERLEAVED-NEXT: [[CONV:%.*]] = zext i8 [[D]] to i32
+; CHECK-INTERLEAVED-NEXT: [[A_PROMOTED:%.*]] = load i32, ptr [[A]], align 1
+; CHECK-INTERLEAVED-NEXT: [[CONV1:%.*]] = zext i8 [[C]] to i32
+; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = sub i32 -4, [[CONV]]
+; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP0]], 2
+; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 1
+; CHECK-INTERLEAVED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK-INTERLEAVED: vector.scevcheck:
+; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = trunc i8 [[D]] to i2
+; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = sub i2 0, [[TMP3]]
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext i2 [[TMP4]] to i32
+; CHECK-INTERLEAVED-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i32 [[TMP5]], 0
+; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = sub i32 -4, [[CONV]]
+; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 2
+; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 4, i32 [[TMP7]])
+; CHECK-INTERLEAVED-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
+; CHECK-INTERLEAVED-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
+; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = add i32 [[CONV]], [[MUL_RESULT]]
+; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = icmp slt i32 [[TMP8]], [[CONV]]
+; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = or i1 [[TMP9]], [[MUL_OVERFLOW]]
+; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = or i1 [[IDENT_CHECK]], [[TMP10]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP11]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED: vector.ph:
+; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP2]], 2
+; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP2]], [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = mul i32 [[N_VEC]], 4
+; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = add i32 [[CONV]], [[TMP12]]
+; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED: vector.body:
+; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[A_PROMOTED]], [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = mul i32 [[INDEX]], 4
+; CHECK-INTERLEAVED-NEXT: [[OFFSET_IDX:%.*]] = add i32 [[CONV]], [[TMP14]]
+; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = add i32 [[OFFSET_IDX]], 4
+; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = sext i32 [[OFFSET_IDX]] to i64
+; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = sext i32 [[TMP15]] to i64
+; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr inbounds [0 x i8], ptr [[B]], i64 0, i64 [[TMP16]]
+; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr inbounds [0 x i8], ptr [[B]], i64 0, i64 [[TMP17]]
+; CHECK-INTERLEAVED-NEXT: store i8 0, ptr [[TMP18]], align 1
+; CHECK-INTERLEAVED-NEXT: store i8 0, ptr [[TMP19]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP20]] = add i32 [[VEC_PHI]], [[CONV1]]
+; CHECK-INTERLEAVED-NEXT: [[TMP21]] = add i32 [[VEC_PHI1]], [[CONV1]]
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-INTERLEAVED: middle.block:
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP21]], [[TMP20]]
+; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVED: scalar.ph:
+;
+; CHECK-MAXBW-LABEL: define void @add_of_zext_outside_loop(
+; CHECK-MAXBW-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i8 [[C:%.*]], i8 [[D:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT: entry:
+; CHECK-MAXBW-NEXT: [[CONV:%.*]] = zext i8 [[D]] to i32
+; CHECK-MAXBW-NEXT: [[A_PROMOTED:%.*]] = load i32, ptr [[A]], align 1
+; CHECK-MAXBW-NEXT: [[CONV1:%.*]] = zext i8 [[C]] to i32
+; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-MAXBW: for.body:
+; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i32 [ [[CONV]], [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[RDX:%.*]] = phi i32 [ [[A_PROMOTED]], [[ENTRY]] ], [ [[RDX_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[IDXPROM:%.*]] = sext i32 [[IV]] to i64
+; CHECK-MAXBW-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i8], ptr [[B]], i64 0, i64 [[IDXPROM]]
+; CHECK-MAXBW-NEXT: store i8 0, ptr [[ARRAYIDX]], align 1
+; CHECK-MAXBW-NEXT: [[RDX_NEXT]] = add nsw i32 [[RDX]], [[CONV1]]
+; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add i32 [[IV]], 4
+; CHECK-MAXBW-NEXT: [[CMP:%.*]] = icmp eq i32 [[IV_NEXT]], 0
+; CHECK-MAXBW-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_BODY]]
+; CHECK-MAXBW: exit:
+; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[RDX_NEXT]], [[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT: store i32 [[ADD_LCSSA]], ptr [[A]], align 4
+; CHECK-MAXBW-NEXT: ret void
+;
+entry:
+ %conv = zext i8 %d to i32
+ %a.promoted = load i32, ptr %a, align 1
+ %conv1 = zext i8 %c to i32
+ br label %for.body
+
+for.body:
+ %iv = phi i32 [ %conv, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi i32 [ %a.promoted, %entry ], [ %rdx.next, %for.body ]
+ %idxprom = sext i32 %iv to i64
+ %arrayidx = getelementptr inbounds [0 x i8], ptr %b, i64 0, i64 %idxprom
+ store i8 0, ptr %arrayidx, align 1
+ %rdx.next = add nsw i32 %rdx, %conv1
+ %iv.next = add i32 %iv, 4
+ %cmp = icmp eq i32 %iv.next, 0
+ br i1 %cmp, label %exit, label %for.body
+
+exit:
+ %add.lcssa = phi i32 [ %rdx.next, %for.body ]
+ store i32 %add.lcssa, ptr %a, align 4
+ ret void
+}
+
+define void @add_of_loop_invariant_zext(ptr noalias %a, ptr noalias %b, i8 %c, i8 %d) #0 {
+; CHECK-INTERLEAVE1-LABEL: define void @add_of_loop_invariant_zext(
+; CHECK-INTERLEAVE1-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i8 [[C:%.*]], i8 [[D:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT: entry:
+; CHECK-INTERLEAVE1-NEXT: [[CONV:%.*]] = zext i8 [[D]] to i32
+; CHECK-INTERLEAVE1-NEXT: [[A_PROMOTED:%.*]] = load i32, ptr [[A]], align 1
+; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-INTERLEAVE1: for.body:
+; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i32 [ [[CONV]], [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[RDX:%.*]] = phi i32 [ [[A_PROMOTED]], [[ENTRY]] ], [ [[RDX_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[IDXPROM:%.*]] = sext i32 [[IV]] to i64
+; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i8], ptr [[B]], i64 0, i64 [[IDXPROM]]
+; CHECK-INTERLEAVE1-NEXT: store i8 0, ptr [[ARRAYIDX]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[CONV1:%.*]] = zext i8 [[C]] to i32
+; CHECK-INTERLEAVE1-NEXT: [[RDX_NEXT]] = add nsw i32 [[RDX]], [[CONV1]]
+; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add i32 [[IV]], 4
+; CHECK-INTERLEAVE1-NEXT: [[CMP:%.*]] = icmp eq i32 [[IV_NEXT]], 0
+; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_BODY]]
+; CHECK-INTERLEAVE1: exit:
+; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[RDX_NEXT]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: store i32 [[ADD_LCSSA]], ptr [[A]], align 4
+; CHECK-INTERLEAVE1-NEXT: ret void
+;
+; CHECK-INTERLEAVED-LABEL: define void @add_of_loop_invariant_zext(
+; CHECK-INTERLEAVED-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i8 [[C:%.*]], i8 [[D:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT: entry:
+; CHECK-INTERLEAVED-NEXT: [[CONV:%.*]] = zext i8 [[D]] to i32
+; CHECK-INTERLEAVED-NEXT: [[A_PROMOTED:%.*]] = load i32, ptr [[A]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = sub i32 -4, [[CONV]]
+; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP0]], 2
+; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 1
+; CHECK-INTERLEAVED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK-INTERLEAVED: vector.scevcheck:
+; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = trunc i8 [[D]] to i2
+; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = sub i2 0, [[TMP3]]
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext i2 [[TMP4]] to i32
+; CHECK-INTERLEAVED-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i32 [[TMP5]], 0
+; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = sub i32 -4, [[CONV]]
+; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 2
+; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 4, i32 [[TMP7]])
+; CHECK-INTERLEAVED-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
+; CHECK-INTERLEAVED-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
+; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = add i32 [[CONV]], [[MUL_RESULT]]
+; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = icmp slt i32 [[TMP8]], [[CONV]]
+; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = or i1 [[TMP9]], [[MUL_OVERFLOW]]
+; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = or i1 [[IDENT_CHECK]], [[TMP10]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP11]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED: vector.ph:
+; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP2]], 2
+; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP2]], [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = mul i32 [[N_VEC]], 4
+; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = add i32 [[CONV]], [[TMP12]]
+; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = zext i8 [[C]] to i32
+; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED: vector.body:
+; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[A_PROMOTED]], [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = mul i32 [[INDEX]], 4
+; CHECK-INTERLEAVED-NEXT: [[OFFSET_IDX:%.*]] = add i32 [[CONV]], [[TMP15]]
+; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = add i32 [[OFFSET_IDX]], 4
+; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = sext i32 [[OFFSET_IDX]] to i64
+; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = sext i32 [[TMP16]] to i64
+; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr inbounds [0 x i8], ptr [[B]], i64 0, i64 [[TMP17]]
+; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = getelementptr inbounds [0 x i8], ptr [[B]], i64 0, i64 [[TMP18]]
+; CHECK-INTERLEAVED-NEXT: store i8 0, ptr [[TMP19]], align 1
+; CHECK-INTERLEAVED-NEXT: store i8 0, ptr [[TMP20]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP21]] = add i32 [[VEC_PHI]], [[TMP14]]
+; CHECK-INTERLEAVED-NEXT: [[TMP22]] = add i32 [[VEC_PHI1]], [[TMP14]]
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-INTERLEAVED: middle.block:
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP22]], [[TMP21]]
+; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVED: scalar.ph:
+;
+; CHECK-MAXBW-LABEL: define void @add_of_loop_invariant_zext(
+; CHECK-MAXBW-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i8 [[C:%.*]], i8 [[D:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT: entry:
+; CHECK-MAXBW-NEXT: [[CONV:%.*]] = zext i8 [[D]] to i32
+; CHECK-MAXBW-NEXT: [[A_PROMOTED:%.*]] = load i32, ptr [[A]], align 1
+; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-MAXBW: for.body:
+; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i32 [ [[CONV]], [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[RDX:%.*]] = phi i32 [ [[A_PROMOTED]], [[ENTRY]] ], [ [[RDX_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[IDXPROM:%.*]] = sext i32 [[IV]] to i64
+; CHECK-MAXBW-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i8], ptr [[B]], i64 0, i64 [[IDXPROM]]
+; CHECK-MAXBW-NEXT: store i8 0, ptr [[ARRAYIDX]], align 1
+; CHECK-MAXBW-NEXT: [[CONV1:%.*]] = zext i8 [[C]] to i32
+; CHECK-MAXBW-NEXT: [[RDX_NEXT]] = add nsw i32 [[RDX]], [[CONV1]]
+; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add i32 [[IV]], 4
+; CHECK-MAXBW-NEXT: [[CMP:%.*]] = icmp eq i32 [[IV_NEXT]], 0
+; CHECK-MAXBW-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_BODY]]
+; CHECK-MAXBW: exit:
+; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[RDX_NEXT]], [[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT: store i32 [[ADD_LCSSA]], ptr [[A]], align 4
+; CHECK-MAXBW-NEXT: ret void
+;
+entry:
+ %conv = zext i8 %d to i32
+ %a.promoted = load i32, ptr %a, align 1
+ br label %for.body
+
+for.body:
+ %iv = phi i32 [ %conv, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi i32 [ %a.promoted, %entry ], [ %rdx.next, %for.body ]
+ %idxprom = sext i32 %iv to i64
+ %arrayidx = getelementptr inbounds [0 x i8], ptr %b, i64 0, i64 %idxprom
+ store i8 0, ptr %arrayidx, align 1
+ %conv1 = zext i8 %c to i32
+ %rdx.next = add nsw i32 %rdx, %conv1
+ %iv.next = add i32 %iv, 4
+ %cmp = icmp eq i32 %iv.next, 0
+ br i1 %cmp, label %exit, label %for.body
+
+exit:
+ %add.lcssa = phi i32 [ %rdx.next, %for.body ]
+ store i32 %add.lcssa, ptr %a, align 4
+ ret void
+}
+
+
!0 = distinct !{!0, !1}
!1 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
attributes #0 = { vscale_range(1,16) "target-features"="+sve" }
|
In getScaledReductions for the case where we try to match a partial reduction of the form:
%phi = phi i32 ...
...
%add = add i32 %phi, %zext
where
%zext = i8 %some_val to i32
we should ensure that %zext is actually inside the loop.
Fixes #148260