Skip to content

[LV] Ensure getScaledReductions only matches extends inside the loop #148264

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from

Conversation

david-arm
Copy link
Contributor

In getScaledReductions for the case where we try to match a partial reduction of the form:

%phi = phi i32 ...
...
%add = add i32 %phi, %zext

where

%zext = i8 %some_val to i32

we should ensure that %zext is actually inside the loop.

Fixes #148260

In getScaledReductions for the case where we try to match a partial
reduction of the form:

%phi = phi i32 ...
...
%add = add i32 %phi, %zext

where

%zext = i8 %some_val to i32

we should ensure that %zext is actually inside the loop.

Fixes llvm#148260
@llvmbot
Copy link
Member

llvmbot commented Jul 11, 2025

@llvm/pr-subscribers-llvm-transforms

Author: David Sherwood (david-arm)

Changes

In getScaledReductions for the case where we try to match a partial reduction of the form:

%phi = phi i32 ...
...
%add = add i32 %phi, %zext

where

%zext = i8 %some_val to i32

we should ensure that %zext is actually inside the loop.

Fixes #148260


Full diff: https://github.com/llvm/llvm-project/pull/148264.diff

2 Files Affected:

  • (modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+6-1)
  • (modified) llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll (+247)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f3de24aa4c3d1..7a00e94efb228 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8109,7 +8109,7 @@ bool VPRecipeBuilder::getScaledReductions(
   std::optional<unsigned> BinOpc;
   Type *ExtOpTypes[2] = {nullptr};
 
-  auto CollectExtInfo = [&Exts,
+  auto CollectExtInfo = [this, &Exts,
                          &ExtOpTypes](SmallVectorImpl<Value *> &Ops) -> bool {
     unsigned I = 0;
     for (Value *OpI : Ops) {
@@ -8117,6 +8117,11 @@ bool VPRecipeBuilder::getScaledReductions(
       if (!match(OpI, m_ZExtOrSExt(m_Value(ExtOp))))
         return false;
       Exts[I] = cast<Instruction>(OpI);
+
+      // Other operand should live inside the loop
+      if (!CM.TheLoop->contains(Exts[I]))
+        return false;
+
       ExtOpTypes[I] = ExtOp->getType();
       I++;
     }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
index 5d5ee570da0ff..70c8019a2115f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
@@ -911,6 +911,253 @@ for.exit:                        ; preds = %for.body
   ret i32 %add
 }
 
+define void @add_of_zext_outside_loop(ptr noalias %a, ptr noalias %b, i8 %c, i8 %d) #0 {
+; CHECK-INTERLEAVE1-LABEL: define void @add_of_zext_outside_loop(
+; CHECK-INTERLEAVE1-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i8 [[C:%.*]], i8 [[D:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT:  entry:
+; CHECK-INTERLEAVE1-NEXT:    [[CONV:%.*]] = zext i8 [[D]] to i32
+; CHECK-INTERLEAVE1-NEXT:    [[A_PROMOTED:%.*]] = load i32, ptr [[A]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[CONV1:%.*]] = zext i8 [[C]] to i32
+; CHECK-INTERLEAVE1-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK-INTERLEAVE1:       for.body:
+; CHECK-INTERLEAVE1-NEXT:    [[IV:%.*]] = phi i32 [ [[CONV]], [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[RDX:%.*]] = phi i32 [ [[A_PROMOTED]], [[ENTRY]] ], [ [[RDX_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IV]] to i64
+; CHECK-INTERLEAVE1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i8], ptr [[B]], i64 0, i64 [[IDXPROM]]
+; CHECK-INTERLEAVE1-NEXT:    store i8 0, ptr [[ARRAYIDX]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[RDX_NEXT]] = add nsw i32 [[RDX]], [[CONV1]]
+; CHECK-INTERLEAVE1-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 4
+; CHECK-INTERLEAVE1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[IV_NEXT]], 0
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_BODY]]
+; CHECK-INTERLEAVE1:       exit:
+; CHECK-INTERLEAVE1-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[RDX_NEXT]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    store i32 [[ADD_LCSSA]], ptr [[A]], align 4
+; CHECK-INTERLEAVE1-NEXT:    ret void
+;
+; CHECK-INTERLEAVED-LABEL: define void @add_of_zext_outside_loop(
+; CHECK-INTERLEAVED-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i8 [[C:%.*]], i8 [[D:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT:  entry:
+; CHECK-INTERLEAVED-NEXT:    [[CONV:%.*]] = zext i8 [[D]] to i32
+; CHECK-INTERLEAVED-NEXT:    [[A_PROMOTED:%.*]] = load i32, ptr [[A]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[CONV1:%.*]] = zext i8 [[C]] to i32
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = sub i32 -4, [[CONV]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = lshr i32 [[TMP0]], 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 1
+; CHECK-INTERLEAVED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK-INTERLEAVED:       vector.scevcheck:
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = trunc i8 [[D]] to i2
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = sub i2 0, [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = zext i2 [[TMP4]] to i32
+; CHECK-INTERLEAVED-NEXT:    [[IDENT_CHECK:%.*]] = icmp ne i32 [[TMP5]], 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = sub i32 -4, [[CONV]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 2
+; CHECK-INTERLEAVED-NEXT:    [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 4, i32 [[TMP7]])
+; CHECK-INTERLEAVED-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
+; CHECK-INTERLEAVED-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = add i32 [[CONV]], [[MUL_RESULT]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = icmp slt i32 [[TMP8]], [[CONV]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = or i1 [[TMP9]], [[MUL_OVERFLOW]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = or i1 [[IDENT_CHECK]], [[TMP10]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP11]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED:       vector.ph:
+; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP2]], 2
+; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP2]], [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = mul i32 [[N_VEC]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = add i32 [[CONV]], [[TMP12]]
+; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED:       vector.body:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ [[A_PROMOTED]], [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = mul i32 [[INDEX]], 4
+; CHECK-INTERLEAVED-NEXT:    [[OFFSET_IDX:%.*]] = add i32 [[CONV]], [[TMP14]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = add i32 [[OFFSET_IDX]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = sext i32 [[OFFSET_IDX]] to i64
+; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = sext i32 [[TMP15]] to i64
+; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [0 x i8], ptr [[B]], i64 0, i64 [[TMP16]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [0 x i8], ptr [[B]], i64 0, i64 [[TMP17]]
+; CHECK-INTERLEAVED-NEXT:    store i8 0, ptr [[TMP18]], align 1
+; CHECK-INTERLEAVED-NEXT:    store i8 0, ptr [[TMP19]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP20]] = add i32 [[VEC_PHI]], [[CONV1]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP21]] = add i32 [[VEC_PHI1]], [[CONV1]]
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-INTERLEAVED:       middle.block:
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add i32 [[TMP21]], [[TMP20]]
+; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVED:       scalar.ph:
+;
+; CHECK-MAXBW-LABEL: define void @add_of_zext_outside_loop(
+; CHECK-MAXBW-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i8 [[C:%.*]], i8 [[D:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT:  entry:
+; CHECK-MAXBW-NEXT:    [[CONV:%.*]] = zext i8 [[D]] to i32
+; CHECK-MAXBW-NEXT:    [[A_PROMOTED:%.*]] = load i32, ptr [[A]], align 1
+; CHECK-MAXBW-NEXT:    [[CONV1:%.*]] = zext i8 [[C]] to i32
+; CHECK-MAXBW-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK-MAXBW:       for.body:
+; CHECK-MAXBW-NEXT:    [[IV:%.*]] = phi i32 [ [[CONV]], [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[RDX:%.*]] = phi i32 [ [[A_PROMOTED]], [[ENTRY]] ], [ [[RDX_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IV]] to i64
+; CHECK-MAXBW-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i8], ptr [[B]], i64 0, i64 [[IDXPROM]]
+; CHECK-MAXBW-NEXT:    store i8 0, ptr [[ARRAYIDX]], align 1
+; CHECK-MAXBW-NEXT:    [[RDX_NEXT]] = add nsw i32 [[RDX]], [[CONV1]]
+; CHECK-MAXBW-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 4
+; CHECK-MAXBW-NEXT:    [[CMP:%.*]] = icmp eq i32 [[IV_NEXT]], 0
+; CHECK-MAXBW-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_BODY]]
+; CHECK-MAXBW:       exit:
+; CHECK-MAXBW-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[RDX_NEXT]], [[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    store i32 [[ADD_LCSSA]], ptr [[A]], align 4
+; CHECK-MAXBW-NEXT:    ret void
+;
+entry:
+  %conv = zext i8 %d to i32
+  %a.promoted = load i32, ptr %a, align 1
+  %conv1 = zext i8 %c to i32
+  br label %for.body
+
+for.body:
+  %iv = phi i32 [ %conv, %entry ], [ %iv.next, %for.body ]
+  %rdx = phi i32 [ %a.promoted, %entry ], [ %rdx.next, %for.body ]
+  %idxprom = sext i32 %iv to i64
+  %arrayidx = getelementptr inbounds [0 x i8], ptr %b, i64 0, i64 %idxprom
+  store i8 0, ptr %arrayidx, align 1
+  %rdx.next = add nsw i32 %rdx, %conv1
+  %iv.next = add i32 %iv, 4
+  %cmp = icmp eq i32 %iv.next, 0
+  br i1 %cmp, label %exit, label %for.body
+
+exit:
+  %add.lcssa = phi i32 [ %rdx.next, %for.body ]
+  store i32 %add.lcssa, ptr %a, align 4
+  ret void
+}
+
+define void @add_of_loop_invariant_zext(ptr noalias %a, ptr noalias %b, i8 %c, i8 %d) #0 {
+; CHECK-INTERLEAVE1-LABEL: define void @add_of_loop_invariant_zext(
+; CHECK-INTERLEAVE1-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i8 [[C:%.*]], i8 [[D:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT:  entry:
+; CHECK-INTERLEAVE1-NEXT:    [[CONV:%.*]] = zext i8 [[D]] to i32
+; CHECK-INTERLEAVE1-NEXT:    [[A_PROMOTED:%.*]] = load i32, ptr [[A]], align 1
+; CHECK-INTERLEAVE1-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK-INTERLEAVE1:       for.body:
+; CHECK-INTERLEAVE1-NEXT:    [[IV:%.*]] = phi i32 [ [[CONV]], [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[RDX:%.*]] = phi i32 [ [[A_PROMOTED]], [[ENTRY]] ], [ [[RDX_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IV]] to i64
+; CHECK-INTERLEAVE1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i8], ptr [[B]], i64 0, i64 [[IDXPROM]]
+; CHECK-INTERLEAVE1-NEXT:    store i8 0, ptr [[ARRAYIDX]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[CONV1:%.*]] = zext i8 [[C]] to i32
+; CHECK-INTERLEAVE1-NEXT:    [[RDX_NEXT]] = add nsw i32 [[RDX]], [[CONV1]]
+; CHECK-INTERLEAVE1-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 4
+; CHECK-INTERLEAVE1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[IV_NEXT]], 0
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_BODY]]
+; CHECK-INTERLEAVE1:       exit:
+; CHECK-INTERLEAVE1-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[RDX_NEXT]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    store i32 [[ADD_LCSSA]], ptr [[A]], align 4
+; CHECK-INTERLEAVE1-NEXT:    ret void
+;
+; CHECK-INTERLEAVED-LABEL: define void @add_of_loop_invariant_zext(
+; CHECK-INTERLEAVED-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i8 [[C:%.*]], i8 [[D:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT:  entry:
+; CHECK-INTERLEAVED-NEXT:    [[CONV:%.*]] = zext i8 [[D]] to i32
+; CHECK-INTERLEAVED-NEXT:    [[A_PROMOTED:%.*]] = load i32, ptr [[A]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = sub i32 -4, [[CONV]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = lshr i32 [[TMP0]], 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 1
+; CHECK-INTERLEAVED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK-INTERLEAVED:       vector.scevcheck:
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = trunc i8 [[D]] to i2
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = sub i2 0, [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = zext i2 [[TMP4]] to i32
+; CHECK-INTERLEAVED-NEXT:    [[IDENT_CHECK:%.*]] = icmp ne i32 [[TMP5]], 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = sub i32 -4, [[CONV]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 2
+; CHECK-INTERLEAVED-NEXT:    [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 4, i32 [[TMP7]])
+; CHECK-INTERLEAVED-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
+; CHECK-INTERLEAVED-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = add i32 [[CONV]], [[MUL_RESULT]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = icmp slt i32 [[TMP8]], [[CONV]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = or i1 [[TMP9]], [[MUL_OVERFLOW]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = or i1 [[IDENT_CHECK]], [[TMP10]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP11]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED:       vector.ph:
+; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP2]], 2
+; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP2]], [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = mul i32 [[N_VEC]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = add i32 [[CONV]], [[TMP12]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = zext i8 [[C]] to i32
+; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED:       vector.body:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ [[A_PROMOTED]], [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = mul i32 [[INDEX]], 4
+; CHECK-INTERLEAVED-NEXT:    [[OFFSET_IDX:%.*]] = add i32 [[CONV]], [[TMP15]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = add i32 [[OFFSET_IDX]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = sext i32 [[OFFSET_IDX]] to i64
+; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = sext i32 [[TMP16]] to i64
+; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [0 x i8], ptr [[B]], i64 0, i64 [[TMP17]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [0 x i8], ptr [[B]], i64 0, i64 [[TMP18]]
+; CHECK-INTERLEAVED-NEXT:    store i8 0, ptr [[TMP19]], align 1
+; CHECK-INTERLEAVED-NEXT:    store i8 0, ptr [[TMP20]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP21]] = add i32 [[VEC_PHI]], [[TMP14]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP22]] = add i32 [[VEC_PHI1]], [[TMP14]]
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-INTERLEAVED:       middle.block:
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add i32 [[TMP22]], [[TMP21]]
+; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVED:       scalar.ph:
+;
+; CHECK-MAXBW-LABEL: define void @add_of_loop_invariant_zext(
+; CHECK-MAXBW-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i8 [[C:%.*]], i8 [[D:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT:  entry:
+; CHECK-MAXBW-NEXT:    [[CONV:%.*]] = zext i8 [[D]] to i32
+; CHECK-MAXBW-NEXT:    [[A_PROMOTED:%.*]] = load i32, ptr [[A]], align 1
+; CHECK-MAXBW-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK-MAXBW:       for.body:
+; CHECK-MAXBW-NEXT:    [[IV:%.*]] = phi i32 [ [[CONV]], [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[RDX:%.*]] = phi i32 [ [[A_PROMOTED]], [[ENTRY]] ], [ [[RDX_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IV]] to i64
+; CHECK-MAXBW-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i8], ptr [[B]], i64 0, i64 [[IDXPROM]]
+; CHECK-MAXBW-NEXT:    store i8 0, ptr [[ARRAYIDX]], align 1
+; CHECK-MAXBW-NEXT:    [[CONV1:%.*]] = zext i8 [[C]] to i32
+; CHECK-MAXBW-NEXT:    [[RDX_NEXT]] = add nsw i32 [[RDX]], [[CONV1]]
+; CHECK-MAXBW-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 4
+; CHECK-MAXBW-NEXT:    [[CMP:%.*]] = icmp eq i32 [[IV_NEXT]], 0
+; CHECK-MAXBW-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_BODY]]
+; CHECK-MAXBW:       exit:
+; CHECK-MAXBW-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[RDX_NEXT]], [[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    store i32 [[ADD_LCSSA]], ptr [[A]], align 4
+; CHECK-MAXBW-NEXT:    ret void
+;
+entry:
+  %conv = zext i8 %d to i32
+  %a.promoted = load i32, ptr %a, align 1
+  br label %for.body
+
+for.body:
+  %iv = phi i32 [ %conv, %entry ], [ %iv.next, %for.body ]
+  %rdx = phi i32 [ %a.promoted, %entry ], [ %rdx.next, %for.body ]
+  %idxprom = sext i32 %iv to i64
+  %arrayidx = getelementptr inbounds [0 x i8], ptr %b, i64 0, i64 %idxprom
+  store i8 0, ptr %arrayidx, align 1
+  %conv1 = zext i8 %c to i32
+  %rdx.next = add nsw i32 %rdx, %conv1
+  %iv.next = add i32 %iv, 4
+  %cmp = icmp eq i32 %iv.next, 0
+  br i1 %cmp, label %exit, label %for.body
+
+exit:
+  %add.lcssa = phi i32 [ %rdx.next, %for.body ]
+  store i32 %add.lcssa, ptr %a, align 4
+  ret void
+}
+
+
 !0 = distinct !{!0, !1}
 !1 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
 attributes #0 = { vscale_range(1,16) "target-features"="+sve" }

@llvmbot
Copy link
Member

llvmbot commented Jul 11, 2025

@llvm/pr-subscribers-vectorizers

Author: David Sherwood (david-arm)

Changes

In getScaledReductions for the case where we try to match a partial reduction of the form:

%phi = phi i32 ...
...
%add = add i32 %phi, %zext

where

%zext = i8 %some_val to i32

we should ensure that %zext is actually inside the loop.

Fixes #148260


Full diff: https://github.com/llvm/llvm-project/pull/148264.diff

2 Files Affected:

  • (modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+6-1)
  • (modified) llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll (+247)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f3de24aa4c3d1..7a00e94efb228 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8109,7 +8109,7 @@ bool VPRecipeBuilder::getScaledReductions(
   std::optional<unsigned> BinOpc;
   Type *ExtOpTypes[2] = {nullptr};
 
-  auto CollectExtInfo = [&Exts,
+  auto CollectExtInfo = [this, &Exts,
                          &ExtOpTypes](SmallVectorImpl<Value *> &Ops) -> bool {
     unsigned I = 0;
     for (Value *OpI : Ops) {
@@ -8117,6 +8117,11 @@ bool VPRecipeBuilder::getScaledReductions(
       if (!match(OpI, m_ZExtOrSExt(m_Value(ExtOp))))
         return false;
       Exts[I] = cast<Instruction>(OpI);
+
+      // Other operand should live inside the loop
+      if (!CM.TheLoop->contains(Exts[I]))
+        return false;
+
       ExtOpTypes[I] = ExtOp->getType();
       I++;
     }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
index 5d5ee570da0ff..70c8019a2115f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
@@ -911,6 +911,253 @@ for.exit:                        ; preds = %for.body
   ret i32 %add
 }
 
+define void @add_of_zext_outside_loop(ptr noalias %a, ptr noalias %b, i8 %c, i8 %d) #0 {
+; CHECK-INTERLEAVE1-LABEL: define void @add_of_zext_outside_loop(
+; CHECK-INTERLEAVE1-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i8 [[C:%.*]], i8 [[D:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT:  entry:
+; CHECK-INTERLEAVE1-NEXT:    [[CONV:%.*]] = zext i8 [[D]] to i32
+; CHECK-INTERLEAVE1-NEXT:    [[A_PROMOTED:%.*]] = load i32, ptr [[A]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[CONV1:%.*]] = zext i8 [[C]] to i32
+; CHECK-INTERLEAVE1-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK-INTERLEAVE1:       for.body:
+; CHECK-INTERLEAVE1-NEXT:    [[IV:%.*]] = phi i32 [ [[CONV]], [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[RDX:%.*]] = phi i32 [ [[A_PROMOTED]], [[ENTRY]] ], [ [[RDX_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IV]] to i64
+; CHECK-INTERLEAVE1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i8], ptr [[B]], i64 0, i64 [[IDXPROM]]
+; CHECK-INTERLEAVE1-NEXT:    store i8 0, ptr [[ARRAYIDX]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[RDX_NEXT]] = add nsw i32 [[RDX]], [[CONV1]]
+; CHECK-INTERLEAVE1-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 4
+; CHECK-INTERLEAVE1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[IV_NEXT]], 0
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_BODY]]
+; CHECK-INTERLEAVE1:       exit:
+; CHECK-INTERLEAVE1-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[RDX_NEXT]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    store i32 [[ADD_LCSSA]], ptr [[A]], align 4
+; CHECK-INTERLEAVE1-NEXT:    ret void
+;
+; CHECK-INTERLEAVED-LABEL: define void @add_of_zext_outside_loop(
+; CHECK-INTERLEAVED-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i8 [[C:%.*]], i8 [[D:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT:  entry:
+; CHECK-INTERLEAVED-NEXT:    [[CONV:%.*]] = zext i8 [[D]] to i32
+; CHECK-INTERLEAVED-NEXT:    [[A_PROMOTED:%.*]] = load i32, ptr [[A]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[CONV1:%.*]] = zext i8 [[C]] to i32
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = sub i32 -4, [[CONV]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = lshr i32 [[TMP0]], 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 1
+; CHECK-INTERLEAVED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK-INTERLEAVED:       vector.scevcheck:
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = trunc i8 [[D]] to i2
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = sub i2 0, [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = zext i2 [[TMP4]] to i32
+; CHECK-INTERLEAVED-NEXT:    [[IDENT_CHECK:%.*]] = icmp ne i32 [[TMP5]], 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = sub i32 -4, [[CONV]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 2
+; CHECK-INTERLEAVED-NEXT:    [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 4, i32 [[TMP7]])
+; CHECK-INTERLEAVED-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
+; CHECK-INTERLEAVED-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = add i32 [[CONV]], [[MUL_RESULT]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = icmp slt i32 [[TMP8]], [[CONV]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = or i1 [[TMP9]], [[MUL_OVERFLOW]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = or i1 [[IDENT_CHECK]], [[TMP10]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP11]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED:       vector.ph:
+; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP2]], 2
+; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP2]], [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = mul i32 [[N_VEC]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = add i32 [[CONV]], [[TMP12]]
+; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED:       vector.body:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ [[A_PROMOTED]], [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = mul i32 [[INDEX]], 4
+; CHECK-INTERLEAVED-NEXT:    [[OFFSET_IDX:%.*]] = add i32 [[CONV]], [[TMP14]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = add i32 [[OFFSET_IDX]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = sext i32 [[OFFSET_IDX]] to i64
+; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = sext i32 [[TMP15]] to i64
+; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [0 x i8], ptr [[B]], i64 0, i64 [[TMP16]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [0 x i8], ptr [[B]], i64 0, i64 [[TMP17]]
+; CHECK-INTERLEAVED-NEXT:    store i8 0, ptr [[TMP18]], align 1
+; CHECK-INTERLEAVED-NEXT:    store i8 0, ptr [[TMP19]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP20]] = add i32 [[VEC_PHI]], [[CONV1]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP21]] = add i32 [[VEC_PHI1]], [[CONV1]]
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-INTERLEAVED:       middle.block:
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add i32 [[TMP21]], [[TMP20]]
+; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVED:       scalar.ph:
+;
+; CHECK-MAXBW-LABEL: define void @add_of_zext_outside_loop(
+; CHECK-MAXBW-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i8 [[C:%.*]], i8 [[D:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT:  entry:
+; CHECK-MAXBW-NEXT:    [[CONV:%.*]] = zext i8 [[D]] to i32
+; CHECK-MAXBW-NEXT:    [[A_PROMOTED:%.*]] = load i32, ptr [[A]], align 1
+; CHECK-MAXBW-NEXT:    [[CONV1:%.*]] = zext i8 [[C]] to i32
+; CHECK-MAXBW-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK-MAXBW:       for.body:
+; CHECK-MAXBW-NEXT:    [[IV:%.*]] = phi i32 [ [[CONV]], [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[RDX:%.*]] = phi i32 [ [[A_PROMOTED]], [[ENTRY]] ], [ [[RDX_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IV]] to i64
+; CHECK-MAXBW-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i8], ptr [[B]], i64 0, i64 [[IDXPROM]]
+; CHECK-MAXBW-NEXT:    store i8 0, ptr [[ARRAYIDX]], align 1
+; CHECK-MAXBW-NEXT:    [[RDX_NEXT]] = add nsw i32 [[RDX]], [[CONV1]]
+; CHECK-MAXBW-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 4
+; CHECK-MAXBW-NEXT:    [[CMP:%.*]] = icmp eq i32 [[IV_NEXT]], 0
+; CHECK-MAXBW-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_BODY]]
+; CHECK-MAXBW:       exit:
+; CHECK-MAXBW-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[RDX_NEXT]], [[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    store i32 [[ADD_LCSSA]], ptr [[A]], align 4
+; CHECK-MAXBW-NEXT:    ret void
+;
+entry:
+  %conv = zext i8 %d to i32
+  %a.promoted = load i32, ptr %a, align 1
+  %conv1 = zext i8 %c to i32
+  br label %for.body
+
+for.body:
+  %iv = phi i32 [ %conv, %entry ], [ %iv.next, %for.body ]
+  %rdx = phi i32 [ %a.promoted, %entry ], [ %rdx.next, %for.body ]
+  %idxprom = sext i32 %iv to i64
+  %arrayidx = getelementptr inbounds [0 x i8], ptr %b, i64 0, i64 %idxprom
+  store i8 0, ptr %arrayidx, align 1
+  %rdx.next = add nsw i32 %rdx, %conv1
+  %iv.next = add i32 %iv, 4
+  %cmp = icmp eq i32 %iv.next, 0
+  br i1 %cmp, label %exit, label %for.body
+
+exit:
+  %add.lcssa = phi i32 [ %rdx.next, %for.body ]
+  store i32 %add.lcssa, ptr %a, align 4
+  ret void
+}
+
+define void @add_of_loop_invariant_zext(ptr noalias %a, ptr noalias %b, i8 %c, i8 %d) #0 {
+; CHECK-INTERLEAVE1-LABEL: define void @add_of_loop_invariant_zext(
+; CHECK-INTERLEAVE1-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i8 [[C:%.*]], i8 [[D:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT:  entry:
+; CHECK-INTERLEAVE1-NEXT:    [[CONV:%.*]] = zext i8 [[D]] to i32
+; CHECK-INTERLEAVE1-NEXT:    [[A_PROMOTED:%.*]] = load i32, ptr [[A]], align 1
+; CHECK-INTERLEAVE1-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK-INTERLEAVE1:       for.body:
+; CHECK-INTERLEAVE1-NEXT:    [[IV:%.*]] = phi i32 [ [[CONV]], [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[RDX:%.*]] = phi i32 [ [[A_PROMOTED]], [[ENTRY]] ], [ [[RDX_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IV]] to i64
+; CHECK-INTERLEAVE1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i8], ptr [[B]], i64 0, i64 [[IDXPROM]]
+; CHECK-INTERLEAVE1-NEXT:    store i8 0, ptr [[ARRAYIDX]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[CONV1:%.*]] = zext i8 [[C]] to i32
+; CHECK-INTERLEAVE1-NEXT:    [[RDX_NEXT]] = add nsw i32 [[RDX]], [[CONV1]]
+; CHECK-INTERLEAVE1-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 4
+; CHECK-INTERLEAVE1-NEXT:    [[CMP:%.*]] = icmp eq i32 [[IV_NEXT]], 0
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_BODY]]
+; CHECK-INTERLEAVE1:       exit:
+; CHECK-INTERLEAVE1-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[RDX_NEXT]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    store i32 [[ADD_LCSSA]], ptr [[A]], align 4
+; CHECK-INTERLEAVE1-NEXT:    ret void
+;
+; CHECK-INTERLEAVED-LABEL: define void @add_of_loop_invariant_zext(
+; CHECK-INTERLEAVED-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i8 [[C:%.*]], i8 [[D:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT:  entry:
+; CHECK-INTERLEAVED-NEXT:    [[CONV:%.*]] = zext i8 [[D]] to i32
+; CHECK-INTERLEAVED-NEXT:    [[A_PROMOTED:%.*]] = load i32, ptr [[A]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = sub i32 -4, [[CONV]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = lshr i32 [[TMP0]], 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 1
+; CHECK-INTERLEAVED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK-INTERLEAVED:       vector.scevcheck:
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = trunc i8 [[D]] to i2
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = sub i2 0, [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = zext i2 [[TMP4]] to i32
+; CHECK-INTERLEAVED-NEXT:    [[IDENT_CHECK:%.*]] = icmp ne i32 [[TMP5]], 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = sub i32 -4, [[CONV]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 2
+; CHECK-INTERLEAVED-NEXT:    [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 4, i32 [[TMP7]])
+; CHECK-INTERLEAVED-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
+; CHECK-INTERLEAVED-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = add i32 [[CONV]], [[MUL_RESULT]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = icmp slt i32 [[TMP8]], [[CONV]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = or i1 [[TMP9]], [[MUL_OVERFLOW]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = or i1 [[IDENT_CHECK]], [[TMP10]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP11]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED:       vector.ph:
+; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP2]], 2
+; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP2]], [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = mul i32 [[N_VEC]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = add i32 [[CONV]], [[TMP12]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = zext i8 [[C]] to i32
+; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED:       vector.body:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ [[A_PROMOTED]], [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = mul i32 [[INDEX]], 4
+; CHECK-INTERLEAVED-NEXT:    [[OFFSET_IDX:%.*]] = add i32 [[CONV]], [[TMP15]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = add i32 [[OFFSET_IDX]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = sext i32 [[OFFSET_IDX]] to i64
+; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = sext i32 [[TMP16]] to i64
+; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [0 x i8], ptr [[B]], i64 0, i64 [[TMP17]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [0 x i8], ptr [[B]], i64 0, i64 [[TMP18]]
+; CHECK-INTERLEAVED-NEXT:    store i8 0, ptr [[TMP19]], align 1
+; CHECK-INTERLEAVED-NEXT:    store i8 0, ptr [[TMP20]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP21]] = add i32 [[VEC_PHI]], [[TMP14]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP22]] = add i32 [[VEC_PHI1]], [[TMP14]]
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-INTERLEAVED:       middle.block:
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add i32 [[TMP22]], [[TMP21]]
+; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVED:       scalar.ph:
+;
+; CHECK-MAXBW-LABEL: define void @add_of_loop_invariant_zext(
+; CHECK-MAXBW-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i8 [[C:%.*]], i8 [[D:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT:  entry:
+; CHECK-MAXBW-NEXT:    [[CONV:%.*]] = zext i8 [[D]] to i32
+; CHECK-MAXBW-NEXT:    [[A_PROMOTED:%.*]] = load i32, ptr [[A]], align 1
+; CHECK-MAXBW-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK-MAXBW:       for.body:
+; CHECK-MAXBW-NEXT:    [[IV:%.*]] = phi i32 [ [[CONV]], [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[RDX:%.*]] = phi i32 [ [[A_PROMOTED]], [[ENTRY]] ], [ [[RDX_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IV]] to i64
+; CHECK-MAXBW-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i8], ptr [[B]], i64 0, i64 [[IDXPROM]]
+; CHECK-MAXBW-NEXT:    store i8 0, ptr [[ARRAYIDX]], align 1
+; CHECK-MAXBW-NEXT:    [[CONV1:%.*]] = zext i8 [[C]] to i32
+; CHECK-MAXBW-NEXT:    [[RDX_NEXT]] = add nsw i32 [[RDX]], [[CONV1]]
+; CHECK-MAXBW-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 4
+; CHECK-MAXBW-NEXT:    [[CMP:%.*]] = icmp eq i32 [[IV_NEXT]], 0
+; CHECK-MAXBW-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_BODY]]
+; CHECK-MAXBW:       exit:
+; CHECK-MAXBW-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[RDX_NEXT]], [[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    store i32 [[ADD_LCSSA]], ptr [[A]], align 4
+; CHECK-MAXBW-NEXT:    ret void
+;
+entry:
+  %conv = zext i8 %d to i32
+  %a.promoted = load i32, ptr %a, align 1
+  br label %for.body
+
+for.body:
+  %iv = phi i32 [ %conv, %entry ], [ %iv.next, %for.body ]
+  %rdx = phi i32 [ %a.promoted, %entry ], [ %rdx.next, %for.body ]
+  %idxprom = sext i32 %iv to i64
+  %arrayidx = getelementptr inbounds [0 x i8], ptr %b, i64 0, i64 %idxprom
+  store i8 0, ptr %arrayidx, align 1
+  %conv1 = zext i8 %c to i32
+  %rdx.next = add nsw i32 %rdx, %conv1
+  %iv.next = add i32 %iv, 4
+  %cmp = icmp eq i32 %iv.next, 0
+  br i1 %cmp, label %exit, label %for.body
+
+exit:
+  %add.lcssa = phi i32 [ %rdx.next, %for.body ]
+  store i32 %add.lcssa, ptr %a, align 4
+  ret void
+}
+
+
 !0 = distinct !{!0, !1}
 !1 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
 attributes #0 = { vscale_range(1,16) "target-features"="+sve" }

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

[LV] Assertion `Val && "isa<> used on a null pointer"' failed
2 participants