JIT: Prove some cases where strength reducing to GC pointers is ok (#…

…104679) For loops iterating over arrays we often have bounds that allow us to prove that an add recurrence formed by strength reduction will stay within that array. In these cases we know that forming the byrefs eagerly is ok. For example, when strength reduction is enabled, this changes the codegen of ```csharp private struct S { public int A, B, C; } [MethodImpl(MethodImplOptions.NoInlining)] private static float Sum(S[] ss) { int sum = 0; for (int i = 0; i < ss.Length; i++) { S v = ss[i]; sum += v.A; sum += v.B; sum += v.C; } return sum; } ``` in the following way: ```diff G_M63518_IG03: - mov r11d, 16 + add rcx, 16 ;; size=4 bbWeight=0.25 PerfScore 0.06 G_M63518_IG04: - lea r8, bword ptr [rcx+r11] + mov r8, rcx mov r10d, dword ptr [r8] mov r9d, dword ptr [r8+0x04] mov r8d, dword ptr [r8+0x08] add eax, r10d add eax, r9d add eax, r8d - add r11, 12 + add rcx, 12 dec edx jne SHORT G_M63518_IG04 ;; size=31 bbWeight=4 PerfScore 34.00 ```
dotnet · Jul 15, 2024 · 2199c77 · 2199c77
1 parent b885a58
commit 2199c77
Show file tree

Hide file tree

Showing 5 changed files with 257 additions and 29 deletions.
diff --git a/src/coreclr/jit/assertionprop.cpp b/src/coreclr/jit/assertionprop.cpp
@@ -4848,6 +4848,52 @@ AssertionIndex Compiler::optAssertionIsNonNullInternal(GenTree*
     return NO_ASSERTION_INDEX;
 }
 
+//------------------------------------------------------------------------
+// optAssertionVNIsNonNull: See if we can prove that the value of a VN is
+// non-null using assertions.
+//
+// Arguments:
+//   vn         - VN to check
+//   assertions - set of live assertions
+//
+// Return Value:
+//   True if the VN could be proven non-null.
+//
+bool Compiler::optAssertionVNIsNonNull(ValueNum vn, ASSERT_VALARG_TP assertions)
+{
+    if (vnStore->IsKnownNonNull(vn))
+    {
+        return true;
+    }
+
+    // Check each assertion to find if we have a vn != null assertion.
+    //
+    BitVecOps::Iter iter(apTraits, assertions);
+    unsigned        index = 0;
+    while (iter.NextElem(&index))
+    {
+        AssertionIndex assertionIndex = GetAssertionIndex(index);
+        if (assertionIndex > optAssertionCount)
+        {
+            break;
+        }
+        AssertionDsc* curAssertion = optGetAssertion(assertionIndex);
+        if (!curAssertion->CanPropNonNull())
+        {
+            continue;
+        }
+
+        if (curAssertion->op1.vn != vn)
+        {
+            continue;
+        }
+
+        return true;
+    }
+
+    return false;
+}
+
 /*****************************************************************************
  *
  *  Given a tree consisting of a call and a set of available assertions, we

diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h
@@ -8030,6 +8030,7 @@ class Compiler
     AssertionIndex optAssertionIsSubrange(GenTree* tree, IntegralRange range, ASSERT_VALARG_TP assertions);
     AssertionIndex optAssertionIsSubtype(GenTree* tree, GenTree* methodTableArg, ASSERT_VALARG_TP assertions);
     AssertionIndex optAssertionIsNonNullInternal(GenTree* op, ASSERT_VALARG_TP assertions DEBUGARG(bool* pVnBased));
+    bool           optAssertionVNIsNonNull(ValueNum vn, ASSERT_VALARG_TP assertions);
     bool           optAssertionIsNonNull(GenTree*                    op,
                                          ASSERT_VALARG_TP assertions DEBUGARG(bool* pVnBased) DEBUGARG(AssertionIndex* pIndex));
 

diff --git a/src/coreclr/jit/inductionvariableopts.cpp b/src/coreclr/jit/inductionvariableopts.cpp
@@ -5,29 +5,46 @@
 // scalar evolution analysis (see scev.h and scev.cpp for more information
 // about the scalar evolution analysis).
 //
-// Currently the only optimization done is widening of primary induction
-// variables from 32 bits into 64 bits. This is generally only profitable on
-// x64 that does not allow zero extension of 32-bit values in addressing modes
-// (in contrast, arm64 does have the capability of including zero extensions in
-// addressing modes). For x64 this saves a zero extension for every array
-// access inside the loop, in exchange for some widening or narrowing stores
-// outside the loop:
-//   - To make sure the new widened IV starts at the right value it is
-//   initialized to the value of the narrow IV outside the loop (either in the
-//   preheader or at the def location of the narrow IV). Usually the start
-//   value is a constant, in which case the widened IV is just initialized to
-//   the constant value.
-//   - If the narrow IV is used after the loop we need to store it back from
-//   the widened IV in the exits. We depend on liveness sets to figure out
-//   which exits to insert IR into.
-//
-// These steps ensure that the wide IV has the right value to begin with and
-// the old narrow IV still has the right value after the loop. Additionally,
-// we must replace every use of the narrow IV inside the loop with the widened
-// IV. This is done by a traversal of the IR inside the loop. We do not
-// actually widen the uses of the IV; rather, we keep all uses and defs as
-// 32-bit, which the backend is able to handle efficiently on x64. Because of
-// this we do not need to worry about overflow.
+// Currently the following optimizations are done:
+//
+// IV widening:
+//   This widens primary induction variables from 32 bits into 64 bits. This is
+//   generally only profitable on x64 that does not allow zero extension of
+//   32-bit values in addressing modes (in contrast, arm64 does have the
+//   capability of including zero extensions in addressing modes). For x64 this
+//   saves a zero extension for every array access inside the loop, in exchange
+//   for some widening or narrowing stores outside the loop:
+//     - To make sure the new widened IV starts at the right value it is
+//     initialized to the value of the narrow IV outside the loop (either in
+//     the preheader or at the def location of the narrow IV). Usually the
+//     start value is a constant, in which case the widened IV is just
+//     initialized to the constant value.
+//     - If the narrow IV is used after the loop we need to store it back from
+//     the widened IV in the exits. We depend on liveness sets to figure out
+//     which exits to insert IR into.
+//
+//   These steps ensure that the wide IV has the right value to begin with and
+//   the old narrow IV still has the right value after the loop. Additionally,
+//   we must replace every use of the narrow IV inside the loop with the widened
+//   IV. This is done by a traversal of the IR inside the loop. We do not
+//   actually widen the uses of the IV; rather, we keep all uses and defs as
+//   32-bit, which the backend is able to handle efficiently on x64. Because of
+//   this we do not need to worry about overflow.
+//
+// Loop reversing:
+//   This converts loops that are up-counted into loops that are down-counted.
+//   Down-counted loops can generally do their IV update and compare in a
+//   single instruction, bypassing the need to do a separate comparison with a
+//   bound.
+//
+// Strength reduction (disabled):
+//   This changes the stride of primary IVs in a loop to avoid more expensive
+//   multiplications inside the loop. Commonly the primary IVs are only used
+//   for indexing memory at some element size, which can end up with these
+//   multiplications.
+//
+//   Strength reduction frequently relies on reversing the loop to remove the
+//   last non-multiplied use of the primary IV.
 //
 
 #include "jitpch.h"
@@ -1227,6 +1244,7 @@ class StrengthReductionContext
     bool        InitializeCursors(GenTreeLclVarCommon* primaryIVLcl, ScevAddRec* primaryIV);
     void        AdvanceCursors(ArrayStack<CursorInfo>* cursors, ArrayStack<CursorInfo>* nextCursors);
     bool        CheckAdvancedCursors(ArrayStack<CursorInfo>* cursors, int derivedLevel, ScevAddRec** nextIV);
+    bool        StaysWithinManagedObject(ArrayStack<CursorInfo>* cursors, ScevAddRec* addRec);
     bool        TryReplaceUsesWithNewPrimaryIV(ArrayStack<CursorInfo>* cursors, ScevAddRec* iv);
     BasicBlock* FindUpdateInsertionPoint(ArrayStack<CursorInfo>* cursors);
 
@@ -1344,13 +1362,11 @@ bool StrengthReductionContext::TryStrengthReduce()
             }
             assert(nextIV != nullptr);
 
-            // We need more sanity checks to allow materializing GC-typed add
-            // recs. Otherwise we may eagerly form a GC pointer that was only
-            // lazily formed under some conditions before, which can be
-            // illegal. For now we just bail.
-            if (varTypeIsGC(nextIV->Type))
+            if (varTypeIsGC(nextIV->Type) && !StaysWithinManagedObject(nextCursors, nextIV))
             {
-                JITDUMP("    Next IV has type %s. Bailing.\n", varTypeName(nextIV->Type));
+                JITDUMP(
+                    "    Next IV computes a GC pointer that we cannot prove to be inside a managed object. Bailing.\n",
+                    varTypeName(nextIV->Type));
                 break;
             }
 
@@ -1694,6 +1710,127 @@ bool StrengthReductionContext::CheckAdvancedCursors(ArrayStack<CursorInfo>* curs
     return *nextIV != nullptr;
 }
 
+//------------------------------------------------------------------------
+// StaysWithinManagedObject: Check whether the specified GC-pointer add-rec can
+// be guaranteed to be inside the same managed object for the whole loop.
+//
+// Parameters:
+//   cursors - Cursors pointing to next uses that correspond to the specific add-rec.
+//   addRec  - The add recurrence
+//
+// Returns:
+//   True if we were able to prove so.
+//
+bool StrengthReductionContext::StaysWithinManagedObject(ArrayStack<CursorInfo>* cursors, ScevAddRec* addRec)
+{
+    int64_t offset;
+    Scev*   baseScev = addRec->Start->PeelAdditions(&offset);
+    offset           = static_cast<target_ssize_t>(offset);
+
+    // We only support arrays here. To strength reduce Span<T> accesses we need
+    // additional properies on the range designated by a Span<T> that we
+    // currently do not specify, or we need to prove that the byref we may form
+    // in the IV update would have been formed anyway by the loop.
+    if (!baseScev->OperIs(ScevOper::Local) || !baseScev->TypeIs(TYP_REF))
+    {
+        return false;
+    }
+
+    // Now use the fact that we keep ARR_ADDRs in the IR when we have array
+    // accesses.
+    GenTreeArrAddr* arrAddr = nullptr;
+    for (int i = 0; i < cursors->Height(); i++)
+    {
+        CursorInfo& cursor = cursors->BottomRef(i);
+        GenTree*    parent = cursor.Tree->gtGetParent(nullptr);
+        if ((parent != nullptr) && parent->OperIs(GT_ARR_ADDR))
+        {
+            arrAddr = parent->AsArrAddr();
+            break;
+        }
+    }
+
+    if (arrAddr == nullptr)
+    {
+        return false;
+    }
+
+    unsigned arrElemSize = arrAddr->GetElemType() == TYP_STRUCT
+                               ? m_comp->typGetObjLayout(arrAddr->GetElemClassHandle())->GetSize()
+                               : genTypeSize(arrAddr->GetElemType());
+
+    int64_t stepCns;
+    if (!addRec->Step->GetConstantValue(m_comp, &stepCns) || ((unsigned)stepCns > arrElemSize))
+    {
+        return false;
+    }
+
+    ScevLocal* local = (ScevLocal*)baseScev;
+
+    ValueNum vn = m_scevContext.MaterializeVN(baseScev);
+    if (vn == ValueNumStore::NoVN)
+    {
+        return false;
+    }
+
+    BasicBlock* preheader = m_loop->EntryEdge(0)->getSourceBlock();
+    if (!m_comp->optAssertionVNIsNonNull(vn, preheader->bbAssertionOut))
+    {
+        return false;
+    }
+
+    // We have a non-null array. Check that the 'start' offset looks fine.
+    // TODO: We could also use assertions on the length of the array. E.g. if
+    // we know the length of the array is > 3, then we can allow the add rec to
+    // have a later start. Maybe range check can be used?
+    if ((offset < 0) || (offset > (int64_t)OFFSETOF__CORINFO_Array__data))
+    {
+        return false;
+    }
+
+    // Now see if we have a bound that guarantees that we iterate less than the
+    // array length's times.
+    for (int i = 0; i < m_backEdgeBounds.Height(); i++)
+    {
+        // TODO: EvaluateRelop ought to be powerful enough to prove something
+        // like bound < ARR_LENGTH(vn), but it is not able to prove that
+        // currently, even for bound = ARR_LENGTH(vn) - 1 (common case).
+        Scev* bound = m_backEdgeBounds.Bottom(i);
+
+        int64_t boundOffset;
+        Scev*   boundBase = bound->PeelAdditions(&boundOffset);
+
+        if (bound->TypeIs(TYP_INT))
+        {
+            boundOffset = static_cast<int32_t>(boundOffset);
+        }
+
+        if (boundOffset >= 0)
+        {
+            // If we take the backedge >= the array length times, then we would
+            // advance the addrec past the end.
+            continue;
+        }
+
+        ValueNum boundBaseVN = m_scevContext.MaterializeVN(boundBase);
+
+        VNFuncApp vnf;
+        if (!m_comp->vnStore->GetVNFunc(boundBaseVN, &vnf))
+        {
+            continue;
+        }
+
+        if ((vnf.m_func != VNF_ARR_LENGTH) || (vnf.m_args[0] != vn))
+        {
+            continue;
+        }
+
+        return true;
+    }
+
+    return false;
+}
+
 //------------------------------------------------------------------------
 // TryReplaceUsesWithNewPrimaryIV: Perform final sanity checks before
 // introducing a new primary IV and replacing the uses represented by the

diff --git a/src/coreclr/jit/scev.cpp b/src/coreclr/jit/scev.cpp
@@ -220,6 +220,48 @@ bool Scev::IsInvariant()
     return result != ScevVisit::Abort;
 }
 
+//------------------------------------------------------------------------
+// Scev::PeelAdditions: Peel the aditions from a SCEV and return the base SCEV
+// and the sum of the offsets peeled.
+//
+// Parameters:
+//   offset - [out] The sum of offsets peeled
+//
+// Returns:
+//   The base SCEV.
+//
+// Remarks:
+//   If the SCEV is 32-bits, the user is expected to apply the proper
+//   truncation (or extension into 64-bit).
+//
+Scev* Scev::PeelAdditions(int64_t* offset)
+{
+    *offset = 0;
+
+    Scev* scev = this;
+    while (scev->OperIs(ScevOper::Add))
+    {
+        Scev* op1 = ((ScevBinop*)scev)->Op1;
+        Scev* op2 = ((ScevBinop*)scev)->Op2;
+        if (op1->OperIs(ScevOper::Constant))
+        {
+            *offset += ((ScevConstant*)op1)->Value;
+            scev = op2;
+        }
+        else if (op2->OperIs(ScevOper::Constant))
+        {
+            *offset += ((ScevConstant*)op2)->Value;
+            scev = op1;
+        }
+        else
+        {
+            break;
+        }
+    }
+
+    return scev;
+}
+
 //------------------------------------------------------------------------
 // Scev::Equals: Check if two SCEV trees are equal.
 //

diff --git a/src/coreclr/jit/scev.h b/src/coreclr/jit/scev.h
@@ -75,6 +75,8 @@ struct Scev
 
     bool IsInvariant();
 
+    Scev* PeelAdditions(int64_t* offset);
+
     static bool Equals(Scev* left, Scev* right);
 };