Skip to content

Commit

Permalink
Use AVX2 in codegen for GT_STORE_BLK (#55604)
Browse files Browse the repository at this point in the history
* Saving current state, current state does not currently pass tests

* Getting available vector register size from compiler instance instead, resolves failing tests.

* Emit vzeroupper if there is potentially an AVX-SSE transition

* Copy remainder using same tempReg allocated during SIMD moves instead of allocating a GPR

* Better guard for maximum SIMD register size

* Fix typo

* Remove commented out lines

* Insert vzeroupper if using AVX2

* Add another vzeroupper

* Remove vzeroupper, inserting the instruction hurts performance

* Adding vzeroupper again for testing

* Remove debug lines

* Add assert before copying remaining bytes, change regSize condition to depend on AVX, remove vzeroupper inserts.

* Fix typo

* Added check to ensure size is not 0 to prevent allocating register for structs of size 0.

* Using YMM registers during init block if block is large enough

* Add some comments

* Formatting

* Rename remainder variable to shiftBack, better describes the logic for the block.

* Use shift-back technique for init block as well.

* Add assert for init block

* Use xmm register to move remainder if it fits, shift-back for GPR for remainder instead of step-down

* Use xmm in init block for remainder if it is an appropriate size, shift-back for GPR in init instead of step-down register size

* Avoid allocating GPR during LSRA for BlkOpKindUnroll

* Remove shift-back of GPR for remainder, added back the step-down to resolve test failures on x86

* Shift-back GPR when handling remainder for AMD64 only
  • Loading branch information
alexcovington committed Sep 18, 2021
1 parent c980180 commit 4d72c6b
Show file tree
Hide file tree
Showing 3 changed files with 173 additions and 13 deletions.
180 changes: 170 additions & 10 deletions src/coreclr/jit/codegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2782,7 +2782,7 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
assert(src->IsIntegralConst(0));
assert(willUseSimdMov);
#ifdef TARGET_AMD64
assert(size % 16 == 0);
assert(size >= XMM_REGSIZE_BYTES);
#else
assert(size % 8 == 0);
#endif
Expand All @@ -2797,24 +2797,33 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
{
regNumber srcXmmReg = node->GetSingleTempReg(RBM_ALLFLOAT);

unsigned regSize = (size >= YMM_REGSIZE_BYTES) && compiler->compOpportunisticallyDependsOn(InstructionSet_AVX)
? YMM_REGSIZE_BYTES
: XMM_REGSIZE_BYTES;

if (src->gtSkipReloadOrCopy()->IsIntegralConst(0))
{
// If the source is constant 0 then always use xorps, it's faster
// than copying the constant from a GPR to a XMM register.
emit->emitIns_R_R(INS_xorps, EA_16BYTE, srcXmmReg, srcXmmReg);
emit->emitIns_R_R(INS_xorps, EA_ATTR(regSize), srcXmmReg, srcXmmReg);
}
else
{
emit->emitIns_Mov(INS_movd, EA_PTRSIZE, srcXmmReg, srcIntReg, /* canSkip */ false);
emit->emitIns_R_R(INS_punpckldq, EA_16BYTE, srcXmmReg, srcXmmReg);

if (regSize == YMM_REGSIZE_BYTES)
{
// Extend the bytes in the lower lanes to the upper lanes
emit->emitIns_R_R_R_I(INS_vinsertf128, EA_32BYTE, srcXmmReg, srcXmmReg, srcXmmReg, 1);
}
#ifdef TARGET_X86
// For x86, we need one more to convert it from 8 bytes to 16 bytes.
emit->emitIns_R_R(INS_punpckldq, EA_16BYTE, srcXmmReg, srcXmmReg);
#endif
}

instruction simdMov = simdUnalignedMovIns();
unsigned regSize = XMM_REGSIZE_BYTES;
unsigned bytesWritten = 0;

while (bytesWritten < size)
Expand All @@ -2828,8 +2837,21 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
#endif
if (bytesWritten + regSize > size)
{
#ifdef TARGET_AMD64
if (size - bytesWritten <= XMM_REGSIZE_BYTES)
{
regSize = XMM_REGSIZE_BYTES;
}

// Shift dstOffset back to use full SIMD move
unsigned shiftBack = regSize - (size - bytesWritten);
assert(shiftBack <= regSize);
bytesWritten -= shiftBack;
dstOffset -= shiftBack;
#else
assert(srcIntReg != REG_NA);
break;
#endif
}

if (dstLclNum != BAD_VAR_NUM)
Expand All @@ -2849,14 +2871,51 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
size -= bytesWritten;
}

// Fill the remainder using normal stores.
// Fill the remainder using normal stores.
#ifdef TARGET_AMD64
unsigned regSize = REGSIZE_BYTES;

while (regSize > size)
{
regSize /= 2;
}

for (; size > regSize; size -= regSize, dstOffset += regSize)
{
if (dstLclNum != BAD_VAR_NUM)
{
emit->emitIns_S_R(INS_mov, EA_ATTR(regSize), srcIntReg, dstLclNum, dstOffset);
}
else
{
emit->emitIns_ARX_R(INS_mov, EA_ATTR(regSize), srcIntReg, dstAddrBaseReg, dstAddrIndexReg,
dstAddrIndexScale, dstOffset);
}
}

if (size > 0)
{
unsigned shiftBack = regSize - size;
assert(shiftBack <= regSize);
dstOffset -= shiftBack;

if (dstLclNum != BAD_VAR_NUM)
{
emit->emitIns_S_R(INS_mov, EA_ATTR(regSize), srcIntReg, dstLclNum, dstOffset);
}
else
{
emit->emitIns_ARX_R(INS_mov, EA_ATTR(regSize), srcIntReg, dstAddrBaseReg, dstAddrIndexReg,
dstAddrIndexScale, dstOffset);
}
}
#else // TARGET_X86
for (unsigned regSize = REGSIZE_BYTES; size > 0; size -= regSize, dstOffset += regSize)
{
while (regSize > size)
{
regSize /= 2;
}

if (dstLclNum != BAD_VAR_NUM)
{
emit->emitIns_S_R(INS_mov, EA_ATTR(regSize), srcIntReg, dstLclNum, dstOffset);
Expand All @@ -2867,6 +2926,7 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
dstAddrIndexScale, dstOffset);
}
}
#endif
}

#ifdef TARGET_AMD64
Expand Down Expand Up @@ -3017,8 +3077,13 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node)
regNumber tempReg = node->GetSingleTempReg(RBM_ALLFLOAT);

instruction simdMov = simdUnalignedMovIns();
for (unsigned regSize = XMM_REGSIZE_BYTES; size >= regSize;
size -= regSize, srcOffset += regSize, dstOffset += regSize)

// Get the largest SIMD register available if the size is large enough
unsigned regSize = (size >= YMM_REGSIZE_BYTES) && compiler->compOpportunisticallyDependsOn(InstructionSet_AVX)
? YMM_REGSIZE_BYTES
: XMM_REGSIZE_BYTES;

for (; size >= regSize; size -= regSize, srcOffset += regSize, dstOffset += regSize)
{
if (srcLclNum != BAD_VAR_NUM)
{
Expand All @@ -3041,15 +3106,109 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node)
}
}

// TODO-CQ-XArch: On x86 we could copy 8 byte at once by using MOVQ instead of four 4 byte MOV stores.
// On x64 it may also be worth copying a 4/8 byte remainder using MOVD/MOVQ, that avoids the need to
// allocate a GPR just for the remainder.
if (size > 0)
{
if (size <= XMM_REGSIZE_BYTES)
{
regSize = XMM_REGSIZE_BYTES;
}

// Copy the remainder by moving the last regSize bytes of the buffer
unsigned shiftBack = regSize - size;
assert(shiftBack <= regSize);

srcOffset -= shiftBack;
dstOffset -= shiftBack;

if (srcLclNum != BAD_VAR_NUM)
{
emit->emitIns_R_S(simdMov, EA_ATTR(regSize), tempReg, srcLclNum, srcOffset);
}
else
{
emit->emitIns_R_ARX(simdMov, EA_ATTR(regSize), tempReg, srcAddrBaseReg, srcAddrIndexReg,
srcAddrIndexScale, srcOffset);
}

if (dstLclNum != BAD_VAR_NUM)
{
emit->emitIns_S_R(simdMov, EA_ATTR(regSize), tempReg, dstLclNum, dstOffset);
}
else
{
emit->emitIns_ARX_R(simdMov, EA_ATTR(regSize), tempReg, dstAddrBaseReg, dstAddrIndexReg,
dstAddrIndexScale, dstOffset);
}
}

return;
}

// Fill the remainder with normal loads/stores
if (size > 0)
{
regNumber tempReg = node->GetSingleTempReg(RBM_ALLINT);

#ifdef TARGET_AMD64
unsigned regSize = REGSIZE_BYTES;

while (regSize > size)
{
regSize /= 2;
}

for (; size > regSize; size -= regSize, srcOffset += regSize, dstOffset += regSize)
{
if (srcLclNum != BAD_VAR_NUM)
{
emit->emitIns_R_S(INS_mov, EA_ATTR(regSize), tempReg, srcLclNum, srcOffset);
}
else
{
emit->emitIns_R_ARX(INS_mov, EA_ATTR(regSize), tempReg, srcAddrBaseReg, srcAddrIndexReg,
srcAddrIndexScale, srcOffset);
}

if (dstLclNum != BAD_VAR_NUM)
{
emit->emitIns_S_R(INS_mov, EA_ATTR(regSize), tempReg, dstLclNum, dstOffset);
}
else
{
emit->emitIns_ARX_R(INS_mov, EA_ATTR(regSize), tempReg, dstAddrBaseReg, dstAddrIndexReg,
dstAddrIndexScale, dstOffset);
}
}

if (size > 0)
{
unsigned shiftBack = regSize - size;
assert(shiftBack <= regSize);

srcOffset -= shiftBack;
dstOffset -= shiftBack;

if (srcLclNum != BAD_VAR_NUM)
{
emit->emitIns_R_S(INS_mov, EA_ATTR(regSize), tempReg, srcLclNum, srcOffset);
}
else
{
emit->emitIns_R_ARX(INS_mov, EA_ATTR(regSize), tempReg, srcAddrBaseReg, srcAddrIndexReg,
srcAddrIndexScale, srcOffset);
}

if (dstLclNum != BAD_VAR_NUM)
{
emit->emitIns_S_R(INS_mov, EA_ATTR(regSize), tempReg, dstLclNum, dstOffset);
}
else
{
emit->emitIns_ARX_R(INS_mov, EA_ATTR(regSize), tempReg, dstAddrBaseReg, dstAddrIndexReg,
dstAddrIndexScale, dstOffset);
}
}
#else // TARGET_X86
for (unsigned regSize = REGSIZE_BYTES; size > 0; size -= regSize, srcOffset += regSize, dstOffset += regSize)
{
while (regSize > size)
Expand Down Expand Up @@ -3077,6 +3236,7 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node)
dstAddrIndexScale, dstOffset);
}
}
#endif
}
}

Expand Down
2 changes: 1 addition & 1 deletion src/coreclr/jit/lowerxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode)
{
const bool canUse16BytesSimdMov = !blkNode->IsOnHeapAndContainsReferences();
#ifdef TARGET_AMD64
const bool willUseOnlySimdMov = canUse16BytesSimdMov && (size % 16 == 0);
const bool willUseOnlySimdMov = canUse16BytesSimdMov && (size >= XMM_REGSIZE_BYTES);
#else
const bool willUseOnlySimdMov = (size % 8 == 0);
#endif
Expand Down
4 changes: 2 additions & 2 deletions src/coreclr/jit/lsraxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1364,7 +1364,7 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode)
switch (blkNode->gtBlkOpKind)
{
case GenTreeBlk::BlkOpKindUnroll:
if ((size % XMM_REGSIZE_BYTES) != 0)
if (size < XMM_REGSIZE_BYTES)
{
regMaskTP regMask = allRegs(TYP_INT);
#ifdef TARGET_X86
Expand All @@ -1381,7 +1381,7 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode)
if (size >= XMM_REGSIZE_BYTES)
{
buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates());
SetContainsAVXFlags();
SetContainsAVXFlags(size);
}
break;

Expand Down

0 comments on commit 4d72c6b

Please sign in to comment.