diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 597c5009f369a..ff3229921dd86 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -2782,7 +2782,7 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) assert(src->IsIntegralConst(0)); assert(willUseSimdMov); #ifdef TARGET_AMD64 - assert(size % 16 == 0); + assert(size >= XMM_REGSIZE_BYTES); #else assert(size % 8 == 0); #endif @@ -2797,16 +2797,26 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) { regNumber srcXmmReg = node->GetSingleTempReg(RBM_ALLFLOAT); + unsigned regSize = (size >= YMM_REGSIZE_BYTES) && compiler->compOpportunisticallyDependsOn(InstructionSet_AVX) + ? YMM_REGSIZE_BYTES + : XMM_REGSIZE_BYTES; + if (src->gtSkipReloadOrCopy()->IsIntegralConst(0)) { // If the source is constant 0 then always use xorps, it's faster // than copying the constant from a GPR to a XMM register. - emit->emitIns_R_R(INS_xorps, EA_16BYTE, srcXmmReg, srcXmmReg); + emit->emitIns_R_R(INS_xorps, EA_ATTR(regSize), srcXmmReg, srcXmmReg); } else { emit->emitIns_Mov(INS_movd, EA_PTRSIZE, srcXmmReg, srcIntReg, /* canSkip */ false); emit->emitIns_R_R(INS_punpckldq, EA_16BYTE, srcXmmReg, srcXmmReg); + + if (regSize == YMM_REGSIZE_BYTES) + { + // Extend the bytes in the lower lanes to the upper lanes + emit->emitIns_R_R_R_I(INS_vinsertf128, EA_32BYTE, srcXmmReg, srcXmmReg, srcXmmReg, 1); + } #ifdef TARGET_X86 // For x86, we need one more to convert it from 8 bytes to 16 bytes. emit->emitIns_R_R(INS_punpckldq, EA_16BYTE, srcXmmReg, srcXmmReg); @@ -2814,7 +2824,6 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) } instruction simdMov = simdUnalignedMovIns(); - unsigned regSize = XMM_REGSIZE_BYTES; unsigned bytesWritten = 0; while (bytesWritten < size) @@ -2828,8 +2837,21 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) #endif if (bytesWritten + regSize > size) { +#ifdef TARGET_AMD64 + if (size - bytesWritten <= XMM_REGSIZE_BYTES) + { + regSize = XMM_REGSIZE_BYTES; + } + + // Shift dstOffset back to use full SIMD move + unsigned shiftBack = regSize - (size - bytesWritten); + assert(shiftBack <= regSize); + bytesWritten -= shiftBack; + dstOffset -= shiftBack; +#else assert(srcIntReg != REG_NA); break; +#endif } if (dstLclNum != BAD_VAR_NUM) @@ -2849,14 +2871,51 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) size -= bytesWritten; } - // Fill the remainder using normal stores. +// Fill the remainder using normal stores. +#ifdef TARGET_AMD64 + unsigned regSize = REGSIZE_BYTES; + + while (regSize > size) + { + regSize /= 2; + } + + for (; size > regSize; size -= regSize, dstOffset += regSize) + { + if (dstLclNum != BAD_VAR_NUM) + { + emit->emitIns_S_R(INS_mov, EA_ATTR(regSize), srcIntReg, dstLclNum, dstOffset); + } + else + { + emit->emitIns_ARX_R(INS_mov, EA_ATTR(regSize), srcIntReg, dstAddrBaseReg, dstAddrIndexReg, + dstAddrIndexScale, dstOffset); + } + } + + if (size > 0) + { + unsigned shiftBack = regSize - size; + assert(shiftBack <= regSize); + dstOffset -= shiftBack; + + if (dstLclNum != BAD_VAR_NUM) + { + emit->emitIns_S_R(INS_mov, EA_ATTR(regSize), srcIntReg, dstLclNum, dstOffset); + } + else + { + emit->emitIns_ARX_R(INS_mov, EA_ATTR(regSize), srcIntReg, dstAddrBaseReg, dstAddrIndexReg, + dstAddrIndexScale, dstOffset); + } + } +#else // TARGET_X86 for (unsigned regSize = REGSIZE_BYTES; size > 0; size -= regSize, dstOffset += regSize) { while (regSize > size) { regSize /= 2; } - if (dstLclNum != BAD_VAR_NUM) { emit->emitIns_S_R(INS_mov, EA_ATTR(regSize), srcIntReg, dstLclNum, dstOffset); @@ -2867,6 +2926,7 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) dstAddrIndexScale, dstOffset); } } +#endif } #ifdef TARGET_AMD64 @@ -3017,8 +3077,13 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node) regNumber tempReg = node->GetSingleTempReg(RBM_ALLFLOAT); instruction simdMov = simdUnalignedMovIns(); - for (unsigned regSize = XMM_REGSIZE_BYTES; size >= regSize; - size -= regSize, srcOffset += regSize, dstOffset += regSize) + + // Get the largest SIMD register available if the size is large enough + unsigned regSize = (size >= YMM_REGSIZE_BYTES) && compiler->compOpportunisticallyDependsOn(InstructionSet_AVX) + ? YMM_REGSIZE_BYTES + : XMM_REGSIZE_BYTES; + + for (; size >= regSize; size -= regSize, srcOffset += regSize, dstOffset += regSize) { if (srcLclNum != BAD_VAR_NUM) { @@ -3041,15 +3106,109 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node) } } - // TODO-CQ-XArch: On x86 we could copy 8 byte at once by using MOVQ instead of four 4 byte MOV stores. - // On x64 it may also be worth copying a 4/8 byte remainder using MOVD/MOVQ, that avoids the need to - // allocate a GPR just for the remainder. + if (size > 0) + { + if (size <= XMM_REGSIZE_BYTES) + { + regSize = XMM_REGSIZE_BYTES; + } + + // Copy the remainder by moving the last regSize bytes of the buffer + unsigned shiftBack = regSize - size; + assert(shiftBack <= regSize); + + srcOffset -= shiftBack; + dstOffset -= shiftBack; + + if (srcLclNum != BAD_VAR_NUM) + { + emit->emitIns_R_S(simdMov, EA_ATTR(regSize), tempReg, srcLclNum, srcOffset); + } + else + { + emit->emitIns_R_ARX(simdMov, EA_ATTR(regSize), tempReg, srcAddrBaseReg, srcAddrIndexReg, + srcAddrIndexScale, srcOffset); + } + + if (dstLclNum != BAD_VAR_NUM) + { + emit->emitIns_S_R(simdMov, EA_ATTR(regSize), tempReg, dstLclNum, dstOffset); + } + else + { + emit->emitIns_ARX_R(simdMov, EA_ATTR(regSize), tempReg, dstAddrBaseReg, dstAddrIndexReg, + dstAddrIndexScale, dstOffset); + } + } + + return; } + // Fill the remainder with normal loads/stores if (size > 0) { regNumber tempReg = node->GetSingleTempReg(RBM_ALLINT); +#ifdef TARGET_AMD64 + unsigned regSize = REGSIZE_BYTES; + + while (regSize > size) + { + regSize /= 2; + } + + for (; size > regSize; size -= regSize, srcOffset += regSize, dstOffset += regSize) + { + if (srcLclNum != BAD_VAR_NUM) + { + emit->emitIns_R_S(INS_mov, EA_ATTR(regSize), tempReg, srcLclNum, srcOffset); + } + else + { + emit->emitIns_R_ARX(INS_mov, EA_ATTR(regSize), tempReg, srcAddrBaseReg, srcAddrIndexReg, + srcAddrIndexScale, srcOffset); + } + + if (dstLclNum != BAD_VAR_NUM) + { + emit->emitIns_S_R(INS_mov, EA_ATTR(regSize), tempReg, dstLclNum, dstOffset); + } + else + { + emit->emitIns_ARX_R(INS_mov, EA_ATTR(regSize), tempReg, dstAddrBaseReg, dstAddrIndexReg, + dstAddrIndexScale, dstOffset); + } + } + + if (size > 0) + { + unsigned shiftBack = regSize - size; + assert(shiftBack <= regSize); + + srcOffset -= shiftBack; + dstOffset -= shiftBack; + + if (srcLclNum != BAD_VAR_NUM) + { + emit->emitIns_R_S(INS_mov, EA_ATTR(regSize), tempReg, srcLclNum, srcOffset); + } + else + { + emit->emitIns_R_ARX(INS_mov, EA_ATTR(regSize), tempReg, srcAddrBaseReg, srcAddrIndexReg, + srcAddrIndexScale, srcOffset); + } + + if (dstLclNum != BAD_VAR_NUM) + { + emit->emitIns_S_R(INS_mov, EA_ATTR(regSize), tempReg, dstLclNum, dstOffset); + } + else + { + emit->emitIns_ARX_R(INS_mov, EA_ATTR(regSize), tempReg, dstAddrBaseReg, dstAddrIndexReg, + dstAddrIndexScale, dstOffset); + } + } +#else // TARGET_X86 for (unsigned regSize = REGSIZE_BYTES; size > 0; size -= regSize, srcOffset += regSize, dstOffset += regSize) { while (regSize > size) @@ -3077,6 +3236,7 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node) dstAddrIndexScale, dstOffset); } } +#endif } } diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index c4d56ed23780b..f76340a136098 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -225,7 +225,7 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) { const bool canUse16BytesSimdMov = !blkNode->IsOnHeapAndContainsReferences(); #ifdef TARGET_AMD64 - const bool willUseOnlySimdMov = canUse16BytesSimdMov && (size % 16 == 0); + const bool willUseOnlySimdMov = canUse16BytesSimdMov && (size >= XMM_REGSIZE_BYTES); #else const bool willUseOnlySimdMov = (size % 8 == 0); #endif diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index 356ad74c1702d..6f5892d1712fe 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -1364,7 +1364,7 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode) switch (blkNode->gtBlkOpKind) { case GenTreeBlk::BlkOpKindUnroll: - if ((size % XMM_REGSIZE_BYTES) != 0) + if (size < XMM_REGSIZE_BYTES) { regMaskTP regMask = allRegs(TYP_INT); #ifdef TARGET_X86 @@ -1381,7 +1381,7 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode) if (size >= XMM_REGSIZE_BYTES) { buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates()); - SetContainsAVXFlags(); + SetContainsAVXFlags(size); } break;