From 3a7b06453eec84b5fd7c3178339fd230f21b5b35 Mon Sep 17 00:00:00 2001
From: Lawrence Benson <github@lawben.com>
Date: Wed, 15 May 2024 14:08:37 +0200
Subject: [PATCH 01/49] Add initial code for @llvm.masked.compress intrinsics

---
 llvm/include/llvm/CodeGen/ISDOpcodes.h        |  5 ++
 llvm/include/llvm/IR/Intrinsics.td            |  5 ++
 .../include/llvm/Target/TargetSelectionDAG.td |  6 ++
 .../SelectionDAG/LegalizeIntegerTypes.cpp     | 20 +++++++
 llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h |  3 +
 .../SelectionDAG/LegalizeVectorOps.cpp        | 50 ++++++++++++++++
 .../SelectionDAG/LegalizeVectorTypes.cpp      | 60 +++++++++++++++++++
 .../SelectionDAG/SelectionDAGBuilder.cpp      |  7 +++
 .../SelectionDAG/SelectionDAGDumper.cpp       |  1 +
 llvm/lib/CodeGen/TargetLoweringBase.cpp       |  3 +
 10 files changed, 160 insertions(+)

diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index d8af97957e48ec..71dfd8b43b7108 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -1294,6 +1294,11 @@ enum NodeType {
   MLOAD,
   MSTORE,
 
+  // Masked compress - consecutively place vector elements based on mask
+  // e.g., vec = {A, B, C, D} and mask = 1010
+  //         --> {A, C, ?, ?} where ? is undefined
+  MCOMPRESS,
+
   // Masked gather and scatter - load and store operations for a vector of
   // random addresses with additional mask operand that prevents memory
   // accesses to the masked-off lanes.
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index f1c7d950f92755..e924d28956b0a5 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -2362,6 +2362,11 @@ def int_masked_compressstore:
             [IntrWriteMem, IntrArgMemOnly, IntrWillReturn,
              NoCapture<ArgIndex<1>>]>;
 
+def int_masked_compress:
+    DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+              [llvm_anyvector_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
+              [IntrNoMem, IntrWillReturn]>;
+
 // Test whether a pointer is associated with a type metadata identifier.
 def int_type_test : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_ptr_ty, llvm_metadata_ty],
                               [IntrNoMem, IntrWillReturn, IntrSpeculatable]>;
diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
index 1684b424e3b442..061330fb4e08f3 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -266,6 +266,10 @@ def SDTMaskedScatter : SDTypeProfile<0, 4, [
   SDTCisSameNumEltsAs<0, 1>, SDTCisSameNumEltsAs<0, 3>
 ]>;
 
+def SDTMaskedCompress : SDTypeProfile<1, 2, [
+  SDTCisVec<0>, SDTCisVec<1>, SDTCisSameNumEltsAs<0, 1>,
+]>;
+
 def SDTVecShuffle : SDTypeProfile<1, 2, [
   SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>
 ]>;
@@ -731,6 +735,8 @@ def masked_gather : SDNode<"ISD::MGATHER", SDTMaskedGather,
 def masked_scatter : SDNode<"ISD::MSCATTER", SDTMaskedScatter,
                             [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
 
+def masked_compress : SDNode<"ISD::MCOMPRESS", SDTMaskedCompress>;
+
 // Do not use ld, st directly. Use load, extload, sextload, zextload, store,
 // and truncst (see below).
 def ld         : SDNode<"ISD::LOAD"       , SDTLoad,
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 0aa36deda79dcc..80f645b433cbe4 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -87,6 +87,7 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
     break;
   case ISD::MGATHER:     Res = PromoteIntRes_MGATHER(cast<MaskedGatherSDNode>(N));
     break;
+  case ISD::MCOMPRESS:   Res = PromoteIntRes_MCOMPRESS(N); break;
   case ISD::SELECT:
   case ISD::VSELECT:
   case ISD::VP_SELECT:
@@ -948,6 +949,11 @@ SDValue DAGTypeLegalizer::PromoteIntRes_MGATHER(MaskedGatherSDNode *N) {
   return Res;
 }
 
+SDValue DAGTypeLegalizer::PromoteIntRes_MCOMPRESS(SDNode *N) {
+  SDValue Vec = GetPromotedInteger(N->getOperand(0));
+  return DAG.getNode(ISD::MCOMPRESS, SDLoc(N), Vec.getValueType(), Vec, N->getOperand(1));
+}
+
 /// Promote the overflow flag of an overflowing arithmetic node.
 SDValue DAGTypeLegalizer::PromoteIntRes_Overflow(SDNode *N) {
   // Change the return type of the boolean result while obeying
@@ -1855,6 +1861,7 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
                                                  OpNo); break;
   case ISD::MSCATTER: Res = PromoteIntOp_MSCATTER(cast<MaskedScatterSDNode>(N),
                                                   OpNo); break;
+  case ISD::MCOMPRESS: Res = PromoteIntOp_MCOMPRESS(N, OpNo); break;
   case ISD::VP_TRUNCATE:
   case ISD::TRUNCATE:     Res = PromoteIntOp_TRUNCATE(N); break;
   case ISD::BF16_TO_FP:
@@ -2335,6 +2342,19 @@ SDValue DAGTypeLegalizer::PromoteIntOp_MSCATTER(MaskedScatterSDNode *N,
                               N->getIndexType(), TruncateStore);
 }
 
+SDValue DAGTypeLegalizer::PromoteIntOp_MCOMPRESS(SDNode *N, unsigned OpNo) {
+  SDValue Vec = N->getOperand(0);
+  SDValue Mask = N->getOperand(1);
+  EVT VT = Vec.getValueType();
+
+  if (OpNo == 0)
+    Vec = GetPromotedInteger(Vec);
+  else
+    Mask = PromoteTargetBoolean(Mask, VT);
+
+  return DAG.getNode(ISD::MCOMPRESS, SDLoc(N), VT, Vec, Mask);
+}
+
 SDValue DAGTypeLegalizer::PromoteIntOp_TRUNCATE(SDNode *N) {
   SDValue Op = GetPromotedInteger(N->getOperand(0));
   if (N->getOpcode() == ISD::VP_TRUNCATE)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index d925089d5689f1..5fb14757f8991f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -321,6 +321,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue PromoteIntRes_LOAD(LoadSDNode *N);
   SDValue PromoteIntRes_MLOAD(MaskedLoadSDNode *N);
   SDValue PromoteIntRes_MGATHER(MaskedGatherSDNode *N);
+  SDValue PromoteIntRes_MCOMPRESS(SDNode *N);
   SDValue PromoteIntRes_Overflow(SDNode *N);
   SDValue PromoteIntRes_FFREXP(SDNode *N);
   SDValue PromoteIntRes_SADDSUBO(SDNode *N, unsigned ResNo);
@@ -390,6 +391,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue PromoteIntOp_MLOAD(MaskedLoadSDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_MSCATTER(MaskedScatterSDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_MGATHER(MaskedGatherSDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_MCOMPRESS(SDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_FRAMERETURNADDR(SDNode *N);
   SDValue PromoteIntOp_FIX(SDNode *N);
   SDValue PromoteIntOp_ExpOp(SDNode *N);
@@ -882,6 +884,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   void SplitVecRes_MLOAD(MaskedLoadSDNode *MLD, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_Gather(MemSDNode *VPGT, SDValue &Lo, SDValue &Hi,
                           bool SplitSETCC = false);
+  void SplitVecRes_MCOMPRESS(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_ScalarOp(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_STEP_VECTOR(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 423df9ae6b2a55..759de775ba011d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -134,6 +134,7 @@ class VectorLegalizer {
   SDValue ExpandVSELECT(SDNode *Node);
   SDValue ExpandVP_SELECT(SDNode *Node);
   SDValue ExpandVP_MERGE(SDNode *Node);
+  SDValue ExpandMCOMPRESS(SDNode *Node);
   SDValue ExpandVP_REM(SDNode *Node);
   SDValue ExpandSELECT(SDNode *Node);
   std::pair<SDValue, SDValue> ExpandLoad(SDNode *N);
@@ -442,6 +443,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::FP_TO_SINT_SAT:
   case ISD::FP_TO_UINT_SAT:
   case ISD::MGATHER:
+  case ISD::MCOMPRESS:
     Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
     break;
   case ISD::SMULFIX:
@@ -1101,6 +1103,9 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
       return;
 
     break;
+  case ISD::MCOMPRESS:
+    Results.push_back(ExpandMCOMPRESS(Node));
+    return;
   }
 
   SDValue Unrolled = DAG.UnrollVectorOp(Node);
@@ -1505,6 +1510,51 @@ SDValue VectorLegalizer::ExpandVP_MERGE(SDNode *Node) {
   return DAG.getSelect(DL, Node->getValueType(0), FullMask, Op1, Op2);
 }
 
+SDValue VectorLegalizer::ExpandMCOMPRESS(SDNode *Node) {
+  SDLoc DL(Node);
+  SDValue Vec = Node->getOperand(0);
+  SDValue Mask = Node->getOperand(1);
+
+  EVT VecVT = Vec.getValueType();
+  EVT ScalarVT = VecVT.getScalarType();
+  EVT MaskScalarVT = Mask.getValueType().getScalarType();
+
+  assert(TLI.isTypeLegal(VecVT) &&  TLI.isTypeLegal(ScalarVT) && TLI.isTypeLegal(MaskScalarVT) &&
+         "Need legal vector/mask element types to scalarize masked compress.");
+
+  SDValue StackPtr = DAG.CreateStackTemporary(VecVT);
+  SDValue Chain = DAG.getEntryNode();
+  SDValue OutPos = DAG.getConstant(0, DL, MVT::i32);
+
+  unsigned NumElms = VecVT.getVectorNumElements();
+  // Skip element zero, as we always copy this to the output vector.
+  for (unsigned I = 0; I < NumElms; I++) {
+    SDValue Idx = DAG.getVectorIdxConstant(I, DL);
+
+    SDValue ValI =
+        DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Vec, Idx);
+    SDValue OutPtr =
+        TLI.getVectorElementPointer(DAG, StackPtr, VecVT, OutPos);
+    Chain = DAG.getStore(Chain, DL, ValI, OutPtr, MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()));
+
+    // Skip this for last element.
+    if (I < NumElms - 1) {
+      // Get the mask value and add it to the current output position. This
+      // either increments by 1 if MaskI is true or adds 0 otherwise.
+      SDValue MaskI =
+          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskScalarVT, Mask, Idx);
+      MaskI = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, MaskI);
+      MaskI = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, MaskI);
+      OutPos = DAG.getNode(ISD::ADD, DL, MVT::i32, OutPos, MaskI);
+    }
+  }
+
+  int FI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+  MachinePointerInfo PtrInfo =
+      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
+  return DAG.getLoad(VecVT, DL, Chain, StackPtr, PtrInfo);
+}
+
 SDValue VectorLegalizer::ExpandVP_REM(SDNode *Node) {
   // Implement VP_SREM/UREM in terms of VP_SDIV/VP_UDIV, VP_MUL, VP_SUB.
   EVT VT = Node->getValueType(0);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index cd858003cf03bc..62e7febed65680 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1058,6 +1058,9 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::VP_GATHER:
     SplitVecRes_Gather(cast<MemSDNode>(N), Lo, Hi, /*SplitSETCC*/ true);
     break;
+  case ISD::MCOMPRESS:
+    SplitVecRes_MCOMPRESS(N, Lo, Hi);
+    break;
   case ISD::SETCC:
   case ISD::VP_SETCC:
     SplitVecRes_SETCC(N, Lo, Hi);
@@ -2304,6 +2307,63 @@ void DAGTypeLegalizer::SplitVecRes_Gather(MemSDNode *N, SDValue &Lo,
   ReplaceValueWith(SDValue(N, 1), Ch);
 }
 
+void DAGTypeLegalizer::SplitVecRes_MCOMPRESS(SDNode *N, SDValue &Lo,
+                                             SDValue &Hi) {
+  // This is not "trivial", as there is a dependency between the two subvectors.
+  // Depending on the number of 1s in the mask, the elements from the Hi vector
+  // need to be moved to the Lo vector. So we just perform this as one "big"
+  // operation (analogously to the default MCOMPRESS expand implementation), by
+  // writing to memory and then loading the Lo and Hi vectors from that. This
+  // gets rid of MCOMPRESS and all other operands can be legalized later.
+  SDLoc DL(N);
+  SDValue Vec = N->getOperand(0);
+  SDValue Mask = N->getOperand(1);
+
+  EVT VecVT = Vec.getValueType();
+  EVT SubVecVT = VecVT.getHalfNumVectorElementsVT(*DAG.getContext());
+  EVT ScalarVT = VecVT.getScalarType();
+  EVT MaskScalarVT = Mask.getValueType().getScalarType();
+
+  // TODO: This code is duplicated here and in LegalizeVectorOps.
+  SDValue StackPtr = DAG.CreateStackTemporary(VecVT);
+  SDValue Chain = DAG.getEntryNode();
+  SDValue OutPos = DAG.getConstant(0, DL, MVT::i32);
+
+  unsigned NumElms = VecVT.getVectorNumElements();
+  // Skip element zero, as we always copy this to the output vector.
+  for (unsigned I = 0; I < NumElms; I++) {
+    SDValue Idx = DAG.getVectorIdxConstant(I, DL);
+
+    SDValue ValI = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Vec, Idx);
+    SDValue OutPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, OutPos);
+    Chain = DAG.getStore(
+        Chain, DL, ValI, OutPtr,
+        MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()));
+
+    // Skip this for last element.
+    if (I < NumElms - 1) {
+      // Get the mask value and add it to the current output position. This
+      // either increments by 1 if MaskI is true or adds 0 otherwise.
+      SDValue MaskI =
+          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskScalarVT, Mask, Idx);
+      MaskI = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, MaskI);
+      MaskI = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, MaskI);
+      OutPos = DAG.getNode(ISD::ADD, DL, MVT::i32, OutPos, MaskI);
+    }
+  }
+
+  int FI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+  MachinePointerInfo PtrInfo =
+      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
+  SDValue HiPtr = TLI.getVectorElementPointer(
+      DAG, StackPtr, VecVT, DAG.getConstant(NumElms / 2, DL, MVT::i32));
+
+  Lo = DAG.getLoad(SubVecVT, DL, Chain, StackPtr, PtrInfo);
+  Hi = DAG.getLoad(
+      SubVecVT, DL, Chain, HiPtr,
+      MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()));
+}
+
 void DAGTypeLegalizer::SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi) {
   assert(N->getValueType(0).isVector() &&
          N->getOperand(0).getValueType().isVector() &&
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index ca352da5d36eb4..665bab6121837a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6718,6 +6718,13 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
   case Intrinsic::masked_compressstore:
     visitMaskedStore(I, true /* IsCompressing */);
     return;
+  case Intrinsic::masked_compress:
+    setValue(&I, DAG.getNode(ISD::MCOMPRESS, sdl,
+                             getValue(I.getArgOperand(0)).getValueType(),
+                             getValue(I.getArgOperand(0)),
+                             getValue(I.getArgOperand(1)),
+                             Flags));
+    return;
   case Intrinsic::powi:
     setValue(&I, ExpandPowI(sdl, getValue(I.getArgOperand(0)),
                             getValue(I.getArgOperand(1)), DAG));
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 59742e90c6791c..37288054b0e7b0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -416,6 +416,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::MSTORE:                     return "masked_store";
   case ISD::MGATHER:                    return "masked_gather";
   case ISD::MSCATTER:                   return "masked_scatter";
+  case ISD::MCOMPRESS:                  return "masked_compress";
   case ISD::VAARG:                      return "vaarg";
   case ISD::VACOPY:                     return "vacopy";
   case ISD::VAEND:                      return "vaend";
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 09b70cfb72278f..5ee12be752b277 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -956,6 +956,9 @@ void TargetLoweringBase::initActions() {
     // Named vector shuffles default to expand.
     setOperationAction(ISD::VECTOR_SPLICE, VT, Expand);
 
+    // Only some target support this vector operation. Most need to expand it.
+    setOperationAction(ISD::MCOMPRESS, VT, Expand);
+
     // VP operations default to expand.
 #define BEGIN_REGISTER_VP_SDNODE(SDOPC, ...)                                   \
     setOperationAction(ISD::SDOPC, VT, Expand);

From 75abf0b013f732335ced35002055f0da48f724e0 Mon Sep 17 00:00:00 2001
From: Lawrence Benson <github@lawben.com>
Date: Wed, 15 May 2024 15:32:47 +0200
Subject: [PATCH 02/49] Remove requirements for legal types

---
 llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 759de775ba011d..ca32db26e511cf 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -1519,9 +1519,6 @@ SDValue VectorLegalizer::ExpandMCOMPRESS(SDNode *Node) {
   EVT ScalarVT = VecVT.getScalarType();
   EVT MaskScalarVT = Mask.getValueType().getScalarType();
 
-  assert(TLI.isTypeLegal(VecVT) &&  TLI.isTypeLegal(ScalarVT) && TLI.isTypeLegal(MaskScalarVT) &&
-         "Need legal vector/mask element types to scalarize masked compress.");
-
   SDValue StackPtr = DAG.CreateStackTemporary(VecVT);
   SDValue Chain = DAG.getEntryNode();
   SDValue OutPos = DAG.getConstant(0, DL, MVT::i32);

From 0329bc9652f9a6c633924d951d6694399d9f6af7 Mon Sep 17 00:00:00 2001
From: Lawrence Benson <github@lawben.com>
Date: Wed, 15 May 2024 16:33:36 +0200
Subject: [PATCH 03/49] Add tests for AArch64

---
 llvm/test/CodeGen/AArch64/masked-compress.ll | 280 +++++++++++++++++++
 1 file changed, 280 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/masked-compress.ll

diff --git a/llvm/test/CodeGen/AArch64/masked-compress.ll b/llvm/test/CodeGen/AArch64/masked-compress.ll
new file mode 100644
index 00000000000000..54c3beab82f768
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/masked-compress.ll
@@ -0,0 +1,280 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64-apple-darwin -verify-machineinstrs < %s | FileCheck %s
+
+define <4 x i32> @test_compress_v4i32(<4 x i32> %vec, <4 x i1> %mask) {
+; CHECK-LABEL: test_compress_v4i32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ushll.4s v1, v1, #0
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str s0, [sp]
+; CHECK-NEXT:    shl.4s v1, v1, #31
+; CHECK-NEXT:    cmlt.4s v1, v1, #0
+; CHECK-NEXT:    mov.s w9, v1[1]
+; CHECK-NEXT:    mov.s w10, v1[2]
+; CHECK-NEXT:    fmov w11, s1
+; CHECK-NEXT:    bfi x8, x11, #2, #1
+; CHECK-NEXT:    and w11, w11, #0x1
+; CHECK-NEXT:    and w9, w9, #0x1
+; CHECK-NEXT:    and w10, w10, #0x1
+; CHECK-NEXT:    add w9, w11, w9
+; CHECK-NEXT:    mov x11, sp
+; CHECK-NEXT:    st1.s { v0 }[1], [x8]
+; CHECK-NEXT:    add w10, w9, w10
+; CHECK-NEXT:    orr x9, x11, x9, lsl #2
+; CHECK-NEXT:    bfi x11, x10, #2, #2
+; CHECK-NEXT:    st1.s { v0 }[2], [x9]
+; CHECK-NEXT:    st1.s { v0 }[3], [x11]
+; CHECK-NEXT:    ldr q0, [sp], #16
+; CHECK-NEXT:    ret
+    %out = call <4 x i32> @llvm.masked.compress.v4i32(<4 x i32> %vec, <4 x i1> %mask)
+    ret <4 x i32> %out
+}
+
+define <16 x i8> @test_compress_v16i8(<16 x i8> %vec, <16 x i1> %mask) {
+; CHECK-LABEL: test_compress_v16i8:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    shl.16b v1, v1, #7
+; CHECK-NEXT:    mov x12, sp
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    st1.b { v0 }[0], [x8]
+; CHECK-NEXT:    mov x13, sp
+; CHECK-NEXT:    cmlt.16b v1, v1, #0
+; CHECK-NEXT:    umov.b w9, v1[0]
+; CHECK-NEXT:    umov.b w10, v1[1]
+; CHECK-NEXT:    umov.b w11, v1[2]
+; CHECK-NEXT:    umov.b w14, v1[3]
+; CHECK-NEXT:    bfxil x12, x9, #0, #1
+; CHECK-NEXT:    and w10, w10, #0x1
+; CHECK-NEXT:    and w9, w9, #0x1
+; CHECK-NEXT:    add w9, w9, w10
+; CHECK-NEXT:    umov.b w10, v1[4]
+; CHECK-NEXT:    and w11, w11, #0x1
+; CHECK-NEXT:    st1.b { v0 }[1], [x12]
+; CHECK-NEXT:    orr x12, x8, x9
+; CHECK-NEXT:    add w9, w9, w11
+; CHECK-NEXT:    umov.b w11, v1[5]
+; CHECK-NEXT:    and w14, w14, #0x1
+; CHECK-NEXT:    st1.b { v0 }[2], [x12]
+; CHECK-NEXT:    add w14, w9, w14
+; CHECK-NEXT:    umov.b w12, v1[6]
+; CHECK-NEXT:    orr x9, x8, x9
+; CHECK-NEXT:    and w10, w10, #0x1
+; CHECK-NEXT:    st1.b { v0 }[3], [x9]
+; CHECK-NEXT:    orr x9, x8, x14
+; CHECK-NEXT:    add w10, w14, w10
+; CHECK-NEXT:    umov.b w14, v1[7]
+; CHECK-NEXT:    st1.b { v0 }[4], [x9]
+; CHECK-NEXT:    and w11, w11, #0x1
+; CHECK-NEXT:    bfxil x13, x10, #0, #4
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    add w10, w10, w11
+; CHECK-NEXT:    umov.b w11, v1[8]
+; CHECK-NEXT:    and w12, w12, #0x1
+; CHECK-NEXT:    bfxil x9, x10, #0, #4
+; CHECK-NEXT:    st1.b { v0 }[5], [x13]
+; CHECK-NEXT:    umov.b w13, v1[9]
+; CHECK-NEXT:    add w10, w10, w12
+; CHECK-NEXT:    mov x12, sp
+; CHECK-NEXT:    and w14, w14, #0x1
+; CHECK-NEXT:    st1.b { v0 }[6], [x9]
+; CHECK-NEXT:    umov.b w9, v1[10]
+; CHECK-NEXT:    bfxil x12, x10, #0, #4
+; CHECK-NEXT:    add w10, w10, w14
+; CHECK-NEXT:    mov x14, sp
+; CHECK-NEXT:    and w11, w11, #0x1
+; CHECK-NEXT:    bfxil x14, x10, #0, #4
+; CHECK-NEXT:    add w10, w10, w11
+; CHECK-NEXT:    mov x11, sp
+; CHECK-NEXT:    and w13, w13, #0x1
+; CHECK-NEXT:    st1.b { v0 }[7], [x12]
+; CHECK-NEXT:    mov x12, sp
+; CHECK-NEXT:    bfxil x11, x10, #0, #4
+; CHECK-NEXT:    add w10, w10, w13
+; CHECK-NEXT:    umov.b w13, v1[11]
+; CHECK-NEXT:    st1.b { v0 }[8], [x14]
+; CHECK-NEXT:    umov.b w14, v1[12]
+; CHECK-NEXT:    and w9, w9, #0x1
+; CHECK-NEXT:    bfxil x12, x10, #0, #4
+; CHECK-NEXT:    add w9, w10, w9
+; CHECK-NEXT:    mov x10, sp
+; CHECK-NEXT:    st1.b { v0 }[9], [x11]
+; CHECK-NEXT:    umov.b w11, v1[13]
+; CHECK-NEXT:    bfxil x10, x9, #0, #4
+; CHECK-NEXT:    st1.b { v0 }[10], [x12]
+; CHECK-NEXT:    umov.b w12, v1[14]
+; CHECK-NEXT:    and w13, w13, #0x1
+; CHECK-NEXT:    and w14, w14, #0x1
+; CHECK-NEXT:    add w9, w9, w13
+; CHECK-NEXT:    st1.b { v0 }[11], [x10]
+; CHECK-NEXT:    mov x10, sp
+; CHECK-NEXT:    add w13, w9, w14
+; CHECK-NEXT:    mov x14, sp
+; CHECK-NEXT:    bfxil x10, x9, #0, #4
+; CHECK-NEXT:    and w9, w11, #0x1
+; CHECK-NEXT:    mov x11, sp
+; CHECK-NEXT:    add w9, w13, w9
+; CHECK-NEXT:    and w12, w12, #0x1
+; CHECK-NEXT:    bfxil x14, x13, #0, #4
+; CHECK-NEXT:    bfxil x11, x9, #0, #4
+; CHECK-NEXT:    add w9, w9, w12
+; CHECK-NEXT:    st1.b { v0 }[12], [x10]
+; CHECK-NEXT:    bfxil x8, x9, #0, #4
+; CHECK-NEXT:    st1.b { v0 }[13], [x14]
+; CHECK-NEXT:    st1.b { v0 }[14], [x11]
+; CHECK-NEXT:    st1.b { v0 }[15], [x8]
+; CHECK-NEXT:    ldr q0, [sp], #16
+; CHECK-NEXT:    ret
+    %out = call <16 x i8> @llvm.masked.compress.v16i8(<16 x i8> %vec, <16 x i1> %mask)
+    ret <16 x i8> %out
+}
+
+define <8 x i32> @test_compress_large(<8 x i32> %vec, <8 x i1> %mask) {
+; CHECK-LABEL: test_compress_large:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; CHECK-NEXT:    sub x9, sp, #48
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    and sp, x9, #0xffffffffffffffe0
+; CHECK-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    ; kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    umov.b w9, v2[0]
+; CHECK-NEXT:    umov.b w10, v2[1]
+; CHECK-NEXT:    mov x12, sp
+; CHECK-NEXT:    umov.b w11, v2[2]
+; CHECK-NEXT:    umov.b w13, v2[3]
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    umov.b w14, v2[4]
+; CHECK-NEXT:    str s0, [sp]
+; CHECK-NEXT:    bfi x12, x9, #2, #1
+; CHECK-NEXT:    and w10, w10, #0x1
+; CHECK-NEXT:    and w9, w9, #0x1
+; CHECK-NEXT:    add w9, w9, w10
+; CHECK-NEXT:    and w10, w11, #0x1
+; CHECK-NEXT:    and w13, w13, #0x1
+; CHECK-NEXT:    orr x11, x8, x9, lsl #2
+; CHECK-NEXT:    add w9, w9, w10
+; CHECK-NEXT:    umov.b w10, v2[5]
+; CHECK-NEXT:    st1.s { v0 }[1], [x12]
+; CHECK-NEXT:    add w13, w9, w13
+; CHECK-NEXT:    orr x9, x8, x9, lsl #2
+; CHECK-NEXT:    st1.s { v0 }[2], [x11]
+; CHECK-NEXT:    umov.b w11, v2[6]
+; CHECK-NEXT:    mov x12, sp
+; CHECK-NEXT:    and w14, w14, #0x1
+; CHECK-NEXT:    bfi x12, x13, #2, #3
+; CHECK-NEXT:    st1.s { v0 }[3], [x9]
+; CHECK-NEXT:    add w13, w13, w14
+; CHECK-NEXT:    and w9, w10, #0x1
+; CHECK-NEXT:    mov x10, sp
+; CHECK-NEXT:    add w9, w13, w9
+; CHECK-NEXT:    mov x14, sp
+; CHECK-NEXT:    str s1, [x12]
+; CHECK-NEXT:    and w11, w11, #0x1
+; CHECK-NEXT:    bfi x10, x9, #2, #3
+; CHECK-NEXT:    bfi x14, x13, #2, #3
+; CHECK-NEXT:    add w9, w9, w11
+; CHECK-NEXT:    bfi x8, x9, #2, #3
+; CHECK-NEXT:    st1.s { v1 }[1], [x14]
+; CHECK-NEXT:    st1.s { v1 }[2], [x10]
+; CHECK-NEXT:    st1.s { v1 }[3], [x8]
+; CHECK-NEXT:    ldp q0, q1, [sp]
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-NEXT:    ret
+    %out = call <8 x i32> @llvm.masked.compress.v8i32(<8 x i32> %vec, <8 x i1> %mask)
+    ret <8 x i32> %out
+}
+
+define <4 x i32> @test_compress_const(<4 x i32> %vec, <4 x i1> %mask) {
+; CHECK-LABEL: test_compress_const:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    mov x8, #3 ; =0x3
+; CHECK-NEXT:    mov w9, #9 ; =0x9
+; CHECK-NEXT:    movk x8, #7, lsl #32
+; CHECK-NEXT:    str x8, [sp, #-16]!
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    mov w8, #5 ; =0x5
+; CHECK-NEXT:    str w9, [sp, #8]
+; CHECK-NEXT:    str w8, [sp]
+; CHECK-NEXT:    ldr q0, [sp], #16
+; CHECK-NEXT:    ret
+    %out = call <4 x i32> @llvm.masked.compress.v4i32(<4 x i32> <i32 3, i32 5, i32 7, i32 9>,
+                                                      <4 x i1>   <i1 0,  i1 1,  i1 1,  i1 0>)
+    ret <4 x i32> %out
+}
+
+define <4 x i8> @test_compress_small(<4 x i8> %vec, <4 x i1> %mask) {
+; CHECK-LABEL: test_compress_small:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    shl.4h v1, v1, #15
+; CHECK-NEXT:    add x8, sp, #8
+; CHECK-NEXT:    ; kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    str h0, [sp, #8]
+; CHECK-NEXT:    cmlt.4h v1, v1, #0
+; CHECK-NEXT:    umov.h w9, v1[0]
+; CHECK-NEXT:    umov.h w10, v1[1]
+; CHECK-NEXT:    umov.h w11, v1[2]
+; CHECK-NEXT:    bfi x8, x9, #1, #1
+; CHECK-NEXT:    and w10, w10, #0x1
+; CHECK-NEXT:    and w9, w9, #0x1
+; CHECK-NEXT:    add w9, w9, w10
+; CHECK-NEXT:    and w11, w11, #0x1
+; CHECK-NEXT:    add x10, sp, #8
+; CHECK-NEXT:    add w11, w9, w11
+; CHECK-NEXT:    orr x9, x10, x9, lsl #1
+; CHECK-NEXT:    st1.h { v0 }[1], [x8]
+; CHECK-NEXT:    bfi x10, x11, #1, #2
+; CHECK-NEXT:    st1.h { v0 }[2], [x9]
+; CHECK-NEXT:    st1.h { v0 }[3], [x10]
+; CHECK-NEXT:    ldr d0, [sp, #8]
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+    %out = call <4 x i8> @llvm.masked.compress.v4i8(<4 x i8> %vec, <4 x i1> %mask)
+    ret <4 x i8> %out
+}
+
+define <4 x i4> @test_compress_illegal_element_type(<4 x i4> %vec, <4 x i1> %mask) {
+; CHECK-LABEL: test_compress_illegal_element_type:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    shl.4h v1, v1, #15
+; CHECK-NEXT:    add x8, sp, #8
+; CHECK-NEXT:    ; kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    str h0, [sp, #8]
+; CHECK-NEXT:    cmlt.4h v1, v1, #0
+; CHECK-NEXT:    umov.h w9, v1[0]
+; CHECK-NEXT:    umov.h w10, v1[1]
+; CHECK-NEXT:    umov.h w11, v1[2]
+; CHECK-NEXT:    bfi x8, x9, #1, #1
+; CHECK-NEXT:    and w10, w10, #0x1
+; CHECK-NEXT:    and w9, w9, #0x1
+; CHECK-NEXT:    add w9, w9, w10
+; CHECK-NEXT:    and w11, w11, #0x1
+; CHECK-NEXT:    add x10, sp, #8
+; CHECK-NEXT:    add w11, w9, w11
+; CHECK-NEXT:    orr x9, x10, x9, lsl #1
+; CHECK-NEXT:    st1.h { v0 }[1], [x8]
+; CHECK-NEXT:    bfi x10, x11, #1, #2
+; CHECK-NEXT:    st1.h { v0 }[2], [x9]
+; CHECK-NEXT:    st1.h { v0 }[3], [x10]
+; CHECK-NEXT:    ldr d0, [sp, #8]
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+    %out = call <4 x i4> @llvm.masked.compress.v4i4(<4 x i4> %vec, <4 x i1> %mask)
+    ret <4 x i4> %out
+}
+
+declare <4 x i32> @llvm.masked.compress.v4i32(<4 x i32>, <4 x i1>)
+declare <16 x i8> @llvm.masked.compress.v16i8(<16 x i8>, <16 x i1>)
+declare <4 x i4> @llvm.masked.compress.v4i4(<4 x i4>, <4 x i1>)
+declare <4 x i8> @llvm.masked.compress.v4i8(<4 x i8>, <4 x i1>)
+declare <8 x i32> @llvm.masked.compress.v8i32(<8 x i32>, <8 x i1>)

From 73bfebbdb3d7a1c96c8521e5789ad7e59f665da7 Mon Sep 17 00:00:00 2001
From: Lawrence Benson <github@lawben.com>
Date: Wed, 15 May 2024 16:41:13 +0200
Subject: [PATCH 04/49] Add floating point test

---
 llvm/test/CodeGen/AArch64/masked-compress.ll | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/llvm/test/CodeGen/AArch64/masked-compress.ll b/llvm/test/CodeGen/AArch64/masked-compress.ll
index 54c3beab82f768..a2f39b9620c957 100644
--- a/llvm/test/CodeGen/AArch64/masked-compress.ll
+++ b/llvm/test/CodeGen/AArch64/masked-compress.ll
@@ -32,6 +32,25 @@ define <4 x i32> @test_compress_v4i32(<4 x i32> %vec, <4 x i1> %mask) {
     ret <4 x i32> %out
 }
 
+define <2 x double> @test_compress_v2f64(<2 x double> %vec, <2 x i1> %mask) {
+; CHECK-LABEL: test_compress_v2f64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ushll.2d v1, v1, #0
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str d0, [sp]
+; CHECK-NEXT:    shl.2d v1, v1, #63
+; CHECK-NEXT:    cmlt.2d v1, v1, #0
+; CHECK-NEXT:    fmov x9, d1
+; CHECK-NEXT:    bfi x8, x9, #3, #1
+; CHECK-NEXT:    st1.d { v0 }[1], [x8]
+; CHECK-NEXT:    ldr q0, [sp], #16
+; CHECK-NEXT:    ret
+    %out = call <2 x double> @llvm.masked.compress.v2f64(<2 x double> %vec, <2 x i1> %mask)
+    ret <2 x double> %out
+}
+
 define <16 x i8> @test_compress_v16i8(<16 x i8> %vec, <16 x i1> %mask) {
 ; CHECK-LABEL: test_compress_v16i8:
 ; CHECK:       ; %bb.0:

From e4423a1b434c086a7787594efbba96aa29e392c4 Mon Sep 17 00:00:00 2001
From: Lawrence Benson <github@lawben.com>
Date: Wed, 15 May 2024 17:47:42 +0200
Subject: [PATCH 05/49] Add documentation

---
 llvm/docs/LangRef.rst | 79 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 79 insertions(+)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 06809f8bf445d8..773893b83a5d7e 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -24975,6 +24975,85 @@ The '``llvm.masked.compressstore``' intrinsic is designed for compressing data i
 
 Other targets may support this intrinsic differently, for example, by lowering it into a sequence of branches that guard scalar store operations.
 
+Masked Vector Compress Intrinsic
+--------------------------------
+
+LLVM provides an intrinsic for compressing data within a vector based on a selection mask.
+Semantically, this is similar to :ref:``@llvm.masked.compressstore <_int_compressstore>`` but with weaker assumptions
+and without storing the results to memory, i.e., the data remains in the vector.
+
+.. _int_masked_compress:
+
+'``llvm.masked.compress.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic. A number of scalar values of integer, floating point or pointer data type are collected
+from an input vector and placed adjacently within the result vector. A mask defines which elements to collect from the vector.
+
+:: code-block:: llvm
+
+      declare <8 x i32> @llvm.masked.compress.v8i32(<8 x i32> <value>, <8 x i1> <mask>)
+      declare <16 x float> @llvm.masked.compress.v16f32(<16 x float> <value>, <16 x i1> <mask>)
+
+Overview:
+"""""""""
+
+Selects elements from input vector '``value``' according to the '``mask``'.
+All selected elements are written into adjacent lanes in the result vector, from lower to higher.
+The mask holds a bit for each vector lane, and is used to select elements to be kept.
+The number of valid lanes is equal to the number of active bits in the mask.
+The main difference to :ref:`llvm.masked.compressstore <_int_compressstore>` is that the remainder of the vector may
+contain undefined values.
+This allows for branchless code and better optimization for all targets that do not support the explicit semantics of
+:ref:`llvm.masked.compressstore <_int_compressstore>`.
+The result vector can be written with a similar effect, as all the selected values are at the lower positions of the
+vector, but without requiring branches to avoid writes where the mask is 0.
+
+
+Arguments:
+""""""""""
+
+The first operand is the input vector, from which elements are selected.
+The second operand is the mask, a vector of boolean values.
+The mask and the input vector must have the same number of vector elements.
+
+Semantics:
+""""""""""
+
+The '``llvm.masked.compress``' intrinsic is designed for compressing data within a vector, i.e., ideally within a register.
+It allows to collect elements from possibly non-adjacent lanes of a vector and place them contiguously in the result vector in one IR operation.
+It is useful for targets all that support compress operations (e.g., AVX-512, ARM SVE, RISCV V), which more instruction
+sets do than explicit compressstore, i.e., ``llvm.masked.compress`` may yield better performance on more targets than
+``llvm.masked.compressstore`` due to weaker constraints.
+This intrinsic allows vectorizing loops with cross-iteration dependencies like in the following example:
+
+.. code-block:: c
+
+    // Consecutively store selected values with branchless code.
+    int *in, *out; bool *mask; int pos = 0;
+    for (int i = 0; i < size; ++i) {
+      out[pos] = in[i];
+      // if mask[i] == 0, the current value is overwritten in the next iteration.
+      pos += mask[i];
+    }
+
+
+.. code-block:: llvm
+
+    ; Load elements from `in`.
+    %vec = load <4 x i32>, ptr %inPtr
+    %mask = load <4 x i1>, ptr %maskPtr
+    %compressed = call <4 x i32> @llvm.masked.compress.v4i32(<4 x i32> %vec, <4 x i1> %mask)
+    store <4 x i32> %compressed, ptr %outPtr
+
+    ; %outPtr should be increased in each iteration by the number of '1's in the mask.
+    %iMask = bitcast <4 x i1> %mask to i4
+    %popcnt = call i4 @llvm.ctpop.i4(i4 %iMask)
+    %zextPopcnt = zext i4 %popcnt to i64
+    %nextOut = add i64 %outPos, %zextPopcnt
+
 
 Memory Use Markers
 ------------------

From 3e9967803d110116fc90823ae39a91cfe9d03d2c Mon Sep 17 00:00:00 2001
From: Lawrence Benson <github@lawben.com>
Date: Wed, 15 May 2024 18:06:44 +0200
Subject: [PATCH 06/49] Fix formatting

---
 .../lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp | 11 ++++++++---
 llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp   | 10 +++++-----
 llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp |  3 +--
 3 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 80f645b433cbe4..4063144f473937 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -87,7 +87,9 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
     break;
   case ISD::MGATHER:     Res = PromoteIntRes_MGATHER(cast<MaskedGatherSDNode>(N));
     break;
-  case ISD::MCOMPRESS:   Res = PromoteIntRes_MCOMPRESS(N); break;
+  case ISD::MCOMPRESS:
+    Res = PromoteIntRes_MCOMPRESS(N);
+    break;
   case ISD::SELECT:
   case ISD::VSELECT:
   case ISD::VP_SELECT:
@@ -951,7 +953,8 @@ SDValue DAGTypeLegalizer::PromoteIntRes_MGATHER(MaskedGatherSDNode *N) {
 
 SDValue DAGTypeLegalizer::PromoteIntRes_MCOMPRESS(SDNode *N) {
   SDValue Vec = GetPromotedInteger(N->getOperand(0));
-  return DAG.getNode(ISD::MCOMPRESS, SDLoc(N), Vec.getValueType(), Vec, N->getOperand(1));
+  return DAG.getNode(ISD::MCOMPRESS, SDLoc(N), Vec.getValueType(), Vec,
+                     N->getOperand(1));
 }
 
 /// Promote the overflow flag of an overflowing arithmetic node.
@@ -1861,7 +1864,9 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
                                                  OpNo); break;
   case ISD::MSCATTER: Res = PromoteIntOp_MSCATTER(cast<MaskedScatterSDNode>(N),
                                                   OpNo); break;
-  case ISD::MCOMPRESS: Res = PromoteIntOp_MCOMPRESS(N, OpNo); break;
+  case ISD::MCOMPRESS:
+    Res = PromoteIntOp_MCOMPRESS(N, OpNo);
+    break;
   case ISD::VP_TRUNCATE:
   case ISD::TRUNCATE:     Res = PromoteIntOp_TRUNCATE(N); break;
   case ISD::BF16_TO_FP:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index ca32db26e511cf..ebf0f63775d44b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -1528,11 +1528,11 @@ SDValue VectorLegalizer::ExpandMCOMPRESS(SDNode *Node) {
   for (unsigned I = 0; I < NumElms; I++) {
     SDValue Idx = DAG.getVectorIdxConstant(I, DL);
 
-    SDValue ValI =
-        DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Vec, Idx);
-    SDValue OutPtr =
-        TLI.getVectorElementPointer(DAG, StackPtr, VecVT, OutPos);
-    Chain = DAG.getStore(Chain, DL, ValI, OutPtr, MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()));
+    SDValue ValI = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Vec, Idx);
+    SDValue OutPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, OutPos);
+    Chain = DAG.getStore(
+        Chain, DL, ValI, OutPtr,
+        MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()));
 
     // Skip this for last element.
     if (I < NumElms - 1) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 665bab6121837a..20461511ac92ff 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6722,8 +6722,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     setValue(&I, DAG.getNode(ISD::MCOMPRESS, sdl,
                              getValue(I.getArgOperand(0)).getValueType(),
                              getValue(I.getArgOperand(0)),
-                             getValue(I.getArgOperand(1)),
-                             Flags));
+                             getValue(I.getArgOperand(1)), Flags));
     return;
   case Intrinsic::powi:
     setValue(&I, ExpandPowI(sdl, getValue(I.getArgOperand(0)),

From b686f83ef295b505e8f3316efc66ac8bea163ebc Mon Sep 17 00:00:00 2001
From: Lawrence Benson <github@lawben.com>
Date: Thu, 16 May 2024 11:15:41 +0200
Subject: [PATCH 07/49] Fix references in docs

---
 llvm/docs/LangRef.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 773893b83a5d7e..e2d3a986ddedf3 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -24979,7 +24979,7 @@ Masked Vector Compress Intrinsic
 --------------------------------
 
 LLVM provides an intrinsic for compressing data within a vector based on a selection mask.
-Semantically, this is similar to :ref:``@llvm.masked.compressstore <_int_compressstore>`` but with weaker assumptions
+Semantically, this is similar to :ref:`llvm.masked.compressstore <int_compressstore>` but with weaker assumptions
 and without storing the results to memory, i.e., the data remains in the vector.
 
 .. _int_masked_compress:
@@ -25004,10 +25004,10 @@ Selects elements from input vector '``value``' according to the '``mask``'.
 All selected elements are written into adjacent lanes in the result vector, from lower to higher.
 The mask holds a bit for each vector lane, and is used to select elements to be kept.
 The number of valid lanes is equal to the number of active bits in the mask.
-The main difference to :ref:`llvm.masked.compressstore <_int_compressstore>` is that the remainder of the vector may
+The main difference to :ref:`llvm.masked.compressstore <int_compressstore>` is that the remainder of the vector may
 contain undefined values.
 This allows for branchless code and better optimization for all targets that do not support the explicit semantics of
-:ref:`llvm.masked.compressstore <_int_compressstore>`.
+:ref:`llvm.masked.compressstore <int_compressstore>`.
 The result vector can be written with a similar effect, as all the selected values are at the lower positions of the
 vector, but without requiring branches to avoid writes where the mask is 0.
 

From 73cc28f42e5e18daf3b57997b4cc9084a78e6327 Mon Sep 17 00:00:00 2001
From: Lawrence Benson <github@lawben.com>
Date: Thu, 16 May 2024 12:44:59 +0200
Subject: [PATCH 08/49] Add widen for vector type legalization

---
 llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h |  1 +
 .../SelectionDAG/LegalizeVectorTypes.cpp      | 16 ++++
 llvm/test/CodeGen/AArch64/masked-compress.ll  | 82 +++++++++++++++++--
 3 files changed, 94 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 5fb14757f8991f..83343ef5c173fc 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -975,6 +975,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue WidenVecRes_LOAD(SDNode* N);
   SDValue WidenVecRes_VP_LOAD(VPLoadSDNode *N);
   SDValue WidenVecRes_VP_STRIDED_LOAD(VPStridedLoadSDNode *N);
+  SDValue WidenVecRes_MCOMPRESS(SDNode* N);
   SDValue WidenVecRes_MLOAD(MaskedLoadSDNode* N);
   SDValue WidenVecRes_MGATHER(MaskedGatherSDNode* N);
   SDValue WidenVecRes_VP_GATHER(VPGatherSDNode* N);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 62e7febed65680..2a9c8adfc3fc60 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -4273,6 +4273,9 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::EXPERIMENTAL_VP_STRIDED_LOAD:
     Res = WidenVecRes_VP_STRIDED_LOAD(cast<VPStridedLoadSDNode>(N));
     break;
+  case ISD::MCOMPRESS:
+    Res = WidenVecRes_MCOMPRESS(N);
+    break;
   case ISD::MLOAD:
     Res = WidenVecRes_MLOAD(cast<MaskedLoadSDNode>(N));
     break;
@@ -5655,6 +5658,19 @@ SDValue DAGTypeLegalizer::WidenVecRes_VP_STRIDED_LOAD(VPStridedLoadSDNode *N) {
   return Res;
 }
 
+SDValue DAGTypeLegalizer::WidenVecRes_MCOMPRESS(SDNode *N) {
+  SDValue Vec = N->getOperand(0);
+  SDValue Mask = N->getOperand(1);
+  EVT WideVecVT = TLI.getTypeToTransformTo(*DAG.getContext(), Vec.getValueType());
+  EVT WideMaskVT = TLI.getTypeToTransformTo(*DAG.getContext(), Mask.getValueType());
+
+  // In the default expanded case, adding UNDEF values for the new widened lanes
+  // allows us to remove their access later, which reduces the number os stores.
+  SDValue WideVec = ModifyToType(Vec, WideVecVT, /*FillWithZeroes=*/true);
+  SDValue WideMask = ModifyToType(Mask, WideMaskVT, /*FillWithZeroes=*/true);
+  return DAG.getNode(ISD::MCOMPRESS, SDLoc(N), WideVecVT, WideVec, WideMask);
+}
+
 SDValue DAGTypeLegalizer::WidenVecRes_MLOAD(MaskedLoadSDNode *N) {
 
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(),N->getValueType(0));
diff --git a/llvm/test/CodeGen/AArch64/masked-compress.ll b/llvm/test/CodeGen/AArch64/masked-compress.ll
index a2f39b9620c957..92aea4108a7b7f 100644
--- a/llvm/test/CodeGen/AArch64/masked-compress.ll
+++ b/llvm/test/CodeGen/AArch64/masked-compress.ll
@@ -292,8 +292,80 @@ define <4 x i4> @test_compress_illegal_element_type(<4 x i4> %vec, <4 x i1> %mas
     ret <4 x i4> %out
 }
 
-declare <4 x i32> @llvm.masked.compress.v4i32(<4 x i32>, <4 x i1>)
-declare <16 x i8> @llvm.masked.compress.v16i8(<16 x i8>, <16 x i1>)
-declare <4 x i4> @llvm.masked.compress.v4i4(<4 x i4>, <4 x i1>)
-declare <4 x i8> @llvm.masked.compress.v4i8(<4 x i8>, <4 x i1>)
-declare <8 x i32> @llvm.masked.compress.v8i32(<8 x i32>, <8 x i1>)
+define <3 x i32> @test_compress_narrow(<3 x i32> %vec, <3 x i1> %mask) {
+; CHECK-LABEL: test_compress_narrow:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movi.2d v1, #0000000000000000
+; CHECK-NEXT:    mov x11, sp
+; CHECK-NEXT:    str s0, [sp]
+; CHECK-NEXT:    mov.h v1[0], w0
+; CHECK-NEXT:    mov.h v1[1], w1
+; CHECK-NEXT:    mov.h v1[2], w2
+; CHECK-NEXT:    ushll.4s v1, v1, #0
+; CHECK-NEXT:    shl.4s v1, v1, #31
+; CHECK-NEXT:    cmlt.4s v1, v1, #0
+; CHECK-NEXT:    mov.s w8, v1[1]
+; CHECK-NEXT:    mov.s w9, v1[2]
+; CHECK-NEXT:    fmov w10, s1
+; CHECK-NEXT:    bfi x11, x10, #2, #1
+; CHECK-NEXT:    and w10, w10, #0x1
+; CHECK-NEXT:    and w8, w8, #0x1
+; CHECK-NEXT:    and w9, w9, #0x1
+; CHECK-NEXT:    add w8, w10, w8
+; CHECK-NEXT:    mov x10, sp
+; CHECK-NEXT:    st1.s { v0 }[1], [x11]
+; CHECK-NEXT:    add w9, w8, w9
+; CHECK-NEXT:    orr x8, x10, x8, lsl #2
+; CHECK-NEXT:    bfi x10, x9, #2, #2
+; CHECK-NEXT:    st1.s { v0 }[2], [x8]
+; CHECK-NEXT:    str wzr, [x10]
+; CHECK-NEXT:    ldr q0, [sp], #16
+; CHECK-NEXT:    ret
+    %out = call <3 x i32> @llvm.masked.compress.v3i32(<3 x i32> %vec, <3 x i1> %mask)
+    ret <3 x i32> %out
+}
+
+define <3 x i3> @test_compress_narrow_illegal_element_type(<3 x i3> %vec, <3 x i1> %mask) {
+; CHECK-LABEL: test_compress_narrow_illegal_element_type:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movi.2d v0, #0000000000000000
+; CHECK-NEXT:    fmov s1, w0
+; CHECK-NEXT:    add x12, sp, #8
+; CHECK-NEXT:    movi.4h v2, #7
+; CHECK-NEXT:    add x11, sp, #8
+; CHECK-NEXT:    mov.h v1[1], w1
+; CHECK-NEXT:    mov.h v0[0], w3
+; CHECK-NEXT:    mov.h v1[2], w2
+; CHECK-NEXT:    mov.h v0[1], w4
+; CHECK-NEXT:    mov.h v0[2], w5
+; CHECK-NEXT:    shl.4h v0, v0, #15
+; CHECK-NEXT:    cmlt.4h v0, v0, #0
+; CHECK-NEXT:    umov.h w8, v0[0]
+; CHECK-NEXT:    umov.h w9, v0[1]
+; CHECK-NEXT:    umov.h w10, v0[2]
+; CHECK-NEXT:    and.8b v0, v1, v2
+; CHECK-NEXT:    and w9, w9, #0x1
+; CHECK-NEXT:    and w13, w8, #0x1
+; CHECK-NEXT:    bfi x12, x8, #1, #1
+; CHECK-NEXT:    add w8, w13, w9
+; CHECK-NEXT:    and w9, w10, #0x1
+; CHECK-NEXT:    str h0, [sp, #8]
+; CHECK-NEXT:    orr x10, x11, x8, lsl #1
+; CHECK-NEXT:    add w8, w8, w9
+; CHECK-NEXT:    st1.h { v0 }[1], [x12]
+; CHECK-NEXT:    bfi x11, x8, #1, #2
+; CHECK-NEXT:    st1.h { v0 }[2], [x10]
+; CHECK-NEXT:    strh wzr, [x11]
+; CHECK-NEXT:    ldr d0, [sp, #8]
+; CHECK-NEXT:    umov.h w0, v0[0]
+; CHECK-NEXT:    umov.h w1, v0[1]
+; CHECK-NEXT:    umov.h w2, v0[2]
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+    %out = call <3 x i3> @llvm.masked.compress.v3i3(<3 x i3> %vec, <3 x i1> %mask)
+    ret <3 x i3> %out
+}

From 8a613f37780c83e9874c0f5454ffec27eb9060f2 Mon Sep 17 00:00:00 2001
From: Lawrence Benson <github@lawben.com>
Date: Thu, 16 May 2024 14:48:43 +0200
Subject: [PATCH 09/49] Put expand logic in TargerLowering to avoid code
 duplication.

---
 llvm/include/llvm/CodeGen/TargetLowering.h    |  4 +
 .../SelectionDAG/LegalizeVectorOps.cpp        | 45 +----------
 .../SelectionDAG/LegalizeVectorTypes.cpp      | 56 ++-----------
 .../CodeGen/SelectionDAG/TargetLowering.cpp   | 47 +++++++++++
 llvm/test/CodeGen/AArch64/masked-compress.ll  | 79 +++++++++----------
 5 files changed, 95 insertions(+), 136 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 50a8c7eb75af5b..ad79c22aa3e1b6 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -5472,6 +5472,10 @@ class TargetLowering : public TargetLoweringBase {
   /// method accepts vectors as its arguments.
   SDValue expandVectorSplice(SDNode *Node, SelectionDAG &DAG) const;
 
+  /// Expand a vector MCOMPRESS into a sequence of extract element, store
+  /// temporarily, advance store position, before re-loading the final vector.
+  SDValue expandMCOMPRESS(SDNode *Node, SelectionDAG &DAG) const;
+
   /// Legalize a SETCC or VP_SETCC with given LHS and RHS and condition code CC
   /// on the current target. A VP_SETCC will additionally be given a Mask
   /// and/or EVL not equal to SDValue().
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index ebf0f63775d44b..bb2d3a8a3a0d16 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -134,7 +134,6 @@ class VectorLegalizer {
   SDValue ExpandVSELECT(SDNode *Node);
   SDValue ExpandVP_SELECT(SDNode *Node);
   SDValue ExpandVP_MERGE(SDNode *Node);
-  SDValue ExpandMCOMPRESS(SDNode *Node);
   SDValue ExpandVP_REM(SDNode *Node);
   SDValue ExpandSELECT(SDNode *Node);
   std::pair<SDValue, SDValue> ExpandLoad(SDNode *N);
@@ -1104,7 +1103,7 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
 
     break;
   case ISD::MCOMPRESS:
-    Results.push_back(ExpandMCOMPRESS(Node));
+    Results.push_back(TLI.expandMCOMPRESS(Node, DAG));
     return;
   }
 
@@ -1510,48 +1509,6 @@ SDValue VectorLegalizer::ExpandVP_MERGE(SDNode *Node) {
   return DAG.getSelect(DL, Node->getValueType(0), FullMask, Op1, Op2);
 }
 
-SDValue VectorLegalizer::ExpandMCOMPRESS(SDNode *Node) {
-  SDLoc DL(Node);
-  SDValue Vec = Node->getOperand(0);
-  SDValue Mask = Node->getOperand(1);
-
-  EVT VecVT = Vec.getValueType();
-  EVT ScalarVT = VecVT.getScalarType();
-  EVT MaskScalarVT = Mask.getValueType().getScalarType();
-
-  SDValue StackPtr = DAG.CreateStackTemporary(VecVT);
-  SDValue Chain = DAG.getEntryNode();
-  SDValue OutPos = DAG.getConstant(0, DL, MVT::i32);
-
-  unsigned NumElms = VecVT.getVectorNumElements();
-  // Skip element zero, as we always copy this to the output vector.
-  for (unsigned I = 0; I < NumElms; I++) {
-    SDValue Idx = DAG.getVectorIdxConstant(I, DL);
-
-    SDValue ValI = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Vec, Idx);
-    SDValue OutPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, OutPos);
-    Chain = DAG.getStore(
-        Chain, DL, ValI, OutPtr,
-        MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()));
-
-    // Skip this for last element.
-    if (I < NumElms - 1) {
-      // Get the mask value and add it to the current output position. This
-      // either increments by 1 if MaskI is true or adds 0 otherwise.
-      SDValue MaskI =
-          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskScalarVT, Mask, Idx);
-      MaskI = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, MaskI);
-      MaskI = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, MaskI);
-      OutPos = DAG.getNode(ISD::ADD, DL, MVT::i32, OutPos, MaskI);
-    }
-  }
-
-  int FI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
-  MachinePointerInfo PtrInfo =
-      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
-  return DAG.getLoad(VecVT, DL, Chain, StackPtr, PtrInfo);
-}
-
 SDValue VectorLegalizer::ExpandVP_REM(SDNode *Node) {
   // Implement VP_SREM/UREM in terms of VP_SDIV/VP_UDIV, VP_MUL, VP_SUB.
   EVT VT = Node->getValueType(0);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 2a9c8adfc3fc60..34146e9a123084 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -2312,56 +2312,14 @@ void DAGTypeLegalizer::SplitVecRes_MCOMPRESS(SDNode *N, SDValue &Lo,
   // This is not "trivial", as there is a dependency between the two subvectors.
   // Depending on the number of 1s in the mask, the elements from the Hi vector
   // need to be moved to the Lo vector. So we just perform this as one "big"
-  // operation (analogously to the default MCOMPRESS expand implementation), by
-  // writing to memory and then loading the Lo and Hi vectors from that. This
-  // gets rid of MCOMPRESS and all other operands can be legalized later.
-  SDLoc DL(N);
-  SDValue Vec = N->getOperand(0);
-  SDValue Mask = N->getOperand(1);
-
-  EVT VecVT = Vec.getValueType();
-  EVT SubVecVT = VecVT.getHalfNumVectorElementsVT(*DAG.getContext());
-  EVT ScalarVT = VecVT.getScalarType();
-  EVT MaskScalarVT = Mask.getValueType().getScalarType();
-
-  // TODO: This code is duplicated here and in LegalizeVectorOps.
-  SDValue StackPtr = DAG.CreateStackTemporary(VecVT);
-  SDValue Chain = DAG.getEntryNode();
-  SDValue OutPos = DAG.getConstant(0, DL, MVT::i32);
-
-  unsigned NumElms = VecVT.getVectorNumElements();
-  // Skip element zero, as we always copy this to the output vector.
-  for (unsigned I = 0; I < NumElms; I++) {
-    SDValue Idx = DAG.getVectorIdxConstant(I, DL);
-
-    SDValue ValI = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Vec, Idx);
-    SDValue OutPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, OutPos);
-    Chain = DAG.getStore(
-        Chain, DL, ValI, OutPtr,
-        MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()));
-
-    // Skip this for last element.
-    if (I < NumElms - 1) {
-      // Get the mask value and add it to the current output position. This
-      // either increments by 1 if MaskI is true or adds 0 otherwise.
-      SDValue MaskI =
-          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskScalarVT, Mask, Idx);
-      MaskI = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, MaskI);
-      MaskI = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, MaskI);
-      OutPos = DAG.getNode(ISD::ADD, DL, MVT::i32, OutPos, MaskI);
-    }
-  }
-
-  int FI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
-  MachinePointerInfo PtrInfo =
-      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
-  SDValue HiPtr = TLI.getVectorElementPointer(
-      DAG, StackPtr, VecVT, DAG.getConstant(NumElms / 2, DL, MVT::i32));
+  // operation and then extract the Lo and Hi vectors from that. This gets rid
+  // of MCOMPRESS and all other operands can be legalized later.
+  SDValue Compressed = TLI.expandMCOMPRESS(N, DAG);
 
-  Lo = DAG.getLoad(SubVecVT, DL, Chain, StackPtr, PtrInfo);
-  Hi = DAG.getLoad(
-      SubVecVT, DL, Chain, HiPtr,
-      MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()));
+  SDLoc DL(N);
+  EVT SubVecVT = Compressed.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
+  Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVecVT, Compressed, DAG.getVectorIdxConstant(0, DL));
+  Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVecVT, Compressed, DAG.getVectorIdxConstant(SubVecVT.getVectorNumElements(), DL));
 }
 
 void DAGTypeLegalizer::SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 7beaeb9b7a1718..f3d48e60c4b585 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -11226,6 +11226,53 @@ SDValue TargetLowering::expandVectorSplice(SDNode *Node,
                      MachinePointerInfo::getUnknownStack(MF));
 }
 
+SDValue TargetLowering::expandMCOMPRESS(SDNode *Node, SelectionDAG &DAG) const {
+  SDLoc DL(Node);
+  SDValue Vec = Node->getOperand(0);
+  SDValue Mask = Node->getOperand(1);
+
+  EVT VecVT = Vec.getValueType();
+  EVT ScalarVT = VecVT.getScalarType();
+  EVT MaskScalarVT = Mask.getValueType().getScalarType();
+
+  if (VecVT.isScalableVector())
+    report_fatal_error(
+        "Expanding masked_compress for scalable vectors is undefined.");
+
+  SDValue StackPtr = DAG.CreateStackTemporary(VecVT.getStoreSize(), DAG.getReducedAlign(VecVT, /*UseABI=*/false));
+  SDValue Chain = DAG.getEntryNode();
+  SDValue OutPos = DAG.getConstant(0, DL, MVT::i32);
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  unsigned NumElms = VecVT.getVectorNumElements();
+  // Skip element zero, as we always copy this to the output vector.
+  for (unsigned I = 0; I < NumElms; I++) {
+    SDValue Idx = DAG.getVectorIdxConstant(I, DL);
+
+    SDValue ValI = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Vec, Idx);
+    SDValue OutPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, OutPos);
+    Chain = DAG.getStore(
+        Chain, DL, ValI, OutPtr,
+        MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()));
+
+    // Skip this for last element.
+    if (I < NumElms - 1) {
+      // Get the mask value and add it to the current output position. This
+      // either increments by 1 if MaskI is true or adds 0 otherwise.
+      SDValue MaskI =
+          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskScalarVT, Mask, Idx);
+      MaskI = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, MaskI);
+      MaskI = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, MaskI);
+      OutPos = DAG.getNode(ISD::ADD, DL, MVT::i32, OutPos, MaskI);
+    }
+  }
+
+  int FI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+  MachinePointerInfo PtrInfo =
+      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
+  return DAG.getLoad(VecVT, DL, Chain, StackPtr, PtrInfo);
+}
+
 bool TargetLowering::LegalizeSetCCCondCode(SelectionDAG &DAG, EVT VT,
                                            SDValue &LHS, SDValue &RHS,
                                            SDValue &CC, SDValue Mask,
diff --git a/llvm/test/CodeGen/AArch64/masked-compress.ll b/llvm/test/CodeGen/AArch64/masked-compress.ll
index 92aea4108a7b7f..9057e6f8967fa5 100644
--- a/llvm/test/CodeGen/AArch64/masked-compress.ll
+++ b/llvm/test/CodeGen/AArch64/masked-compress.ll
@@ -154,57 +154,50 @@ define <16 x i8> @test_compress_v16i8(<16 x i8> %vec, <16 x i1> %mask) {
 define <8 x i32> @test_compress_large(<8 x i32> %vec, <8 x i1> %mask) {
 ; CHECK-LABEL: test_compress_large:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
-; CHECK-NEXT:    sub x9, sp, #48
-; CHECK-NEXT:    mov x29, sp
-; CHECK-NEXT:    and sp, x9, #0xffffffffffffffe0
-; CHECK-NEXT:    .cfi_def_cfa w29, 16
-; CHECK-NEXT:    .cfi_offset w30, -8
-; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    sub sp, sp, #32
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-NEXT:    ; kill: def $d2 killed $d2 def $q2
-; CHECK-NEXT:    umov.b w9, v2[0]
-; CHECK-NEXT:    umov.b w10, v2[1]
+; CHECK-NEXT:    umov.b w8, v2[0]
+; CHECK-NEXT:    umov.b w9, v2[1]
 ; CHECK-NEXT:    mov x12, sp
-; CHECK-NEXT:    umov.b w11, v2[2]
+; CHECK-NEXT:    umov.b w10, v2[2]
 ; CHECK-NEXT:    umov.b w13, v2[3]
-; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    mov x11, sp
 ; CHECK-NEXT:    umov.b w14, v2[4]
 ; CHECK-NEXT:    str s0, [sp]
-; CHECK-NEXT:    bfi x12, x9, #2, #1
-; CHECK-NEXT:    and w10, w10, #0x1
 ; CHECK-NEXT:    and w9, w9, #0x1
-; CHECK-NEXT:    add w9, w9, w10
-; CHECK-NEXT:    and w10, w11, #0x1
-; CHECK-NEXT:    and w13, w13, #0x1
-; CHECK-NEXT:    orr x11, x8, x9, lsl #2
-; CHECK-NEXT:    add w9, w9, w10
+; CHECK-NEXT:    and w15, w8, #0x1
+; CHECK-NEXT:    bfi x12, x8, #2, #1
+; CHECK-NEXT:    and w8, w10, #0x1
+; CHECK-NEXT:    add w9, w15, w9
 ; CHECK-NEXT:    umov.b w10, v2[5]
+; CHECK-NEXT:    add w8, w9, w8
+; CHECK-NEXT:    orr x15, x11, x9, lsl #2
+; CHECK-NEXT:    umov.b w9, v2[6]
 ; CHECK-NEXT:    st1.s { v0 }[1], [x12]
-; CHECK-NEXT:    add w13, w9, w13
-; CHECK-NEXT:    orr x9, x8, x9, lsl #2
-; CHECK-NEXT:    st1.s { v0 }[2], [x11]
-; CHECK-NEXT:    umov.b w11, v2[6]
-; CHECK-NEXT:    mov x12, sp
-; CHECK-NEXT:    and w14, w14, #0x1
-; CHECK-NEXT:    bfi x12, x13, #2, #3
-; CHECK-NEXT:    st1.s { v0 }[3], [x9]
-; CHECK-NEXT:    add w13, w13, w14
-; CHECK-NEXT:    and w9, w10, #0x1
-; CHECK-NEXT:    mov x10, sp
-; CHECK-NEXT:    add w9, w13, w9
-; CHECK-NEXT:    mov x14, sp
-; CHECK-NEXT:    str s1, [x12]
-; CHECK-NEXT:    and w11, w11, #0x1
-; CHECK-NEXT:    bfi x10, x9, #2, #3
-; CHECK-NEXT:    bfi x14, x13, #2, #3
-; CHECK-NEXT:    add w9, w9, w11
-; CHECK-NEXT:    bfi x8, x9, #2, #3
-; CHECK-NEXT:    st1.s { v1 }[1], [x14]
-; CHECK-NEXT:    st1.s { v1 }[2], [x10]
-; CHECK-NEXT:    st1.s { v1 }[3], [x8]
-; CHECK-NEXT:    ldp q0, q1, [sp]
-; CHECK-NEXT:    mov sp, x29
-; CHECK-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; CHECK-NEXT:    add x12, x11, w8, uxtw #2
+; CHECK-NEXT:    and w13, w13, #0x1
+; CHECK-NEXT:    st1.s { v0 }[2], [x15]
+; CHECK-NEXT:    add w8, w8, w13
+; CHECK-NEXT:    st1.s { v0 }[3], [x12]
+; CHECK-NEXT:    and w12, w14, #0x1
+; CHECK-NEXT:    and w10, w10, #0x1
+; CHECK-NEXT:    add w12, w8, w12
+; CHECK-NEXT:    and w9, w9, #0x1
+; CHECK-NEXT:    and x8, x8, #0x7
+; CHECK-NEXT:    add w10, w12, w10
+; CHECK-NEXT:    and x12, x12, #0x7
+; CHECK-NEXT:    str s1, [x11, x8, lsl #2]
+; CHECK-NEXT:    add w9, w10, w9
+; CHECK-NEXT:    and x10, x10, #0x7
+; CHECK-NEXT:    add x12, x11, x12, lsl #2
+; CHECK-NEXT:    and x9, x9, #0x7
+; CHECK-NEXT:    add x8, x11, x10, lsl #2
+; CHECK-NEXT:    add x9, x11, x9, lsl #2
+; CHECK-NEXT:    st1.s { v1 }[1], [x12]
+; CHECK-NEXT:    st1.s { v1 }[2], [x8]
+; CHECK-NEXT:    st1.s { v1 }[3], [x9]
+; CHECK-NEXT:    ldp q0, q1, [sp], #32
 ; CHECK-NEXT:    ret
     %out = call <8 x i32> @llvm.masked.compress.v8i32(<8 x i32> %vec, <8 x i1> %mask)
     ret <8 x i32> %out

From a4df959d8047012661d868e48722711a77e60a68 Mon Sep 17 00:00:00 2001
From: Lawrence Benson <github@lawben.com>
Date: Thu, 16 May 2024 14:49:21 +0200
Subject: [PATCH 10/49] Fix formatting

---
 llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h    |  2 +-
 .../CodeGen/SelectionDAG/LegalizeVectorTypes.cpp | 16 +++++++++++-----
 llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp |  3 ++-
 3 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 83343ef5c173fc..26ce361f2d580f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -975,7 +975,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue WidenVecRes_LOAD(SDNode* N);
   SDValue WidenVecRes_VP_LOAD(VPLoadSDNode *N);
   SDValue WidenVecRes_VP_STRIDED_LOAD(VPStridedLoadSDNode *N);
-  SDValue WidenVecRes_MCOMPRESS(SDNode* N);
+  SDValue WidenVecRes_MCOMPRESS(SDNode *N);
   SDValue WidenVecRes_MLOAD(MaskedLoadSDNode* N);
   SDValue WidenVecRes_MGATHER(MaskedGatherSDNode* N);
   SDValue WidenVecRes_VP_GATHER(VPGatherSDNode* N);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 34146e9a123084..e94316003c35c7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -2317,9 +2317,13 @@ void DAGTypeLegalizer::SplitVecRes_MCOMPRESS(SDNode *N, SDValue &Lo,
   SDValue Compressed = TLI.expandMCOMPRESS(N, DAG);
 
   SDLoc DL(N);
-  EVT SubVecVT = Compressed.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
-  Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVecVT, Compressed, DAG.getVectorIdxConstant(0, DL));
-  Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVecVT, Compressed, DAG.getVectorIdxConstant(SubVecVT.getVectorNumElements(), DL));
+  EVT SubVecVT =
+      Compressed.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
+  Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVecVT, Compressed,
+                   DAG.getVectorIdxConstant(0, DL));
+  Hi = DAG.getNode(
+      ISD::EXTRACT_SUBVECTOR, DL, SubVecVT, Compressed,
+      DAG.getVectorIdxConstant(SubVecVT.getVectorNumElements(), DL));
 }
 
 void DAGTypeLegalizer::SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi) {
@@ -5619,8 +5623,10 @@ SDValue DAGTypeLegalizer::WidenVecRes_VP_STRIDED_LOAD(VPStridedLoadSDNode *N) {
 SDValue DAGTypeLegalizer::WidenVecRes_MCOMPRESS(SDNode *N) {
   SDValue Vec = N->getOperand(0);
   SDValue Mask = N->getOperand(1);
-  EVT WideVecVT = TLI.getTypeToTransformTo(*DAG.getContext(), Vec.getValueType());
-  EVT WideMaskVT = TLI.getTypeToTransformTo(*DAG.getContext(), Mask.getValueType());
+  EVT WideVecVT =
+      TLI.getTypeToTransformTo(*DAG.getContext(), Vec.getValueType());
+  EVT WideMaskVT =
+      TLI.getTypeToTransformTo(*DAG.getContext(), Mask.getValueType());
 
   // In the default expanded case, adding UNDEF values for the new widened lanes
   // allows us to remove their access later, which reduces the number os stores.
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index f3d48e60c4b585..6509f4441d2667 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -11239,7 +11239,8 @@ SDValue TargetLowering::expandMCOMPRESS(SDNode *Node, SelectionDAG &DAG) const {
     report_fatal_error(
         "Expanding masked_compress for scalable vectors is undefined.");
 
-  SDValue StackPtr = DAG.CreateStackTemporary(VecVT.getStoreSize(), DAG.getReducedAlign(VecVT, /*UseABI=*/false));
+  SDValue StackPtr = DAG.CreateStackTemporary(
+      VecVT.getStoreSize(), DAG.getReducedAlign(VecVT, /*UseABI=*/false));
   SDValue Chain = DAG.getEntryNode();
   SDValue OutPos = DAG.getConstant(0, DL, MVT::i32);
 

From 17004b99ba19b1886f6dc38d331e78bd5e6ebe44 Mon Sep 17 00:00:00 2001
From: Lawrence Benson <github@lawben.com>
Date: Fri, 17 May 2024 12:07:23 +0200
Subject: [PATCH 11/49] Add basic lowering of MCOMPRESS in GlobalISel

---
 .../llvm/CodeGen/GlobalISel/LegalizerHelper.h |  1 +
 llvm/include/llvm/Support/TargetOpcodes.def   |  3 ++
 llvm/include/llvm/Target/GenericOpcodes.td    |  7 +++
 .../Target/GlobalISel/SelectionDAGCompat.td   |  1 +
 llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp  |  2 +
 .../CodeGen/GlobalISel/LegalizerHelper.cpp    | 49 +++++++++++++++++++
 .../CodeGen/SelectionDAG/TargetLowering.cpp   |  1 -
 .../AArch64/GISel/AArch64LegalizerInfo.cpp    |  4 ++
 8 files changed, 67 insertions(+), 1 deletion(-)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
index 284f434fbb9b0c..132ed3fab57fcb 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
@@ -410,6 +410,7 @@ class LegalizerHelper {
   LegalizeResult lowerUnmergeValues(MachineInstr &MI);
   LegalizeResult lowerExtractInsertVectorElt(MachineInstr &MI);
   LegalizeResult lowerShuffleVector(MachineInstr &MI);
+  LegalizeResult lowerMCOMPRESS(MachineInstr &MI);
   Register getDynStackAllocTargetPtr(Register SPReg, Register AllocSize,
                                      Align Alignment, LLT PtrTy);
   LegalizeResult lowerDynStackAlloc(MachineInstr &MI);
diff --git a/llvm/include/llvm/Support/TargetOpcodes.def b/llvm/include/llvm/Support/TargetOpcodes.def
index 559a588c251482..f85aca25fb945d 100644
--- a/llvm/include/llvm/Support/TargetOpcodes.def
+++ b/llvm/include/llvm/Support/TargetOpcodes.def
@@ -751,6 +751,9 @@ HANDLE_TARGET_OPCODE(G_SHUFFLE_VECTOR)
 /// Generic splatvector.
 HANDLE_TARGET_OPCODE(G_SPLAT_VECTOR)
 
+/// Generic masked compress.
+HANDLE_TARGET_OPCODE(G_MCOMPRESS)
+
 /// Generic count trailing zeroes.
 HANDLE_TARGET_OPCODE(G_CTTZ)
 
diff --git a/llvm/include/llvm/Target/GenericOpcodes.td b/llvm/include/llvm/Target/GenericOpcodes.td
index c40498e5542154..7cbb6cc1bb7376 100644
--- a/llvm/include/llvm/Target/GenericOpcodes.td
+++ b/llvm/include/llvm/Target/GenericOpcodes.td
@@ -1500,6 +1500,13 @@ def G_SPLAT_VECTOR: GenericInstruction {
   let hasSideEffects = false;
 }
 
+// Generic masked compress.
+def G_MCOMPRESS: GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type1:$vec, type2:$mask);
+  let hasSideEffects = false;
+}
+
 //------------------------------------------------------------------------------
 // Vector reductions
 //------------------------------------------------------------------------------
diff --git a/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td b/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
index 8fa0e4b86d6dc9..69fb235dfc2b5e 100644
--- a/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
+++ b/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
@@ -186,6 +186,7 @@ def : GINodeEquiv<G_VECREDUCE_UMAX, vecreduce_umax>;
 def : GINodeEquiv<G_VECREDUCE_SMIN, vecreduce_smin>;
 def : GINodeEquiv<G_VECREDUCE_SMAX, vecreduce_smax>;
 def : GINodeEquiv<G_VECREDUCE_ADD, vecreduce_add>;
+def : GINodeEquiv<G_MCOMPRESS, masked_compress>;
 
 def : GINodeEquiv<G_STRICT_FADD, strict_fadd>;
 def : GINodeEquiv<G_STRICT_FSUB, strict_fsub>;
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 5289b993476dbe..858ba547ac22a5 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -1986,6 +1986,8 @@ unsigned IRTranslator::getSimpleIntrinsicOpcode(Intrinsic::ID ID) {
       return TargetOpcode::G_VECREDUCE_UMAX;
     case Intrinsic::vector_reduce_umin:
       return TargetOpcode::G_VECREDUCE_UMIN;
+    case Intrinsic::masked_compress:
+      return TargetOpcode::G_MCOMPRESS;
     case Intrinsic::lround:
       return TargetOpcode::G_LROUND;
     case Intrinsic::llround:
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 6a76ad7f5db749..5bab609cdefcab 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -3946,6 +3946,8 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
     return lowerExtractInsertVectorElt(MI);
   case G_SHUFFLE_VECTOR:
     return lowerShuffleVector(MI);
+  case G_MCOMPRESS:
+    return lowerMCOMPRESS(MI);
   case G_DYN_STACKALLOC:
     return lowerDynStackAlloc(MI);
   case G_STACKSAVE:
@@ -7502,6 +7504,53 @@ LegalizerHelper::lowerShuffleVector(MachineInstr &MI) {
   return Legalized;
 }
 
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerMCOMPRESS(llvm::MachineInstr &MI) {
+  auto [Dst, DstTy, Vec, VecTy, Mask, MaskTy] = MI.getFirst3RegLLTs();
+
+  if (VecTy.isScalableVector())
+    report_fatal_error(
+        "Lowering masked_compress for scalable vectors is undefined.");
+
+  MachinePointerInfo PtrInfo;
+  Register StackPtr =
+      createStackTemporary(TypeSize::getFixed(VecTy.getSizeInBytes()),
+                           getStackTemporaryAlignment(VecTy), PtrInfo)
+          .getReg(0);
+
+  LLT IdxTy = LLT::scalar(32);
+  LLT ValTy = VecTy.getElementType();
+  Align ValAlign = getStackTemporaryAlignment(ValTy);
+
+  Register OutPos = MIRBuilder.buildConstant(IdxTy, 0).getReg(0);
+
+  unsigned NumElmts = VecTy.getNumElements();
+  for (unsigned I = 0; I < NumElmts; ++I) {
+    auto Idx = MIRBuilder.buildConstant(IdxTy, I);
+    auto Val = MIRBuilder.buildExtractVectorElement(ValTy, Vec, Idx);
+    Register ElmtPtr = getVectorElementPointer(StackPtr, VecTy, OutPos);
+    MIRBuilder.buildStore(Val, ElmtPtr, PtrInfo, ValAlign);
+
+    if (I < NumElmts - 1) {
+      LLT MaskITy = MaskTy.getElementType();
+      auto MaskI = MIRBuilder.buildExtractVectorElement(MaskITy, Mask, Idx);
+      if (MaskITy.getSizeInBits() > 1)
+        MaskI = MIRBuilder.buildTrunc(LLT::scalar(1), MaskI);
+
+      MaskI = MIRBuilder.buildZExt(IdxTy, MaskI);
+      OutPos = MIRBuilder.buildAdd(IdxTy, OutPos, MaskI).getReg(0);
+    }
+  }
+
+  Align VecAlign = getStackTemporaryAlignment(VecTy);
+  MIRBuilder.buildLoad(Dst, StackPtr, PtrInfo, VecAlign);
+
+  MI.eraseFromParent();
+  // TODO: This is not true! We don't know if the input vector type is legal.
+  //       Find out how to assert this!
+  return Legalized;
+}
+
 Register LegalizerHelper::getDynStackAllocTargetPtr(Register SPReg,
                                                     Register AllocSize,
                                                     Align Alignment,
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 6509f4441d2667..a8275012ef0d67 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -11246,7 +11246,6 @@ SDValue TargetLowering::expandMCOMPRESS(SDNode *Node, SelectionDAG &DAG) const {
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   unsigned NumElms = VecVT.getVectorNumElements();
-  // Skip element zero, as we always copy this to the output vector.
   for (unsigned I = 0; I < NumElms; I++) {
     SDValue Idx = DAG.getVectorIdxConstant(I, DL);
 
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index d4aac94d24f12a..98dacdd3c6eb14 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -1132,6 +1132,10 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .scalarize(1)
       .lower();
 
+  // TOOD: FIX ThiS!
+  // DO NOT COMMIT
+  getActionDefinitionsBuilder(G_MCOMPRESS).lower();
+
   getActionDefinitionsBuilder({G_FSHL, G_FSHR})
       .customFor({{s32, s32}, {s32, s64}, {s64, s64}})
       .lower();

From 984cad1ee82cdce9c866efaa87bf600b79455150 Mon Sep 17 00:00:00 2001
From: Lawrence Benson <github@lawben.com>
Date: Fri, 17 May 2024 15:52:35 +0200
Subject: [PATCH 12/49] Add basic AArch64 MIR test

---
 .../GlobalISel/legalize-masked-compress.mir   | 67 +++++++++++++++++++
 1 file changed, 67 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/legalize-masked-compress.mir

diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-masked-compress.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-masked-compress.mir
new file mode 100644
index 00000000000000..2d28b4a597ed36
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-masked-compress.mir
@@ -0,0 +1,67 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=aarch64 -run-pass=legalizer %s -o - | FileCheck %s
+---
+name:            test_mcompress_v4s32
+body:             |
+  bb.0:
+    liveins: $q0, $d1
+
+    ; CHECK-LABEL: name: test_mcompress_v4s32
+    ; CHECK: liveins: $q0, $d1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $d1
+    ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<4 x s32>), [[C1]](s64)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[C1]], [[C2]]
+    ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[MUL]](s64)
+    ; CHECK-NEXT: G_STORE [[EVEC]](s32), [[PTR_ADD]](p0) :: (store (s32) into %stack.0)
+    ; CHECK-NEXT: [[EVEC1:%[0-9]+]]:_(s16) = G_EXTRACT_VECTOR_ELT [[COPY1]](<4 x s16>), [[C1]](s64)
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC1]](s16)
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C3]]
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[C]], [[AND]]
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[EVEC2:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<4 x s32>), [[C4]](s64)
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ADD]], [[C5]]
+    ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[AND1]](s32)
+    ; CHECK-NEXT: [[MUL1:%[0-9]+]]:_(s64) = G_MUL [[SEXT]], [[C2]]
+    ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[MUL1]](s64)
+    ; CHECK-NEXT: G_STORE [[EVEC2]](s32), [[PTR_ADD1]](p0) :: (store (s32) into %stack.0)
+    ; CHECK-NEXT: [[EVEC3:%[0-9]+]]:_(s16) = G_EXTRACT_VECTOR_ELT [[COPY1]](<4 x s16>), [[C4]](s64)
+    ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC3]](s16)
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C3]]
+    ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ADD]], [[AND2]]
+    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; CHECK-NEXT: [[EVEC4:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<4 x s32>), [[C6]](s64)
+    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ADD1]], [[C5]]
+    ; CHECK-NEXT: [[SEXT1:%[0-9]+]]:_(s64) = G_SEXT [[AND3]](s32)
+    ; CHECK-NEXT: [[MUL2:%[0-9]+]]:_(s64) = G_MUL [[SEXT1]], [[C2]]
+    ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[MUL2]](s64)
+    ; CHECK-NEXT: G_STORE [[EVEC4]](s32), [[PTR_ADD2]](p0) :: (store (s32) into %stack.0)
+    ; CHECK-NEXT: [[EVEC5:%[0-9]+]]:_(s16) = G_EXTRACT_VECTOR_ELT [[COPY1]](<4 x s16>), [[C6]](s64)
+    ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC5]](s16)
+    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[ANYEXT2]], [[C3]]
+    ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ADD1]], [[AND4]]
+    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
+    ; CHECK-NEXT: [[EVEC6:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<4 x s32>), [[C7]](s64)
+    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ADD2]], [[C5]]
+    ; CHECK-NEXT: [[SEXT2:%[0-9]+]]:_(s64) = G_SEXT [[AND5]](s32)
+    ; CHECK-NEXT: [[MUL3:%[0-9]+]]:_(s64) = G_MUL [[SEXT2]], [[C2]]
+    ; CHECK-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[MUL3]](s64)
+    ; CHECK-NEXT: G_STORE [[EVEC6]](s32), [[PTR_ADD3]](p0) :: (store (s32) into %stack.0)
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[FRAME_INDEX]](p0) :: (load (<4 x s32>) from %stack.0)
+    ; CHECK-NEXT: $q0 = COPY [[LOAD]](<4 x s32>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
+    %0:_(<4 x s32>) = COPY $q0
+    %1:_(<4 x s16>) = COPY $d1
+    %2:_(<4 x s32>) = G_MCOMPRESS %0(<4 x s32>), %1(<4 x s16>)
+    $q0 = COPY %2(<4 x s32>)
+    RET_ReallyLR implicit $q0
+...
+
+

From 0ea24159782792f4164737b53cd4a077aca91b52 Mon Sep 17 00:00:00 2001
From: Lawrence Benson <github@lawben.com>
Date: Fri, 17 May 2024 16:02:41 +0200
Subject: [PATCH 13/49] Address PR comments

---
 llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 5bab609cdefcab..dc29a080882ca8 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -7508,10 +7508,6 @@ LegalizerHelper::LegalizeResult
 LegalizerHelper::lowerMCOMPRESS(llvm::MachineInstr &MI) {
   auto [Dst, DstTy, Vec, VecTy, Mask, MaskTy] = MI.getFirst3RegLLTs();
 
-  if (VecTy.isScalableVector())
-    report_fatal_error(
-        "Lowering masked_compress for scalable vectors is undefined.");
-
   MachinePointerInfo PtrInfo;
   Register StackPtr =
       createStackTemporary(TypeSize::getFixed(VecTy.getSizeInBytes()),
@@ -7546,8 +7542,6 @@ LegalizerHelper::lowerMCOMPRESS(llvm::MachineInstr &MI) {
   MIRBuilder.buildLoad(Dst, StackPtr, PtrInfo, VecAlign);
 
   MI.eraseFromParent();
-  // TODO: This is not true! We don't know if the input vector type is legal.
-  //       Find out how to assert this!
   return Legalized;
 }
 

From 1dc79b4124feecec746905a1dc4b903f4dac93f2 Mon Sep 17 00:00:00 2001
From: Lawrence Benson <github@lawben.com>
Date: Fri, 17 May 2024 16:39:58 +0200
Subject: [PATCH 14/49] Update docs according to PR comments

---
 llvm/docs/LangRef.rst | 53 ++++++++++++++++---------------------------
 1 file changed, 20 insertions(+), 33 deletions(-)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index e2d3a986ddedf3..ad7e20ec59e34a 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -25002,15 +25002,14 @@ Overview:
 
 Selects elements from input vector '``value``' according to the '``mask``'.
 All selected elements are written into adjacent lanes in the result vector, from lower to higher.
-The mask holds a bit for each vector lane, and is used to select elements to be kept.
-The number of valid lanes is equal to the number of active bits in the mask.
+The mask holds an entry for each vector lane, and is used to select elements to be kept.
+The number of valid lanes is equal to the number of ``true`` entries in the mask, i.e., all lanes >= number-of-selected-values are undefined.
 The main difference to :ref:`llvm.masked.compressstore <int_compressstore>` is that the remainder of the vector may
 contain undefined values.
 This allows for branchless code and better optimization for all targets that do not support the explicit semantics of
-:ref:`llvm.masked.compressstore <int_compressstore>`.
+:ref:`llvm.masked.compressstore <int_compressstore>` but still have some form of compress operations (e.g., ARM SVE and RISCV V)
 The result vector can be written with a similar effect, as all the selected values are at the lower positions of the
-vector, but without requiring branches to avoid writes where the mask is 0.
-
+vector, but without requiring branches to avoid writes where the mask is ``false``.
 
 Arguments:
 """"""""""
@@ -25022,39 +25021,27 @@ The mask and the input vector must have the same number of vector elements.
 Semantics:
 """"""""""
 
-The '``llvm.masked.compress``' intrinsic is designed for compressing data within a vector, i.e., ideally within a register.
-It allows to collect elements from possibly non-adjacent lanes of a vector and place them contiguously in the result vector in one IR operation.
-It is useful for targets all that support compress operations (e.g., AVX-512, ARM SVE, RISCV V), which more instruction
-sets do than explicit compressstore, i.e., ``llvm.masked.compress`` may yield better performance on more targets than
-``llvm.masked.compressstore`` due to weaker constraints.
-This intrinsic allows vectorizing loops with cross-iteration dependencies like in the following example:
+The ``llvm.masked.compress`` intrinsic compresses data within a vector.
+It collects elements from possibly non-adjacent lanes of a vector and place them contiguously in the result vector based on a selection mask.
+This intrinsic performs the logic of the following C++ example.
+All values in ``out`` after the last selected one are undefined.
+If all entries in the ``mask`` are 0, the entire ``out`` vector is undefined.
 
-.. code-block:: c
+.. code-block:: cpp
 
-    // Consecutively store selected values with branchless code.
-    int *in, *out; bool *mask; int pos = 0;
-    for (int i = 0; i < size; ++i) {
-      out[pos] = in[i];
-      // if mask[i] == 0, the current value is overwritten in the next iteration.
-      pos += mask[i];
+    // Consecutively place selected values in a vector.
+    using VecT __attribute__((vector_size(N))) = int;
+    VecT compress(VecT vec, VecT mask) {
+      VecT out;
+      int idx = 0;
+      for (int i = 0; i < N / sizeof(int); ++i) {
+        out[idx] = vec[i];
+        idx += static_cast<bool>(mask[i]);
+      }
+      return out;
     }
 
 
-.. code-block:: llvm
-
-    ; Load elements from `in`.
-    %vec = load <4 x i32>, ptr %inPtr
-    %mask = load <4 x i1>, ptr %maskPtr
-    %compressed = call <4 x i32> @llvm.masked.compress.v4i32(<4 x i32> %vec, <4 x i1> %mask)
-    store <4 x i32> %compressed, ptr %outPtr
-
-    ; %outPtr should be increased in each iteration by the number of '1's in the mask.
-    %iMask = bitcast <4 x i1> %mask to i4
-    %popcnt = call i4 @llvm.ctpop.i4(i4 %iMask)
-    %zextPopcnt = zext i4 %popcnt to i64
-    %nextOut = add i64 %outPos, %zextPopcnt
-
-
 Memory Use Markers
 ------------------
 

From c8515caf06a0ca91fc702b3fce23751e9b1be4f1 Mon Sep 17 00:00:00 2001
From: Lawrence Benson <github@lawben.com>
Date: Tue, 21 May 2024 10:21:34 +0200
Subject: [PATCH 15/49] Match result and input types of MCOMPRES

---
 llvm/include/llvm/CodeGen/ISDOpcodes.h                         | 3 ++-
 llvm/include/llvm/IR/Intrinsics.td                             | 2 +-
 .../CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir   | 3 +++
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index 71dfd8b43b7108..c3973f734f5cd0 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -1294,7 +1294,8 @@ enum NodeType {
   MLOAD,
   MSTORE,
 
-  // Masked compress - consecutively place vector elements based on mask
+  // Masked compress - MCOMPRESS(Vec, Mask)
+  // consecutively place vector elements based on mask
   // e.g., vec = {A, B, C, D} and mask = 1010
   //         --> {A, C, ?, ?} where ? is undefined
   MCOMPRESS,
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index e924d28956b0a5..f8a27dd57b628e 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -2364,7 +2364,7 @@ def int_masked_compressstore:
 
 def int_masked_compress:
     DefaultAttrsIntrinsic<[llvm_anyvector_ty],
-              [llvm_anyvector_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
+              [LLVMMatchType<0>, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
               [IntrNoMem, IntrWillReturn]>;
 
 // Test whether a pointer is associated with a type metadata identifier.
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
index d71111b57efe51..a732c4d6ed5f37 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
@@ -642,6 +642,9 @@
 # DEBUG-NEXT: G_SPLAT_VECTOR (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
 # DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
+# DEBUG-NEXT: G_MCOMPRESS (opcode {{[0-9]+}}): 3 type indices, 0 imm indices
+# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: G_CTTZ (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected

From 8353b2dd5c3d77afb4b9cc2e3cffe035cf4a77a4 Mon Sep 17 00:00:00 2001
From: Lawrence Benson <github@lawben.com>
Date: Tue, 21 May 2024 16:05:35 +0200
Subject: [PATCH 16/49] Add constant folding to SelectionDAG::getNode()

---
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 46 ++++++++++++++-
 .../CodeGen/SelectionDAG/TargetLowering.cpp   |  4 +-
 llvm/test/CodeGen/AArch64/masked-compress.ll  | 58 ++++++++++++++-----
 3 files changed, 92 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 247f52370e4c11..87e27d701eb9de 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -6279,7 +6279,9 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
   // We can't create a scalar CONCAT_VECTORS so skip it. It will break
   // for concats involving SPLAT_VECTOR. Concats of BUILD_VECTORS are handled by
   // foldCONCAT_VECTORS in getNode before this is called.
-  if (Opcode >= ISD::BUILTIN_OP_END || Opcode == ISD::CONCAT_VECTORS)
+  // MCOMPRESS is not defined for scalars. We handle constants before this call.
+  if (Opcode >= ISD::BUILTIN_OP_END || Opcode == ISD::CONCAT_VECTORS ||
+      Opcode == ISD::MCOMPRESS)
     return SDValue();
 
   unsigned NumOps = Ops.size();
@@ -7197,6 +7199,48 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
       return N1.getOperand(1);
     break;
   }
+  case ISD::MCOMPRESS: {
+    EVT VecVT = N1.getValueType();
+    EVT MaskVT = N2.getValueType();
+    (void)MaskVT;
+    assert(VT == VecVT && "Vector and result type don't match.");
+    assert(VecVT.isVector() && MaskVT.isVector() &&
+           "Both inputs must be vectors.");
+    assert(VecVT.isScalableVector() == MaskVT.isScalableVector() &&
+           "Inputs must both be either fixed or scalable vectors.");
+    assert(VecVT.getVectorElementCount() == MaskVT.getVectorElementCount() &&
+           "Vector and mask must have same number of elements.");
+
+    APInt SplatVal;
+    if (ISD::isConstantSplatVector(N2.getNode(), SplatVal))
+      return SplatVal.isAllOnes() ? N1 : getUNDEF(VecVT);
+
+    // No need for potentially expensive compress if the mask is constant.
+    if (ISD::isBuildVectorOfConstantSDNodes(N2.getNode())) {
+      SmallVector<SDValue, 16> Ops;
+      EVT ScalarVT = VecVT.getVectorElementType();
+      unsigned NumSelected = 0;
+      unsigned NumElmts = VecVT.getVectorNumElements();
+      for (unsigned I = 0; I < NumElmts; ++I) {
+        SDValue MaskI = N2.getOperand(I);
+        if (MaskI.isUndef())
+          continue;
+
+        ConstantSDNode *CMaskI = cast<ConstantSDNode>(MaskI);
+        if (CMaskI->isAllOnes()) {
+          SDValue VecI = getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, N1,
+                                 getVectorIdxConstant(I, DL));
+          Ops.push_back(VecI);
+          NumSelected++;
+        }
+      }
+      for (unsigned Rest = NumSelected; Rest < NumElmts; ++Rest) {
+        Ops.push_back(getUNDEF(ScalarVT));
+      }
+      return getBuildVector(VecVT, DL, Ops);
+    }
+    break;
+  }
   }
 
   // Perform trivial constant folding.
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index a8275012ef0d67..bc8c2ed942609f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -11235,9 +11235,9 @@ SDValue TargetLowering::expandMCOMPRESS(SDNode *Node, SelectionDAG &DAG) const {
   EVT ScalarVT = VecVT.getScalarType();
   EVT MaskScalarVT = Mask.getValueType().getScalarType();
 
+  // Needs to be handled by targets that have scalable vector types.
   if (VecVT.isScalableVector())
-    report_fatal_error(
-        "Expanding masked_compress for scalable vectors is undefined.");
+    report_fatal_error("Cannot expand masked_compress for scalable vectors.");
 
   SDValue StackPtr = DAG.CreateStackTemporary(
       VecVT.getStoreSize(), DAG.getReducedAlign(VecVT, /*UseABI=*/false));
diff --git a/llvm/test/CodeGen/AArch64/masked-compress.ll b/llvm/test/CodeGen/AArch64/masked-compress.ll
index 9057e6f8967fa5..c29b49eb23439e 100644
--- a/llvm/test/CodeGen/AArch64/masked-compress.ll
+++ b/llvm/test/CodeGen/AArch64/masked-compress.ll
@@ -203,21 +203,53 @@ define <8 x i32> @test_compress_large(<8 x i32> %vec, <8 x i1> %mask) {
     ret <8 x i32> %out
 }
 
-define <4 x i32> @test_compress_const(<4 x i32> %vec, <4 x i1> %mask) {
-; CHECK-LABEL: test_compress_const:
+define <4 x i32> @test_compress_all_const() {
+; CHECK:       lCPI4_0:
+; CHECK-NEXT:	.long	5
+; CHECK-NEXT:	.long	9
+; CHECK-LABEL: test_compress_all_const:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:  Lloh0:
+; CHECK-NEXT:    adrp x8, lCPI4_0@PAGE
+; CHECK-NEXT:  Lloh1:
+; CHECK-NEXT:    ldr q0, [x8, lCPI4_0@PAGEOFF]
+; CHECK-NEXT:    ret
+    %out = call <4 x i32> @llvm.masked.compress(<4 x i32> <i32 3, i32 5, i32 7, i32 9>,
+                                                <4 x i1>   <i1 0,  i1 1,  i1 0,  i1 1>)
+    ret <4 x i32> %out
+}
+
+define <4 x i32> @test_compress_const_mask(<4 x i32> %vec) {
+; CHECK-LABEL: test_compress_const_mask:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    mov.s v0[1], v0[3]
+; CHECK-NEXT:    ret
+    %out = call <4 x i32> @llvm.masked.compress(<4 x i32> %vec, <4 x i1> <i1 1, i1 undef, i1 0, i1 1>)
+    ret <4 x i32> %out
+}
+
+; We pass a placeholder value for the const_mask* tests to check that they are converted to a no-op by simply copying
+; the second vector input register to the return register or doing nothing.
+define <4 x i32> @test_compress_const_splat1_mask(<4 x i32> %ignore, <4 x i32> %vec) {
+; CHECK-LABEL: test_compress_const_splat1_mask:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    mov.16b v0, v1
+; CHECK-NEXT:    ret
+    %out = call <4 x i32> @llvm.masked.compress(<4 x i32> %vec, <4 x i1> splat (i1 -1))
+    ret <4 x i32> %out
+}
+define <4 x i32> @test_compress_const_splat0_mask(<4 x i32> %ignore, <4 x i32> %vec) {
+; CHECK-LABEL: test_compress_const_splat0_mask:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ret
+    %out = call <4 x i32> @llvm.masked.compress(<4 x i32> %vec, <4 x i1> splat (i1 0))
+    ret <4 x i32> %out
+}
+define <4 x i32> @test_compress_undef_mask(<4 x i32> %ignore, <4 x i32> %vec) {
+; CHECK-LABEL: test_compress_undef_mask:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    mov x8, #3 ; =0x3
-; CHECK-NEXT:    mov w9, #9 ; =0x9
-; CHECK-NEXT:    movk x8, #7, lsl #32
-; CHECK-NEXT:    str x8, [sp, #-16]!
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    mov w8, #5 ; =0x5
-; CHECK-NEXT:    str w9, [sp, #8]
-; CHECK-NEXT:    str w8, [sp]
-; CHECK-NEXT:    ldr q0, [sp], #16
 ; CHECK-NEXT:    ret
-    %out = call <4 x i32> @llvm.masked.compress.v4i32(<4 x i32> <i32 3, i32 5, i32 7, i32 9>,
-                                                      <4 x i1>   <i1 0,  i1 1,  i1 1,  i1 0>)
+    %out = call <4 x i32> @llvm.masked.compress(<4 x i32> %vec, <4 x i1> undef)
     ret <4 x i32> %out
 }
 

From a9aba2916bc175041876a1952e4718d478a9eadb Mon Sep 17 00:00:00 2001
From: Lawrence Benson <github@lawben.com>
Date: Tue, 21 May 2024 16:53:00 +0200
Subject: [PATCH 17/49] Address PR comments for type legalization

---
 llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp |  9 ++-------
 llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp  | 10 +---------
 2 files changed, 3 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 4063144f473937..9b70f3d332b63b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -2348,15 +2348,10 @@ SDValue DAGTypeLegalizer::PromoteIntOp_MSCATTER(MaskedScatterSDNode *N,
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_MCOMPRESS(SDNode *N, unsigned OpNo) {
+  assert(OpNo == 1 && "Can only promote MCOMPRESS mask.");
   SDValue Vec = N->getOperand(0);
-  SDValue Mask = N->getOperand(1);
   EVT VT = Vec.getValueType();
-
-  if (OpNo == 0)
-    Vec = GetPromotedInteger(Vec);
-  else
-    Mask = PromoteTargetBoolean(Mask, VT);
-
+  SDValue Mask = PromoteTargetBoolean(N->getOperand(1), VT);
   return DAG.getNode(ISD::MCOMPRESS, SDLoc(N), VT, Vec, Mask);
 }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index e94316003c35c7..aadfc16e66a10a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -2315,15 +2315,7 @@ void DAGTypeLegalizer::SplitVecRes_MCOMPRESS(SDNode *N, SDValue &Lo,
   // operation and then extract the Lo and Hi vectors from that. This gets rid
   // of MCOMPRESS and all other operands can be legalized later.
   SDValue Compressed = TLI.expandMCOMPRESS(N, DAG);
-
-  SDLoc DL(N);
-  EVT SubVecVT =
-      Compressed.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
-  Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVecVT, Compressed,
-                   DAG.getVectorIdxConstant(0, DL));
-  Hi = DAG.getNode(
-      ISD::EXTRACT_SUBVECTOR, DL, SubVecVT, Compressed,
-      DAG.getVectorIdxConstant(SubVecVT.getVectorNumElements(), DL));
+  std::tie(Lo, Hi) = DAG.SplitVector(Compressed, SDLoc(N));
 }
 
 void DAGTypeLegalizer::SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi) {

From b0af3203b7cbf9838c8e3dc1dedeea506cf208dd Mon Sep 17 00:00:00 2001
From: Lawrence Benson <github@lawben.com>
Date: Tue, 21 May 2024 16:57:24 +0200
Subject: [PATCH 18/49] Move masked.compress in docs

---
 llvm/docs/LangRef.rst | 131 +++++++++++++++++++++---------------------
 1 file changed, 65 insertions(+), 66 deletions(-)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index ad7e20ec59e34a..ff29fb10007dac 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -19197,6 +19197,71 @@ the follow sequence of operations:
 
 The ``mask`` operand will apply to at least the gather and scatter operations.
 
+
+.. _int_masked_compress:
+
+'``llvm.masked.compress.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+LLVM provides an intrinsic for compressing data within a vector based on a selection mask.
+Semantically, this is similar to :ref:`llvm.masked.compressstore <int_compressstore>` but with weaker assumptions
+and without storing the results to memory, i.e., the data remains in the vector.
+
+Syntax:
+"""""""
+This is an overloaded intrinsic. A number of scalar values of integer, floating point or pointer data type are collected
+from an input vector and placed adjacently within the result vector. A mask defines which elements to collect from the vector.
+
+:: code-block:: llvm
+
+      declare <8 x i32> @llvm.masked.compress.v8i32(<8 x i32> <value>, <8 x i1> <mask>)
+      declare <16 x float> @llvm.masked.compress.v16f32(<16 x float> <value>, <16 x i1> <mask>)
+
+Overview:
+"""""""""
+
+Selects elements from input vector '``value``' according to the '``mask``'.
+All selected elements are written into adjacent lanes in the result vector, from lower to higher.
+The mask holds an entry for each vector lane, and is used to select elements to be kept.
+The number of valid lanes is equal to the number of ``true`` entries in the mask, i.e., all lanes >= number-of-selected-values are undefined.
+The main difference to :ref:`llvm.masked.compressstore <int_compressstore>` is that the remainder of the vector may
+contain undefined values.
+This allows for branchless code and better optimization for all targets that do not support the explicit semantics of
+:ref:`llvm.masked.compressstore <int_compressstore>` but still have some form of compress operations (e.g., ARM SVE and RISCV V)
+The result vector can be written with a similar effect, as all the selected values are at the lower positions of the
+vector, but without requiring branches to avoid writes where the mask is ``false``.
+
+Arguments:
+""""""""""
+
+The first operand is the input vector, from which elements are selected.
+The second operand is the mask, a vector of boolean values.
+The mask and the input vector must have the same number of vector elements.
+
+Semantics:
+""""""""""
+
+The ``llvm.masked.compress`` intrinsic compresses data within a vector.
+It collects elements from possibly non-adjacent lanes of a vector and place them contiguously in the result vector based on a selection mask.
+This intrinsic performs the logic of the following C++ example.
+All values in ``out`` after the last selected one are undefined.
+If all entries in the ``mask`` are 0, the entire ``out`` vector is undefined.
+
+.. code-block:: cpp
+
+    // Consecutively place selected values in a vector.
+    using VecT __attribute__((vector_size(N))) = int;
+    VecT compress(VecT vec, VecT mask) {
+      VecT out;
+      int idx = 0;
+      for (int i = 0; i < N / sizeof(int); ++i) {
+        out[idx] = vec[i];
+        idx += static_cast<bool>(mask[i]);
+      }
+      return out;
+    }
+
+
 Matrix Intrinsics
 -----------------
 
@@ -24975,72 +25040,6 @@ The '``llvm.masked.compressstore``' intrinsic is designed for compressing data i
 
 Other targets may support this intrinsic differently, for example, by lowering it into a sequence of branches that guard scalar store operations.
 
-Masked Vector Compress Intrinsic
---------------------------------
-
-LLVM provides an intrinsic for compressing data within a vector based on a selection mask.
-Semantically, this is similar to :ref:`llvm.masked.compressstore <int_compressstore>` but with weaker assumptions
-and without storing the results to memory, i.e., the data remains in the vector.
-
-.. _int_masked_compress:
-
-'``llvm.masked.compress.*``' Intrinsics
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Syntax:
-"""""""
-This is an overloaded intrinsic. A number of scalar values of integer, floating point or pointer data type are collected
-from an input vector and placed adjacently within the result vector. A mask defines which elements to collect from the vector.
-
-:: code-block:: llvm
-
-      declare <8 x i32> @llvm.masked.compress.v8i32(<8 x i32> <value>, <8 x i1> <mask>)
-      declare <16 x float> @llvm.masked.compress.v16f32(<16 x float> <value>, <16 x i1> <mask>)
-
-Overview:
-"""""""""
-
-Selects elements from input vector '``value``' according to the '``mask``'.
-All selected elements are written into adjacent lanes in the result vector, from lower to higher.
-The mask holds an entry for each vector lane, and is used to select elements to be kept.
-The number of valid lanes is equal to the number of ``true`` entries in the mask, i.e., all lanes >= number-of-selected-values are undefined.
-The main difference to :ref:`llvm.masked.compressstore <int_compressstore>` is that the remainder of the vector may
-contain undefined values.
-This allows for branchless code and better optimization for all targets that do not support the explicit semantics of
-:ref:`llvm.masked.compressstore <int_compressstore>` but still have some form of compress operations (e.g., ARM SVE and RISCV V)
-The result vector can be written with a similar effect, as all the selected values are at the lower positions of the
-vector, but without requiring branches to avoid writes where the mask is ``false``.
-
-Arguments:
-""""""""""
-
-The first operand is the input vector, from which elements are selected.
-The second operand is the mask, a vector of boolean values.
-The mask and the input vector must have the same number of vector elements.
-
-Semantics:
-""""""""""
-
-The ``llvm.masked.compress`` intrinsic compresses data within a vector.
-It collects elements from possibly non-adjacent lanes of a vector and place them contiguously in the result vector based on a selection mask.
-This intrinsic performs the logic of the following C++ example.
-All values in ``out`` after the last selected one are undefined.
-If all entries in the ``mask`` are 0, the entire ``out`` vector is undefined.
-
-.. code-block:: cpp
-
-    // Consecutively place selected values in a vector.
-    using VecT __attribute__((vector_size(N))) = int;
-    VecT compress(VecT vec, VecT mask) {
-      VecT out;
-      int idx = 0;
-      for (int i = 0; i < N / sizeof(int); ++i) {
-        out[idx] = vec[i];
-        idx += static_cast<bool>(mask[i]);
-      }
-      return out;
-    }
-
 
 Memory Use Markers
 ------------------

From d9587c7beb89fcbaf4a917e9fb3e6ce87ccc20f3 Mon Sep 17 00:00:00 2001
From: Lawrence Benson <github@lawben.com>
Date: Tue, 21 May 2024 17:17:49 +0200
Subject: [PATCH 19/49] Fix bug for x86 result widening

---
 llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index aadfc16e66a10a..ec872b7c4c4b7c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -5617,8 +5617,9 @@ SDValue DAGTypeLegalizer::WidenVecRes_MCOMPRESS(SDNode *N) {
   SDValue Mask = N->getOperand(1);
   EVT WideVecVT =
       TLI.getTypeToTransformTo(*DAG.getContext(), Vec.getValueType());
-  EVT WideMaskVT =
-      TLI.getTypeToTransformTo(*DAG.getContext(), Mask.getValueType());
+  EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(),
+                                    Mask.getValueType().getVectorElementType(),
+                                    WideVecVT.getVectorNumElements());
 
   // In the default expanded case, adding UNDEF values for the new widened lanes
   // allows us to remove their access later, which reduces the number os stores.

From c04da9bbd2ae56283722822190dd6ad8922af5be Mon Sep 17 00:00:00 2001
From: Lawrence Benson <github@lawben.com>
Date: Tue, 21 May 2024 18:45:37 +0200
Subject: [PATCH 20/49] Remove zero-fill when widening vector

---
 .../SelectionDAG/LegalizeVectorTypes.cpp      |  6 ++--
 llvm/test/CodeGen/AArch64/masked-compress.ll  | 36 +++++++------------
 2 files changed, 14 insertions(+), 28 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index ec872b7c4c4b7c..5652a5287b0dc4 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -5621,10 +5621,8 @@ SDValue DAGTypeLegalizer::WidenVecRes_MCOMPRESS(SDNode *N) {
                                     Mask.getValueType().getVectorElementType(),
                                     WideVecVT.getVectorNumElements());
 
-  // In the default expanded case, adding UNDEF values for the new widened lanes
-  // allows us to remove their access later, which reduces the number os stores.
-  SDValue WideVec = ModifyToType(Vec, WideVecVT, /*FillWithZeroes=*/true);
-  SDValue WideMask = ModifyToType(Mask, WideMaskVT, /*FillWithZeroes=*/true);
+  SDValue WideVec = ModifyToType(Vec, WideVecVT);
+  SDValue WideMask = ModifyToType(Mask, WideMaskVT);
   return DAG.getNode(ISD::MCOMPRESS, SDLoc(N), WideVecVT, WideVec, WideMask);
 }
 
diff --git a/llvm/test/CodeGen/AArch64/masked-compress.ll b/llvm/test/CodeGen/AArch64/masked-compress.ll
index c29b49eb23439e..c5382f591b87e0 100644
--- a/llvm/test/CodeGen/AArch64/masked-compress.ll
+++ b/llvm/test/CodeGen/AArch64/masked-compress.ll
@@ -322,10 +322,9 @@ define <3 x i32> @test_compress_narrow(<3 x i32> %vec, <3 x i1> %mask) {
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    movi.2d v1, #0000000000000000
+; CHECK-NEXT:    fmov s1, w0
 ; CHECK-NEXT:    mov x11, sp
 ; CHECK-NEXT:    str s0, [sp]
-; CHECK-NEXT:    mov.h v1[0], w0
 ; CHECK-NEXT:    mov.h v1[1], w1
 ; CHECK-NEXT:    mov.h v1[2], w2
 ; CHECK-NEXT:    ushll.4s v1, v1, #0
@@ -345,7 +344,7 @@ define <3 x i32> @test_compress_narrow(<3 x i32> %vec, <3 x i1> %mask) {
 ; CHECK-NEXT:    orr x8, x10, x8, lsl #2
 ; CHECK-NEXT:    bfi x10, x9, #2, #2
 ; CHECK-NEXT:    st1.s { v0 }[2], [x8]
-; CHECK-NEXT:    str wzr, [x10]
+; CHECK-NEXT:    st1.s { v0 }[3], [x10]
 ; CHECK-NEXT:    ldr q0, [sp], #16
 ; CHECK-NEXT:    ret
     %out = call <3 x i32> @llvm.masked.compress.v3i32(<3 x i32> %vec, <3 x i1> %mask)
@@ -357,34 +356,23 @@ define <3 x i3> @test_compress_narrow_illegal_element_type(<3 x i3> %vec, <3 x i
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    movi.2d v0, #0000000000000000
-; CHECK-NEXT:    fmov s1, w0
-; CHECK-NEXT:    add x12, sp, #8
-; CHECK-NEXT:    movi.4h v2, #7
-; CHECK-NEXT:    add x11, sp, #8
-; CHECK-NEXT:    mov.h v1[1], w1
-; CHECK-NEXT:    mov.h v0[0], w3
-; CHECK-NEXT:    mov.h v1[2], w2
+; CHECK-NEXT:    fmov s0, w3
+; CHECK-NEXT:    add x10, sp, #8
+; CHECK-NEXT:    strh w0, [sp, #8]
 ; CHECK-NEXT:    mov.h v0[1], w4
 ; CHECK-NEXT:    mov.h v0[2], w5
 ; CHECK-NEXT:    shl.4h v0, v0, #15
 ; CHECK-NEXT:    cmlt.4h v0, v0, #0
 ; CHECK-NEXT:    umov.h w8, v0[0]
 ; CHECK-NEXT:    umov.h w9, v0[1]
-; CHECK-NEXT:    umov.h w10, v0[2]
-; CHECK-NEXT:    and.8b v0, v1, v2
 ; CHECK-NEXT:    and w9, w9, #0x1
-; CHECK-NEXT:    and w13, w8, #0x1
-; CHECK-NEXT:    bfi x12, x8, #1, #1
-; CHECK-NEXT:    add w8, w13, w9
-; CHECK-NEXT:    and w9, w10, #0x1
-; CHECK-NEXT:    str h0, [sp, #8]
-; CHECK-NEXT:    orr x10, x11, x8, lsl #1
-; CHECK-NEXT:    add w8, w8, w9
-; CHECK-NEXT:    st1.h { v0 }[1], [x12]
-; CHECK-NEXT:    bfi x11, x8, #1, #2
-; CHECK-NEXT:    st1.h { v0 }[2], [x10]
-; CHECK-NEXT:    strh wzr, [x11]
+; CHECK-NEXT:    and w11, w8, #0x1
+; CHECK-NEXT:    bfi x10, x8, #1, #1
+; CHECK-NEXT:    add w8, w11, w9
+; CHECK-NEXT:    add x9, sp, #8
+; CHECK-NEXT:    orr x8, x9, x8, lsl #1
+; CHECK-NEXT:    strh w1, [x10]
+; CHECK-NEXT:    strh w2, [x8]
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    umov.h w0, v0[0]
 ; CHECK-NEXT:    umov.h w1, v0[1]

From a60523cab98bd81f0c18589094c874ce0a33468c Mon Sep 17 00:00:00 2001
From: Lawrence Benson <github@lawben.com>
Date: Wed, 22 May 2024 12:57:08 +0200
Subject: [PATCH 21/49] Use [[maybe_unused]] for asserts

---
 llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 87e27d701eb9de..6a617e3986de9b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -6280,6 +6280,7 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
   // for concats involving SPLAT_VECTOR. Concats of BUILD_VECTORS are handled by
   // foldCONCAT_VECTORS in getNode before this is called.
   // MCOMPRESS is not defined for scalars. We handle constants before this call.
+  // TODO: Make this aware of vector-only opcodes and skip all of them.
   if (Opcode >= ISD::BUILTIN_OP_END || Opcode == ISD::CONCAT_VECTORS ||
       Opcode == ISD::MCOMPRESS)
     return SDValue();
@@ -7201,8 +7202,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
   }
   case ISD::MCOMPRESS: {
     EVT VecVT = N1.getValueType();
-    EVT MaskVT = N2.getValueType();
-    (void)MaskVT;
+    [[maybe_unused]] EVT MaskVT = N2.getValueType();
     assert(VT == VecVT && "Vector and result type don't match.");
     assert(VecVT.isVector() && MaskVT.isVector() &&
            "Both inputs must be vectors.");

From 9279e5ee5046494426f48707ffc8ba78106b97f6 Mon Sep 17 00:00:00 2001
From: Lawrence Benson <github@lawben.com>
Date: Fri, 24 May 2024 09:14:59 +0200
Subject: [PATCH 22/49] Move constant folding to DAGCombiner

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 43 +++++++++++++++++++
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 31 +------------
 2 files changed, 45 insertions(+), 29 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index a044b6dc4838ad..beff3386426813 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -540,6 +540,7 @@ namespace {
     SDValue visitVECTOR_SHUFFLE(SDNode *N);
     SDValue visitSCALAR_TO_VECTOR(SDNode *N);
     SDValue visitINSERT_SUBVECTOR(SDNode *N);
+    SDValue visitMCOMPRESS(SDNode *N);
     SDValue visitMLOAD(SDNode *N);
     SDValue visitMSTORE(SDNode *N);
     SDValue visitMGATHER(SDNode *N);
@@ -1960,6 +1961,7 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::MLOAD:              return visitMLOAD(N);
   case ISD::MSCATTER:           return visitMSCATTER(N);
   case ISD::MSTORE:             return visitMSTORE(N);
+  case ISD::MCOMPRESS:          return visitMCOMPRESS(N);
   case ISD::LIFETIME_END:       return visitLIFETIME_END(N);
   case ISD::FP_TO_FP16:         return visitFP_TO_FP16(N);
   case ISD::FP16_TO_FP:         return visitFP16_TO_FP(N);
@@ -12015,6 +12017,47 @@ SDValue DAGCombiner::visitVP_STRIDED_STORE(SDNode *N) {
   return SDValue();
 }
 
+SDValue DAGCombiner::visitMCOMPRESS(SDNode *N) {
+  SDLoc DL(N);
+  SDValue Vec = N->getOperand(0);
+  SDValue Mask = N->getOperand(1);
+  EVT VecVT = Vec.getValueType();
+
+  APInt SplatVal;
+  if (ISD::isConstantSplatVector(Mask.getNode(), SplatVal))
+    return SplatVal.isAllOnes() ? Vec : DAG.getUNDEF(VecVT);
+
+  if (Vec.isUndef() || Mask.isUndef())
+    return DAG.getUNDEF(VecVT);
+
+  // No need for potentially expensive compress if the mask is constant.
+  if (ISD::isBuildVectorOfConstantSDNodes(Mask.getNode())) {
+    SmallVector<SDValue, 16> Ops;
+    EVT ScalarVT = VecVT.getVectorElementType();
+    unsigned NumSelected = 0;
+    unsigned NumElmts = VecVT.getVectorNumElements();
+    for (unsigned I = 0; I < NumElmts; ++I) {
+      SDValue MaskI = Mask.getOperand(I);
+      if (MaskI.isUndef())
+        continue;
+
+      ConstantSDNode *CMaskI = cast<ConstantSDNode>(MaskI);
+      if (CMaskI->isAllOnes()) {
+        SDValue VecI = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Vec,
+                                   DAG.getVectorIdxConstant(I, DL));
+        Ops.push_back(VecI);
+        NumSelected++;
+      }
+    }
+    for (unsigned Rest = NumSelected; Rest < NumElmts; ++Rest) {
+      Ops.push_back(DAG.getUNDEF(ScalarVT));
+    }
+    return DAG.getBuildVector(VecVT, DL, Ops);
+  }
+
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitVPGATHER(SDNode *N) {
   VPGatherSDNode *MGT = cast<VPGatherSDNode>(N);
   SDValue Mask = MGT->getMask();
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 6a617e3986de9b..775cf64a82a0da 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -7206,39 +7206,12 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     assert(VT == VecVT && "Vector and result type don't match.");
     assert(VecVT.isVector() && MaskVT.isVector() &&
            "Both inputs must be vectors.");
-    assert(VecVT.isScalableVector() == MaskVT.isScalableVector() &&
-           "Inputs must both be either fixed or scalable vectors.");
     assert(VecVT.getVectorElementCount() == MaskVT.getVectorElementCount() &&
            "Vector and mask must have same number of elements.");
 
-    APInt SplatVal;
-    if (ISD::isConstantSplatVector(N2.getNode(), SplatVal))
-      return SplatVal.isAllOnes() ? N1 : getUNDEF(VecVT);
-
-    // No need for potentially expensive compress if the mask is constant.
-    if (ISD::isBuildVectorOfConstantSDNodes(N2.getNode())) {
-      SmallVector<SDValue, 16> Ops;
-      EVT ScalarVT = VecVT.getVectorElementType();
-      unsigned NumSelected = 0;
-      unsigned NumElmts = VecVT.getVectorNumElements();
-      for (unsigned I = 0; I < NumElmts; ++I) {
-        SDValue MaskI = N2.getOperand(I);
-        if (MaskI.isUndef())
-          continue;
+    if (N1.isUndef() || N2.isUndef())
+      return getUNDEF(VecVT);
 
-        ConstantSDNode *CMaskI = cast<ConstantSDNode>(MaskI);
-        if (CMaskI->isAllOnes()) {
-          SDValue VecI = getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, N1,
-                                 getVectorIdxConstant(I, DL));
-          Ops.push_back(VecI);
-          NumSelected++;
-        }
-      }
-      for (unsigned Rest = NumSelected; Rest < NumElmts; ++Rest) {
-        Ops.push_back(getUNDEF(ScalarVT));
-      }
-      return getBuildVector(VecVT, DL, Ops);
-    }
     break;
   }
   }

From b48dadadd167ac3eb3346a6e27a3edd482637c30 Mon Sep 17 00:00:00 2001
From: Lawrence Benson <github@lawben.com>
Date: Fri, 24 May 2024 09:19:27 +0200
Subject: [PATCH 23/49] Change TODO for AArch64 GlobalISel

---
 llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 98dacdd3c6eb14..cf4390edfca3b0 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -1132,8 +1132,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .scalarize(1)
       .lower();
 
-  // TOOD: FIX ThiS!
-  // DO NOT COMMIT
+  // TODO: Update this to correct handling when adding AArch64/SVE support.
   getActionDefinitionsBuilder(G_MCOMPRESS).lower();
 
   getActionDefinitionsBuilder({G_FSHL, G_FSHR})

From d4436717aa9579b3245ef365ac2a20c2ce77c3bf Mon Sep 17 00:00:00 2001
From: Lawrence Benson <github@lawben.com>
Date: Fri, 24 May 2024 09:24:33 +0200
Subject: [PATCH 24/49] Remove wrong ISA from docs

---
 llvm/docs/LangRef.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index ff29fb10007dac..af7c65d24b320b 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -19226,8 +19226,8 @@ The mask holds an entry for each vector lane, and is used to select elements to
 The number of valid lanes is equal to the number of ``true`` entries in the mask, i.e., all lanes >= number-of-selected-values are undefined.
 The main difference to :ref:`llvm.masked.compressstore <int_compressstore>` is that the remainder of the vector may
 contain undefined values.
-This allows for branchless code and better optimization for all targets that do not support the explicit semantics of
-:ref:`llvm.masked.compressstore <int_compressstore>` but still have some form of compress operations (e.g., ARM SVE and RISCV V)
+This allows for branchless code and better optimization for all targets that do not support or have inefficient instructions
+of the explicit semantics of :ref:`llvm.masked.compressstore <int_compressstore>` but still have some form of compress operations.
 The result vector can be written with a similar effect, as all the selected values are at the lower positions of the
 vector, but without requiring branches to avoid writes where the mask is ``false``.
 

From d45f61ba6316ecd5cf04c0c7946d93a1633fa5ca Mon Sep 17 00:00:00 2001
From: Lawrence Benson <github@lawben.com>
Date: Fri, 7 Jun 2024 09:04:34 +0200
Subject: [PATCH 25/49] Rename MCOMPRESS to MASKED_COMPRESS

---
 .../llvm/CodeGen/GlobalISel/LegalizerHelper.h  |  2 +-
 llvm/include/llvm/CodeGen/ISDOpcodes.h         |  6 +++---
 llvm/include/llvm/CodeGen/TargetLowering.h     |  4 ++--
 llvm/include/llvm/Support/TargetOpcodes.def    |  2 +-
 llvm/include/llvm/Target/GenericOpcodes.td     |  2 +-
 .../Target/GlobalISel/SelectionDAGCompat.td    |  2 +-
 llvm/include/llvm/Target/TargetSelectionDAG.td |  2 +-
 llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp   |  2 +-
 .../lib/CodeGen/GlobalISel/LegalizerHelper.cpp |  6 +++---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp  |  6 +++---
 .../SelectionDAG/LegalizeIntegerTypes.cpp      | 18 +++++++++---------
 llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h  |  8 ++++----
 .../CodeGen/SelectionDAG/LegalizeVectorOps.cpp |  6 +++---
 .../SelectionDAG/LegalizeVectorTypes.cpp       | 18 +++++++++---------
 llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp |  6 +++---
 .../SelectionDAG/SelectionDAGBuilder.cpp       |  2 +-
 .../SelectionDAG/SelectionDAGDumper.cpp        |  2 +-
 .../CodeGen/SelectionDAG/TargetLowering.cpp    |  2 +-
 llvm/lib/CodeGen/TargetLoweringBase.cpp        |  2 +-
 .../AArch64/GISel/AArch64LegalizerInfo.cpp     |  2 +-
 .../GlobalISel/legalize-masked-compress.mir    |  6 +++---
 .../GlobalISel/legalizer-info-validation.mir   |  2 +-
 22 files changed, 54 insertions(+), 54 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
index 132ed3fab57fcb..6b379c699d42ae 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
@@ -410,7 +410,7 @@ class LegalizerHelper {
   LegalizeResult lowerUnmergeValues(MachineInstr &MI);
   LegalizeResult lowerExtractInsertVectorElt(MachineInstr &MI);
   LegalizeResult lowerShuffleVector(MachineInstr &MI);
-  LegalizeResult lowerMCOMPRESS(MachineInstr &MI);
+  LegalizeResult lowerMASKED_COMPRESS(MachineInstr &MI);
   Register getDynStackAllocTargetPtr(Register SPReg, Register AllocSize,
                                      Align Alignment, LLT PtrTy);
   LegalizeResult lowerDynStackAlloc(MachineInstr &MI);
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index c3973f734f5cd0..aa5aedbed9be29 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -1294,11 +1294,11 @@ enum NodeType {
   MLOAD,
   MSTORE,
 
-  // Masked compress - MCOMPRESS(Vec, Mask)
+  // MASKED_COMPRESS(Vec, Mask)
   // consecutively place vector elements based on mask
-  // e.g., vec = {A, B, C, D} and mask = 1010
+  // e.g., vec = {A, B, C, D} and mask = {1, 0, 1, 0}
   //         --> {A, C, ?, ?} where ? is undefined
-  MCOMPRESS,
+  MASKED_COMPRESS,
 
   // Masked gather and scatter - load and store operations for a vector of
   // random addresses with additional mask operand that prevents memory
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index ad79c22aa3e1b6..336f85f850ddc2 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -5472,9 +5472,9 @@ class TargetLowering : public TargetLoweringBase {
   /// method accepts vectors as its arguments.
   SDValue expandVectorSplice(SDNode *Node, SelectionDAG &DAG) const;
 
-  /// Expand a vector MCOMPRESS into a sequence of extract element, store
+  /// Expand a vector MASKED_COMPRESS into a sequence of extract element, store
   /// temporarily, advance store position, before re-loading the final vector.
-  SDValue expandMCOMPRESS(SDNode *Node, SelectionDAG &DAG) const;
+  SDValue expandMASKED_COMPRESS(SDNode *Node, SelectionDAG &DAG) const;
 
   /// Legalize a SETCC or VP_SETCC with given LHS and RHS and condition code CC
   /// on the current target. A VP_SETCC will additionally be given a Mask
diff --git a/llvm/include/llvm/Support/TargetOpcodes.def b/llvm/include/llvm/Support/TargetOpcodes.def
index f85aca25fb945d..f41e23d935e45e 100644
--- a/llvm/include/llvm/Support/TargetOpcodes.def
+++ b/llvm/include/llvm/Support/TargetOpcodes.def
@@ -752,7 +752,7 @@ HANDLE_TARGET_OPCODE(G_SHUFFLE_VECTOR)
 HANDLE_TARGET_OPCODE(G_SPLAT_VECTOR)
 
 /// Generic masked compress.
-HANDLE_TARGET_OPCODE(G_MCOMPRESS)
+HANDLE_TARGET_OPCODE(G_MASKED_COMPRESS)
 
 /// Generic count trailing zeroes.
 HANDLE_TARGET_OPCODE(G_CTTZ)
diff --git a/llvm/include/llvm/Target/GenericOpcodes.td b/llvm/include/llvm/Target/GenericOpcodes.td
index 7cbb6cc1bb7376..29e82bf6398428 100644
--- a/llvm/include/llvm/Target/GenericOpcodes.td
+++ b/llvm/include/llvm/Target/GenericOpcodes.td
@@ -1501,7 +1501,7 @@ def G_SPLAT_VECTOR: GenericInstruction {
 }
 
 // Generic masked compress.
-def G_MCOMPRESS: GenericInstruction {
+def G_MASKED_COMPRESS: GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type1:$vec, type2:$mask);
   let hasSideEffects = false;
diff --git a/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td b/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
index 69fb235dfc2b5e..3a385f222cd0ef 100644
--- a/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
+++ b/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
@@ -186,7 +186,7 @@ def : GINodeEquiv<G_VECREDUCE_UMAX, vecreduce_umax>;
 def : GINodeEquiv<G_VECREDUCE_SMIN, vecreduce_smin>;
 def : GINodeEquiv<G_VECREDUCE_SMAX, vecreduce_smax>;
 def : GINodeEquiv<G_VECREDUCE_ADD, vecreduce_add>;
-def : GINodeEquiv<G_MCOMPRESS, masked_compress>;
+def : GINodeEquiv<G_MASKED_COMPRESS, masked_compress>;
 
 def : GINodeEquiv<G_STRICT_FADD, strict_fadd>;
 def : GINodeEquiv<G_STRICT_FSUB, strict_fsub>;
diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
index 061330fb4e08f3..42a6a22cebc604 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -735,7 +735,7 @@ def masked_gather : SDNode<"ISD::MGATHER", SDTMaskedGather,
 def masked_scatter : SDNode<"ISD::MSCATTER", SDTMaskedScatter,
                             [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
 
-def masked_compress : SDNode<"ISD::MCOMPRESS", SDTMaskedCompress>;
+def masked_compress : SDNode<"ISD::MASKED_COMPRESS", SDTMaskedCompress>;
 
 // Do not use ld, st directly. Use load, extload, sextload, zextload, store,
 // and truncst (see below).
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 858ba547ac22a5..097e4022affdd1 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -1987,7 +1987,7 @@ unsigned IRTranslator::getSimpleIntrinsicOpcode(Intrinsic::ID ID) {
     case Intrinsic::vector_reduce_umin:
       return TargetOpcode::G_VECREDUCE_UMIN;
     case Intrinsic::masked_compress:
-      return TargetOpcode::G_MCOMPRESS;
+      return TargetOpcode::G_MASKED_COMPRESS;
     case Intrinsic::lround:
       return TargetOpcode::G_LROUND;
     case Intrinsic::llround:
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index dc29a080882ca8..b646454c3d345e 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -3946,8 +3946,8 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
     return lowerExtractInsertVectorElt(MI);
   case G_SHUFFLE_VECTOR:
     return lowerShuffleVector(MI);
-  case G_MCOMPRESS:
-    return lowerMCOMPRESS(MI);
+  case G_MASKED_COMPRESS:
+    return lowerMASKED_COMPRESS(MI);
   case G_DYN_STACKALLOC:
     return lowerDynStackAlloc(MI);
   case G_STACKSAVE:
@@ -7505,7 +7505,7 @@ LegalizerHelper::lowerShuffleVector(MachineInstr &MI) {
 }
 
 LegalizerHelper::LegalizeResult
-LegalizerHelper::lowerMCOMPRESS(llvm::MachineInstr &MI) {
+LegalizerHelper::lowerMASKED_COMPRESS(llvm::MachineInstr &MI) {
   auto [Dst, DstTy, Vec, VecTy, Mask, MaskTy] = MI.getFirst3RegLLTs();
 
   MachinePointerInfo PtrInfo;
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index beff3386426813..50bb743da3809c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -540,7 +540,7 @@ namespace {
     SDValue visitVECTOR_SHUFFLE(SDNode *N);
     SDValue visitSCALAR_TO_VECTOR(SDNode *N);
     SDValue visitINSERT_SUBVECTOR(SDNode *N);
-    SDValue visitMCOMPRESS(SDNode *N);
+    SDValue visitMASKED_COMPRESS(SDNode *N);
     SDValue visitMLOAD(SDNode *N);
     SDValue visitMSTORE(SDNode *N);
     SDValue visitMGATHER(SDNode *N);
@@ -1961,7 +1961,7 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::MLOAD:              return visitMLOAD(N);
   case ISD::MSCATTER:           return visitMSCATTER(N);
   case ISD::MSTORE:             return visitMSTORE(N);
-  case ISD::MCOMPRESS:          return visitMCOMPRESS(N);
+  case ISD::MASKED_COMPRESS:    return visitMASKED_COMPRESS(N);
   case ISD::LIFETIME_END:       return visitLIFETIME_END(N);
   case ISD::FP_TO_FP16:         return visitFP_TO_FP16(N);
   case ISD::FP16_TO_FP:         return visitFP16_TO_FP(N);
@@ -12017,7 +12017,7 @@ SDValue DAGCombiner::visitVP_STRIDED_STORE(SDNode *N) {
   return SDValue();
 }
 
-SDValue DAGCombiner::visitMCOMPRESS(SDNode *N) {
+SDValue DAGCombiner::visitMASKED_COMPRESS(SDNode *N) {
   SDLoc DL(N);
   SDValue Vec = N->getOperand(0);
   SDValue Mask = N->getOperand(1);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 9b70f3d332b63b..50f729f90fea7a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -87,8 +87,8 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
     break;
   case ISD::MGATHER:     Res = PromoteIntRes_MGATHER(cast<MaskedGatherSDNode>(N));
     break;
-  case ISD::MCOMPRESS:
-    Res = PromoteIntRes_MCOMPRESS(N);
+  case ISD::MASKED_COMPRESS:
+    Res = PromoteIntRes_MASKED_COMPRESS(N);
     break;
   case ISD::SELECT:
   case ISD::VSELECT:
@@ -951,9 +951,9 @@ SDValue DAGTypeLegalizer::PromoteIntRes_MGATHER(MaskedGatherSDNode *N) {
   return Res;
 }
 
-SDValue DAGTypeLegalizer::PromoteIntRes_MCOMPRESS(SDNode *N) {
+SDValue DAGTypeLegalizer::PromoteIntRes_MASKED_COMPRESS(SDNode *N) {
   SDValue Vec = GetPromotedInteger(N->getOperand(0));
-  return DAG.getNode(ISD::MCOMPRESS, SDLoc(N), Vec.getValueType(), Vec,
+  return DAG.getNode(ISD::MASKED_COMPRESS, SDLoc(N), Vec.getValueType(), Vec,
                      N->getOperand(1));
 }
 
@@ -1864,8 +1864,8 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
                                                  OpNo); break;
   case ISD::MSCATTER: Res = PromoteIntOp_MSCATTER(cast<MaskedScatterSDNode>(N),
                                                   OpNo); break;
-  case ISD::MCOMPRESS:
-    Res = PromoteIntOp_MCOMPRESS(N, OpNo);
+  case ISD::MASKED_COMPRESS:
+    Res = PromoteIntOp_MASKED_COMPRESS(N, OpNo);
     break;
   case ISD::VP_TRUNCATE:
   case ISD::TRUNCATE:     Res = PromoteIntOp_TRUNCATE(N); break;
@@ -2347,12 +2347,12 @@ SDValue DAGTypeLegalizer::PromoteIntOp_MSCATTER(MaskedScatterSDNode *N,
                               N->getIndexType(), TruncateStore);
 }
 
-SDValue DAGTypeLegalizer::PromoteIntOp_MCOMPRESS(SDNode *N, unsigned OpNo) {
-  assert(OpNo == 1 && "Can only promote MCOMPRESS mask.");
+SDValue DAGTypeLegalizer::PromoteIntOp_MASKED_COMPRESS(SDNode *N, unsigned OpNo) {
+  assert(OpNo == 1 && "Can only promote MASKED_COMPRESS mask.");
   SDValue Vec = N->getOperand(0);
   EVT VT = Vec.getValueType();
   SDValue Mask = PromoteTargetBoolean(N->getOperand(1), VT);
-  return DAG.getNode(ISD::MCOMPRESS, SDLoc(N), VT, Vec, Mask);
+  return DAG.getNode(ISD::MASKED_COMPRESS, SDLoc(N), VT, Vec, Mask);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_TRUNCATE(SDNode *N) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 26ce361f2d580f..898a9e0110548d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -321,7 +321,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue PromoteIntRes_LOAD(LoadSDNode *N);
   SDValue PromoteIntRes_MLOAD(MaskedLoadSDNode *N);
   SDValue PromoteIntRes_MGATHER(MaskedGatherSDNode *N);
-  SDValue PromoteIntRes_MCOMPRESS(SDNode *N);
+  SDValue PromoteIntRes_MASKED_COMPRESS(SDNode *N);
   SDValue PromoteIntRes_Overflow(SDNode *N);
   SDValue PromoteIntRes_FFREXP(SDNode *N);
   SDValue PromoteIntRes_SADDSUBO(SDNode *N, unsigned ResNo);
@@ -391,7 +391,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue PromoteIntOp_MLOAD(MaskedLoadSDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_MSCATTER(MaskedScatterSDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_MGATHER(MaskedGatherSDNode *N, unsigned OpNo);
-  SDValue PromoteIntOp_MCOMPRESS(SDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_MASKED_COMPRESS(SDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_FRAMERETURNADDR(SDNode *N);
   SDValue PromoteIntOp_FIX(SDNode *N);
   SDValue PromoteIntOp_ExpOp(SDNode *N);
@@ -884,7 +884,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   void SplitVecRes_MLOAD(MaskedLoadSDNode *MLD, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_Gather(MemSDNode *VPGT, SDValue &Lo, SDValue &Hi,
                           bool SplitSETCC = false);
-  void SplitVecRes_MCOMPRESS(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_MASKED_COMPRESS(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_ScalarOp(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_STEP_VECTOR(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi);
@@ -975,7 +975,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue WidenVecRes_LOAD(SDNode* N);
   SDValue WidenVecRes_VP_LOAD(VPLoadSDNode *N);
   SDValue WidenVecRes_VP_STRIDED_LOAD(VPStridedLoadSDNode *N);
-  SDValue WidenVecRes_MCOMPRESS(SDNode *N);
+  SDValue WidenVecRes_MASKED_COMPRESS(SDNode *N);
   SDValue WidenVecRes_MLOAD(MaskedLoadSDNode* N);
   SDValue WidenVecRes_MGATHER(MaskedGatherSDNode* N);
   SDValue WidenVecRes_VP_GATHER(VPGatherSDNode* N);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index bb2d3a8a3a0d16..016b8352a9e434 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -442,7 +442,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::FP_TO_SINT_SAT:
   case ISD::FP_TO_UINT_SAT:
   case ISD::MGATHER:
-  case ISD::MCOMPRESS:
+  case ISD::MASKED_COMPRESS:
     Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
     break;
   case ISD::SMULFIX:
@@ -1102,8 +1102,8 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
       return;
 
     break;
-  case ISD::MCOMPRESS:
-    Results.push_back(TLI.expandMCOMPRESS(Node, DAG));
+  case ISD::MASKED_COMPRESS:
+    Results.push_back(TLI.expandMASKED_COMPRESS(Node, DAG));
     return;
   }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 5652a5287b0dc4..034d52fe3119f1 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1058,8 +1058,8 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::VP_GATHER:
     SplitVecRes_Gather(cast<MemSDNode>(N), Lo, Hi, /*SplitSETCC*/ true);
     break;
-  case ISD::MCOMPRESS:
-    SplitVecRes_MCOMPRESS(N, Lo, Hi);
+  case ISD::MASKED_COMPRESS:
+    SplitVecRes_MASKED_COMPRESS(N, Lo, Hi);
     break;
   case ISD::SETCC:
   case ISD::VP_SETCC:
@@ -2307,14 +2307,14 @@ void DAGTypeLegalizer::SplitVecRes_Gather(MemSDNode *N, SDValue &Lo,
   ReplaceValueWith(SDValue(N, 1), Ch);
 }
 
-void DAGTypeLegalizer::SplitVecRes_MCOMPRESS(SDNode *N, SDValue &Lo,
+void DAGTypeLegalizer::SplitVecRes_MASKED_COMPRESS(SDNode *N, SDValue &Lo,
                                              SDValue &Hi) {
   // This is not "trivial", as there is a dependency between the two subvectors.
   // Depending on the number of 1s in the mask, the elements from the Hi vector
   // need to be moved to the Lo vector. So we just perform this as one "big"
   // operation and then extract the Lo and Hi vectors from that. This gets rid
-  // of MCOMPRESS and all other operands can be legalized later.
-  SDValue Compressed = TLI.expandMCOMPRESS(N, DAG);
+  // of MASKED_COMPRESS and all other operands can be legalized later.
+  SDValue Compressed = TLI.expandMASKED_COMPRESS(N, DAG);
   std::tie(Lo, Hi) = DAG.SplitVector(Compressed, SDLoc(N));
 }
 
@@ -4227,8 +4227,8 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::EXPERIMENTAL_VP_STRIDED_LOAD:
     Res = WidenVecRes_VP_STRIDED_LOAD(cast<VPStridedLoadSDNode>(N));
     break;
-  case ISD::MCOMPRESS:
-    Res = WidenVecRes_MCOMPRESS(N);
+  case ISD::MASKED_COMPRESS:
+    Res = WidenVecRes_MASKED_COMPRESS(N);
     break;
   case ISD::MLOAD:
     Res = WidenVecRes_MLOAD(cast<MaskedLoadSDNode>(N));
@@ -5612,7 +5612,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_VP_STRIDED_LOAD(VPStridedLoadSDNode *N) {
   return Res;
 }
 
-SDValue DAGTypeLegalizer::WidenVecRes_MCOMPRESS(SDNode *N) {
+SDValue DAGTypeLegalizer::WidenVecRes_MASKED_COMPRESS(SDNode *N) {
   SDValue Vec = N->getOperand(0);
   SDValue Mask = N->getOperand(1);
   EVT WideVecVT =
@@ -5623,7 +5623,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_MCOMPRESS(SDNode *N) {
 
   SDValue WideVec = ModifyToType(Vec, WideVecVT);
   SDValue WideMask = ModifyToType(Mask, WideMaskVT);
-  return DAG.getNode(ISD::MCOMPRESS, SDLoc(N), WideVecVT, WideVec, WideMask);
+  return DAG.getNode(ISD::MASKED_COMPRESS, SDLoc(N), WideVecVT, WideVec, WideMask);
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_MLOAD(MaskedLoadSDNode *N) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 775cf64a82a0da..d13c4c878e21f3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -6279,10 +6279,10 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
   // We can't create a scalar CONCAT_VECTORS so skip it. It will break
   // for concats involving SPLAT_VECTOR. Concats of BUILD_VECTORS are handled by
   // foldCONCAT_VECTORS in getNode before this is called.
-  // MCOMPRESS is not defined for scalars. We handle constants before this call.
+  // MASKED_COMPRESS is not defined for scalars. We handle constants before this call.
   // TODO: Make this aware of vector-only opcodes and skip all of them.
   if (Opcode >= ISD::BUILTIN_OP_END || Opcode == ISD::CONCAT_VECTORS ||
-      Opcode == ISD::MCOMPRESS)
+      Opcode == ISD::MASKED_COMPRESS)
     return SDValue();
 
   unsigned NumOps = Ops.size();
@@ -7200,7 +7200,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
       return N1.getOperand(1);
     break;
   }
-  case ISD::MCOMPRESS: {
+  case ISD::MASKED_COMPRESS: {
     EVT VecVT = N1.getValueType();
     [[maybe_unused]] EVT MaskVT = N2.getValueType();
     assert(VT == VecVT && "Vector and result type don't match.");
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 20461511ac92ff..e413223d812814 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6719,7 +6719,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     visitMaskedStore(I, true /* IsCompressing */);
     return;
   case Intrinsic::masked_compress:
-    setValue(&I, DAG.getNode(ISD::MCOMPRESS, sdl,
+    setValue(&I, DAG.getNode(ISD::MASKED_COMPRESS, sdl,
                              getValue(I.getArgOperand(0)).getValueType(),
                              getValue(I.getArgOperand(0)),
                              getValue(I.getArgOperand(1)), Flags));
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 37288054b0e7b0..523d1999864840 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -416,7 +416,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::MSTORE:                     return "masked_store";
   case ISD::MGATHER:                    return "masked_gather";
   case ISD::MSCATTER:                   return "masked_scatter";
-  case ISD::MCOMPRESS:                  return "masked_compress";
+  case ISD::MASKED_COMPRESS:            return "masked_compress";
   case ISD::VAARG:                      return "vaarg";
   case ISD::VACOPY:                     return "vacopy";
   case ISD::VAEND:                      return "vaend";
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index bc8c2ed942609f..19e4042624d724 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -11226,7 +11226,7 @@ SDValue TargetLowering::expandVectorSplice(SDNode *Node,
                      MachinePointerInfo::getUnknownStack(MF));
 }
 
-SDValue TargetLowering::expandMCOMPRESS(SDNode *Node, SelectionDAG &DAG) const {
+SDValue TargetLowering::expandMASKED_COMPRESS(SDNode *Node, SelectionDAG &DAG) const {
   SDLoc DL(Node);
   SDValue Vec = Node->getOperand(0);
   SDValue Mask = Node->getOperand(1);
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 5ee12be752b277..01457437f66bac 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -957,7 +957,7 @@ void TargetLoweringBase::initActions() {
     setOperationAction(ISD::VECTOR_SPLICE, VT, Expand);
 
     // Only some target support this vector operation. Most need to expand it.
-    setOperationAction(ISD::MCOMPRESS, VT, Expand);
+    setOperationAction(ISD::MASKED_COMPRESS, VT, Expand);
 
     // VP operations default to expand.
 #define BEGIN_REGISTER_VP_SDNODE(SDOPC, ...)                                   \
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index cf4390edfca3b0..7938db09a8722e 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -1133,7 +1133,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .lower();
 
   // TODO: Update this to correct handling when adding AArch64/SVE support.
-  getActionDefinitionsBuilder(G_MCOMPRESS).lower();
+  getActionDefinitionsBuilder(G_MASKED_COMPRESS).lower();
 
   getActionDefinitionsBuilder({G_FSHL, G_FSHR})
       .customFor({{s32, s32}, {s32, s64}, {s64, s64}})
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-masked-compress.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-masked-compress.mir
index 2d28b4a597ed36..62a5927efd197d 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-masked-compress.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-masked-compress.mir
@@ -1,12 +1,12 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=aarch64 -run-pass=legalizer %s -o - | FileCheck %s
 ---
-name:            test_mcompress_v4s32
+name:            test_masked_compress_v4s32
 body:             |
   bb.0:
     liveins: $q0, $d1
 
-    ; CHECK-LABEL: name: test_mcompress_v4s32
+    ; CHECK-LABEL: name: test_masked_compress_v4s32
     ; CHECK: liveins: $q0, $d1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
@@ -59,7 +59,7 @@ body:             |
     ; CHECK-NEXT: RET_ReallyLR implicit $q0
     %0:_(<4 x s32>) = COPY $q0
     %1:_(<4 x s16>) = COPY $d1
-    %2:_(<4 x s32>) = G_MCOMPRESS %0(<4 x s32>), %1(<4 x s16>)
+    %2:_(<4 x s32>) = G_MASKED_COMPRESS %0(<4 x s32>), %1(<4 x s16>)
     $q0 = COPY %2(<4 x s32>)
     RET_ReallyLR implicit $q0
 ...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
index a732c4d6ed5f37..1c79ad375d7cd5 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
@@ -642,7 +642,7 @@
 # DEBUG-NEXT: G_SPLAT_VECTOR (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
 # DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
-# DEBUG-NEXT: G_MCOMPRESS (opcode {{[0-9]+}}): 3 type indices, 0 imm indices
+# DEBUG-NEXT: G_MASKED_COMPRESS (opcode {{[0-9]+}}): 3 type indices, 0 imm indices
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: G_CTTZ (opcode {{[0-9]+}}): 2 type indices, 0 imm indices

From 4366a43cd1edc142e1a6e182ac8b086621f035c6 Mon Sep 17 00:00:00 2001
From: Lawrence Benson <github@lawben.com>
Date: Fri, 7 Jun 2024 09:16:13 +0200
Subject: [PATCH 26/49] Get stack alignment for load in GlobalISel

---
 llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index b646454c3d345e..796f4411f14f77 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -7538,8 +7538,9 @@ LegalizerHelper::lowerMASKED_COMPRESS(llvm::MachineInstr &MI) {
     }
   }
 
-  Align VecAlign = getStackTemporaryAlignment(VecTy);
-  MIRBuilder.buildLoad(Dst, StackPtr, PtrInfo, VecAlign);
+  MachineFrameInfo &MFI = MI.getMF()->getFrameInfo();
+  MIRBuilder.buildLoad(Dst, StackPtr, PtrInfo,
+                       MFI.getObjectAlign(PtrInfo.StackID));
 
   MI.eraseFromParent();
   return Legalized;

From 808709f3aa5dc6a064c819141adb440a9bd34d2a Mon Sep 17 00:00:00 2001
From: Lawrence Benson <github@lawben.com>
Date: Fri, 7 Jun 2024 09:17:26 +0200
Subject: [PATCH 27/49] Fix formatting

---
 llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp | 3 ++-
 llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp  | 5 +++--
 llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp         | 3 ++-
 llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp       | 3 ++-
 4 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 50f729f90fea7a..b371cd4fe8a6ab 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -2347,7 +2347,8 @@ SDValue DAGTypeLegalizer::PromoteIntOp_MSCATTER(MaskedScatterSDNode *N,
                               N->getIndexType(), TruncateStore);
 }
 
-SDValue DAGTypeLegalizer::PromoteIntOp_MASKED_COMPRESS(SDNode *N, unsigned OpNo) {
+SDValue DAGTypeLegalizer::PromoteIntOp_MASKED_COMPRESS(SDNode *N,
+                                                       unsigned OpNo) {
   assert(OpNo == 1 && "Can only promote MASKED_COMPRESS mask.");
   SDValue Vec = N->getOperand(0);
   EVT VT = Vec.getValueType();
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 034d52fe3119f1..2f25f70680a2a9 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -2308,7 +2308,7 @@ void DAGTypeLegalizer::SplitVecRes_Gather(MemSDNode *N, SDValue &Lo,
 }
 
 void DAGTypeLegalizer::SplitVecRes_MASKED_COMPRESS(SDNode *N, SDValue &Lo,
-                                             SDValue &Hi) {
+                                                   SDValue &Hi) {
   // This is not "trivial", as there is a dependency between the two subvectors.
   // Depending on the number of 1s in the mask, the elements from the Hi vector
   // need to be moved to the Lo vector. So we just perform this as one "big"
@@ -5623,7 +5623,8 @@ SDValue DAGTypeLegalizer::WidenVecRes_MASKED_COMPRESS(SDNode *N) {
 
   SDValue WideVec = ModifyToType(Vec, WideVecVT);
   SDValue WideMask = ModifyToType(Mask, WideMaskVT);
-  return DAG.getNode(ISD::MASKED_COMPRESS, SDLoc(N), WideVecVT, WideVec, WideMask);
+  return DAG.getNode(ISD::MASKED_COMPRESS, SDLoc(N), WideVecVT, WideVec,
+                     WideMask);
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_MLOAD(MaskedLoadSDNode *N) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index d13c4c878e21f3..e434b0cde96ce7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -6279,7 +6279,8 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
   // We can't create a scalar CONCAT_VECTORS so skip it. It will break
   // for concats involving SPLAT_VECTOR. Concats of BUILD_VECTORS are handled by
   // foldCONCAT_VECTORS in getNode before this is called.
-  // MASKED_COMPRESS is not defined for scalars. We handle constants before this call.
+  // MASKED_COMPRESS is not defined for scalars. We handle constants before this
+  // call.
   // TODO: Make this aware of vector-only opcodes and skip all of them.
   if (Opcode >= ISD::BUILTIN_OP_END || Opcode == ISD::CONCAT_VECTORS ||
       Opcode == ISD::MASKED_COMPRESS)
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 19e4042624d724..56c1f55c24b965 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -11226,7 +11226,8 @@ SDValue TargetLowering::expandVectorSplice(SDNode *Node,
                      MachinePointerInfo::getUnknownStack(MF));
 }
 
-SDValue TargetLowering::expandMASKED_COMPRESS(SDNode *Node, SelectionDAG &DAG) const {
+SDValue TargetLowering::expandMASKED_COMPRESS(SDNode *Node,
+                                              SelectionDAG &DAG) const {
   SDLoc DL(Node);
   SDValue Vec = Node->getOperand(0);
   SDValue Mask = Node->getOperand(1);

From d5fca0f83971c8a3b9aa4a34cb8301cd4e0bf363 Mon Sep 17 00:00:00 2001
From: Lawrence Benson <github@lawben.com>
Date: Fri, 7 Jun 2024 09:22:22 +0200
Subject: [PATCH 28/49] Replace use of TLI inside of TargetLowering

---
 llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 56c1f55c24b965..b0cc3490b27229 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -11245,13 +11245,12 @@ SDValue TargetLowering::expandMASKED_COMPRESS(SDNode *Node,
   SDValue Chain = DAG.getEntryNode();
   SDValue OutPos = DAG.getConstant(0, DL, MVT::i32);
 
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   unsigned NumElms = VecVT.getVectorNumElements();
   for (unsigned I = 0; I < NumElms; I++) {
     SDValue Idx = DAG.getVectorIdxConstant(I, DL);
 
     SDValue ValI = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Vec, Idx);
-    SDValue OutPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, OutPos);
+    SDValue OutPtr = getVectorElementPointer(DAG, StackPtr, VecVT, OutPos);
     Chain = DAG.getStore(
         Chain, DL, ValI, OutPtr,
         MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()));

From 00b64d2c9121585b691201ecb4da3276d120957d Mon Sep 17 00:00:00 2001
From: Lawrence Benson <github@lawben.com>
Date: Fri, 7 Jun 2024 11:40:36 +0200
Subject: [PATCH 29/49] Use vector alignement for stack load

---
 llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 796f4411f14f77..75db1d85bd04cd 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -7508,23 +7508,25 @@ LegalizerHelper::LegalizeResult
 LegalizerHelper::lowerMASKED_COMPRESS(llvm::MachineInstr &MI) {
   auto [Dst, DstTy, Vec, VecTy, Mask, MaskTy] = MI.getFirst3RegLLTs();
 
+  Align VecAlign = getStackTemporaryAlignment(VecTy);
   MachinePointerInfo PtrInfo;
   Register StackPtr =
-      createStackTemporary(TypeSize::getFixed(VecTy.getSizeInBytes()),
-                           getStackTemporaryAlignment(VecTy), PtrInfo)
+      createStackTemporary(TypeSize::getFixed(VecTy.getSizeInBytes()), VecAlign,
+                           PtrInfo)
           .getReg(0);
 
   LLT IdxTy = LLT::scalar(32);
   LLT ValTy = VecTy.getElementType();
   Align ValAlign = getStackTemporaryAlignment(ValTy);
 
-  Register OutPos = MIRBuilder.buildConstant(IdxTy, 0).getReg(0);
+  auto OutPos = MIRBuilder.buildConstant(IdxTy, 0);
 
   unsigned NumElmts = VecTy.getNumElements();
   for (unsigned I = 0; I < NumElmts; ++I) {
     auto Idx = MIRBuilder.buildConstant(IdxTy, I);
     auto Val = MIRBuilder.buildExtractVectorElement(ValTy, Vec, Idx);
-    Register ElmtPtr = getVectorElementPointer(StackPtr, VecTy, OutPos);
+    Register ElmtPtr =
+        getVectorElementPointer(StackPtr, VecTy, OutPos.getReg(0));
     MIRBuilder.buildStore(Val, ElmtPtr, PtrInfo, ValAlign);
 
     if (I < NumElmts - 1) {
@@ -7534,13 +7536,12 @@ LegalizerHelper::lowerMASKED_COMPRESS(llvm::MachineInstr &MI) {
         MaskI = MIRBuilder.buildTrunc(LLT::scalar(1), MaskI);
 
       MaskI = MIRBuilder.buildZExt(IdxTy, MaskI);
-      OutPos = MIRBuilder.buildAdd(IdxTy, OutPos, MaskI).getReg(0);
+      OutPos = MIRBuilder.buildAdd(IdxTy, OutPos, MaskI);
     }
   }
 
-  MachineFrameInfo &MFI = MI.getMF()->getFrameInfo();
-  MIRBuilder.buildLoad(Dst, StackPtr, PtrInfo,
-                       MFI.getObjectAlign(PtrInfo.StackID));
+  // TODO: Use StackPtr's FrameIndex alignment.
+  MIRBuilder.buildLoad(Dst, StackPtr, PtrInfo, VecAlign);
 
   MI.eraseFromParent();
   return Legalized;

From 60f9c616823745737ea3eb1aac1a2db8629ce570 Mon Sep 17 00:00:00 2001
From: Lawrence Benson <github@lawben.com>
Date: Fri, 7 Jun 2024 11:47:43 +0200
Subject: [PATCH 30/49] Add llvm.masked.compress to release notes

---
 llvm/docs/ReleaseNotes.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index f2577e1684f5f3..aa09846552d3f4 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -55,6 +55,7 @@ Changes to the LLVM IR
 * Renamed ``llvm.experimental.vector.splice`` intrinsic to ``llvm.vector.splice``.
 * Renamed ``llvm.experimental.vector.interleave2`` intrinsic to ``llvm.vector.interleave2``.
 * Renamed ``llvm.experimental.vector.deinterleave2`` intrinsic to ``llvm.vector.deinterleave2``.
+* Added ``llvm.masked.compress`` intrinsic.
 
 Changes to LLVM infrastructure
 ------------------------------

From f7e4b480ad645992ea51484ed5f44c55dee5bd66 Mon Sep 17 00:00:00 2001
From: Lawrence Benson <github@lawben.com>
Date: Mon, 17 Jun 2024 10:55:20 +0200
Subject: [PATCH 31/49] Add passthru vector to @llvm.masked.compress

---
 llvm/docs/GlobalISel/GenericOpcode.rst        |  7 ++
 llvm/include/llvm/CodeGen/ISDOpcodes.h        |  4 +-
 llvm/include/llvm/IR/Intrinsics.td            |  2 +-
 llvm/include/llvm/Target/GenericOpcodes.td    |  2 +-
 .../include/llvm/Target/TargetSelectionDAG.td |  6 +-
 .../SelectionDAG/LegalizeIntegerTypes.cpp     |  6 +-
 .../SelectionDAG/LegalizeVectorTypes.cpp      |  4 +-
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 36 ++++-----
 .../SelectionDAG/SelectionDAGBuilder.cpp      |  3 +-
 .../CodeGen/SelectionDAG/TargetLowering.cpp   | 78 +++++++++++++++----
 10 files changed, 107 insertions(+), 41 deletions(-)

diff --git a/llvm/docs/GlobalISel/GenericOpcode.rst b/llvm/docs/GlobalISel/GenericOpcode.rst
index 5c28c6fcd30fb6..e44d2bf37aff70 100644
--- a/llvm/docs/GlobalISel/GenericOpcode.rst
+++ b/llvm/docs/GlobalISel/GenericOpcode.rst
@@ -710,6 +710,13 @@ The type of the operand must be equal to or larger than the vector element
 type. If the operand is larger than the vector element type, the scalar is
 implicitly truncated to the vector element type.
 
+G_MASKED_COMPRESS
+^^^^^^^^^^^^^^^^^
+
+Given an input vector, a mask vector, and a passthru vector, continuously place
+all selected (i.e., where mask[i] = true) input lanes in an output vector. All
+remaining lanes in the output are taken from passthru, which may be undef.
+
 Vector Reduction Operations
 ---------------------------
 
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index f834ede597d77d..c1a921455cd346 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -1296,10 +1296,12 @@ enum NodeType {
   MLOAD,
   MSTORE,
 
-  // MASKED_COMPRESS(Vec, Mask)
+  // MASKED_COMPRESS(Vec, Mask, Passthru)
   // consecutively place vector elements based on mask
   // e.g., vec = {A, B, C, D} and mask = {1, 0, 1, 0}
   //         --> {A, C, ?, ?} where ? is undefined
+  // If passthru is defined, ?s are replaced with elements at corresponding
+  // positions from passthru. If passthru is undef, ?s remain undefined.
   MASKED_COMPRESS,
 
   // Masked gather and scatter - load and store operations for a vector of
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index b2a2a591d05a44..014fe8051f1e11 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -2364,7 +2364,7 @@ def int_masked_compressstore:
 
 def int_masked_compress:
     DefaultAttrsIntrinsic<[llvm_anyvector_ty],
-              [LLVMMatchType<0>, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
+              [LLVMMatchType<0>, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMMatchType<0>],
               [IntrNoMem, IntrWillReturn]>;
 
 // Test whether a pointer is associated with a type metadata identifier.
diff --git a/llvm/include/llvm/Target/GenericOpcodes.td b/llvm/include/llvm/Target/GenericOpcodes.td
index 29e82bf6398428..7a7da7c69e6926 100644
--- a/llvm/include/llvm/Target/GenericOpcodes.td
+++ b/llvm/include/llvm/Target/GenericOpcodes.td
@@ -1503,7 +1503,7 @@ def G_SPLAT_VECTOR: GenericInstruction {
 // Generic masked compress.
 def G_MASKED_COMPRESS: GenericInstruction {
   let OutOperandList = (outs type0:$dst);
-  let InOperandList = (ins type1:$vec, type2:$mask);
+  let InOperandList = (ins type0:$vec, type1:$mask, type0:$passthru);
   let hasSideEffects = false;
 }
 
diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
index 4b77a578126ec0..c4ab43b53995a7 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -266,8 +266,10 @@ def SDTMaskedScatter : SDTypeProfile<0, 4, [
   SDTCisSameNumEltsAs<0, 1>, SDTCisSameNumEltsAs<0, 3>
 ]>;
 
-def SDTMaskedCompress : SDTypeProfile<1, 2, [
-  SDTCisVec<0>, SDTCisVec<1>, SDTCisSameNumEltsAs<0, 1>,
+def SDTMaskedCompress : SDTypeProfile<1, 3, [
+  SDTCisVec<0>, SDTCisSameAs<0, 1>,
+  SDTCisVec<2>, SDTCisSameNumEltsAs<1, 2>,
+  SDTCisSameAs<1, 3>
 ]>;
 
 def SDTVecShuffle : SDTypeProfile<1, 2, [
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 50899637967efd..4bebb64f4ec34c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -968,8 +968,9 @@ SDValue DAGTypeLegalizer::PromoteIntRes_MGATHER(MaskedGatherSDNode *N) {
 
 SDValue DAGTypeLegalizer::PromoteIntRes_MASKED_COMPRESS(SDNode *N) {
   SDValue Vec = GetPromotedInteger(N->getOperand(0));
+  SDValue Passthru = GetPromotedInteger(N->getOperand(2));
   return DAG.getNode(ISD::MASKED_COMPRESS, SDLoc(N), Vec.getValueType(), Vec,
-                     N->getOperand(1));
+                     N->getOperand(1), Passthru);
 }
 
 /// Promote the overflow flag of an overflowing arithmetic node.
@@ -2401,8 +2402,9 @@ SDValue DAGTypeLegalizer::PromoteIntOp_MASKED_COMPRESS(SDNode *N,
   assert(OpNo == 1 && "Can only promote MASKED_COMPRESS mask.");
   SDValue Vec = N->getOperand(0);
   EVT VT = Vec.getValueType();
+  SDValue Passthru = N->getOperand(2);
   SDValue Mask = PromoteTargetBoolean(N->getOperand(1), VT);
-  return DAG.getNode(ISD::MASKED_COMPRESS, SDLoc(N), VT, Vec, Mask);
+  return DAG.getNode(ISD::MASKED_COMPRESS, SDLoc(N), VT, Vec, Mask, Passthru);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_TRUNCATE(SDNode *N) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 48ba0142160ee3..d090cdd0830ee9 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -5627,6 +5627,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_VP_STRIDED_LOAD(VPStridedLoadSDNode *N) {
 SDValue DAGTypeLegalizer::WidenVecRes_MASKED_COMPRESS(SDNode *N) {
   SDValue Vec = N->getOperand(0);
   SDValue Mask = N->getOperand(1);
+  SDValue Passthru = N->getOperand(2);
   EVT WideVecVT =
       TLI.getTypeToTransformTo(*DAG.getContext(), Vec.getValueType());
   EVT WideMaskVT = EVT::getVectorVT(*DAG.getContext(),
@@ -5635,8 +5636,9 @@ SDValue DAGTypeLegalizer::WidenVecRes_MASKED_COMPRESS(SDNode *N) {
 
   SDValue WideVec = ModifyToType(Vec, WideVecVT);
   SDValue WideMask = ModifyToType(Mask, WideMaskVT);
+  SDValue WidePassthru = ModifyToType(Passthru, WideVecVT);
   return DAG.getNode(ISD::MASKED_COMPRESS, SDLoc(N), WideVecVT, WideVec,
-                     WideMask);
+                     WideMask, WidePassthru);
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_MLOAD(MaskedLoadSDNode *N) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 6ae537de9abb05..ca5b8e09cd9fac 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -6340,11 +6340,7 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
   // We can't create a scalar CONCAT_VECTORS so skip it. It will break
   // for concats involving SPLAT_VECTOR. Concats of BUILD_VECTORS are handled by
   // foldCONCAT_VECTORS in getNode before this is called.
-  // MASKED_COMPRESS is not defined for scalars. We handle constants before this
-  // call.
-  // TODO: Make this aware of vector-only opcodes and skip all of them.
-  if (Opcode >= ISD::BUILTIN_OP_END || Opcode == ISD::CONCAT_VECTORS ||
-      Opcode == ISD::MASKED_COMPRESS)
+  if (Opcode >= ISD::BUILTIN_OP_END || Opcode == ISD::CONCAT_VECTORS)
     return SDValue();
 
   unsigned NumOps = Ops.size();
@@ -7277,20 +7273,6 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
       return N1.getOperand(1);
     break;
   }
-  case ISD::MASKED_COMPRESS: {
-    EVT VecVT = N1.getValueType();
-    [[maybe_unused]] EVT MaskVT = N2.getValueType();
-    assert(VT == VecVT && "Vector and result type don't match.");
-    assert(VecVT.isVector() && MaskVT.isVector() &&
-           "Both inputs must be vectors.");
-    assert(VecVT.getVectorElementCount() == MaskVT.getVectorElementCount() &&
-           "Vector and mask must have same number of elements.");
-
-    if (N1.isUndef() || N2.isUndef())
-      return getUNDEF(VecVT);
-
-    break;
-  }
   }
 
   // Perform trivial constant folding.
@@ -7525,6 +7507,22 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     if (N1.getValueType() == VT)
       return N1;
     break;
+  case ISD::MASKED_COMPRESS: {
+    EVT VecVT = N1.getValueType();
+    [[maybe_unused]] EVT MaskVT = N2.getValueType();
+    [[maybe_unused]] EVT PassthruVT = N3.getValueType();
+    assert(VT == VecVT && "Vector and result type don't match.");
+    assert(VecVT.isVector() && MaskVT.isVector() && PassthruVT.isVector() &&
+           "All inputs must be vectors.");
+    assert(VecVT == PassthruVT && "Vector and passthru types don't match.");
+    assert(VecVT.getVectorElementCount() == MaskVT.getVectorElementCount() &&
+           "Vector and mask must have same number of elements.");
+
+    if (N1.isUndef() || N2.isUndef())
+      return getUNDEF(VecVT);
+
+    break;
+  }
   }
 
   // Memoize node if it doesn't produce a glue result.
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 825b2ed2da41a4..e732cdf9b23d95 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6735,7 +6735,8 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     setValue(&I, DAG.getNode(ISD::MASKED_COMPRESS, sdl,
                              getValue(I.getArgOperand(0)).getValueType(),
                              getValue(I.getArgOperand(0)),
-                             getValue(I.getArgOperand(1)), Flags));
+                             getValue(I.getArgOperand(1)),
+                             getValue(I.getArgOperand(2)), Flags));
     return;
   case Intrinsic::powi:
     setValue(&I, ExpandPowI(sdl, getValue(I.getArgOperand(0)),
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index b7adf467445841..f262637caa9d45 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -11320,10 +11320,12 @@ SDValue TargetLowering::expandMASKED_COMPRESS(SDNode *Node,
   SDLoc DL(Node);
   SDValue Vec = Node->getOperand(0);
   SDValue Mask = Node->getOperand(1);
+  SDValue Passthru = Node->getOperand(2);
 
   EVT VecVT = Vec.getValueType();
   EVT ScalarVT = VecVT.getScalarType();
-  EVT MaskScalarVT = Mask.getValueType().getScalarType();
+  EVT MaskVT = Mask.getValueType();
+  EVT MaskScalarVT = MaskVT.getScalarType();
 
   // Needs to be handled by targets that have scalable vector types.
   if (VecVT.isScalableVector())
@@ -11331,9 +11333,48 @@ SDValue TargetLowering::expandMASKED_COMPRESS(SDNode *Node,
 
   SDValue StackPtr = DAG.CreateStackTemporary(
       VecVT.getStoreSize(), DAG.getReducedAlign(VecVT, /*UseABI=*/false));
+  int FI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+  MachinePointerInfo PtrInfo =
+      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
+
   SDValue Chain = DAG.getEntryNode();
   SDValue OutPos = DAG.getConstant(0, DL, MVT::i32);
 
+  bool HasPassthru = !Passthru.isUndef();
+
+  // If we have a passthru vector, store it on the stack, overwrite the matching
+  // positions and then re-write the last element that was potentially
+  // overwritten even though mask[i] = false.
+  if (HasPassthru)
+    Chain = DAG.getStore(Chain, DL, Passthru, StackPtr, PtrInfo);
+
+  SDValue LastWriteVal;
+  APInt PassthruSplatVal;
+  bool IsSplatPassthru =
+      ISD::isConstantSplatVector(Passthru.getNode(), PassthruSplatVal);
+
+  if (HasPassthru && IsSplatPassthru) {
+    // As we do not know which position we wrote to last, we cannot simply
+    // access that index from the passthru vector. So we first check if passthru
+    // is a splat vector, to use any element ...
+    LastWriteVal = DAG.getConstant(PassthruSplatVal, DL, ScalarVT);
+  } else {
+    // ... if it is not a splat vector, we need to get the passthru value at
+    // position = popcount(mask) and re-load it from the stack before it is
+    // overwritten in the loop below.
+    SDValue Popcount = DAG.getNode(
+        ISD::TRUNCATE, DL, MaskVT.changeVectorElementType(MVT::i1), Mask);
+    Popcount = DAG.getNode(ISD::ZERO_EXTEND, DL,
+                           MaskVT.changeVectorElementType(MVT::i32), Popcount);
+    Popcount = DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, Popcount);
+    SDValue LastElmtPtr =
+        getVectorElementPointer(DAG, StackPtr, VecVT, Popcount);
+    LastWriteVal = DAG.getLoad(
+        ScalarVT, DL, Chain, LastElmtPtr,
+        MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()));
+    Chain = LastWriteVal.getValue(1);
+  }
+
   unsigned NumElms = VecVT.getVectorNumElements();
   for (unsigned I = 0; I < NumElms; I++) {
     SDValue Idx = DAG.getVectorIdxConstant(I, DL);
@@ -11344,21 +11385,32 @@ SDValue TargetLowering::expandMASKED_COMPRESS(SDNode *Node,
         Chain, DL, ValI, OutPtr,
         MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()));
 
-    // Skip this for last element.
-    if (I < NumElms - 1) {
-      // Get the mask value and add it to the current output position. This
-      // either increments by 1 if MaskI is true or adds 0 otherwise.
-      SDValue MaskI =
-          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskScalarVT, Mask, Idx);
-      MaskI = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, MaskI);
-      MaskI = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, MaskI);
-      OutPos = DAG.getNode(ISD::ADD, DL, MVT::i32, OutPos, MaskI);
+    // Get the mask value and add it to the current output position. This
+    // either increments by 1 if MaskI is true or adds 0 otherwise.
+    SDValue MaskI =
+        DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskScalarVT, Mask, Idx);
+    MaskI = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, MaskI);
+    MaskI = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, MaskI);
+    OutPos = DAG.getNode(ISD::ADD, DL, MVT::i32, OutPos, MaskI);
+
+    if (HasPassthru && I == NumElms - 1) {
+      SDValue EndOfVector =
+          DAG.getConstant(VecVT.getVectorNumElements() - 1, DL, MVT::i32);
+      SDValue AllLanesSelected =
+          DAG.getSetCC(DL, MVT::i1, OutPos, EndOfVector, ISD::CondCode::SETUGT);
+      OutPos = DAG.getNode(ISD::UMIN, DL, MVT::i32, OutPos, EndOfVector);
+      OutPtr = getVectorElementPointer(DAG, StackPtr, VecVT, OutPos);
+
+      // Re-write the last ValI if all lanes were selected. Otherwise,
+      // overwrite the last write it with the passthru value.
+      LastWriteVal =
+          DAG.getSelect(DL, ScalarVT, AllLanesSelected, ValI, LastWriteVal);
+      Chain = DAG.getStore(
+          Chain, DL, LastWriteVal, OutPtr,
+          MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()));
     }
   }
 
-  int FI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
-  MachinePointerInfo PtrInfo =
-      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
   return DAG.getLoad(VecVT, DL, Chain, StackPtr, PtrInfo);
 }
 

From 9f291c8ce97ed9b432c8b74c0ea4a1f15348c94d Mon Sep 17 00:00:00 2001
From: Lawrence Benson <github@lawben.com>
Date: Mon, 17 Jun 2024 12:24:35 +0200
Subject: [PATCH 32/49] Add passthru to DAGCombiner

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  11 +-
 llvm/test/CodeGen/AArch64/masked-compress.ll  | 121 +++++++++++++++---
 2 files changed, 112 insertions(+), 20 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index e2857ed63cab59..8a32b068acb638 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -12097,11 +12097,14 @@ SDValue DAGCombiner::visitMASKED_COMPRESS(SDNode *N) {
   SDLoc DL(N);
   SDValue Vec = N->getOperand(0);
   SDValue Mask = N->getOperand(1);
+  SDValue Passthru = N->getOperand(2);
   EVT VecVT = Vec.getValueType();
 
+  bool HasPassthru = !Passthru.isUndef();
+
   APInt SplatVal;
   if (ISD::isConstantSplatVector(Mask.getNode(), SplatVal))
-    return SplatVal.isAllOnes() ? Vec : DAG.getUNDEF(VecVT);
+    return SplatVal.isAllOnes() ? Vec : (HasPassthru ? Passthru : DAG.getUNDEF(VecVT));
 
   if (Vec.isUndef() || Mask.isUndef())
     return DAG.getUNDEF(VecVT);
@@ -12126,7 +12129,11 @@ SDValue DAGCombiner::visitMASKED_COMPRESS(SDNode *N) {
       }
     }
     for (unsigned Rest = NumSelected; Rest < NumElmts; ++Rest) {
-      Ops.push_back(DAG.getUNDEF(ScalarVT));
+      SDValue Val = HasPassthru ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Passthru,
+                                  DAG.getVectorIdxConstant(Rest - NumSelected, DL))
+                                : DAG.getUNDEF(ScalarVT);
+      Ops.push_back(Val);
+
     }
     return DAG.getBuildVector(VecVT, DL, Ops);
   }
diff --git a/llvm/test/CodeGen/AArch64/masked-compress.ll b/llvm/test/CodeGen/AArch64/masked-compress.ll
index c5382f591b87e0..926858ec1394cb 100644
--- a/llvm/test/CodeGen/AArch64/masked-compress.ll
+++ b/llvm/test/CodeGen/AArch64/masked-compress.ll
@@ -28,7 +28,57 @@ define <4 x i32> @test_compress_v4i32(<4 x i32> %vec, <4 x i1> %mask) {
 ; CHECK-NEXT:    st1.s { v0 }[3], [x11]
 ; CHECK-NEXT:    ldr q0, [sp], #16
 ; CHECK-NEXT:    ret
-    %out = call <4 x i32> @llvm.masked.compress.v4i32(<4 x i32> %vec, <4 x i1> %mask)
+    %out = call <4 x i32> @llvm.masked.compress(<4 x i32> %vec, <4 x i1> %mask, <4 x i32> undef)
+    ret <4 x i32> %out
+}
+
+
+define <4 x i32> @test_compress_v4i32_with_passthru(<4 x i32> %vec, <4 x i1> %mask, <4 x i32> %passthru) {
+; CHECK-LABEL: test_compress_v4i32_with_passthru:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ushll.4s v1, v1, #0
+; CHECK-NEXT:    movi.4s v3, #1
+; CHECK-NEXT:    shl.4s v1, v1, #31
+; CHECK-NEXT:    cmlt.4s v1, v1, #0
+; CHECK-NEXT:    and.16b v3, v1, v3
+; CHECK-NEXT:    str q2, [sp, #-16]!
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    mov.s w8, v1[1]
+; CHECK-NEXT:    fmov w16, s1
+; CHECK-NEXT:    mov x12, sp
+; CHECK-NEXT:    mov.s w11, v1[2]
+; CHECK-NEXT:    addv.4s s2, v3
+; CHECK-NEXT:    mov x10, sp
+; CHECK-NEXT:    mov.s w13, v1[3]
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    mov x14, sp
+; CHECK-NEXT:    bfi x12, x16, #2, #1
+; CHECK-NEXT:    and w16, w16, #0x1
+; CHECK-NEXT:    mov w15, #3 ; =0x3
+; CHECK-NEXT:    and w8, w8, #0x1
+; CHECK-NEXT:    add w8, w16, w8
+; CHECK-NEXT:    fmov w16, s2
+; CHECK-NEXT:    and w11, w11, #0x1
+; CHECK-NEXT:    and w13, w13, #0x1
+; CHECK-NEXT:    add w11, w8, w11
+; CHECK-NEXT:    orr x8, x9, x8, lsl #2
+; CHECK-NEXT:    add w13, w11, w13
+; CHECK-NEXT:    bfi x14, x11, #2, #2
+; CHECK-NEXT:    bfi x10, x16, #2, #2
+; CHECK-NEXT:    mov.s w16, v0[3]
+; CHECK-NEXT:    cmp w13, #3
+; CHECK-NEXT:    csel w11, w13, w15, lo
+; CHECK-NEXT:    ldr w10, [x10]
+; CHECK-NEXT:    str s0, [sp]
+; CHECK-NEXT:    st1.s { v0 }[1], [x12]
+; CHECK-NEXT:    st1.s { v0 }[2], [x8]
+; CHECK-NEXT:    orr x8, x9, x11, lsl #2
+; CHECK-NEXT:    csel w9, w16, w10, hi
+; CHECK-NEXT:    st1.s { v0 }[3], [x14]
+; CHECK-NEXT:    str w9, [x8]
+; CHECK-NEXT:    ldr q0, [sp], #16
+; CHECK-NEXT:    ret
+    %out = call <4 x i32> @llvm.masked.compress(<4 x i32> %vec, <4 x i1> %mask, <4 x i32> %passthru)
     ret <4 x i32> %out
 }
 
@@ -47,7 +97,7 @@ define <2 x double> @test_compress_v2f64(<2 x double> %vec, <2 x i1> %mask) {
 ; CHECK-NEXT:    st1.d { v0 }[1], [x8]
 ; CHECK-NEXT:    ldr q0, [sp], #16
 ; CHECK-NEXT:    ret
-    %out = call <2 x double> @llvm.masked.compress.v2f64(<2 x double> %vec, <2 x i1> %mask)
+    %out = call <2 x double> @llvm.masked.compress.v2f64(<2 x double> %vec, <2 x i1> %mask, <2 x double> undef)
     ret <2 x double> %out
 }
 
@@ -147,7 +197,7 @@ define <16 x i8> @test_compress_v16i8(<16 x i8> %vec, <16 x i1> %mask) {
 ; CHECK-NEXT:    st1.b { v0 }[15], [x8]
 ; CHECK-NEXT:    ldr q0, [sp], #16
 ; CHECK-NEXT:    ret
-    %out = call <16 x i8> @llvm.masked.compress.v16i8(<16 x i8> %vec, <16 x i1> %mask)
+    %out = call <16 x i8> @llvm.masked.compress(<16 x i8> %vec, <16 x i1> %mask, <16 x i8> undef)
     ret <16 x i8> %out
 }
 
@@ -199,23 +249,21 @@ define <8 x i32> @test_compress_large(<8 x i32> %vec, <8 x i1> %mask) {
 ; CHECK-NEXT:    st1.s { v1 }[3], [x9]
 ; CHECK-NEXT:    ldp q0, q1, [sp], #32
 ; CHECK-NEXT:    ret
-    %out = call <8 x i32> @llvm.masked.compress.v8i32(<8 x i32> %vec, <8 x i1> %mask)
+    %out = call <8 x i32> @llvm.masked.compress(<8 x i32> %vec, <8 x i1> %mask, <8 x i32> undef)
     ret <8 x i32> %out
 }
 
 define <4 x i32> @test_compress_all_const() {
-; CHECK:       lCPI4_0:
-; CHECK-NEXT:	.long	5
-; CHECK-NEXT:	.long	9
 ; CHECK-LABEL: test_compress_all_const:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:  Lloh0:
-; CHECK-NEXT:    adrp x8, lCPI4_0@PAGE
+; CHECK-NEXT:    adrp x8, lCPI5_0@PAGE
 ; CHECK-NEXT:  Lloh1:
-; CHECK-NEXT:    ldr q0, [x8, lCPI4_0@PAGEOFF]
+; CHECK-NEXT:    ldr q0, [x8, lCPI5_0@PAGEOFF]
 ; CHECK-NEXT:    ret
     %out = call <4 x i32> @llvm.masked.compress(<4 x i32> <i32 3, i32 5, i32 7, i32 9>,
-                                                <4 x i1>   <i1 0,  i1 1,  i1 0,  i1 1>)
+                                                <4 x i1>   <i1 0,  i1 1,  i1 0,  i1 1>,
+                                                <4 x i32> undef)
     ret <4 x i32> %out
 }
 
@@ -224,7 +272,29 @@ define <4 x i32> @test_compress_const_mask(<4 x i32> %vec) {
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    mov.s v0[1], v0[3]
 ; CHECK-NEXT:    ret
-    %out = call <4 x i32> @llvm.masked.compress(<4 x i32> %vec, <4 x i1> <i1 1, i1 undef, i1 0, i1 1>)
+    %out = call <4 x i32> @llvm.masked.compress(<4 x i32> %vec, <4 x i1> <i1 1, i1 undef, i1 0, i1 1>, <4 x i32> undef)
+    ret <4 x i32> %out
+}
+
+define <4 x i32> @test_compress_const_mask_passthrough(<4 x i32> %vec, <4 x i32> %passthru) {
+; CHECK-LABEL: test_compress_const_mask_passthrough:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    mov.s v0[1], v0[3]
+; CHECK-NEXT:    ret
+    %out = call <4 x i32> @llvm.masked.compress(<4 x i32> %vec, <4 x i1> <i1 1, i1 undef, i1 0, i1 1>, <4 x i32> %passthru)
+    ret <4 x i32> %out
+}
+
+define <4 x i32> @test_compress_const_mask_const_passthrough(<4 x i32> %vec) {
+; CHECK-LABEL: test_compress_const_mask_const_passthrough:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    mov.s v0[1], v0[3]
+; CHECK-NEXT:    mov w8, #5 ; =0x5
+; CHECK-NEXT:    mov.s v0[2], w8
+; CHECK-NEXT:    mov w8, #6 ; =0x6
+; CHECK-NEXT:    mov.s v0[3], w8
+; CHECK-NEXT:    ret
+    %out = call <4 x i32> @llvm.masked.compress(<4 x i32> %vec, <4 x i1> <i1 1, i1 undef, i1 0, i1 1>, <4 x i32> <i32 5, i32 6, i32 7, i32 8>)
     ret <4 x i32> %out
 }
 
@@ -235,21 +305,36 @@ define <4 x i32> @test_compress_const_splat1_mask(<4 x i32> %ignore, <4 x i32> %
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    mov.16b v0, v1
 ; CHECK-NEXT:    ret
-    %out = call <4 x i32> @llvm.masked.compress(<4 x i32> %vec, <4 x i1> splat (i1 -1))
+    %out = call <4 x i32> @llvm.masked.compress(<4 x i32> %vec, <4 x i1> splat (i1 -1), <4 x i32> undef)
     ret <4 x i32> %out
 }
 define <4 x i32> @test_compress_const_splat0_mask(<4 x i32> %ignore, <4 x i32> %vec) {
 ; CHECK-LABEL: test_compress_const_splat0_mask:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    ret
-    %out = call <4 x i32> @llvm.masked.compress(<4 x i32> %vec, <4 x i1> splat (i1 0))
+    %out = call <4 x i32> @llvm.masked.compress(<4 x i32> %vec, <4 x i1> splat (i1 0), <4 x i32> undef)
     ret <4 x i32> %out
 }
 define <4 x i32> @test_compress_undef_mask(<4 x i32> %ignore, <4 x i32> %vec) {
 ; CHECK-LABEL: test_compress_undef_mask:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    ret
-    %out = call <4 x i32> @llvm.masked.compress(<4 x i32> %vec, <4 x i1> undef)
+    %out = call <4 x i32> @llvm.masked.compress(<4 x i32> %vec, <4 x i1> undef, <4 x i32> undef)
+    ret <4 x i32> %out
+}
+define <4 x i32> @test_compress_const_splat0_mask_with_passthru(<4 x i32> %ignore, <4 x i32> %vec, <4 x i32> %passthru) {
+; CHECK-LABEL: test_compress_const_splat0_mask_with_passthru:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    mov.16b v0, v2
+; CHECK-NEXT:    ret
+    %out = call <4 x i32> @llvm.masked.compress(<4 x i32> %vec, <4 x i1> splat (i1 0), <4 x i32> %passthru)
+    ret <4 x i32> %out
+}
+define <4 x i32> @test_compress_const_splat0_mask_without_passthru(<4 x i32> %ignore, <4 x i32> %vec) {
+; CHECK-LABEL: test_compress_const_splat0_mask_without_passthru:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ret
+    %out = call <4 x i32> @llvm.masked.compress(<4 x i32> %vec, <4 x i1> splat (i1 0), <4 x i32> undef)
     ret <4 x i32> %out
 }
 
@@ -281,7 +366,7 @@ define <4 x i8> @test_compress_small(<4 x i8> %vec, <4 x i1> %mask) {
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
-    %out = call <4 x i8> @llvm.masked.compress.v4i8(<4 x i8> %vec, <4 x i1> %mask)
+    %out = call <4 x i8> @llvm.masked.compress(<4 x i8> %vec, <4 x i1> %mask, <4 x i8> undef)
     ret <4 x i8> %out
 }
 
@@ -313,7 +398,7 @@ define <4 x i4> @test_compress_illegal_element_type(<4 x i4> %vec, <4 x i1> %mas
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
-    %out = call <4 x i4> @llvm.masked.compress.v4i4(<4 x i4> %vec, <4 x i1> %mask)
+    %out = call <4 x i4> @llvm.masked.compress(<4 x i4> %vec, <4 x i1> %mask, <4 x i4> undef)
     ret <4 x i4> %out
 }
 
@@ -347,7 +432,7 @@ define <3 x i32> @test_compress_narrow(<3 x i32> %vec, <3 x i1> %mask) {
 ; CHECK-NEXT:    st1.s { v0 }[3], [x10]
 ; CHECK-NEXT:    ldr q0, [sp], #16
 ; CHECK-NEXT:    ret
-    %out = call <3 x i32> @llvm.masked.compress.v3i32(<3 x i32> %vec, <3 x i1> %mask)
+    %out = call <3 x i32> @llvm.masked.compress(<3 x i32> %vec, <3 x i1> %mask, <3 x i32> undef)
     ret <3 x i32> %out
 }
 
@@ -379,6 +464,6 @@ define <3 x i3> @test_compress_narrow_illegal_element_type(<3 x i3> %vec, <3 x i
 ; CHECK-NEXT:    umov.h w2, v0[2]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
-    %out = call <3 x i3> @llvm.masked.compress.v3i3(<3 x i3> %vec, <3 x i1> %mask)
+    %out = call <3 x i3> @llvm.masked.compress(<3 x i3> %vec, <3 x i1> %mask, <3 x i3> undef)
     ret <3 x i3> %out
 }

From c733d6b35f12b10b3e4c61c472a794ad0815cf66 Mon Sep 17 00:00:00 2001
From: Lawrence Benson <github@lawben.com>
Date: Mon, 17 Jun 2024 12:34:45 +0200
Subject: [PATCH 33/49] Update LangRef with passthru

---
 llvm/docs/LangRef.rst | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index b1fafc5efee541..4ff62058461d95 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -19244,11 +19244,12 @@ Syntax:
 """""""
 This is an overloaded intrinsic. A number of scalar values of integer, floating point or pointer data type are collected
 from an input vector and placed adjacently within the result vector. A mask defines which elements to collect from the vector.
+The remaining lanes are filled with values from ``passthru``.
 
 :: code-block:: llvm
 
-      declare <8 x i32> @llvm.masked.compress.v8i32(<8 x i32> <value>, <8 x i1> <mask>)
-      declare <16 x float> @llvm.masked.compress.v16f32(<16 x float> <value>, <16 x i1> <mask>)
+      declare <8 x i32> @llvm.masked.compress.v8i32(<8 x i32> <value>, <8 x i1> <mask>, <8 x i32> <passthru>)
+      declare <16 x float> @llvm.masked.compress.v16f32(<16 x float> <value>, <16 x i1> <mask>, <16 x float> undef)
 
 Overview:
 """""""""
@@ -19256,9 +19257,9 @@ Overview:
 Selects elements from input vector '``value``' according to the '``mask``'.
 All selected elements are written into adjacent lanes in the result vector, from lower to higher.
 The mask holds an entry for each vector lane, and is used to select elements to be kept.
-The number of valid lanes is equal to the number of ``true`` entries in the mask, i.e., all lanes >= number-of-selected-values are undefined.
-The main difference to :ref:`llvm.masked.compressstore <int_compressstore>` is that the remainder of the vector may
-contain undefined values.
+If ``passthru`` is undefined, the number of valid lanes is equal to the number of ``true`` entries in the mask, i.e., all lanes >= number-of-selected-values are undefined.
+If a ``passthru`` vector is given, all remaining lanes are filled with values from ``passthru``, starting from ``passthrough[0]``.
+The main difference to :ref:`llvm.masked.compressstore <int_compressstore>` is that the we do not need to guard against memory access for unselected lanes.
 This allows for branchless code and better optimization for all targets that do not support or have inefficient instructions
 of the explicit semantics of :ref:`llvm.masked.compressstore <int_compressstore>` but still have some form of compress operations.
 The result vector can be written with a similar effect, as all the selected values are at the lower positions of the
@@ -19269,28 +19270,34 @@ Arguments:
 
 The first operand is the input vector, from which elements are selected.
 The second operand is the mask, a vector of boolean values.
+The third operand is the passthru vector, from which elements are filled into remaining lanes.
 The mask and the input vector must have the same number of vector elements.
+The input and passthru vectors must have the same type.
 
 Semantics:
 """"""""""
 
 The ``llvm.masked.compress`` intrinsic compresses data within a vector.
-It collects elements from possibly non-adjacent lanes of a vector and place them contiguously in the result vector based on a selection mask.
+It collects elements from possibly non-adjacent lanes of a vector and place them contiguously in the result vector based
+on a selection mask and fill the remaining lanes with values from ``passthru``.
 This intrinsic performs the logic of the following C++ example.
-All values in ``out`` after the last selected one are undefined.
-If all entries in the ``mask`` are 0, the entire ``out`` vector is undefined.
+All values in ``out`` after the last selected one are undefined if ``passthru`` is undefined.
+If all entries in the ``mask`` are 0, the ``out`` vector is ``passthru``.
 
 .. code-block:: cpp
 
     // Consecutively place selected values in a vector.
     using VecT __attribute__((vector_size(N))) = int;
-    VecT compress(VecT vec, VecT mask) {
+    VecT compress(VecT vec, VecT mask, VecT passthru) {
       VecT out;
       int idx = 0;
       for (int i = 0; i < N / sizeof(int); ++i) {
         out[idx] = vec[i];
         idx += static_cast<bool>(mask[i]);
       }
+      for (int i = idx; i < N / sizeof(int); ++i) {
+        out[idx] = passthru[i - idx];
+      }
       return out;
     }
 

From 9222b54c00f6672129508eadc4a36856a6a90139 Mon Sep 17 00:00:00 2001
From: Lawrence Benson <github@lawben.com>
Date: Mon, 17 Jun 2024 13:47:58 +0200
Subject: [PATCH 34/49] Add passthru to GlobalISel

---
 .../CodeGen/GlobalISel/LegalizerHelper.cpp    |  51 +++++++--
 .../CodeGen/SelectionDAG/TargetLowering.cpp   |   4 +-
 .../GlobalISel/legalize-masked-compress.mir   | 101 ++++++++++++++++--
 llvm/test/CodeGen/AArch64/masked-compress.ll  |   4 +-
 4 files changed, 142 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index e372d5e2fcc502..76be3fe18a1e3f 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -7514,7 +7514,10 @@ LegalizerHelper::lowerShuffleVector(MachineInstr &MI) {
 
 LegalizerHelper::LegalizeResult
 LegalizerHelper::lowerMASKED_COMPRESS(llvm::MachineInstr &MI) {
-  auto [Dst, DstTy, Vec, VecTy, Mask, MaskTy] = MI.getFirst3RegLLTs();
+  auto [Dst, DstTy, Vec, VecTy, Mask, MaskTy, Passthru, PassthruTy] = MI.getFirst4RegLLTs();
+
+  if (VecTy.isScalableVector())
+    report_fatal_error("Cannot expand masked_compress for scalable vectors.");
 
   Align VecAlign = getStackTemporaryAlignment(VecTy);
   MachinePointerInfo PtrInfo;
@@ -7522,6 +7525,7 @@ LegalizerHelper::lowerMASKED_COMPRESS(llvm::MachineInstr &MI) {
       createStackTemporary(TypeSize::getFixed(VecTy.getSizeInBytes()), VecAlign,
                            PtrInfo)
           .getReg(0);
+  MachinePointerInfo ValPtrInfo = MachinePointerInfo::getUnknownStack(*MI.getMF());
 
   LLT IdxTy = LLT::scalar(32);
   LLT ValTy = VecTy.getElementType();
@@ -7529,22 +7533,51 @@ LegalizerHelper::lowerMASKED_COMPRESS(llvm::MachineInstr &MI) {
 
   auto OutPos = MIRBuilder.buildConstant(IdxTy, 0);
 
+  bool HasPassthru = MRI.getVRegDef(Passthru)->getOpcode() != TargetOpcode::G_IMPLICIT_DEF; // isGuaranteedNotToBeUndef(Passthru, MRI);
+
+  if (HasPassthru)
+    MIRBuilder.buildStore(Passthru, StackPtr, PtrInfo, VecAlign);
+
+  Register LastWriteVal;
+  std::optional<APInt> PassthruSplatVal =
+      isConstantOrConstantSplatVector(*MRI.getVRegDef(Passthru), MRI);;
+
+  if (PassthruSplatVal.has_value()) {
+    LastWriteVal = MIRBuilder.buildConstant(ValTy, PassthruSplatVal.value()).getReg(0);
+  } else if (HasPassthru) {
+    auto Popcount = MIRBuilder.buildZExt(MaskTy.changeElementSize(32), Mask);
+    Popcount = MIRBuilder.buildInstr(TargetOpcode::G_VECREDUCE_ADD, {LLT::scalar(32)}, {Popcount});
+
+    Register LastElmtPtr =
+        getVectorElementPointer(StackPtr, VecTy, Popcount.getReg(0));
+    LastWriteVal = MIRBuilder.buildLoad(ValTy, LastElmtPtr, ValPtrInfo, ValAlign).getReg(0);
+  }
+
   unsigned NumElmts = VecTy.getNumElements();
   for (unsigned I = 0; I < NumElmts; ++I) {
     auto Idx = MIRBuilder.buildConstant(IdxTy, I);
     auto Val = MIRBuilder.buildExtractVectorElement(ValTy, Vec, Idx);
     Register ElmtPtr =
         getVectorElementPointer(StackPtr, VecTy, OutPos.getReg(0));
-    MIRBuilder.buildStore(Val, ElmtPtr, PtrInfo, ValAlign);
+    MIRBuilder.buildStore(Val, ElmtPtr, ValPtrInfo, ValAlign);
+
+    LLT MaskITy = MaskTy.getElementType();
+    auto MaskI = MIRBuilder.buildExtractVectorElement(MaskITy, Mask, Idx);
+    if (MaskITy.getSizeInBits() > 1)
+      MaskI = MIRBuilder.buildTrunc(LLT::scalar(1), MaskI);
+
+    MaskI = MIRBuilder.buildZExt(IdxTy, MaskI);
+    OutPos = MIRBuilder.buildAdd(IdxTy, OutPos, MaskI);
 
-    if (I < NumElmts - 1) {
-      LLT MaskITy = MaskTy.getElementType();
-      auto MaskI = MIRBuilder.buildExtractVectorElement(MaskITy, Mask, Idx);
-      if (MaskITy.getSizeInBits() > 1)
-        MaskI = MIRBuilder.buildTrunc(LLT::scalar(1), MaskI);
+    if (HasPassthru && I == NumElmts - 1) {
+      auto EndOfVector = MIRBuilder.buildConstant(IdxTy, VecTy.getNumElements() - 1);
+      auto AllLanesSelected =
+          MIRBuilder.buildICmp(CmpInst::ICMP_UGT, LLT::scalar(1), OutPos, EndOfVector);
+      OutPos = MIRBuilder.buildInstr(TargetOpcode::G_UMIN, {IdxTy}, {OutPos, EndOfVector});
+      ElmtPtr = getVectorElementPointer(StackPtr, VecTy, OutPos.getReg(0));
 
-      MaskI = MIRBuilder.buildZExt(IdxTy, MaskI);
-      OutPos = MIRBuilder.buildAdd(IdxTy, OutPos, MaskI);
+      LastWriteVal = MIRBuilder.buildSelect(ValTy, AllLanesSelected, Val, LastWriteVal).getReg(0);
+      MIRBuilder.buildStore(LastWriteVal, ElmtPtr, ValPtrInfo, ValAlign);
     }
   }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index f262637caa9d45..ac4a28ab98165b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -11353,12 +11353,12 @@ SDValue TargetLowering::expandMASKED_COMPRESS(SDNode *Node,
   bool IsSplatPassthru =
       ISD::isConstantSplatVector(Passthru.getNode(), PassthruSplatVal);
 
-  if (HasPassthru && IsSplatPassthru) {
+  if (IsSplatPassthru) {
     // As we do not know which position we wrote to last, we cannot simply
     // access that index from the passthru vector. So we first check if passthru
     // is a splat vector, to use any element ...
     LastWriteVal = DAG.getConstant(PassthruSplatVal, DL, ScalarVT);
-  } else {
+  } else if (HasPassthru) {
     // ... if it is not a splat vector, we need to get the passthru value at
     // position = popcount(mask) and re-load it from the stack before it is
     // overwritten in the loop below.
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-masked-compress.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-masked-compress.mir
index 62a5927efd197d..41b83d8d3214fe 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-masked-compress.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-masked-compress.mir
@@ -18,7 +18,7 @@ body:             |
     ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
     ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[C1]], [[C2]]
     ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[MUL]](s64)
-    ; CHECK-NEXT: G_STORE [[EVEC]](s32), [[PTR_ADD]](p0) :: (store (s32) into %stack.0)
+    ; CHECK-NEXT: G_STORE [[EVEC]](s32), [[PTR_ADD]](p0) :: (store (s32))
     ; CHECK-NEXT: [[EVEC1:%[0-9]+]]:_(s16) = G_EXTRACT_VECTOR_ELT [[COPY1]](<4 x s16>), [[C1]](s64)
     ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC1]](s16)
     ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
@@ -31,7 +31,7 @@ body:             |
     ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[AND1]](s32)
     ; CHECK-NEXT: [[MUL1:%[0-9]+]]:_(s64) = G_MUL [[SEXT]], [[C2]]
     ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[MUL1]](s64)
-    ; CHECK-NEXT: G_STORE [[EVEC2]](s32), [[PTR_ADD1]](p0) :: (store (s32) into %stack.0)
+    ; CHECK-NEXT: G_STORE [[EVEC2]](s32), [[PTR_ADD1]](p0) :: (store (s32))
     ; CHECK-NEXT: [[EVEC3:%[0-9]+]]:_(s16) = G_EXTRACT_VECTOR_ELT [[COPY1]](<4 x s16>), [[C4]](s64)
     ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC3]](s16)
     ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C3]]
@@ -42,7 +42,7 @@ body:             |
     ; CHECK-NEXT: [[SEXT1:%[0-9]+]]:_(s64) = G_SEXT [[AND3]](s32)
     ; CHECK-NEXT: [[MUL2:%[0-9]+]]:_(s64) = G_MUL [[SEXT1]], [[C2]]
     ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[MUL2]](s64)
-    ; CHECK-NEXT: G_STORE [[EVEC4]](s32), [[PTR_ADD2]](p0) :: (store (s32) into %stack.0)
+    ; CHECK-NEXT: G_STORE [[EVEC4]](s32), [[PTR_ADD2]](p0) :: (store (s32))
     ; CHECK-NEXT: [[EVEC5:%[0-9]+]]:_(s16) = G_EXTRACT_VECTOR_ELT [[COPY1]](<4 x s16>), [[C6]](s64)
     ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC5]](s16)
     ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[ANYEXT2]], [[C3]]
@@ -53,15 +53,104 @@ body:             |
     ; CHECK-NEXT: [[SEXT2:%[0-9]+]]:_(s64) = G_SEXT [[AND5]](s32)
     ; CHECK-NEXT: [[MUL3:%[0-9]+]]:_(s64) = G_MUL [[SEXT2]], [[C2]]
     ; CHECK-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[MUL3]](s64)
-    ; CHECK-NEXT: G_STORE [[EVEC6]](s32), [[PTR_ADD3]](p0) :: (store (s32) into %stack.0)
+    ; CHECK-NEXT: G_STORE [[EVEC6]](s32), [[PTR_ADD3]](p0) :: (store (s32))
     ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[FRAME_INDEX]](p0) :: (load (<4 x s32>) from %stack.0)
     ; CHECK-NEXT: $q0 = COPY [[LOAD]](<4 x s32>)
     ; CHECK-NEXT: RET_ReallyLR implicit $q0
     %0:_(<4 x s32>) = COPY $q0
     %1:_(<4 x s16>) = COPY $d1
-    %2:_(<4 x s32>) = G_MASKED_COMPRESS %0(<4 x s32>), %1(<4 x s16>)
-    $q0 = COPY %2(<4 x s32>)
+    %2:_(<4 x s32>) = G_IMPLICIT_DEF
+    %3:_(<4 x s32>) = G_MASKED_COMPRESS %0(<4 x s32>), %1(<4 x s16>), %2(<4 x s32>)
+    $q0 = COPY %3(<4 x s32>)
     RET_ReallyLR implicit $q0
 ...
+---
+name:            test_masked_compress_v4s32_with_passthru
+body:             |
+  bb.0:
+    liveins: $q0, $d1, $q2
+
+
+    ; CHECK-LABEL: name: test_masked_compress_v4s32_with_passthru
+    ; CHECK: liveins: $q0, $d1, $q2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $d1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(<4 x s32>) = COPY $q2
+    ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: G_STORE [[COPY2]](<4 x s32>), [[FRAME_INDEX]](p0) :: (store (<4 x s32>) into %stack.0)
+    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(<4 x s32>) = G_ZEXT [[COPY1]](<4 x s16>)
+    ; CHECK-NEXT: [[VECREDUCE_ADD:%[0-9]+]]:_(s32) = G_VECREDUCE_ADD [[ZEXT]](<4 x s32>)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[VECREDUCE_ADD]], [[C1]]
+    ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[AND]](s32)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[SEXT]], [[C2]]
+    ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[MUL]](s64)
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s32))
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<4 x s32>), [[C3]](s64)
+    ; CHECK-NEXT: [[MUL1:%[0-9]+]]:_(s64) = G_MUL [[C3]], [[C2]]
+    ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[MUL1]](s64)
+    ; CHECK-NEXT: G_STORE [[EVEC]](s32), [[PTR_ADD1]](p0) :: (store (s32))
+    ; CHECK-NEXT: [[EVEC1:%[0-9]+]]:_(s16) = G_EXTRACT_VECTOR_ELT [[COPY1]](<4 x s16>), [[C3]](s64)
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC1]](s16)
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C4]]
+    ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[C]], [[AND1]]
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[EVEC2:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<4 x s32>), [[C5]](s64)
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ADD]], [[C1]]
+    ; CHECK-NEXT: [[SEXT1:%[0-9]+]]:_(s64) = G_SEXT [[AND2]](s32)
+    ; CHECK-NEXT: [[MUL2:%[0-9]+]]:_(s64) = G_MUL [[SEXT1]], [[C2]]
+    ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[MUL2]](s64)
+    ; CHECK-NEXT: G_STORE [[EVEC2]](s32), [[PTR_ADD2]](p0) :: (store (s32))
+    ; CHECK-NEXT: [[EVEC3:%[0-9]+]]:_(s16) = G_EXTRACT_VECTOR_ELT [[COPY1]](<4 x s16>), [[C5]](s64)
+    ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC3]](s16)
+    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C4]]
+    ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ADD]], [[AND3]]
+    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; CHECK-NEXT: [[EVEC4:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<4 x s32>), [[C6]](s64)
+    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[ADD1]], [[C1]]
+    ; CHECK-NEXT: [[SEXT2:%[0-9]+]]:_(s64) = G_SEXT [[AND4]](s32)
+    ; CHECK-NEXT: [[MUL3:%[0-9]+]]:_(s64) = G_MUL [[SEXT2]], [[C2]]
+    ; CHECK-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[MUL3]](s64)
+    ; CHECK-NEXT: G_STORE [[EVEC4]](s32), [[PTR_ADD3]](p0) :: (store (s32))
+    ; CHECK-NEXT: [[EVEC5:%[0-9]+]]:_(s16) = G_EXTRACT_VECTOR_ELT [[COPY1]](<4 x s16>), [[C6]](s64)
+    ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC5]](s16)
+    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ANYEXT2]], [[C4]]
+    ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ADD1]], [[AND5]]
+    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
+    ; CHECK-NEXT: [[EVEC6:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<4 x s32>), [[C7]](s64)
+    ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[ADD2]], [[C1]]
+    ; CHECK-NEXT: [[SEXT3:%[0-9]+]]:_(s64) = G_SEXT [[AND6]](s32)
+    ; CHECK-NEXT: [[MUL4:%[0-9]+]]:_(s64) = G_MUL [[SEXT3]], [[C2]]
+    ; CHECK-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[MUL4]](s64)
+    ; CHECK-NEXT: G_STORE [[EVEC6]](s32), [[PTR_ADD4]](p0) :: (store (s32))
+    ; CHECK-NEXT: [[EVEC7:%[0-9]+]]:_(s16) = G_EXTRACT_VECTOR_ELT [[COPY1]](<4 x s16>), [[C7]](s64)
+    ; CHECK-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[EVEC7]](s16)
+    ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[ANYEXT3]], [[C4]]
+    ; CHECK-NEXT: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[ADD2]], [[AND7]]
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[ADD3]](s32), [[C1]]
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD3]](s32), [[C1]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ADD3]], [[C1]]
+    ; CHECK-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[SELECT]], [[C1]]
+    ; CHECK-NEXT: [[SEXT4:%[0-9]+]]:_(s64) = G_SEXT [[AND8]](s32)
+    ; CHECK-NEXT: [[MUL5:%[0-9]+]]:_(s64) = G_MUL [[SEXT4]], [[C2]]
+    ; CHECK-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[FRAME_INDEX]], [[MUL5]](s64)
+    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[EVEC6]], [[LOAD]]
+    ; CHECK-NEXT: G_STORE [[SELECT1]](s32), [[PTR_ADD5]](p0) :: (store (s32))
+    ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[FRAME_INDEX]](p0) :: (load (<4 x s32>) from %stack.0)
+    ; CHECK-NEXT: $q0 = COPY [[LOAD1]](<4 x s32>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
+    %0:_(<4 x s32>) = COPY $q0
+    %1:_(<4 x s16>) = COPY $d1
+    %2:_(<4 x s32>) = COPY $q2
+    %3:_(<4 x s32>) = G_MASKED_COMPRESS %0(<4 x s32>), %1(<4 x s16>), %2(<4 x s32>)
+    $q0 = COPY %3(<4 x s32>)
+    RET_ReallyLR implicit $q0
+...
+
 
 
diff --git a/llvm/test/CodeGen/AArch64/masked-compress.ll b/llvm/test/CodeGen/AArch64/masked-compress.ll
index 926858ec1394cb..69ea50c8dac57a 100644
--- a/llvm/test/CodeGen/AArch64/masked-compress.ll
+++ b/llvm/test/CodeGen/AArch64/masked-compress.ll
@@ -279,7 +279,9 @@ define <4 x i32> @test_compress_const_mask(<4 x i32> %vec) {
 define <4 x i32> @test_compress_const_mask_passthrough(<4 x i32> %vec, <4 x i32> %passthru) {
 ; CHECK-LABEL: test_compress_const_mask_passthrough:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    mov.s v0[1], v0[3]
+; CHECK-NEXT:    ext.16b v1, v0, v1, #8
+; CHECK-NEXT:    mov.s v1[0], v0[0]
+; CHECK-NEXT:    mov.16b v0, v1
 ; CHECK-NEXT:    ret
     %out = call <4 x i32> @llvm.masked.compress(<4 x i32> %vec, <4 x i1> <i1 1, i1 undef, i1 0, i1 1>, <4 x i32> %passthru)
     ret <4 x i32> %out

From daa784fa328713e09ffec8fdd4b9f0eb50de4ce9 Mon Sep 17 00:00:00 2001
From: Lawrence Benson <github@lawben.com>
Date: Mon, 17 Jun 2024 13:50:41 +0200
Subject: [PATCH 35/49] Fix formatting

---
 .../CodeGen/GlobalISel/LegalizerHelper.cpp    | 36 ++++++++++++-------
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 13 ++++---
 2 files changed, 32 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 76be3fe18a1e3f..f423dbf6c6e3c0 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -7514,7 +7514,8 @@ LegalizerHelper::lowerShuffleVector(MachineInstr &MI) {
 
 LegalizerHelper::LegalizeResult
 LegalizerHelper::lowerMASKED_COMPRESS(llvm::MachineInstr &MI) {
-  auto [Dst, DstTy, Vec, VecTy, Mask, MaskTy, Passthru, PassthruTy] = MI.getFirst4RegLLTs();
+  auto [Dst, DstTy, Vec, VecTy, Mask, MaskTy, Passthru, PassthruTy] =
+      MI.getFirst4RegLLTs();
 
   if (VecTy.isScalableVector())
     report_fatal_error("Cannot expand masked_compress for scalable vectors.");
@@ -7525,7 +7526,8 @@ LegalizerHelper::lowerMASKED_COMPRESS(llvm::MachineInstr &MI) {
       createStackTemporary(TypeSize::getFixed(VecTy.getSizeInBytes()), VecAlign,
                            PtrInfo)
           .getReg(0);
-  MachinePointerInfo ValPtrInfo = MachinePointerInfo::getUnknownStack(*MI.getMF());
+  MachinePointerInfo ValPtrInfo =
+      MachinePointerInfo::getUnknownStack(*MI.getMF());
 
   LLT IdxTy = LLT::scalar(32);
   LLT ValTy = VecTy.getElementType();
@@ -7533,24 +7535,30 @@ LegalizerHelper::lowerMASKED_COMPRESS(llvm::MachineInstr &MI) {
 
   auto OutPos = MIRBuilder.buildConstant(IdxTy, 0);
 
-  bool HasPassthru = MRI.getVRegDef(Passthru)->getOpcode() != TargetOpcode::G_IMPLICIT_DEF; // isGuaranteedNotToBeUndef(Passthru, MRI);
+  bool HasPassthru =
+      MRI.getVRegDef(Passthru)->getOpcode() != TargetOpcode::G_IMPLICIT_DEF;
 
   if (HasPassthru)
     MIRBuilder.buildStore(Passthru, StackPtr, PtrInfo, VecAlign);
 
   Register LastWriteVal;
   std::optional<APInt> PassthruSplatVal =
-      isConstantOrConstantSplatVector(*MRI.getVRegDef(Passthru), MRI);;
+      isConstantOrConstantSplatVector(*MRI.getVRegDef(Passthru), MRI);
+  ;
 
   if (PassthruSplatVal.has_value()) {
-    LastWriteVal = MIRBuilder.buildConstant(ValTy, PassthruSplatVal.value()).getReg(0);
+    LastWriteVal =
+        MIRBuilder.buildConstant(ValTy, PassthruSplatVal.value()).getReg(0);
   } else if (HasPassthru) {
     auto Popcount = MIRBuilder.buildZExt(MaskTy.changeElementSize(32), Mask);
-    Popcount = MIRBuilder.buildInstr(TargetOpcode::G_VECREDUCE_ADD, {LLT::scalar(32)}, {Popcount});
+    Popcount = MIRBuilder.buildInstr(TargetOpcode::G_VECREDUCE_ADD,
+                                     {LLT::scalar(32)}, {Popcount});
 
     Register LastElmtPtr =
         getVectorElementPointer(StackPtr, VecTy, Popcount.getReg(0));
-    LastWriteVal = MIRBuilder.buildLoad(ValTy, LastElmtPtr, ValPtrInfo, ValAlign).getReg(0);
+    LastWriteVal =
+        MIRBuilder.buildLoad(ValTy, LastElmtPtr, ValPtrInfo, ValAlign)
+            .getReg(0);
   }
 
   unsigned NumElmts = VecTy.getNumElements();
@@ -7570,13 +7578,17 @@ LegalizerHelper::lowerMASKED_COMPRESS(llvm::MachineInstr &MI) {
     OutPos = MIRBuilder.buildAdd(IdxTy, OutPos, MaskI);
 
     if (HasPassthru && I == NumElmts - 1) {
-      auto EndOfVector = MIRBuilder.buildConstant(IdxTy, VecTy.getNumElements() - 1);
-      auto AllLanesSelected =
-          MIRBuilder.buildICmp(CmpInst::ICMP_UGT, LLT::scalar(1), OutPos, EndOfVector);
-      OutPos = MIRBuilder.buildInstr(TargetOpcode::G_UMIN, {IdxTy}, {OutPos, EndOfVector});
+      auto EndOfVector =
+          MIRBuilder.buildConstant(IdxTy, VecTy.getNumElements() - 1);
+      auto AllLanesSelected = MIRBuilder.buildICmp(
+          CmpInst::ICMP_UGT, LLT::scalar(1), OutPos, EndOfVector);
+      OutPos = MIRBuilder.buildInstr(TargetOpcode::G_UMIN, {IdxTy},
+                                     {OutPos, EndOfVector});
       ElmtPtr = getVectorElementPointer(StackPtr, VecTy, OutPos.getReg(0));
 
-      LastWriteVal = MIRBuilder.buildSelect(ValTy, AllLanesSelected, Val, LastWriteVal).getReg(0);
+      LastWriteVal =
+          MIRBuilder.buildSelect(ValTy, AllLanesSelected, Val, LastWriteVal)
+              .getReg(0);
       MIRBuilder.buildStore(LastWriteVal, ElmtPtr, ValPtrInfo, ValAlign);
     }
   }
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 284f56e463346a..ca3dc2ba02c06b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -12104,7 +12104,9 @@ SDValue DAGCombiner::visitMASKED_COMPRESS(SDNode *N) {
 
   APInt SplatVal;
   if (ISD::isConstantSplatVector(Mask.getNode(), SplatVal))
-    return SplatVal.isAllOnes() ? Vec : (HasPassthru ? Passthru : DAG.getUNDEF(VecVT));
+    return SplatVal.isAllOnes()
+               ? Vec
+               : (HasPassthru ? Passthru : DAG.getUNDEF(VecVT));
 
   if (Vec.isUndef() || Mask.isUndef())
     return DAG.getUNDEF(VecVT);
@@ -12129,11 +12131,12 @@ SDValue DAGCombiner::visitMASKED_COMPRESS(SDNode *N) {
       }
     }
     for (unsigned Rest = NumSelected; Rest < NumElmts; ++Rest) {
-      SDValue Val = HasPassthru ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Passthru,
-                                  DAG.getVectorIdxConstant(Rest - NumSelected, DL))
-                                : DAG.getUNDEF(ScalarVT);
+      SDValue Val =
+          HasPassthru
+              ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Passthru,
+                            DAG.getVectorIdxConstant(Rest - NumSelected, DL))
+              : DAG.getUNDEF(ScalarVT);
       Ops.push_back(Val);
-
     }
     return DAG.getBuildVector(VecVT, DL, Ops);
   }

From 3e4673c9ecd9687c809f81684d90a2918f319230 Mon Sep 17 00:00:00 2001
From: Lawrence Benson <github@lawben.com>
Date: Mon, 17 Jun 2024 14:54:00 +0200
Subject: [PATCH 36/49] Fix GlobalISel test

---
 .../CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
index 6443c13891366d..b9dd8cac4cc08f 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
@@ -642,7 +642,7 @@
 # DEBUG-NEXT: G_SPLAT_VECTOR (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
 # DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
-# DEBUG-NEXT: G_MASKED_COMPRESS (opcode {{[0-9]+}}): 3 type indices, 0 imm indices
+# DEBUG-NEXT: G_MASKED_COMPRESS (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: G_CTTZ (opcode {{[0-9]+}}): 2 type indices, 0 imm indices

From 4b12b32381f05ba5ccb27181f746ce8b913e9e8c Mon Sep 17 00:00:00 2001
From: Lawrence Benson <github@lawben.com>
Date: Mon, 17 Jun 2024 14:54:21 +0200
Subject: [PATCH 37/49] Update comment on MASKED_COMPRESS opcode

---
 llvm/include/llvm/CodeGen/ISDOpcodes.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index 49ba84a83bb941..fb310898d51955 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -1306,8 +1306,8 @@ enum NodeType {
   // consecutively place vector elements based on mask
   // e.g., vec = {A, B, C, D} and mask = {1, 0, 1, 0}
   //         --> {A, C, ?, ?} where ? is undefined
-  // If passthru is defined, ?s are replaced with elements at corresponding
-  // positions from passthru. If passthru is undef, ?s remain undefined.
+  // If passthru is defined, ?s are replaced with elements from passthru.
+  // If passthru is undef, ?s remain undefined.
   MASKED_COMPRESS,
 
   // Masked gather and scatter - load and store operations for a vector of

From 6c9c96902a9784d1f440d90aa19936874676ce8a Mon Sep 17 00:00:00 2001
From: Lawrence Benson <github@lawben.com>
Date: Mon, 17 Jun 2024 18:05:28 +0200
Subject: [PATCH 38/49] Fix docs

---
 llvm/docs/LangRef.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 4ff62058461d95..356efa4a1f2357 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -19258,7 +19258,7 @@ Selects elements from input vector '``value``' according to the '``mask``'.
 All selected elements are written into adjacent lanes in the result vector, from lower to higher.
 The mask holds an entry for each vector lane, and is used to select elements to be kept.
 If ``passthru`` is undefined, the number of valid lanes is equal to the number of ``true`` entries in the mask, i.e., all lanes >= number-of-selected-values are undefined.
-If a ``passthru`` vector is given, all remaining lanes are filled with values from ``passthru``, starting from ``passthrough[0]``.
+If a ``passthru`` vector is given, all remaining lanes are filled with the corresponding lane's value from ``passthru``.
 The main difference to :ref:`llvm.masked.compressstore <int_compressstore>` is that the we do not need to guard against memory access for unselected lanes.
 This allows for branchless code and better optimization for all targets that do not support or have inefficient instructions
 of the explicit semantics of :ref:`llvm.masked.compressstore <int_compressstore>` but still have some form of compress operations.
@@ -19295,8 +19295,8 @@ If all entries in the ``mask`` are 0, the ``out`` vector is ``passthru``.
         out[idx] = vec[i];
         idx += static_cast<bool>(mask[i]);
       }
-      for (int i = idx; i < N / sizeof(int); ++i) {
-        out[idx] = passthru[i - idx];
+      for (; idx < N / sizeof(int); ++idx) {
+        out[idx] = passthru[idx];
       }
       return out;
     }

From 069eb2421de24742f4a34c9b5218091ee708a7f9 Mon Sep 17 00:00:00 2001
From: Lawrence Benson <github@lawben.com>
Date: Tue, 18 Jun 2024 21:13:38 +0200
Subject: [PATCH 39/49] Address PR comments

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  23 ++-
 .../CodeGen/SelectionDAG/TargetLowering.cpp   |  15 +-
 llvm/test/CodeGen/AArch64/masked-compress.ll  | 167 +++++++++---------
 3 files changed, 109 insertions(+), 96 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index ca3dc2ba02c06b..19a25f6c73a6e4 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -10055,7 +10055,7 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
       return LHSC.ult(OpSizeInBits) && RHSC.ult(OpSizeInBits) &&
              LHSC.getZExtValue() <= RHSC.getZExtValue();
     };
-    
+
     // fold (shl (sr[la] exact X,  C1), C2) -> (shl    X, (C2-C1)) if C1 <= C2
     // fold (shl (sr[la] exact X,  C1), C2) -> (sr[la] X, (C2-C1)) if C1 >= C2
     if (N0->getFlags().hasExact()) {
@@ -12103,10 +12103,21 @@ SDValue DAGCombiner::visitMASKED_COMPRESS(SDNode *N) {
   bool HasPassthru = !Passthru.isUndef();
 
   APInt SplatVal;
-  if (ISD::isConstantSplatVector(Mask.getNode(), SplatVal))
-    return SplatVal.isAllOnes()
-               ? Vec
-               : (HasPassthru ? Passthru : DAG.getUNDEF(VecVT));
+  if (ISD::isConstantSplatVector(Mask.getNode(), SplatVal)) {
+    bool HasTrueBoolContent = [&] {
+      switch (TLI.getBooleanContents(Mask.getValueType())) {
+      case TargetLoweringBase::UndefinedBooleanContent:
+        return SplatVal.isOne();
+      case TargetLoweringBase::ZeroOrOneBooleanContent:
+        return SplatVal.isOneBitSet(0);
+      case TargetLoweringBase::ZeroOrNegativeOneBooleanContent:
+        return SplatVal.isAllOnes();
+      }
+    }();
+
+    return HasTrueBoolContent ? Vec
+                              : (HasPassthru ? Passthru : DAG.getUNDEF(VecVT));
+  }
 
   if (Vec.isUndef() || Mask.isUndef())
     return DAG.getUNDEF(VecVT);
@@ -12134,7 +12145,7 @@ SDValue DAGCombiner::visitMASKED_COMPRESS(SDNode *N) {
       SDValue Val =
           HasPassthru
               ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Passthru,
-                            DAG.getVectorIdxConstant(Rest - NumSelected, DL))
+                            DAG.getVectorIdxConstant(Rest, DL))
               : DAG.getUNDEF(ScalarVT);
       Ops.push_back(Val);
     }
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 025f75c18f8e13..af48c03baf6ca7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -11358,8 +11358,9 @@ SDValue TargetLowering::expandMASKED_COMPRESS(SDNode *Node,
   MachinePointerInfo PtrInfo =
       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
 
+  MVT PositionVT = getVectorIdxTy(DAG.getDataLayout());
   SDValue Chain = DAG.getEntryNode();
-  SDValue OutPos = DAG.getConstant(0, DL, MVT::i32);
+  SDValue OutPos = DAG.getConstant(0, DL, PositionVT);
 
   bool HasPassthru = !Passthru.isUndef();
 
@@ -11386,8 +11387,8 @@ SDValue TargetLowering::expandMASKED_COMPRESS(SDNode *Node,
     SDValue Popcount = DAG.getNode(
         ISD::TRUNCATE, DL, MaskVT.changeVectorElementType(MVT::i1), Mask);
     Popcount = DAG.getNode(ISD::ZERO_EXTEND, DL,
-                           MaskVT.changeVectorElementType(MVT::i32), Popcount);
-    Popcount = DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, Popcount);
+                           MaskVT.changeVectorElementType(ScalarVT), Popcount);
+    Popcount = DAG.getNode(ISD::VECREDUCE_ADD, DL, ScalarVT, Popcount);
     SDValue LastElmtPtr =
         getVectorElementPointer(DAG, StackPtr, VecVT, Popcount);
     LastWriteVal = DAG.getLoad(
@@ -11411,15 +11412,15 @@ SDValue TargetLowering::expandMASKED_COMPRESS(SDNode *Node,
     SDValue MaskI =
         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskScalarVT, Mask, Idx);
     MaskI = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, MaskI);
-    MaskI = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, MaskI);
-    OutPos = DAG.getNode(ISD::ADD, DL, MVT::i32, OutPos, MaskI);
+    MaskI = DAG.getNode(ISD::ZERO_EXTEND, DL, PositionVT, MaskI);
+    OutPos = DAG.getNode(ISD::ADD, DL, PositionVT, OutPos, MaskI);
 
     if (HasPassthru && I == NumElms - 1) {
       SDValue EndOfVector =
-          DAG.getConstant(VecVT.getVectorNumElements() - 1, DL, MVT::i32);
+          DAG.getConstant(VecVT.getVectorNumElements() - 1, DL, PositionVT);
       SDValue AllLanesSelected =
           DAG.getSetCC(DL, MVT::i1, OutPos, EndOfVector, ISD::CondCode::SETUGT);
-      OutPos = DAG.getNode(ISD::UMIN, DL, MVT::i32, OutPos, EndOfVector);
+      OutPos = DAG.getNode(ISD::UMIN, DL, PositionVT, OutPos, EndOfVector);
       OutPtr = getVectorElementPointer(DAG, StackPtr, VecVT, OutPos);
 
       // Re-write the last ValI if all lanes were selected. Otherwise,
diff --git a/llvm/test/CodeGen/AArch64/masked-compress.ll b/llvm/test/CodeGen/AArch64/masked-compress.ll
index 69ea50c8dac57a..64ebac51dae349 100644
--- a/llvm/test/CodeGen/AArch64/masked-compress.ll
+++ b/llvm/test/CodeGen/AArch64/masked-compress.ll
@@ -15,10 +15,10 @@ define <4 x i32> @test_compress_v4i32(<4 x i32> %vec, <4 x i1> %mask) {
 ; CHECK-NEXT:    mov.s w10, v1[2]
 ; CHECK-NEXT:    fmov w11, s1
 ; CHECK-NEXT:    bfi x8, x11, #2, #1
-; CHECK-NEXT:    and w11, w11, #0x1
-; CHECK-NEXT:    and w9, w9, #0x1
+; CHECK-NEXT:    and x11, x11, #0x1
+; CHECK-NEXT:    and x9, x9, #0x1
 ; CHECK-NEXT:    and w10, w10, #0x1
-; CHECK-NEXT:    add w9, w11, w9
+; CHECK-NEXT:    add x9, x11, x9
 ; CHECK-NEXT:    mov x11, sp
 ; CHECK-NEXT:    st1.s { v0 }[1], [x8]
 ; CHECK-NEXT:    add w10, w9, w10
@@ -53,21 +53,21 @@ define <4 x i32> @test_compress_v4i32_with_passthru(<4 x i32> %vec, <4 x i1> %ma
 ; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    mov x14, sp
 ; CHECK-NEXT:    bfi x12, x16, #2, #1
-; CHECK-NEXT:    and w16, w16, #0x1
+; CHECK-NEXT:    and x16, x16, #0x1
 ; CHECK-NEXT:    mov w15, #3 ; =0x3
-; CHECK-NEXT:    and w8, w8, #0x1
-; CHECK-NEXT:    add w8, w16, w8
+; CHECK-NEXT:    and x8, x8, #0x1
+; CHECK-NEXT:    add x8, x16, x8
 ; CHECK-NEXT:    fmov w16, s2
-; CHECK-NEXT:    and w11, w11, #0x1
-; CHECK-NEXT:    and w13, w13, #0x1
-; CHECK-NEXT:    add w11, w8, w11
+; CHECK-NEXT:    and x11, x11, #0x1
+; CHECK-NEXT:    and x13, x13, #0x1
+; CHECK-NEXT:    add x11, x8, x11
 ; CHECK-NEXT:    orr x8, x9, x8, lsl #2
-; CHECK-NEXT:    add w13, w11, w13
+; CHECK-NEXT:    add x13, x11, x13
 ; CHECK-NEXT:    bfi x14, x11, #2, #2
 ; CHECK-NEXT:    bfi x10, x16, #2, #2
 ; CHECK-NEXT:    mov.s w16, v0[3]
-; CHECK-NEXT:    cmp w13, #3
-; CHECK-NEXT:    csel w11, w13, w15, lo
+; CHECK-NEXT:    cmp x13, #3
+; CHECK-NEXT:    csel x11, x13, x15, lo
 ; CHECK-NEXT:    ldr w10, [x10]
 ; CHECK-NEXT:    str s0, [sp]
 ; CHECK-NEXT:    st1.s { v0 }[1], [x12]
@@ -117,75 +117,75 @@ define <16 x i8> @test_compress_v16i8(<16 x i8> %vec, <16 x i1> %mask) {
 ; CHECK-NEXT:    umov.b w11, v1[2]
 ; CHECK-NEXT:    umov.b w14, v1[3]
 ; CHECK-NEXT:    bfxil x12, x9, #0, #1
-; CHECK-NEXT:    and w10, w10, #0x1
-; CHECK-NEXT:    and w9, w9, #0x1
-; CHECK-NEXT:    add w9, w9, w10
+; CHECK-NEXT:    and x10, x10, #0x1
+; CHECK-NEXT:    and x9, x9, #0x1
+; CHECK-NEXT:    add x9, x9, x10
 ; CHECK-NEXT:    umov.b w10, v1[4]
-; CHECK-NEXT:    and w11, w11, #0x1
+; CHECK-NEXT:    and x11, x11, #0x1
 ; CHECK-NEXT:    st1.b { v0 }[1], [x12]
 ; CHECK-NEXT:    orr x12, x8, x9
-; CHECK-NEXT:    add w9, w9, w11
+; CHECK-NEXT:    add x9, x9, x11
 ; CHECK-NEXT:    umov.b w11, v1[5]
-; CHECK-NEXT:    and w14, w14, #0x1
+; CHECK-NEXT:    and x14, x14, #0x1
 ; CHECK-NEXT:    st1.b { v0 }[2], [x12]
-; CHECK-NEXT:    add w14, w9, w14
+; CHECK-NEXT:    add x14, x9, x14
 ; CHECK-NEXT:    umov.b w12, v1[6]
 ; CHECK-NEXT:    orr x9, x8, x9
-; CHECK-NEXT:    and w10, w10, #0x1
+; CHECK-NEXT:    and x10, x10, #0x1
 ; CHECK-NEXT:    st1.b { v0 }[3], [x9]
 ; CHECK-NEXT:    orr x9, x8, x14
-; CHECK-NEXT:    add w10, w14, w10
+; CHECK-NEXT:    add x10, x14, x10
 ; CHECK-NEXT:    umov.b w14, v1[7]
 ; CHECK-NEXT:    st1.b { v0 }[4], [x9]
-; CHECK-NEXT:    and w11, w11, #0x1
+; CHECK-NEXT:    and x11, x11, #0x1
 ; CHECK-NEXT:    bfxil x13, x10, #0, #4
 ; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    add w10, w10, w11
+; CHECK-NEXT:    add x10, x10, x11
 ; CHECK-NEXT:    umov.b w11, v1[8]
-; CHECK-NEXT:    and w12, w12, #0x1
+; CHECK-NEXT:    and x12, x12, #0x1
 ; CHECK-NEXT:    bfxil x9, x10, #0, #4
 ; CHECK-NEXT:    st1.b { v0 }[5], [x13]
 ; CHECK-NEXT:    umov.b w13, v1[9]
-; CHECK-NEXT:    add w10, w10, w12
+; CHECK-NEXT:    add x10, x10, x12
 ; CHECK-NEXT:    mov x12, sp
-; CHECK-NEXT:    and w14, w14, #0x1
+; CHECK-NEXT:    and x14, x14, #0x1
 ; CHECK-NEXT:    st1.b { v0 }[6], [x9]
 ; CHECK-NEXT:    umov.b w9, v1[10]
 ; CHECK-NEXT:    bfxil x12, x10, #0, #4
-; CHECK-NEXT:    add w10, w10, w14
+; CHECK-NEXT:    add x10, x10, x14
 ; CHECK-NEXT:    mov x14, sp
-; CHECK-NEXT:    and w11, w11, #0x1
+; CHECK-NEXT:    and x11, x11, #0x1
 ; CHECK-NEXT:    bfxil x14, x10, #0, #4
-; CHECK-NEXT:    add w10, w10, w11
+; CHECK-NEXT:    add x10, x10, x11
 ; CHECK-NEXT:    mov x11, sp
-; CHECK-NEXT:    and w13, w13, #0x1
+; CHECK-NEXT:    and x13, x13, #0x1
 ; CHECK-NEXT:    st1.b { v0 }[7], [x12]
 ; CHECK-NEXT:    mov x12, sp
 ; CHECK-NEXT:    bfxil x11, x10, #0, #4
-; CHECK-NEXT:    add w10, w10, w13
+; CHECK-NEXT:    add x10, x10, x13
 ; CHECK-NEXT:    umov.b w13, v1[11]
 ; CHECK-NEXT:    st1.b { v0 }[8], [x14]
 ; CHECK-NEXT:    umov.b w14, v1[12]
-; CHECK-NEXT:    and w9, w9, #0x1
+; CHECK-NEXT:    and x9, x9, #0x1
 ; CHECK-NEXT:    bfxil x12, x10, #0, #4
-; CHECK-NEXT:    add w9, w10, w9
+; CHECK-NEXT:    add x9, x10, x9
 ; CHECK-NEXT:    mov x10, sp
 ; CHECK-NEXT:    st1.b { v0 }[9], [x11]
 ; CHECK-NEXT:    umov.b w11, v1[13]
 ; CHECK-NEXT:    bfxil x10, x9, #0, #4
 ; CHECK-NEXT:    st1.b { v0 }[10], [x12]
 ; CHECK-NEXT:    umov.b w12, v1[14]
-; CHECK-NEXT:    and w13, w13, #0x1
-; CHECK-NEXT:    and w14, w14, #0x1
-; CHECK-NEXT:    add w9, w9, w13
+; CHECK-NEXT:    and x13, x13, #0x1
+; CHECK-NEXT:    and x14, x14, #0x1
+; CHECK-NEXT:    add x9, x9, x13
 ; CHECK-NEXT:    st1.b { v0 }[11], [x10]
 ; CHECK-NEXT:    mov x10, sp
-; CHECK-NEXT:    add w13, w9, w14
+; CHECK-NEXT:    add x13, x9, x14
 ; CHECK-NEXT:    mov x14, sp
 ; CHECK-NEXT:    bfxil x10, x9, #0, #4
-; CHECK-NEXT:    and w9, w11, #0x1
+; CHECK-NEXT:    and x9, x11, #0x1
 ; CHECK-NEXT:    mov x11, sp
-; CHECK-NEXT:    add w9, w13, w9
+; CHECK-NEXT:    add x9, x13, x9
 ; CHECK-NEXT:    and w12, w12, #0x1
 ; CHECK-NEXT:    bfxil x14, x13, #0, #4
 ; CHECK-NEXT:    bfxil x11, x9, #0, #4
@@ -207,46 +207,46 @@ define <8 x i32> @test_compress_large(<8 x i32> %vec, <8 x i1> %mask) {
 ; CHECK-NEXT:    sub sp, sp, #32
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
 ; CHECK-NEXT:    ; kill: def $d2 killed $d2 def $q2
-; CHECK-NEXT:    umov.b w8, v2[0]
-; CHECK-NEXT:    umov.b w9, v2[1]
+; CHECK-NEXT:    umov.b w9, v2[0]
+; CHECK-NEXT:    umov.b w10, v2[1]
 ; CHECK-NEXT:    mov x12, sp
-; CHECK-NEXT:    umov.b w10, v2[2]
+; CHECK-NEXT:    umov.b w11, v2[2]
 ; CHECK-NEXT:    umov.b w13, v2[3]
-; CHECK-NEXT:    mov x11, sp
+; CHECK-NEXT:    mov x8, sp
 ; CHECK-NEXT:    umov.b w14, v2[4]
 ; CHECK-NEXT:    str s0, [sp]
-; CHECK-NEXT:    and w9, w9, #0x1
-; CHECK-NEXT:    and w15, w8, #0x1
-; CHECK-NEXT:    bfi x12, x8, #2, #1
-; CHECK-NEXT:    and w8, w10, #0x1
-; CHECK-NEXT:    add w9, w15, w9
-; CHECK-NEXT:    umov.b w10, v2[5]
-; CHECK-NEXT:    add w8, w9, w8
-; CHECK-NEXT:    orr x15, x11, x9, lsl #2
-; CHECK-NEXT:    umov.b w9, v2[6]
+; CHECK-NEXT:    and x10, x10, #0x1
+; CHECK-NEXT:    and x15, x9, #0x1
+; CHECK-NEXT:    bfi x12, x9, #2, #1
+; CHECK-NEXT:    and x9, x11, #0x1
+; CHECK-NEXT:    add x10, x15, x10
+; CHECK-NEXT:    umov.b w11, v2[5]
+; CHECK-NEXT:    add x9, x10, x9
+; CHECK-NEXT:    orr x15, x8, x10, lsl #2
+; CHECK-NEXT:    umov.b w10, v2[6]
 ; CHECK-NEXT:    st1.s { v0 }[1], [x12]
-; CHECK-NEXT:    add x12, x11, w8, uxtw #2
-; CHECK-NEXT:    and w13, w13, #0x1
+; CHECK-NEXT:    add x12, x8, x9, lsl #2
+; CHECK-NEXT:    and x13, x13, #0x1
 ; CHECK-NEXT:    st1.s { v0 }[2], [x15]
-; CHECK-NEXT:    add w8, w8, w13
+; CHECK-NEXT:    add x9, x9, x13
 ; CHECK-NEXT:    st1.s { v0 }[3], [x12]
-; CHECK-NEXT:    and w12, w14, #0x1
+; CHECK-NEXT:    and x12, x14, #0x1
+; CHECK-NEXT:    and x11, x11, #0x1
+; CHECK-NEXT:    add x12, x9, x12
 ; CHECK-NEXT:    and w10, w10, #0x1
-; CHECK-NEXT:    add w12, w8, w12
-; CHECK-NEXT:    and w9, w9, #0x1
-; CHECK-NEXT:    and x8, x8, #0x7
-; CHECK-NEXT:    add w10, w12, w10
+; CHECK-NEXT:    and x9, x9, #0x7
+; CHECK-NEXT:    add x11, x12, x11
 ; CHECK-NEXT:    and x12, x12, #0x7
-; CHECK-NEXT:    str s1, [x11, x8, lsl #2]
-; CHECK-NEXT:    add w9, w10, w9
+; CHECK-NEXT:    str s1, [x8, x9, lsl #2]
+; CHECK-NEXT:    add w10, w11, w10
+; CHECK-NEXT:    and x11, x11, #0x7
+; CHECK-NEXT:    add x12, x8, x12, lsl #2
 ; CHECK-NEXT:    and x10, x10, #0x7
-; CHECK-NEXT:    add x12, x11, x12, lsl #2
-; CHECK-NEXT:    and x9, x9, #0x7
-; CHECK-NEXT:    add x8, x11, x10, lsl #2
-; CHECK-NEXT:    add x9, x11, x9, lsl #2
+; CHECK-NEXT:    add x9, x8, x11, lsl #2
+; CHECK-NEXT:    add x8, x8, x10, lsl #2
 ; CHECK-NEXT:    st1.s { v1 }[1], [x12]
-; CHECK-NEXT:    st1.s { v1 }[2], [x8]
-; CHECK-NEXT:    st1.s { v1 }[3], [x9]
+; CHECK-NEXT:    st1.s { v1 }[2], [x9]
+; CHECK-NEXT:    st1.s { v1 }[3], [x8]
 ; CHECK-NEXT:    ldp q0, q1, [sp], #32
 ; CHECK-NEXT:    ret
     %out = call <8 x i32> @llvm.masked.compress(<8 x i32> %vec, <8 x i1> %mask, <8 x i32> undef)
@@ -261,6 +261,7 @@ define <4 x i32> @test_compress_all_const() {
 ; CHECK-NEXT:  Lloh1:
 ; CHECK-NEXT:    ldr q0, [x8, lCPI5_0@PAGEOFF]
 ; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpLdr Lloh0, Lloh1
     %out = call <4 x i32> @llvm.masked.compress(<4 x i32> <i32 3, i32 5, i32 7, i32 9>,
                                                 <4 x i1>   <i1 0,  i1 1,  i1 0,  i1 1>,
                                                 <4 x i32> undef)
@@ -279,7 +280,7 @@ define <4 x i32> @test_compress_const_mask(<4 x i32> %vec) {
 define <4 x i32> @test_compress_const_mask_passthrough(<4 x i32> %vec, <4 x i32> %passthru) {
 ; CHECK-LABEL: test_compress_const_mask_passthrough:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    ext.16b v1, v0, v1, #8
+; CHECK-NEXT:    mov.d v1[0], v0[1]
 ; CHECK-NEXT:    mov.s v1[0], v0[0]
 ; CHECK-NEXT:    mov.16b v0, v1
 ; CHECK-NEXT:    ret
@@ -291,9 +292,9 @@ define <4 x i32> @test_compress_const_mask_const_passthrough(<4 x i32> %vec) {
 ; CHECK-LABEL: test_compress_const_mask_const_passthrough:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    mov.s v0[1], v0[3]
-; CHECK-NEXT:    mov w8, #5 ; =0x5
+; CHECK-NEXT:    mov w8, #7 ; =0x7
 ; CHECK-NEXT:    mov.s v0[2], w8
-; CHECK-NEXT:    mov w8, #6 ; =0x6
+; CHECK-NEXT:    mov w8, #8 ; =0x8
 ; CHECK-NEXT:    mov.s v0[3], w8
 ; CHECK-NEXT:    ret
     %out = call <4 x i32> @llvm.masked.compress(<4 x i32> %vec, <4 x i1> <i1 1, i1 undef, i1 0, i1 1>, <4 x i32> <i32 5, i32 6, i32 7, i32 8>)
@@ -354,9 +355,9 @@ define <4 x i8> @test_compress_small(<4 x i8> %vec, <4 x i1> %mask) {
 ; CHECK-NEXT:    umov.h w10, v1[1]
 ; CHECK-NEXT:    umov.h w11, v1[2]
 ; CHECK-NEXT:    bfi x8, x9, #1, #1
-; CHECK-NEXT:    and w10, w10, #0x1
-; CHECK-NEXT:    and w9, w9, #0x1
-; CHECK-NEXT:    add w9, w9, w10
+; CHECK-NEXT:    and x10, x10, #0x1
+; CHECK-NEXT:    and x9, x9, #0x1
+; CHECK-NEXT:    add x9, x9, x10
 ; CHECK-NEXT:    and w11, w11, #0x1
 ; CHECK-NEXT:    add x10, sp, #8
 ; CHECK-NEXT:    add w11, w9, w11
@@ -386,9 +387,9 @@ define <4 x i4> @test_compress_illegal_element_type(<4 x i4> %vec, <4 x i1> %mas
 ; CHECK-NEXT:    umov.h w10, v1[1]
 ; CHECK-NEXT:    umov.h w11, v1[2]
 ; CHECK-NEXT:    bfi x8, x9, #1, #1
-; CHECK-NEXT:    and w10, w10, #0x1
-; CHECK-NEXT:    and w9, w9, #0x1
-; CHECK-NEXT:    add w9, w9, w10
+; CHECK-NEXT:    and x10, x10, #0x1
+; CHECK-NEXT:    and x9, x9, #0x1
+; CHECK-NEXT:    add x9, x9, x10
 ; CHECK-NEXT:    and w11, w11, #0x1
 ; CHECK-NEXT:    add x10, sp, #8
 ; CHECK-NEXT:    add w11, w9, w11
@@ -421,10 +422,10 @@ define <3 x i32> @test_compress_narrow(<3 x i32> %vec, <3 x i1> %mask) {
 ; CHECK-NEXT:    mov.s w9, v1[2]
 ; CHECK-NEXT:    fmov w10, s1
 ; CHECK-NEXT:    bfi x11, x10, #2, #1
-; CHECK-NEXT:    and w10, w10, #0x1
-; CHECK-NEXT:    and w8, w8, #0x1
+; CHECK-NEXT:    and x10, x10, #0x1
+; CHECK-NEXT:    and x8, x8, #0x1
 ; CHECK-NEXT:    and w9, w9, #0x1
-; CHECK-NEXT:    add w8, w10, w8
+; CHECK-NEXT:    add x8, x10, x8
 ; CHECK-NEXT:    mov x10, sp
 ; CHECK-NEXT:    st1.s { v0 }[1], [x11]
 ; CHECK-NEXT:    add w9, w8, w9
@@ -452,10 +453,10 @@ define <3 x i3> @test_compress_narrow_illegal_element_type(<3 x i3> %vec, <3 x i
 ; CHECK-NEXT:    cmlt.4h v0, v0, #0
 ; CHECK-NEXT:    umov.h w8, v0[0]
 ; CHECK-NEXT:    umov.h w9, v0[1]
-; CHECK-NEXT:    and w9, w9, #0x1
-; CHECK-NEXT:    and w11, w8, #0x1
+; CHECK-NEXT:    and x9, x9, #0x1
+; CHECK-NEXT:    and x11, x8, #0x1
 ; CHECK-NEXT:    bfi x10, x8, #1, #1
-; CHECK-NEXT:    add w8, w11, w9
+; CHECK-NEXT:    add x8, x11, x9
 ; CHECK-NEXT:    add x9, sp, #8
 ; CHECK-NEXT:    orr x8, x9, x8, lsl #1
 ; CHECK-NEXT:    strh w1, [x10]

From 5e3189b4d0278482254eba48f3d77b093aa34565 Mon Sep 17 00:00:00 2001
From: Lawrence Benson <github@lawben.com>
Date: Wed, 19 Jun 2024 12:28:17 +0200
Subject: [PATCH 40/49] Use isConstTrueVal

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 19 ++++---------------
 1 file changed, 4 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 2bed743c9bd165..81e9c492da59a7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -12091,19 +12091,9 @@ SDValue DAGCombiner::visitMASKED_COMPRESS(SDNode *N) {
 
   APInt SplatVal;
   if (ISD::isConstantSplatVector(Mask.getNode(), SplatVal)) {
-    bool HasTrueBoolContent = [&] {
-      switch (TLI.getBooleanContents(Mask.getValueType())) {
-      case TargetLoweringBase::UndefinedBooleanContent:
-        return SplatVal.isOne();
-      case TargetLoweringBase::ZeroOrOneBooleanContent:
-        return SplatVal.isOneBitSet(0);
-      case TargetLoweringBase::ZeroOrNegativeOneBooleanContent:
-        return SplatVal.isAllOnes();
-      }
-    }();
-
-    return HasTrueBoolContent ? Vec
-                              : (HasPassthru ? Passthru : DAG.getUNDEF(VecVT));
+    return TLI.isConstTrueVal(Mask)
+               ? Vec
+               : (HasPassthru ? Passthru : DAG.getUNDEF(VecVT));
   }
 
   if (Vec.isUndef() || Mask.isUndef())
@@ -12120,8 +12110,7 @@ SDValue DAGCombiner::visitMASKED_COMPRESS(SDNode *N) {
       if (MaskI.isUndef())
         continue;
 
-      ConstantSDNode *CMaskI = cast<ConstantSDNode>(MaskI);
-      if (CMaskI->isAllOnes()) {
+      if (TLI.isConstTrueVal(MaskI)) {
         SDValue VecI = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Vec,
                                    DAG.getVectorIdxConstant(I, DL));
         Ops.push_back(VecI);

From 6b3a3b8738ec132300a7553d30706b1199e19133 Mon Sep 17 00:00:00 2001
From: Lawrence Benson <github@lawben.com>
Date: Wed, 19 Jun 2024 14:13:28 +0200
Subject: [PATCH 41/49] Remove redundant undef

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 81e9c492da59a7..421e6964048404 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -12090,11 +12090,8 @@ SDValue DAGCombiner::visitMASKED_COMPRESS(SDNode *N) {
   bool HasPassthru = !Passthru.isUndef();
 
   APInt SplatVal;
-  if (ISD::isConstantSplatVector(Mask.getNode(), SplatVal)) {
-    return TLI.isConstTrueVal(Mask)
-               ? Vec
-               : (HasPassthru ? Passthru : DAG.getUNDEF(VecVT));
-  }
+  if (ISD::isConstantSplatVector(Mask.getNode(), SplatVal))
+    return TLI.isConstTrueVal(Mask) ? Vec : Passthru;
 
   if (Vec.isUndef() || Mask.isUndef())
     return DAG.getUNDEF(VecVT);

From 50cce2363780366118334bd96a51a11bf72e94f2 Mon Sep 17 00:00:00 2001
From: Lawrence Benson <github@lawben.com>
Date: Fri, 21 Jun 2024 13:12:30 +0200
Subject: [PATCH 42/49] Address PR comments

---
 llvm/docs/LangRef.rst                         | 43 +++++++++++++------
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  3 +-
 2 files changed, 31 insertions(+), 15 deletions(-)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 3f142af00e169e..5e3664f0a8e8fe 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -19258,23 +19258,36 @@ The remaining lanes are filled with values from ``passthru``.
 Overview:
 """""""""
 
-Selects elements from input vector '``value``' according to the '``mask``'.
-All selected elements are written into adjacent lanes in the result vector, from lower to higher.
-The mask holds an entry for each vector lane, and is used to select elements to be kept.
-If ``passthru`` is undefined, the number of valid lanes is equal to the number of ``true`` entries in the mask, i.e., all lanes >= number-of-selected-values are undefined.
-If a ``passthru`` vector is given, all remaining lanes are filled with the corresponding lane's value from ``passthru``.
-The main difference to :ref:`llvm.masked.compressstore <int_compressstore>` is that the we do not need to guard against memory access for unselected lanes.
-This allows for branchless code and better optimization for all targets that do not support or have inefficient instructions
-of the explicit semantics of :ref:`llvm.masked.compressstore <int_compressstore>` but still have some form of compress operations.
-The result vector can be written with a similar effect, as all the selected values are at the lower positions of the
-vector, but without requiring branches to avoid writes where the mask is ``false``.
+Selects elements from input vector ``value`` according to the ``mask``.
+All selected elements are written into adjacent lanes in the result vector,
+from lower to higher.
+The mask holds an entry for each vector lane, and is used to select elements
+to be kept.
+If a mask entry if undefined or poison, it is treated as "false", i.e., the
+element is not selected.
+If ``passthru`` is undefined, the number of valid lanes is equal to the number
+of ``true`` entries in the mask, i.e., all lanes >= number-of-selected-values
+are undefined.
+If a ``passthru`` vector is given, all remaining lanes are filled with the
+corresponding lane's value from ``passthru``.
+The main difference to :ref:`llvm.masked.compressstore <int_compressstore>` is
+that the we do not need to guard against memory access for unselected lanes.
+This allows for branchless code and better optimization for all targets that
+do not support or have inefficient
+instructions of the explicit semantics of
+:ref:`llvm.masked.compressstore <int_compressstore>` but still have some form
+of compress operations.
+The result vector can be written with a similar effect, as all the selected
+values are at the lower positions of the vector, but without requiring
+branches to avoid writes where the mask is ``false``.
 
 Arguments:
 """"""""""
 
 The first operand is the input vector, from which elements are selected.
 The second operand is the mask, a vector of boolean values.
-The third operand is the passthru vector, from which elements are filled into remaining lanes.
+The third operand is the passthru vector, from which elements are filled
+into remaining lanes.
 The mask and the input vector must have the same number of vector elements.
 The input and passthru vectors must have the same type.
 
@@ -19282,10 +19295,12 @@ Semantics:
 """"""""""
 
 The ``llvm.masked.compress`` intrinsic compresses data within a vector.
-It collects elements from possibly non-adjacent lanes of a vector and place them contiguously in the result vector based
-on a selection mask and fill the remaining lanes with values from ``passthru``.
+It collects elements from possibly non-adjacent lanes of a vector and place
+them contiguously in the result vector based on a selection mask and fill the
+remaining lanes with values from ``passthru``.
 This intrinsic performs the logic of the following C++ example.
-All values in ``out`` after the last selected one are undefined if ``passthru`` is undefined.
+All values in ``out`` after the last selected one are undefined if
+``passthru`` is undefined.
 If all entries in the ``mask`` are 0, the ``out`` vector is ``passthru``.
 
 .. code-block:: cpp
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 421e6964048404..305d95074166d8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -12093,7 +12093,7 @@ SDValue DAGCombiner::visitMASKED_COMPRESS(SDNode *N) {
   if (ISD::isConstantSplatVector(Mask.getNode(), SplatVal))
     return TLI.isConstTrueVal(Mask) ? Vec : Passthru;
 
-  if (Vec.isUndef() || Mask.isUndef())
+  if ((Vec.isUndef() && Passthru.isUndef()) || Mask.isUndef())
     return DAG.getUNDEF(VecVT);
 
   // No need for potentially expensive compress if the mask is constant.
@@ -12104,6 +12104,7 @@ SDValue DAGCombiner::visitMASKED_COMPRESS(SDNode *N) {
     unsigned NumElmts = VecVT.getVectorNumElements();
     for (unsigned I = 0; I < NumElmts; ++I) {
       SDValue MaskI = Mask.getOperand(I);
+      // We treat undef mask entries as "false".
       if (MaskI.isUndef())
         continue;
 

From 89c3e9f83bb2b8fae8a829776f07c363cacd6c3d Mon Sep 17 00:00:00 2001
From: Lawrence Benson <github@lawben.com>
Date: Fri, 21 Jun 2024 13:37:28 +0200
Subject: [PATCH 43/49] Return passthru for undef mask or vec

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 305d95074166d8..73e4e88c498669 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -12093,8 +12093,8 @@ SDValue DAGCombiner::visitMASKED_COMPRESS(SDNode *N) {
   if (ISD::isConstantSplatVector(Mask.getNode(), SplatVal))
     return TLI.isConstTrueVal(Mask) ? Vec : Passthru;
 
-  if ((Vec.isUndef() && Passthru.isUndef()) || Mask.isUndef())
-    return DAG.getUNDEF(VecVT);
+  if (Vec.isUndef() || Mask.isUndef())
+    return Passthru;
 
   // No need for potentially expensive compress if the mask is constant.
   if (ISD::isBuildVectorOfConstantSDNodes(Mask.getNode())) {

From 995c863e2b391d3d994cd94127280b981b314e95 Mon Sep 17 00:00:00 2001
From: Lawrence Benson <github@lawben.com>
Date: Tue, 25 Jun 2024 10:20:19 +0200
Subject: [PATCH 44/49] Address PR comments

---
 llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp       | 1 -
 llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp | 2 +-
 llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp        | 2 +-
 llvm/test/CodeGen/AArch64/masked-compress.ll          | 8 +++++---
 4 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index f423dbf6c6e3c0..f8949062ecf6ff 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -7544,7 +7544,6 @@ LegalizerHelper::lowerMASKED_COMPRESS(llvm::MachineInstr &MI) {
   Register LastWriteVal;
   std::optional<APInt> PassthruSplatVal =
       isConstantOrConstantSplatVector(*MRI.getVRegDef(Passthru), MRI);
-  ;
 
   if (PassthruSplatVal.has_value()) {
     LastWriteVal =
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 344610cd22d304..3f090380da7e6d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -5748,7 +5748,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_MASKED_COMPRESS(SDNode *N) {
                                     WideVecVT.getVectorNumElements());
 
   SDValue WideVec = ModifyToType(Vec, WideVecVT);
-  SDValue WideMask = ModifyToType(Mask, WideMaskVT);
+  SDValue WideMask = ModifyToType(Mask, WideMaskVT, /*FillWithZeroes=*/true);
   SDValue WidePassthru = ModifyToType(Passthru, WideVecVT);
   return DAG.getNode(ISD::MASKED_COMPRESS, SDLoc(N), WideVecVT, WideVec,
                      WideMask, WidePassthru);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index ca5b8e09cd9fac..d8a2c614e5ead7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -7519,7 +7519,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
            "Vector and mask must have same number of elements.");
 
     if (N1.isUndef() || N2.isUndef())
-      return getUNDEF(VecVT);
+      return N3;
 
     break;
   }
diff --git a/llvm/test/CodeGen/AArch64/masked-compress.ll b/llvm/test/CodeGen/AArch64/masked-compress.ll
index 64ebac51dae349..436e5af1e1e44d 100644
--- a/llvm/test/CodeGen/AArch64/masked-compress.ll
+++ b/llvm/test/CodeGen/AArch64/masked-compress.ll
@@ -297,7 +297,7 @@ define <4 x i32> @test_compress_const_mask_const_passthrough(<4 x i32> %vec) {
 ; CHECK-NEXT:    mov w8, #8 ; =0x8
 ; CHECK-NEXT:    mov.s v0[3], w8
 ; CHECK-NEXT:    ret
-    %out = call <4 x i32> @llvm.masked.compress(<4 x i32> %vec, <4 x i1> <i1 1, i1 undef, i1 0, i1 1>, <4 x i32> <i32 5, i32 6, i32 7, i32 8>)
+    %out = call <4 x i32> @llvm.masked.compress(<4 x i32> %vec, <4 x i1> <i1 1, i1 0, i1 0, i1 1>, <4 x i32> <i32 5, i32 6, i32 7, i32 8>)
     ret <4 x i32> %out
 }
 
@@ -410,9 +410,10 @@ define <3 x i32> @test_compress_narrow(<3 x i32> %vec, <3 x i1> %mask) {
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    fmov s1, w0
+; CHECK-NEXT:    movi.2d v1, #0000000000000000
 ; CHECK-NEXT:    mov x11, sp
 ; CHECK-NEXT:    str s0, [sp]
+; CHECK-NEXT:    mov.h v1[0], w0
 ; CHECK-NEXT:    mov.h v1[1], w1
 ; CHECK-NEXT:    mov.h v1[2], w2
 ; CHECK-NEXT:    ushll.4s v1, v1, #0
@@ -444,9 +445,10 @@ define <3 x i3> @test_compress_narrow_illegal_element_type(<3 x i3> %vec, <3 x i
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    fmov s0, w3
+; CHECK-NEXT:    movi.2d v0, #0000000000000000
 ; CHECK-NEXT:    add x10, sp, #8
 ; CHECK-NEXT:    strh w0, [sp, #8]
+; CHECK-NEXT:    mov.h v0[0], w3
 ; CHECK-NEXT:    mov.h v0[1], w4
 ; CHECK-NEXT:    mov.h v0[2], w5
 ; CHECK-NEXT:    shl.4h v0, v0, #15

From adb9d9c27fa84576a0fc017cbb279c3bfe175572 Mon Sep 17 00:00:00 2001
From: Lawrence Benson <github@lawben.com>
Date: Mon, 1 Jul 2024 11:08:00 +0200
Subject: [PATCH 45/49] Rename masked.compress to experimental.vector.compress

---
 llvm/docs/GlobalISel/GenericOpcode.rst        |  2 +-
 llvm/docs/LangRef.rst                         | 10 +++---
 .../llvm/CodeGen/GlobalISel/LegalizerHelper.h |  2 +-
 llvm/include/llvm/CodeGen/ISDOpcodes.h        | 16 ++++-----
 llvm/include/llvm/CodeGen/TargetLowering.h    |  4 +--
 llvm/include/llvm/IR/Intrinsics.td            |  2 +-
 llvm/include/llvm/Support/TargetOpcodes.def   |  2 +-
 llvm/include/llvm/Target/GenericOpcodes.td    |  2 +-
 .../Target/GlobalISel/SelectionDAGCompat.td   |  2 +-
 .../include/llvm/Target/TargetSelectionDAG.td |  2 +-
 llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp  |  4 +--
 .../CodeGen/GlobalISel/LegalizerHelper.cpp    |  6 ++--
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  6 ++--
 .../SelectionDAG/LegalizeIntegerTypes.cpp     | 18 +++++-----
 llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h |  8 ++---
 .../SelectionDAG/LegalizeVectorOps.cpp        |  6 ++--
 .../SelectionDAG/LegalizeVectorTypes.cpp      | 18 +++++-----
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp |  2 +-
 .../SelectionDAG/SelectionDAGBuilder.cpp      | 14 ++++----
 .../SelectionDAG/SelectionDAGDumper.cpp       |  2 +-
 .../CodeGen/SelectionDAG/TargetLowering.cpp   |  2 +-
 llvm/lib/CodeGen/TargetLoweringBase.cpp       |  2 +-
 .../AArch64/GISel/AArch64LegalizerInfo.cpp    |  2 +-
 ...press.mir => legalize-vector-compress.mir} | 12 +++----
 .../GlobalISel/legalizer-info-validation.mir  |  2 +-
 ...{masked-compress.ll => vector-compress.ll} | 36 +++++++++----------
 26 files changed, 92 insertions(+), 92 deletions(-)
 rename llvm/test/CodeGen/AArch64/GlobalISel/{legalize-masked-compress.mir => legalize-vector-compress.mir} (96%)
 rename llvm/test/CodeGen/AArch64/{masked-compress.ll => vector-compress.ll} (88%)

diff --git a/llvm/docs/GlobalISel/GenericOpcode.rst b/llvm/docs/GlobalISel/GenericOpcode.rst
index e44d2bf37aff70..375516a9d32ab4 100644
--- a/llvm/docs/GlobalISel/GenericOpcode.rst
+++ b/llvm/docs/GlobalISel/GenericOpcode.rst
@@ -710,7 +710,7 @@ The type of the operand must be equal to or larger than the vector element
 type. If the operand is larger than the vector element type, the scalar is
 implicitly truncated to the vector element type.
 
-G_MASKED_COMPRESS
+G_VECTOR_COMPRESS
 ^^^^^^^^^^^^^^^^^
 
 Given an input vector, a mask vector, and a passthru vector, continuously place
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 5e3664f0a8e8fe..2129b2c7f58271 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -19235,9 +19235,9 @@ the follow sequence of operations:
 The ``mask`` operand will apply to at least the gather and scatter operations.
 
 
-.. _int_masked_compress:
+.. _int_vector_compress:
 
-'``llvm.masked.compress.*``' Intrinsics
+'``llvm.experimental.vector.compress.*``' Intrinsics
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 LLVM provides an intrinsic for compressing data within a vector based on a selection mask.
@@ -19252,8 +19252,8 @@ The remaining lanes are filled with values from ``passthru``.
 
 :: code-block:: llvm
 
-      declare <8 x i32> @llvm.masked.compress.v8i32(<8 x i32> <value>, <8 x i1> <mask>, <8 x i32> <passthru>)
-      declare <16 x float> @llvm.masked.compress.v16f32(<16 x float> <value>, <16 x i1> <mask>, <16 x float> undef)
+      declare <8 x i32> @llvm.experimental.vector.compress.v8i32(<8 x i32> <value>, <8 x i1> <mask>, <8 x i32> <passthru>)
+      declare <16 x float> @llvm.experimental.vector.compress.v16f32(<16 x float> <value>, <16 x i1> <mask>, <16 x float> undef)
 
 Overview:
 """""""""
@@ -19294,7 +19294,7 @@ The input and passthru vectors must have the same type.
 Semantics:
 """"""""""
 
-The ``llvm.masked.compress`` intrinsic compresses data within a vector.
+The ``llvm.experimental.vector.compress`` intrinsic compresses data within a vector.
 It collects elements from possibly non-adjacent lanes of a vector and place
 them contiguously in the result vector based on a selection mask and fill the
 remaining lanes with values from ``passthru``.
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
index 6b379c699d42ae..c369271c3aa340 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
@@ -410,7 +410,7 @@ class LegalizerHelper {
   LegalizeResult lowerUnmergeValues(MachineInstr &MI);
   LegalizeResult lowerExtractInsertVectorElt(MachineInstr &MI);
   LegalizeResult lowerShuffleVector(MachineInstr &MI);
-  LegalizeResult lowerMASKED_COMPRESS(MachineInstr &MI);
+  LegalizeResult lowerVECTOR_COMPRESS(MachineInstr &MI);
   Register getDynStackAllocTargetPtr(Register SPReg, Register AllocSize,
                                      Align Alignment, LLT PtrTy);
   LegalizeResult lowerDynStackAlloc(MachineInstr &MI);
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index fb310898d51955..3100a95d283013 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -647,6 +647,14 @@ enum NodeType {
   /// non-constant operands.
   STEP_VECTOR,
 
+  /// VECTOR_COMPRESS(Vec, Mask, Passthru)
+  /// consecutively place vector elements based on mask
+  /// e.g., vec = {A, B, C, D} and mask = {1, 0, 1, 0}
+  ///         --> {A, C, ?, ?} where ? is undefined
+  /// If passthru is defined, ?s are replaced with elements from passthru.
+  /// If passthru is undef, ?s remain undefined.
+  VECTOR_COMPRESS,
+
   /// MULHU/MULHS - Multiply high - Multiply two integers of type iN,
   /// producing an unsigned/signed value of type i[2*N], then return the top
   /// part.
@@ -1302,14 +1310,6 @@ enum NodeType {
   MLOAD,
   MSTORE,
 
-  // MASKED_COMPRESS(Vec, Mask, Passthru)
-  // consecutively place vector elements based on mask
-  // e.g., vec = {A, B, C, D} and mask = {1, 0, 1, 0}
-  //         --> {A, C, ?, ?} where ? is undefined
-  // If passthru is defined, ?s are replaced with elements from passthru.
-  // If passthru is undef, ?s remain undefined.
-  MASKED_COMPRESS,
-
   // Masked gather and scatter - load and store operations for a vector of
   // random addresses with additional mask operand that prevents memory
   // accesses to the masked-off lanes.
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 814afe54d51e01..f349a46e899a56 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -5495,9 +5495,9 @@ class TargetLowering : public TargetLoweringBase {
   /// method accepts vectors as its arguments.
   SDValue expandVectorSplice(SDNode *Node, SelectionDAG &DAG) const;
 
-  /// Expand a vector MASKED_COMPRESS into a sequence of extract element, store
+  /// Expand a vector VECTOR_COMPRESS into a sequence of extract element, store
   /// temporarily, advance store position, before re-loading the final vector.
-  SDValue expandMASKED_COMPRESS(SDNode *Node, SelectionDAG &DAG) const;
+  SDValue expandVECTOR_COMPRESS(SDNode *Node, SelectionDAG &DAG) const;
 
   /// Legalize a SETCC or VP_SETCC with given LHS and RHS and condition code CC
   /// on the current target. A VP_SETCC will additionally be given a Mask
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 014fe8051f1e11..59eea47f7c1910 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -2362,7 +2362,7 @@ def int_masked_compressstore:
             [IntrWriteMem, IntrArgMemOnly, IntrWillReturn,
              NoCapture<ArgIndex<1>>]>;
 
-def int_masked_compress:
+def int_experimental_vector_compress:
     DefaultAttrsIntrinsic<[llvm_anyvector_ty],
               [LLVMMatchType<0>, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, LLVMMatchType<0>],
               [IntrNoMem, IntrWillReturn]>;
diff --git a/llvm/include/llvm/Support/TargetOpcodes.def b/llvm/include/llvm/Support/TargetOpcodes.def
index f41e23d935e45e..9d554dc7df1b41 100644
--- a/llvm/include/llvm/Support/TargetOpcodes.def
+++ b/llvm/include/llvm/Support/TargetOpcodes.def
@@ -752,7 +752,7 @@ HANDLE_TARGET_OPCODE(G_SHUFFLE_VECTOR)
 HANDLE_TARGET_OPCODE(G_SPLAT_VECTOR)
 
 /// Generic masked compress.
-HANDLE_TARGET_OPCODE(G_MASKED_COMPRESS)
+HANDLE_TARGET_OPCODE(G_VECTOR_COMPRESS)
 
 /// Generic count trailing zeroes.
 HANDLE_TARGET_OPCODE(G_CTTZ)
diff --git a/llvm/include/llvm/Target/GenericOpcodes.td b/llvm/include/llvm/Target/GenericOpcodes.td
index 7a7da7c69e6926..e84f31184e775c 100644
--- a/llvm/include/llvm/Target/GenericOpcodes.td
+++ b/llvm/include/llvm/Target/GenericOpcodes.td
@@ -1501,7 +1501,7 @@ def G_SPLAT_VECTOR: GenericInstruction {
 }
 
 // Generic masked compress.
-def G_MASKED_COMPRESS: GenericInstruction {
+def G_VECTOR_COMPRESS: GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$vec, type1:$mask, type0:$passthru);
   let hasSideEffects = false;
diff --git a/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td b/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
index e99fd42d0bb7ee..9e8c80daceb74f 100644
--- a/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
+++ b/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
@@ -187,7 +187,7 @@ def : GINodeEquiv<G_VECREDUCE_UMAX, vecreduce_umax>;
 def : GINodeEquiv<G_VECREDUCE_SMIN, vecreduce_smin>;
 def : GINodeEquiv<G_VECREDUCE_SMAX, vecreduce_smax>;
 def : GINodeEquiv<G_VECREDUCE_ADD, vecreduce_add>;
-def : GINodeEquiv<G_MASKED_COMPRESS, masked_compress>;
+def : GINodeEquiv<G_VECTOR_COMPRESS, vector_compress>;
 
 def : GINodeEquiv<G_STRICT_FADD, strict_fadd>;
 def : GINodeEquiv<G_STRICT_FSUB, strict_fsub>;
diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
index 15ffc1d5f36f6e..94895beecb9f1d 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -745,7 +745,7 @@ def masked_gather : SDNode<"ISD::MGATHER", SDTMaskedGather,
 def masked_scatter : SDNode<"ISD::MSCATTER", SDTMaskedScatter,
                             [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
 
-def masked_compress : SDNode<"ISD::MASKED_COMPRESS", SDTMaskedCompress>;
+def vector_compress : SDNode<"ISD::VECTOR_COMPRESS", SDTMaskedCompress>;
 
 // Do not use ld, st directly. Use load, extload, sextload, zextload, store,
 // and truncst (see below).
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index d26fef1e201f49..57f296137da61f 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -1982,8 +1982,8 @@ unsigned IRTranslator::getSimpleIntrinsicOpcode(Intrinsic::ID ID) {
       return TargetOpcode::G_VECREDUCE_UMAX;
     case Intrinsic::vector_reduce_umin:
       return TargetOpcode::G_VECREDUCE_UMIN;
-    case Intrinsic::masked_compress:
-      return TargetOpcode::G_MASKED_COMPRESS;
+    case Intrinsic::experimental_vector_compress:
+      return TargetOpcode::G_VECTOR_COMPRESS;
     case Intrinsic::lround:
       return TargetOpcode::G_LROUND;
     case Intrinsic::llround:
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index f8949062ecf6ff..43c3bd676dd908 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -3953,8 +3953,8 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
     return lowerExtractInsertVectorElt(MI);
   case G_SHUFFLE_VECTOR:
     return lowerShuffleVector(MI);
-  case G_MASKED_COMPRESS:
-    return lowerMASKED_COMPRESS(MI);
+  case G_VECTOR_COMPRESS:
+    return lowerVECTOR_COMPRESS(MI);
   case G_DYN_STACKALLOC:
     return lowerDynStackAlloc(MI);
   case G_STACKSAVE:
@@ -7513,7 +7513,7 @@ LegalizerHelper::lowerShuffleVector(MachineInstr &MI) {
 }
 
 LegalizerHelper::LegalizeResult
-LegalizerHelper::lowerMASKED_COMPRESS(llvm::MachineInstr &MI) {
+LegalizerHelper::lowerVECTOR_COMPRESS(llvm::MachineInstr &MI) {
   auto [Dst, DstTy, Vec, VecTy, Mask, MaskTy, Passthru, PassthruTy] =
       MI.getFirst4RegLLTs();
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 73e4e88c498669..3758a8717f3802 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -537,7 +537,7 @@ namespace {
     SDValue visitVECTOR_SHUFFLE(SDNode *N);
     SDValue visitSCALAR_TO_VECTOR(SDNode *N);
     SDValue visitINSERT_SUBVECTOR(SDNode *N);
-    SDValue visitMASKED_COMPRESS(SDNode *N);
+    SDValue visitVECTOR_COMPRESS(SDNode *N);
     SDValue visitMLOAD(SDNode *N);
     SDValue visitMSTORE(SDNode *N);
     SDValue visitMGATHER(SDNode *N);
@@ -1958,7 +1958,7 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::MLOAD:              return visitMLOAD(N);
   case ISD::MSCATTER:           return visitMSCATTER(N);
   case ISD::MSTORE:             return visitMSTORE(N);
-  case ISD::MASKED_COMPRESS:    return visitMASKED_COMPRESS(N);
+  case ISD::VECTOR_COMPRESS:    return visitVECTOR_COMPRESS(N);
   case ISD::LIFETIME_END:       return visitLIFETIME_END(N);
   case ISD::FP_TO_FP16:         return visitFP_TO_FP16(N);
   case ISD::FP16_TO_FP:         return visitFP16_TO_FP(N);
@@ -12080,7 +12080,7 @@ SDValue DAGCombiner::visitVP_STRIDED_STORE(SDNode *N) {
   return SDValue();
 }
 
-SDValue DAGCombiner::visitMASKED_COMPRESS(SDNode *N) {
+SDValue DAGCombiner::visitVECTOR_COMPRESS(SDNode *N) {
   SDLoc DL(N);
   SDValue Vec = N->getOperand(0);
   SDValue Mask = N->getOperand(1);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 6ad38cfa06c6fe..b51a675f934a59 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -87,8 +87,8 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
     break;
   case ISD::MGATHER:     Res = PromoteIntRes_MGATHER(cast<MaskedGatherSDNode>(N));
     break;
-  case ISD::MASKED_COMPRESS:
-    Res = PromoteIntRes_MASKED_COMPRESS(N);
+  case ISD::VECTOR_COMPRESS:
+    Res = PromoteIntRes_VECTOR_COMPRESS(N);
     break;
   case ISD::SELECT:
   case ISD::VSELECT:
@@ -971,10 +971,10 @@ SDValue DAGTypeLegalizer::PromoteIntRes_MGATHER(MaskedGatherSDNode *N) {
   return Res;
 }
 
-SDValue DAGTypeLegalizer::PromoteIntRes_MASKED_COMPRESS(SDNode *N) {
+SDValue DAGTypeLegalizer::PromoteIntRes_VECTOR_COMPRESS(SDNode *N) {
   SDValue Vec = GetPromotedInteger(N->getOperand(0));
   SDValue Passthru = GetPromotedInteger(N->getOperand(2));
-  return DAG.getNode(ISD::MASKED_COMPRESS, SDLoc(N), Vec.getValueType(), Vec,
+  return DAG.getNode(ISD::VECTOR_COMPRESS, SDLoc(N), Vec.getValueType(), Vec,
                      N->getOperand(1), Passthru);
 }
 
@@ -1926,8 +1926,8 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
                                                  OpNo); break;
   case ISD::MSCATTER: Res = PromoteIntOp_MSCATTER(cast<MaskedScatterSDNode>(N),
                                                   OpNo); break;
-  case ISD::MASKED_COMPRESS:
-    Res = PromoteIntOp_MASKED_COMPRESS(N, OpNo);
+  case ISD::VECTOR_COMPRESS:
+    Res = PromoteIntOp_VECTOR_COMPRESS(N, OpNo);
     break;
   case ISD::VP_TRUNCATE:
   case ISD::TRUNCATE:     Res = PromoteIntOp_TRUNCATE(N); break;
@@ -2423,14 +2423,14 @@ SDValue DAGTypeLegalizer::PromoteIntOp_MSCATTER(MaskedScatterSDNode *N,
                               N->getIndexType(), TruncateStore);
 }
 
-SDValue DAGTypeLegalizer::PromoteIntOp_MASKED_COMPRESS(SDNode *N,
+SDValue DAGTypeLegalizer::PromoteIntOp_VECTOR_COMPRESS(SDNode *N,
                                                        unsigned OpNo) {
-  assert(OpNo == 1 && "Can only promote MASKED_COMPRESS mask.");
+  assert(OpNo == 1 && "Can only promote VECTOR_COMPRESS mask.");
   SDValue Vec = N->getOperand(0);
   EVT VT = Vec.getValueType();
   SDValue Passthru = N->getOperand(2);
   SDValue Mask = PromoteTargetBoolean(N->getOperand(1), VT);
-  return DAG.getNode(ISD::MASKED_COMPRESS, SDLoc(N), VT, Vec, Mask, Passthru);
+  return DAG.getNode(ISD::VECTOR_COMPRESS, SDLoc(N), VT, Vec, Mask, Passthru);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_TRUNCATE(SDNode *N) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 887817bea72e54..5ed460d4e4a480 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -340,7 +340,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue PromoteIntRes_LOAD(LoadSDNode *N);
   SDValue PromoteIntRes_MLOAD(MaskedLoadSDNode *N);
   SDValue PromoteIntRes_MGATHER(MaskedGatherSDNode *N);
-  SDValue PromoteIntRes_MASKED_COMPRESS(SDNode *N);
+  SDValue PromoteIntRes_VECTOR_COMPRESS(SDNode *N);
   SDValue PromoteIntRes_Overflow(SDNode *N);
   SDValue PromoteIntRes_FFREXP(SDNode *N);
   SDValue PromoteIntRes_SADDSUBO(SDNode *N, unsigned ResNo);
@@ -412,7 +412,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue PromoteIntOp_MLOAD(MaskedLoadSDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_MSCATTER(MaskedScatterSDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_MGATHER(MaskedGatherSDNode *N, unsigned OpNo);
-  SDValue PromoteIntOp_MASKED_COMPRESS(SDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_VECTOR_COMPRESS(SDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_FRAMERETURNADDR(SDNode *N);
   SDValue PromoteIntOp_FIX(SDNode *N);
   SDValue PromoteIntOp_ExpOp(SDNode *N);
@@ -916,7 +916,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   void SplitVecRes_MLOAD(MaskedLoadSDNode *MLD, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_Gather(MemSDNode *VPGT, SDValue &Lo, SDValue &Hi,
                           bool SplitSETCC = false);
-  void SplitVecRes_MASKED_COMPRESS(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_VECTOR_COMPRESS(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_ScalarOp(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_STEP_VECTOR(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi);
@@ -1008,7 +1008,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue WidenVecRes_LOAD(SDNode* N);
   SDValue WidenVecRes_VP_LOAD(VPLoadSDNode *N);
   SDValue WidenVecRes_VP_STRIDED_LOAD(VPStridedLoadSDNode *N);
-  SDValue WidenVecRes_MASKED_COMPRESS(SDNode *N);
+  SDValue WidenVecRes_VECTOR_COMPRESS(SDNode *N);
   SDValue WidenVecRes_MLOAD(MaskedLoadSDNode* N);
   SDValue WidenVecRes_MGATHER(MaskedGatherSDNode* N);
   SDValue WidenVecRes_VP_GATHER(VPGatherSDNode* N);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index fa62ae1323d852..88f4d2c1f13559 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -449,7 +449,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::FP_TO_SINT_SAT:
   case ISD::FP_TO_UINT_SAT:
   case ISD::MGATHER:
-  case ISD::MASKED_COMPRESS:
+  case ISD::VECTOR_COMPRESS:
   case ISD::SCMP:
   case ISD::UCMP:
     Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
@@ -1118,8 +1118,8 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
       return;
 
     break;
-  case ISD::MASKED_COMPRESS:
-    Results.push_back(TLI.expandMASKED_COMPRESS(Node, DAG));
+  case ISD::VECTOR_COMPRESS:
+    Results.push_back(TLI.expandVECTOR_COMPRESS(Node, DAG));
     return;
   }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 3f090380da7e6d..41c94ed7ebd8ed 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1100,8 +1100,8 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::VP_GATHER:
     SplitVecRes_Gather(cast<MemSDNode>(N), Lo, Hi, /*SplitSETCC*/ true);
     break;
-  case ISD::MASKED_COMPRESS:
-    SplitVecRes_MASKED_COMPRESS(N, Lo, Hi);
+  case ISD::VECTOR_COMPRESS:
+    SplitVecRes_VECTOR_COMPRESS(N, Lo, Hi);
     break;
   case ISD::SETCC:
   case ISD::VP_SETCC:
@@ -2378,14 +2378,14 @@ void DAGTypeLegalizer::SplitVecRes_Gather(MemSDNode *N, SDValue &Lo,
   ReplaceValueWith(SDValue(N, 1), Ch);
 }
 
-void DAGTypeLegalizer::SplitVecRes_MASKED_COMPRESS(SDNode *N, SDValue &Lo,
+void DAGTypeLegalizer::SplitVecRes_VECTOR_COMPRESS(SDNode *N, SDValue &Lo,
                                                    SDValue &Hi) {
   // This is not "trivial", as there is a dependency between the two subvectors.
   // Depending on the number of 1s in the mask, the elements from the Hi vector
   // need to be moved to the Lo vector. So we just perform this as one "big"
   // operation and then extract the Lo and Hi vectors from that. This gets rid
-  // of MASKED_COMPRESS and all other operands can be legalized later.
-  SDValue Compressed = TLI.expandMASKED_COMPRESS(N, DAG);
+  // of VECTOR_COMPRESS and all other operands can be legalized later.
+  SDValue Compressed = TLI.expandVECTOR_COMPRESS(N, DAG);
   std::tie(Lo, Hi) = DAG.SplitVector(Compressed, SDLoc(N));
 }
 
@@ -4320,8 +4320,8 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::EXPERIMENTAL_VP_STRIDED_LOAD:
     Res = WidenVecRes_VP_STRIDED_LOAD(cast<VPStridedLoadSDNode>(N));
     break;
-  case ISD::MASKED_COMPRESS:
-    Res = WidenVecRes_MASKED_COMPRESS(N);
+  case ISD::VECTOR_COMPRESS:
+    Res = WidenVecRes_VECTOR_COMPRESS(N);
     break;
   case ISD::MLOAD:
     Res = WidenVecRes_MLOAD(cast<MaskedLoadSDNode>(N));
@@ -5737,7 +5737,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_VP_STRIDED_LOAD(VPStridedLoadSDNode *N) {
   return Res;
 }
 
-SDValue DAGTypeLegalizer::WidenVecRes_MASKED_COMPRESS(SDNode *N) {
+SDValue DAGTypeLegalizer::WidenVecRes_VECTOR_COMPRESS(SDNode *N) {
   SDValue Vec = N->getOperand(0);
   SDValue Mask = N->getOperand(1);
   SDValue Passthru = N->getOperand(2);
@@ -5750,7 +5750,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_MASKED_COMPRESS(SDNode *N) {
   SDValue WideVec = ModifyToType(Vec, WideVecVT);
   SDValue WideMask = ModifyToType(Mask, WideMaskVT, /*FillWithZeroes=*/true);
   SDValue WidePassthru = ModifyToType(Passthru, WideVecVT);
-  return DAG.getNode(ISD::MASKED_COMPRESS, SDLoc(N), WideVecVT, WideVec,
+  return DAG.getNode(ISD::VECTOR_COMPRESS, SDLoc(N), WideVecVT, WideVec,
                      WideMask, WidePassthru);
 }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index d8a2c614e5ead7..5ccc85592ffea8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -7507,7 +7507,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     if (N1.getValueType() == VT)
       return N1;
     break;
-  case ISD::MASKED_COMPRESS: {
+  case ISD::VECTOR_COMPRESS: {
     EVT VecVT = N1.getValueType();
     [[maybe_unused]] EVT MaskVT = N2.getValueType();
     [[maybe_unused]] EVT PassthruVT = N3.getValueType();
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index b9e73e351be767..9f589afd7cfbfb 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6731,13 +6731,6 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
   case Intrinsic::masked_compressstore:
     visitMaskedStore(I, true /* IsCompressing */);
     return;
-  case Intrinsic::masked_compress:
-    setValue(&I, DAG.getNode(ISD::MASKED_COMPRESS, sdl,
-                             getValue(I.getArgOperand(0)).getValueType(),
-                             getValue(I.getArgOperand(0)),
-                             getValue(I.getArgOperand(1)),
-                             getValue(I.getArgOperand(2)), Flags));
-    return;
   case Intrinsic::powi:
     setValue(&I, ExpandPowI(sdl, getValue(I.getArgOperand(0)),
                             getValue(I.getArgOperand(1)), DAG));
@@ -8051,6 +8044,13 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
   case Intrinsic::vector_deinterleave2:
     visitVectorDeinterleave(I);
     return;
+  case Intrinsic::experimental_vector_compress:
+    setValue(&I, DAG.getNode(ISD::VECTOR_COMPRESS, sdl,
+                             getValue(I.getArgOperand(0)).getValueType(),
+                             getValue(I.getArgOperand(0)),
+                             getValue(I.getArgOperand(1)),
+                             getValue(I.getArgOperand(2)), Flags));
+    return;
   case Intrinsic::experimental_convergence_anchor:
   case Intrinsic::experimental_convergence_entry:
   case Intrinsic::experimental_convergence_loop:
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 3a10d4fc2a6891..5f3765eda8910e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -422,7 +422,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::MSTORE:                     return "masked_store";
   case ISD::MGATHER:                    return "masked_gather";
   case ISD::MSCATTER:                   return "masked_scatter";
-  case ISD::MASKED_COMPRESS:            return "masked_compress";
+  case ISD::VECTOR_COMPRESS:            return "vector_compress";
   case ISD::VAARG:                      return "vaarg";
   case ISD::VACOPY:                     return "vacopy";
   case ISD::VAEND:                      return "vaend";
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index af48c03baf6ca7..4c76a48818d8ed 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -11336,7 +11336,7 @@ SDValue TargetLowering::expandVectorSplice(SDNode *Node,
                      MachinePointerInfo::getUnknownStack(MF));
 }
 
-SDValue TargetLowering::expandMASKED_COMPRESS(SDNode *Node,
+SDValue TargetLowering::expandVECTOR_COMPRESS(SDNode *Node,
                                               SelectionDAG &DAG) const {
   SDLoc DL(Node);
   SDValue Vec = Node->getOperand(0);
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index dc0b88c270bbff..a1408e0d903254 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -989,7 +989,7 @@ void TargetLoweringBase::initActions() {
     setOperationAction(ISD::VECTOR_SPLICE, VT, Expand);
 
     // Only some target support this vector operation. Most need to expand it.
-    setOperationAction(ISD::MASKED_COMPRESS, VT, Expand);
+    setOperationAction(ISD::VECTOR_COMPRESS, VT, Expand);
 
     // VP operations default to expand.
 #define BEGIN_REGISTER_VP_SDNODE(SDOPC, ...)                                   \
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 695edde2e2da7c..47c3dc292e2e0b 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -1151,7 +1151,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .lower();
 
   // TODO: Update this to correct handling when adding AArch64/SVE support.
-  getActionDefinitionsBuilder(G_MASKED_COMPRESS).lower();
+  getActionDefinitionsBuilder(G_VECTOR_COMPRESS).lower();
 
   getActionDefinitionsBuilder({G_FSHL, G_FSHR})
       .customFor({{s32, s32}, {s32, s64}, {s64, s64}})
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-masked-compress.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-vector-compress.mir
similarity index 96%
rename from llvm/test/CodeGen/AArch64/GlobalISel/legalize-masked-compress.mir
rename to llvm/test/CodeGen/AArch64/GlobalISel/legalize-vector-compress.mir
index 41b83d8d3214fe..cc7577473b5485 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-masked-compress.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-vector-compress.mir
@@ -1,12 +1,12 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=aarch64 -run-pass=legalizer %s -o - | FileCheck %s
 ---
-name:            test_masked_compress_v4s32
+name:            test_vector_compress_v4s32
 body:             |
   bb.0:
     liveins: $q0, $d1
 
-    ; CHECK-LABEL: name: test_masked_compress_v4s32
+    ; CHECK-LABEL: name: test_vector_compress_v4s32
     ; CHECK: liveins: $q0, $d1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
@@ -60,18 +60,18 @@ body:             |
     %0:_(<4 x s32>) = COPY $q0
     %1:_(<4 x s16>) = COPY $d1
     %2:_(<4 x s32>) = G_IMPLICIT_DEF
-    %3:_(<4 x s32>) = G_MASKED_COMPRESS %0(<4 x s32>), %1(<4 x s16>), %2(<4 x s32>)
+    %3:_(<4 x s32>) = G_VECTOR_COMPRESS %0(<4 x s32>), %1(<4 x s16>), %2(<4 x s32>)
     $q0 = COPY %3(<4 x s32>)
     RET_ReallyLR implicit $q0
 ...
 ---
-name:            test_masked_compress_v4s32_with_passthru
+name:            test_vector_compress_v4s32_with_passthru
 body:             |
   bb.0:
     liveins: $q0, $d1, $q2
 
 
-    ; CHECK-LABEL: name: test_masked_compress_v4s32_with_passthru
+    ; CHECK-LABEL: name: test_vector_compress_v4s32_with_passthru
     ; CHECK: liveins: $q0, $d1, $q2
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
@@ -147,7 +147,7 @@ body:             |
     %0:_(<4 x s32>) = COPY $q0
     %1:_(<4 x s16>) = COPY $d1
     %2:_(<4 x s32>) = COPY $q2
-    %3:_(<4 x s32>) = G_MASKED_COMPRESS %0(<4 x s32>), %1(<4 x s16>), %2(<4 x s32>)
+    %3:_(<4 x s32>) = G_VECTOR_COMPRESS %0(<4 x s32>), %1(<4 x s16>), %2(<4 x s32>)
     $q0 = COPY %3(<4 x s32>)
     RET_ReallyLR implicit $q0
 ...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
index b9dd8cac4cc08f..c76b01f60919c1 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
@@ -642,7 +642,7 @@
 # DEBUG-NEXT: G_SPLAT_VECTOR (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
 # DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
-# DEBUG-NEXT: G_MASKED_COMPRESS (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
+# DEBUG-NEXT: G_VECTOR_COMPRESS (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: G_CTTZ (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
diff --git a/llvm/test/CodeGen/AArch64/masked-compress.ll b/llvm/test/CodeGen/AArch64/vector-compress.ll
similarity index 88%
rename from llvm/test/CodeGen/AArch64/masked-compress.ll
rename to llvm/test/CodeGen/AArch64/vector-compress.ll
index 436e5af1e1e44d..fcf5c546f2610a 100644
--- a/llvm/test/CodeGen/AArch64/masked-compress.ll
+++ b/llvm/test/CodeGen/AArch64/vector-compress.ll
@@ -28,7 +28,7 @@ define <4 x i32> @test_compress_v4i32(<4 x i32> %vec, <4 x i1> %mask) {
 ; CHECK-NEXT:    st1.s { v0 }[3], [x11]
 ; CHECK-NEXT:    ldr q0, [sp], #16
 ; CHECK-NEXT:    ret
-    %out = call <4 x i32> @llvm.masked.compress(<4 x i32> %vec, <4 x i1> %mask, <4 x i32> undef)
+    %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> %mask, <4 x i32> undef)
     ret <4 x i32> %out
 }
 
@@ -78,7 +78,7 @@ define <4 x i32> @test_compress_v4i32_with_passthru(<4 x i32> %vec, <4 x i1> %ma
 ; CHECK-NEXT:    str w9, [x8]
 ; CHECK-NEXT:    ldr q0, [sp], #16
 ; CHECK-NEXT:    ret
-    %out = call <4 x i32> @llvm.masked.compress(<4 x i32> %vec, <4 x i1> %mask, <4 x i32> %passthru)
+    %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> %mask, <4 x i32> %passthru)
     ret <4 x i32> %out
 }
 
@@ -97,7 +97,7 @@ define <2 x double> @test_compress_v2f64(<2 x double> %vec, <2 x i1> %mask) {
 ; CHECK-NEXT:    st1.d { v0 }[1], [x8]
 ; CHECK-NEXT:    ldr q0, [sp], #16
 ; CHECK-NEXT:    ret
-    %out = call <2 x double> @llvm.masked.compress.v2f64(<2 x double> %vec, <2 x i1> %mask, <2 x double> undef)
+    %out = call <2 x double> @llvm.experimental.vector.compress.v2f64(<2 x double> %vec, <2 x i1> %mask, <2 x double> undef)
     ret <2 x double> %out
 }
 
@@ -197,7 +197,7 @@ define <16 x i8> @test_compress_v16i8(<16 x i8> %vec, <16 x i1> %mask) {
 ; CHECK-NEXT:    st1.b { v0 }[15], [x8]
 ; CHECK-NEXT:    ldr q0, [sp], #16
 ; CHECK-NEXT:    ret
-    %out = call <16 x i8> @llvm.masked.compress(<16 x i8> %vec, <16 x i1> %mask, <16 x i8> undef)
+    %out = call <16 x i8> @llvm.experimental.vector.compress(<16 x i8> %vec, <16 x i1> %mask, <16 x i8> undef)
     ret <16 x i8> %out
 }
 
@@ -249,7 +249,7 @@ define <8 x i32> @test_compress_large(<8 x i32> %vec, <8 x i1> %mask) {
 ; CHECK-NEXT:    st1.s { v1 }[3], [x8]
 ; CHECK-NEXT:    ldp q0, q1, [sp], #32
 ; CHECK-NEXT:    ret
-    %out = call <8 x i32> @llvm.masked.compress(<8 x i32> %vec, <8 x i1> %mask, <8 x i32> undef)
+    %out = call <8 x i32> @llvm.experimental.vector.compress(<8 x i32> %vec, <8 x i1> %mask, <8 x i32> undef)
     ret <8 x i32> %out
 }
 
@@ -262,7 +262,7 @@ define <4 x i32> @test_compress_all_const() {
 ; CHECK-NEXT:    ldr q0, [x8, lCPI5_0@PAGEOFF]
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:    .loh AdrpLdr Lloh0, Lloh1
-    %out = call <4 x i32> @llvm.masked.compress(<4 x i32> <i32 3, i32 5, i32 7, i32 9>,
+    %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> <i32 3, i32 5, i32 7, i32 9>,
                                                 <4 x i1>   <i1 0,  i1 1,  i1 0,  i1 1>,
                                                 <4 x i32> undef)
     ret <4 x i32> %out
@@ -273,7 +273,7 @@ define <4 x i32> @test_compress_const_mask(<4 x i32> %vec) {
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    mov.s v0[1], v0[3]
 ; CHECK-NEXT:    ret
-    %out = call <4 x i32> @llvm.masked.compress(<4 x i32> %vec, <4 x i1> <i1 1, i1 undef, i1 0, i1 1>, <4 x i32> undef)
+    %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> <i1 1, i1 undef, i1 0, i1 1>, <4 x i32> undef)
     ret <4 x i32> %out
 }
 
@@ -284,7 +284,7 @@ define <4 x i32> @test_compress_const_mask_passthrough(<4 x i32> %vec, <4 x i32>
 ; CHECK-NEXT:    mov.s v1[0], v0[0]
 ; CHECK-NEXT:    mov.16b v0, v1
 ; CHECK-NEXT:    ret
-    %out = call <4 x i32> @llvm.masked.compress(<4 x i32> %vec, <4 x i1> <i1 1, i1 undef, i1 0, i1 1>, <4 x i32> %passthru)
+    %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> <i1 1, i1 undef, i1 0, i1 1>, <4 x i32> %passthru)
     ret <4 x i32> %out
 }
 
@@ -297,7 +297,7 @@ define <4 x i32> @test_compress_const_mask_const_passthrough(<4 x i32> %vec) {
 ; CHECK-NEXT:    mov w8, #8 ; =0x8
 ; CHECK-NEXT:    mov.s v0[3], w8
 ; CHECK-NEXT:    ret
-    %out = call <4 x i32> @llvm.masked.compress(<4 x i32> %vec, <4 x i1> <i1 1, i1 0, i1 0, i1 1>, <4 x i32> <i32 5, i32 6, i32 7, i32 8>)
+    %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> <i1 1, i1 0, i1 0, i1 1>, <4 x i32> <i32 5, i32 6, i32 7, i32 8>)
     ret <4 x i32> %out
 }
 
@@ -308,21 +308,21 @@ define <4 x i32> @test_compress_const_splat1_mask(<4 x i32> %ignore, <4 x i32> %
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    mov.16b v0, v1
 ; CHECK-NEXT:    ret
-    %out = call <4 x i32> @llvm.masked.compress(<4 x i32> %vec, <4 x i1> splat (i1 -1), <4 x i32> undef)
+    %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> splat (i1 -1), <4 x i32> undef)
     ret <4 x i32> %out
 }
 define <4 x i32> @test_compress_const_splat0_mask(<4 x i32> %ignore, <4 x i32> %vec) {
 ; CHECK-LABEL: test_compress_const_splat0_mask:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    ret
-    %out = call <4 x i32> @llvm.masked.compress(<4 x i32> %vec, <4 x i1> splat (i1 0), <4 x i32> undef)
+    %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> splat (i1 0), <4 x i32> undef)
     ret <4 x i32> %out
 }
 define <4 x i32> @test_compress_undef_mask(<4 x i32> %ignore, <4 x i32> %vec) {
 ; CHECK-LABEL: test_compress_undef_mask:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    ret
-    %out = call <4 x i32> @llvm.masked.compress(<4 x i32> %vec, <4 x i1> undef, <4 x i32> undef)
+    %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> undef, <4 x i32> undef)
     ret <4 x i32> %out
 }
 define <4 x i32> @test_compress_const_splat0_mask_with_passthru(<4 x i32> %ignore, <4 x i32> %vec, <4 x i32> %passthru) {
@@ -330,14 +330,14 @@ define <4 x i32> @test_compress_const_splat0_mask_with_passthru(<4 x i32> %ignor
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    mov.16b v0, v2
 ; CHECK-NEXT:    ret
-    %out = call <4 x i32> @llvm.masked.compress(<4 x i32> %vec, <4 x i1> splat (i1 0), <4 x i32> %passthru)
+    %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> splat (i1 0), <4 x i32> %passthru)
     ret <4 x i32> %out
 }
 define <4 x i32> @test_compress_const_splat0_mask_without_passthru(<4 x i32> %ignore, <4 x i32> %vec) {
 ; CHECK-LABEL: test_compress_const_splat0_mask_without_passthru:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    ret
-    %out = call <4 x i32> @llvm.masked.compress(<4 x i32> %vec, <4 x i1> splat (i1 0), <4 x i32> undef)
+    %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %vec, <4 x i1> splat (i1 0), <4 x i32> undef)
     ret <4 x i32> %out
 }
 
@@ -369,7 +369,7 @@ define <4 x i8> @test_compress_small(<4 x i8> %vec, <4 x i1> %mask) {
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
-    %out = call <4 x i8> @llvm.masked.compress(<4 x i8> %vec, <4 x i1> %mask, <4 x i8> undef)
+    %out = call <4 x i8> @llvm.experimental.vector.compress(<4 x i8> %vec, <4 x i1> %mask, <4 x i8> undef)
     ret <4 x i8> %out
 }
 
@@ -401,7 +401,7 @@ define <4 x i4> @test_compress_illegal_element_type(<4 x i4> %vec, <4 x i1> %mas
 ; CHECK-NEXT:    ldr d0, [sp, #8]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
-    %out = call <4 x i4> @llvm.masked.compress(<4 x i4> %vec, <4 x i1> %mask, <4 x i4> undef)
+    %out = call <4 x i4> @llvm.experimental.vector.compress(<4 x i4> %vec, <4 x i1> %mask, <4 x i4> undef)
     ret <4 x i4> %out
 }
 
@@ -436,7 +436,7 @@ define <3 x i32> @test_compress_narrow(<3 x i32> %vec, <3 x i1> %mask) {
 ; CHECK-NEXT:    st1.s { v0 }[3], [x10]
 ; CHECK-NEXT:    ldr q0, [sp], #16
 ; CHECK-NEXT:    ret
-    %out = call <3 x i32> @llvm.masked.compress(<3 x i32> %vec, <3 x i1> %mask, <3 x i32> undef)
+    %out = call <3 x i32> @llvm.experimental.vector.compress(<3 x i32> %vec, <3 x i1> %mask, <3 x i32> undef)
     ret <3 x i32> %out
 }
 
@@ -469,6 +469,6 @@ define <3 x i3> @test_compress_narrow_illegal_element_type(<3 x i3> %vec, <3 x i
 ; CHECK-NEXT:    umov.h w2, v0[2]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
-    %out = call <3 x i3> @llvm.masked.compress(<3 x i3> %vec, <3 x i1> %mask, <3 x i3> undef)
+    %out = call <3 x i3> @llvm.experimental.vector.compress(<3 x i3> %vec, <3 x i1> %mask, <3 x i3> undef)
     ret <3 x i3> %out
 }

From ba2939ea1299309393df8334c1c7f0be5d6da5bf Mon Sep 17 00:00:00 2001
From: Lawrence Benson <github@lawben.com>
Date: Mon, 1 Jul 2024 11:24:53 +0200
Subject: [PATCH 46/49] Add freeze to mask extract for poison/undef entries

---
 llvm/docs/LangRef.rst                            | 10 +++++-----
 llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp |  6 ++++--
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 2129b2c7f58271..f07f6a1b8e373f 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -19263,11 +19263,6 @@ All selected elements are written into adjacent lanes in the result vector,
 from lower to higher.
 The mask holds an entry for each vector lane, and is used to select elements
 to be kept.
-If a mask entry if undefined or poison, it is treated as "false", i.e., the
-element is not selected.
-If ``passthru`` is undefined, the number of valid lanes is equal to the number
-of ``true`` entries in the mask, i.e., all lanes >= number-of-selected-values
-are undefined.
 If a ``passthru`` vector is given, all remaining lanes are filled with the
 corresponding lane's value from ``passthru``.
 The main difference to :ref:`llvm.masked.compressstore <int_compressstore>` is
@@ -19302,6 +19297,11 @@ This intrinsic performs the logic of the following C++ example.
 All values in ``out`` after the last selected one are undefined if
 ``passthru`` is undefined.
 If all entries in the ``mask`` are 0, the ``out`` vector is ``passthru``.
+If any element of the mask is poison, all elements of the result are poison.
+Otherwise, if any element of the mask is undef, all elements of the result are undef.
+If ``passthru`` is undefined, the number of valid lanes is equal to the number
+of ``true`` entries in the mask, i.e., all lanes >= number-of-selected-values
+are undefined.
 
 .. code-block:: cpp
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 4c76a48818d8ed..14879cc51f8447 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -11409,8 +11409,10 @@ SDValue TargetLowering::expandVECTOR_COMPRESS(SDNode *Node,
 
     // Get the mask value and add it to the current output position. This
     // either increments by 1 if MaskI is true or adds 0 otherwise.
-    SDValue MaskI =
-        DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskScalarVT, Mask, Idx);
+    // Freeze in case we have poison/undef mask entries.
+    SDValue MaskI = DAG.getFreeze(
+        DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskScalarVT, Mask, Idx));
+    MaskI = DAG.getFreeze(MaskI);
     MaskI = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, MaskI);
     MaskI = DAG.getNode(ISD::ZERO_EXTEND, DL, PositionVT, MaskI);
     OutPos = DAG.getNode(ISD::ADD, DL, PositionVT, OutPos, MaskI);

From 11e1742be470822032d06ecc2267936268b2c527 Mon Sep 17 00:00:00 2001
From: Lawrence Benson <github@lawben.com>
Date: Mon, 1 Jul 2024 11:30:20 +0200
Subject: [PATCH 47/49] Fix docs

---
 llvm/docs/LangRef.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index f07f6a1b8e373f..016dccb9d96291 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -19238,7 +19238,7 @@ The ``mask`` operand will apply to at least the gather and scatter operations.
 .. _int_vector_compress:
 
 '``llvm.experimental.vector.compress.*``' Intrinsics
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 LLVM provides an intrinsic for compressing data within a vector based on a selection mask.
 Semantically, this is similar to :ref:`llvm.masked.compressstore <int_compressstore>` but with weaker assumptions

From 32cc27f8f5468bff60b7b4e450e17c994635fa06 Mon Sep 17 00:00:00 2001
From: Lawrence Benson <github@lawben.com>
Date: Tue, 2 Jul 2024 15:20:11 +0200
Subject: [PATCH 48/49] Fix docs

---
 llvm/docs/LangRef.rst      | 4 ++--
 llvm/docs/ReleaseNotes.rst | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 016dccb9d96291..4530b5aa1e1460 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -19290,8 +19290,8 @@ Semantics:
 """"""""""
 
 The ``llvm.experimental.vector.compress`` intrinsic compresses data within a vector.
-It collects elements from possibly non-adjacent lanes of a vector and place
-them contiguously in the result vector based on a selection mask and fill the
+It collects elements from possibly non-adjacent lanes of a vector and places
+them contiguously in the result vector based on a selection mask, filling the
 remaining lanes with values from ``passthru``.
 This intrinsic performs the logic of the following C++ example.
 All values in ``out`` after the last selected one are undefined if
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index ecac5e34670d71..f8cce2cab91101 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -78,7 +78,7 @@ Changes to the LLVM IR
   * ``llvm.instprof.mcdc.tvbitmap.update``: 3rd argument has been
     removed. The next argument has been changed from byte index to bit
     index.
-* Added ``llvm.masked.compress`` intrinsic.
+* Added ``llvm.experimental.vector.compress`` intrinsic.
 
 Changes to LLVM infrastructure
 ------------------------------

From 99610a8c58125575553ed57cd27bc6586c416b09 Mon Sep 17 00:00:00 2001
From: Lawrence Benson <github@lawben.com>
Date: Wed, 3 Jul 2024 16:53:20 +0200
Subject: [PATCH 49/49] Address PR comments

---
 llvm/include/llvm/Target/TargetSelectionDAG.td | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
index 94895beecb9f1d..50667b9eb8efcb 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -266,7 +266,7 @@ def SDTMaskedScatter : SDTypeProfile<0, 4, [
   SDTCisSameNumEltsAs<0, 1>, SDTCisSameNumEltsAs<0, 3>
 ]>;
 
-def SDTMaskedCompress : SDTypeProfile<1, 3, [
+def SDTVectorCompress : SDTypeProfile<1, 3, [
   SDTCisVec<0>, SDTCisSameAs<0, 1>,
   SDTCisVec<2>, SDTCisSameNumEltsAs<1, 2>,
   SDTCisSameAs<1, 3>
@@ -745,7 +745,7 @@ def masked_gather : SDNode<"ISD::MGATHER", SDTMaskedGather,
 def masked_scatter : SDNode<"ISD::MSCATTER", SDTMaskedScatter,
                             [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
 
-def vector_compress : SDNode<"ISD::VECTOR_COMPRESS", SDTMaskedCompress>;
+def vector_compress : SDNode<"ISD::VECTOR_COMPRESS", SDTVectorCompress>;
 
 // Do not use ld, st directly. Use load, extload, sextload, zextload, store,
 // and truncst (see below).