diff --git a/compilerutils/include/compilerutils/CompilerUtils.h b/compilerutils/include/compilerutils/CompilerUtils.h
index 3669ebeb2b..506dcad925 100644
--- a/compilerutils/include/compilerutils/CompilerUtils.h
+++ b/compilerutils/include/compilerutils/CompilerUtils.h
@@ -70,6 +70,9 @@ llvm::Function *cloneFunctionHeader(llvm::Function &f, llvm::FunctionType *newTy
 // Add an unreachable at the current position and remove the rest of the basic block.
 void createUnreachable(llvm::IRBuilder<> &b);
 
+// Specifies a memory that is loaded is the last use.
+void setIsLastUseLoad(llvm::LoadInst &Load);
+
 struct CrossModuleInlinerResult {
   llvm::Value *returnValue;
   llvm::iterator_range<llvm::Function::iterator> newBBs;
diff --git a/compilerutils/lib/CompilerUtils.cpp b/compilerutils/lib/CompilerUtils.cpp
index 40cc96bd83..8bf7ca653d 100644
--- a/compilerutils/lib/CompilerUtils.cpp
+++ b/compilerutils/lib/CompilerUtils.cpp
@@ -41,6 +41,10 @@
 
 using namespace llvm;
 
+// Whether this is a load instruction that should translate to a last_use
+// load.
+static constexpr const char *MDIsLastUseName = "amdgpu.last.use";
+
 // =====================================================================================================================
 // Create an LLVM function call to the named function. The callee is built
 // automatically based on return type and its parameters.
@@ -150,6 +154,10 @@ void CompilerUtils::createUnreachable(llvm::IRBuilder<> &b) {
   DeleteDeadBlock(oldCode);
 }
 
+void CompilerUtils::setIsLastUseLoad(llvm::LoadInst &Load) {
+  Load.setMetadata(MDIsLastUseName, MDTuple::get(Load.getContext(), {}));
+}
+
 namespace {
 
 // Get the name of a global that is copied to a different module for inlining.
diff --git a/gfxruntime/src/shaders/AdvancedBlend.hlsl b/gfxruntime/src/shaders/AdvancedBlend.hlsl
index 872f01b360..5bff6c24cc 100644
--- a/gfxruntime/src/shaders/AdvancedBlend.hlsl
+++ b/gfxruntime/src/shaders/AdvancedBlend.hlsl
@@ -49,8 +49,8 @@
 float4 AmdExtFragCoord() DUMMY_FLOAT4_FUNC
 int AmdExtSampleId() DUMMY_INT_FUNC
 
-float4 AmdAdvancedBlendTexelLoad(int4 imageLow, int4 imageHigh, int2 iCoord, int lod) DUMMY_FLOAT4_FUNC
-float4 AmdAdvancedBlendTexelLoadFmask(int4 imageMsLow, int4 imageMsHigh, int4 fmaskLow, int4 fmaskHigh, int2 iCoord, int lod) DUMMY_FLOAT4_FUNC
+float4 AmdAdvancedBlendTexelLoad(int64_t imageDesc, int2 iCoord, int lod) DUMMY_FLOAT4_FUNC
+float4 AmdAdvancedBlendTexelLoadFmask(int64_t imageDesc, int64_t fmaskDesc, int2 iCoord, int lod) DUMMY_FLOAT4_FUNC
 
 float4 AmdAdvancedBlendCoherentTexelLoad(float4 color, int2 iCoord, int sampleId) DUMMY_FLOAT4_FUNC
 void AmdAdvancedBlendCoherentTexelStore(float4 color, int2 iCoord, int sampleId) DUMMY_VOID_FUNC
@@ -224,8 +224,8 @@ float AmdAdvancedBlendDivide(float dividend, float divisor) {
   }
 }
 
-export float4 AmdAdvancedBlendInternal(float4 inColor, int4 imageMsLow, int4 imageMsHigh, int4 imageLow, int4 imageHigh,
-                                       int4 fmaskLow, int4 fmaskHigh, int mode, bool isMsaa) {
+export float4 AmdAdvancedBlendInternal(float4 inColor, int64_t imageDescMs, int64_t imageDesc, int64_t fmaskDesc,
+                                       int mode, bool isMsaa) {
   float4 srcColor = inColor;
   if (mode == 0) {
     return srcColor;
@@ -234,9 +234,9 @@ export float4 AmdAdvancedBlendInternal(float4 inColor, int4 imageMsLow, int4 ima
   int2 iCoord = int2(fragCoord.x, fragCoord.y);
   float4 dstColor;
   if (isMsaa) {
-    dstColor = AmdAdvancedBlendTexelLoadFmask(imageMsLow, imageMsHigh, fmaskLow, fmaskHigh, iCoord, 0);
+    dstColor = AmdAdvancedBlendTexelLoadFmask(imageDescMs, fmaskDesc, iCoord, 0);
   } else {
-    dstColor = AmdAdvancedBlendTexelLoad(imageLow, imageHigh, iCoord, 0);
+    dstColor = AmdAdvancedBlendTexelLoad(imageDesc, iCoord, 0);
   }
   // TODO: Uncomment them once ROV is support in LLPC
   // int sampleId = AmdExtSampleId();
diff --git a/imported/llvm-dialects b/imported/llvm-dialects
index 55e176fb88..ed4b46e842 160000
--- a/imported/llvm-dialects
+++ b/imported/llvm-dialects
@@ -1 +1 @@
-Subproject commit 55e176fb88bcfc4fae45bafaa3ff209ec4c0d4ee
+Subproject commit ed4b46e8425066a96a5e79afc29bce3d82eecf71
diff --git a/include/vkgcDefs.h b/include/vkgcDefs.h
index 60b5621565..aed46de1b2 100644
--- a/include/vkgcDefs.h
+++ b/include/vkgcDefs.h
@@ -472,6 +472,7 @@ struct PipelineOptions {
   bool internalRtShaders;                         ///< Whether this pipeline has internal raytracing shaders
   unsigned forceNonUniformResourceIndexStageMask; ///< Mask of the stage to force using non-uniform resource index.
   bool reserved16;
+#if LLPC_CLIENT_INTERFACE_MAJOR_VERSION < 73
   bool replaceSetWithResourceType;        ///< For OGL only, replace 'set' with resource type during spirv translate
   bool disableSampleMask;                 ///< For OGL only, disabled if framebuffer doesn't attach multisample texture
   bool buildResourcesDataForShaderModule; ///< For OGL only, build resources usage data while building shader module
@@ -482,6 +483,25 @@ struct PipelineOptions {
   bool enableFragColor;                   ///< For OGL only, need to do frag color broadcast if it is enabled.
   bool disableBaseVertex;                 ///< For OGL only, force the BaseVertex builtin to 0 instead of
                                           ///  loading it from userdata
+  bool bindlessTextureMode;               ///< For OGL only, true if bindless textures are used
+  bool bindlessImageMode;                 ///< For OGL only, true if bindless images are used
+  const auto &getGlState() const { return *this; }
+#else
+  struct GLState {
+    bool replaceSetWithResourceType; ///< For OGL only, replace 'set' with resource type during spirv translate
+    bool disableSampleMask;          ///< For OGL only, disabled if framebuffer doesn't attach multisample texture
+    bool buildResourcesDataForShaderModule; ///< For OGL only, build resources usage data while building shader module
+    bool disableTruncCoordForGather;        ///< If set, trunc_coord of sampler srd is disabled for gather4
+    bool enableCombinedTexture;             ///< For OGL only, use the 'set' for DescriptorCombinedTexture
+                                            ///< for sampled images and samplers
+    bool vertex64BitsAttribSingleLoc;       ///< For OGL only, dvec3/dvec4 vertex attrib only consumes 1 location.
+    bool enableFragColor;                   ///< For OGL only, need to do frag color broadcast if it is enabled.
+    bool disableBaseVertex;                 ///< For OGL only, force the BaseVertex builtin to 0 instead of
+    bool bindlessTextureMode;               ///< For OGL only, true if bindless textures are used
+    bool bindlessImageMode;                 ///< For OGL only, true if bindless images are used
+  } glState;
+  const auto &getGlState() const { return glState; }
+#endif
   unsigned reserved20;
   bool enablePrimGeneratedQuery; ///< If set, primitive generated query is enabled
   bool disablePerCompFetch;      ///< Disable per component fetch in uber fetch shader.
@@ -512,6 +532,7 @@ struct ResourceNodeData {
   unsigned isTexelFetchUsed;        ///< TRUE if texelFetch is used
   unsigned isDefaultUniformSampler; ///< TRUE if it's sampler image in default uniform struct
   unsigned columnCount;             ///< Column count if this is a matrix variable.
+  unsigned componentCount;          ///< Component count if this is a vector, row count if it is a matrix.
   BasicType basicType;              ///< Type of the variable or element
 };
 
@@ -545,6 +566,43 @@ struct ResourcesNodes {
   unsigned defaultUniformInfoCount;
 };
 
+// raytracing system value usage flags
+union RayTracingSystemValueUsage {
+  struct {
+    union {
+      struct {
+        uint16_t flags : 1;             // Shader calls gl_IncomingRayFlagsEXT
+        uint16_t worldRayOrigin : 1;    // Shader calls gl_WorldRayOriginEXT
+        uint16_t tMin : 1;              // Shader calls gl_RayTminEXT
+        uint16_t worldRayDirection : 1; // Shader calls gl_WorldRayDirectionEXT
+        uint16_t tCurrent : 1;          // Shader calls gl_HitTEXT
+        uint16_t launchId : 1;          // Shader calls gl_LaunchIDEXT
+        uint16_t launchSize : 1;        // Shader calls gl_LaunchSizeEXT
+        uint16_t reserved : 9;          // Reserved
+      };
+      uint16_t u16All;
+    } ray;
+
+    union {
+      struct {
+        uint16_t hitKind : 1;             // Shader calls gl_HitKindEXT
+        uint16_t instanceIndex : 1;       // Shader calls gl_InstanceCustomIndexEXT
+        uint16_t instanceID : 1;          // Shader calls gl_InstanceID
+        uint16_t primitiveIndex : 1;      // Shader calls gl_PrimitiveID
+        uint16_t geometryIndex : 1;       // Shader calls gl_GeometryIndexEXT
+        uint16_t objectToWorld : 1;       // Shader calls gl_ObjectToWorldEXT
+        uint16_t objectRayOrigin : 1;     // Shader calls gl_ObjectRayOriginEXT
+        uint16_t objectRayDirection : 1;  // Shader calls gl_ObjectRayDirectionEXT
+        uint16_t worldToObject : 1;       // Shader calls gl_WorldToObjectEXT
+        uint16_t hitTrianglePosition : 1; // Shader calls gl_HitTriangleVertexPositionsEXT
+        uint16_t reserved : 6;            // Reserved
+      };
+      uint16_t u16All;
+    } primitive;
+  };
+  uint32_t u32All;
+};
+
 /// Represents usage info of a shader module
 struct ShaderModuleUsage {
   bool enableVarPtrStorageBuf;    ///< Whether to enable "VariablePointerStorageBuffer" capability
@@ -573,12 +631,14 @@ struct ShaderModuleUsage {
   bool pixelCenterInteger;        ///< Whether pixel coord is Integer
   bool useGenericBuiltIn;         ///< Whether to use builtIn inputs that include gl_PointCoord, gl_PrimitiveId,
                                   ///  gl_Layer, gl_ClipDistance or gl_CullDistance.
+  bool useBarycentric;            ///< Whether to use gl_BarycentricXX or pervertexEXT decoration
   bool enableXfb;                 ///< Whether transform feedback is enabled
   unsigned localSizeX;            ///< Compute shader work-group size in the X dimension
   unsigned localSizeY;            ///< Compute shader work-group size in the Y dimension
   unsigned localSizeZ;            ///< Compute shader work-group size in the Z dimension
   bool disableDualSource;         ///< Whether disable dualSource blend
   uint32_t clipDistanceArraySize; ///< Count of output clip distance
+  RayTracingSystemValueUsage rtSystemValueUsage; ///< Usage flags for ray tracing builtins
 };
 
 /// Represents common part of shader module data
@@ -1001,43 +1061,6 @@ enum RayTracingRayFlag : unsigned {
 };
 
 // =====================================================================================================================
-// raytracing system value usage flags
-union RayTracingSystemValueUsage {
-  struct {
-    union {
-      struct {
-        uint16_t flags : 1;             // Shader calls gl_IncomingRayFlagsEXT
-        uint16_t worldRayOrigin : 1;    // Shader calls gl_WorldRayOriginEXT
-        uint16_t tMin : 1;              // Shader calls gl_RayTminEXT
-        uint16_t worldRayDirection : 1; // Shader calls gl_WorldRayDirectionEXT
-        uint16_t tCurrent : 1;          // Shader calls gl_HitTEXT
-        uint16_t launchId : 1;          // Shader calls gl_LaunchIDEXT
-        uint16_t launchSize : 1;        // Shader calls gl_LaunchSizeEXT
-        uint16_t reserved : 9;          // Reserved
-      };
-      uint16_t u16All;
-    } ray;
-
-    union {
-      struct {
-        uint16_t hitKind : 1;             // Shader calls gl_HitKindEXT
-        uint16_t instanceIndex : 1;       // Shader calls gl_InstanceCustomIndexEXT
-        uint16_t instanceID : 1;          // Shader calls gl_InstanceID
-        uint16_t primitiveIndex : 1;      // Shader calls gl_PrimitiveID
-        uint16_t geometryIndex : 1;       // Shader calls gl_GeometryIndexEXT
-        uint16_t objectToWorld : 1;       // Shader calls gl_ObjectToWorldEXT
-        uint16_t objectRayOrigin : 1;     // Shader calls gl_ObjectRayOriginEXT
-        uint16_t objectRayDirection : 1;  // Shader calls gl_ObjectRayDirectionEXT
-        uint16_t worldToObject : 1;       // Shader calls gl_WorldToObjectEXT
-        uint16_t hitTrianglePosition : 1; // Shader calls gl_HitTriangleVertexPositionsEXT
-        uint16_t reserved : 6;            // Reserved
-      };
-      uint16_t u16All;
-    } primitive;
-  };
-  uint32_t u32All;
-};
-
 /// Represents ray-tracing shader export configuration
 struct RayTracingShaderExportConfig {
   unsigned indirectCallingConvention; ///< Indirect calling convention
@@ -1299,6 +1322,7 @@ struct GraphicsPipelineBuildInfo {
     float pixelTransferBias[4];                   ///< Bias apply to render color target
     bool enableColorClampVs;                      ///< Enable clamp vertex output color
     bool enableColorClampFs;                      ///< Enable clamp fragment output color
+    bool enableFlatShade;                         ///< Whether enable flat shade.
   } glState;
   const auto &getGlState() const { return glState; }
 #endif
@@ -1597,6 +1621,7 @@ class IUtil {
   ///
   /// @param [in] spvBin   SPIR-V binary
   static const char *VKAPI_CALL GetEntryPointNameFromSpirvBinary(const BinaryData *spvBin);
+  static const char *VKAPI_CALL GetResourceMappingNodeTypeName(ResourceMappingNodeType type);
 };
 
 /// 128-bit hash compatible structure
diff --git a/lgc/CMakeLists.txt b/lgc/CMakeLists.txt
index e7feab6d76..e49c3a34fc 100644
--- a/lgc/CMakeLists.txt
+++ b/lgc/CMakeLists.txt
@@ -191,6 +191,7 @@ target_sources(LLVMlgc PRIVATE
     state/ShaderModes.cpp
     state/ShaderStage.cpp
     state/TargetInfo.cpp
+    state/RuntimeContext.cpp
 )
 
 # lgc/util
diff --git a/lgc/builder/ArithBuilder.cpp b/lgc/builder/ArithBuilder.cpp
index 57ba3e3f8c..fee6b1a564 100644
--- a/lgc/builder/ArithBuilder.cpp
+++ b/lgc/builder/ArithBuilder.cpp
@@ -507,29 +507,20 @@ Value *BuilderImpl::CreateCosh(Value *x, const Twine &instName) {
 // @param x : Input value X
 // @param instName : Name to give instruction(s)
 Value *BuilderImpl::CreateTanh(Value *x, const Twine &instName) {
-  // sinh(x) / cosh(x)
-  // (e^x - e^(-x))/(e^x + e^(-x))
+  // tanh(x) = copysign(1-2/(e^-|2x|+1),x)
   // 1/log(2) = 1.442695
-  // e^x = 2^(x*(1/log(2))) = 2^(x*1.442695))
-  Value *divLog2 = CreateFMul(x, getRecipLog2(x->getType()));
-  Value *negDivLog2 = CreateFSub(ConstantFP::get(x->getType(), 0.0), divLog2);
-  Value *exp = CreateUnaryIntrinsic(Intrinsic::exp2, divLog2);
-  Value *expNeg = CreateUnaryIntrinsic(Intrinsic::exp2, negDivLog2);
-  Value *doubleSinh = CreateFSub(exp, expNeg);
-  Value *doubleCosh = CreateFAdd(exp, expNeg);
-  Value *result = fDivFast(doubleSinh, doubleCosh);
-
-  if (!getFastMathFlags().noInfs()) {
-    // NOTE: If the fast math flags might have INFs, we should check the special case when the input is +INF or -INF.
-    // According to the limit of tanh(x), we have following definitions:
-    //                  / 1.0, when x -> +INF
-    //   lim(tanh(x)) =
-    //                  \ -1.0, when x -> -INF
-    Value *one = ConstantFP::get(x->getType(), 1.0);
-    Value *isInf = CreateIsInf(x);
-    result = CreateSelect(isInf, CreateCopySign(one, x), result);
-  }
-
+  // e = 2^(1/log(2))
+  // e^-|2x| = 2^(-|2x|*(1/log(2)))
+  auto vTy = x->getType();
+  Value *result = CreateIntrinsic(Intrinsic::fabs, vTy, x);
+  result = CreateFNeg(result);
+  result = CreateFMul(ConstantFP::get(vTy, 2.0), result);
+  result = CreateFMul(getRecipLog2(vTy), result);
+  result = CreateUnaryIntrinsic(Intrinsic::exp2, result);
+  result = CreateFAdd(ConstantFP::get(vTy, 1.0), result);
+  result = fDivFast(ConstantFP::get(vTy, 2.0), result);
+  result = CreateFSub(ConstantFP::get(vTy, 1.0), result);
+  result = CreateCopySign(result, x);
   result->setName(instName);
   return result;
 }
diff --git a/lgc/builder/BuilderImpl.cpp b/lgc/builder/BuilderImpl.cpp
index ad69faf624..0177fcd2be 100644
--- a/lgc/builder/BuilderImpl.cpp
+++ b/lgc/builder/BuilderImpl.cpp
@@ -249,11 +249,12 @@ Value *BuilderImpl::CreateIntegerDotProduct(Value *vector1, Value *vector2, Valu
 
 // =====================================================================================================================
 // Get whether the context we are building in supports ds_bpermute or v_bpermute across all lanes in the wave
-bool BuilderImpl::supportWaveWideBPermute() const {
+//
+// @param shaderStage : shader stage enum.
+bool BuilderImpl::supportWaveWideBPermute(ShaderStageEnum shaderStage) const {
   auto gfxIp = getPipelineState()->getTargetInfo().getGfxIpVersion().major;
   auto supportBPermute = gfxIp == 8 || gfxIp == 9;
-  auto shaderStage = getShaderStage(GetInsertBlock()->getParent());
-  auto waveSize = getPipelineState()->getShaderWaveSize(shaderStage.value());
+  auto waveSize = getPipelineState()->getShaderWaveSize(shaderStage);
   supportBPermute = supportBPermute || waveSize == 32;
   return supportBPermute;
 }
@@ -261,10 +262,7 @@ bool BuilderImpl::supportWaveWideBPermute() const {
 // =====================================================================================================================
 // Get whether the context we are building in supports permute lane 64 DPP operations.
 bool BuilderImpl::supportPermLane64Dpp() const {
-  auto gfxip = getPipelineState()->getTargetInfo().getGfxIpVersion().major;
-  auto shaderStage = getShaderStage(GetInsertBlock()->getParent());
-  auto waveSize = getPipelineState()->getShaderWaveSize(shaderStage.value());
-  return gfxip >= 11 && waveSize == 64;
+  return getPipelineState()->getTargetInfo().getGfxIpVersion().major >= 11;
 }
 
 // =====================================================================================================================
diff --git a/lgc/builder/DescBuilder.cpp b/lgc/builder/DescBuilder.cpp
index c837b2129a..6ef7fbbf69 100644
--- a/lgc/builder/DescBuilder.cpp
+++ b/lgc/builder/DescBuilder.cpp
@@ -394,45 +394,47 @@ Value *BuilderImpl::buildBufferCompactDesc(Value *desc, unsigned stride) {
   Value *descElem1 = CreateExtractElement(desc, 1);
 
   // Build normal buffer descriptor
-  // Dword 0
   Value *bufDesc = PoisonValue::get(FixedVectorType::get(getInt32Ty(), 4));
-  bufDesc = CreateInsertElement(bufDesc, descElem0, uint64_t(0));
-
-  // Dword 1
-  SqBufRsrcWord1 sqBufRsrcWord1 = {};
-  sqBufRsrcWord1.bits.baseAddressHi = UINT16_MAX;
-  descElem1 = CreateAnd(descElem1, getInt32(sqBufRsrcWord1.u32All));
-  if (stride) {
-    SqBufRsrcWord1 sqBufRsrcWord1Stride = {};
-    sqBufRsrcWord1Stride.bits.stride = stride;
-    descElem1 = CreateOr(descElem1, getInt32(sqBufRsrcWord1Stride.u32All));
-  }
-  bufDesc = CreateInsertElement(bufDesc, descElem1, 1);
-
-  // Dword 2
-  SqBufRsrcWord2 sqBufRsrcWord2 = {};
-  sqBufRsrcWord2.bits.numRecords = UINT32_MAX;
-  bufDesc = CreateInsertElement(bufDesc, getInt32(sqBufRsrcWord2.u32All), 2);
-
-  // Dword 3
-  SqBufRsrcWord3 sqBufRsrcWord3 = {};
-  sqBufRsrcWord3.bits.dstSelX = BUF_DST_SEL_X;
-  sqBufRsrcWord3.bits.dstSelY = BUF_DST_SEL_Y;
-  sqBufRsrcWord3.bits.dstSelZ = BUF_DST_SEL_Z;
-  sqBufRsrcWord3.bits.dstSelW = BUF_DST_SEL_W;
-  if (gfxIp.major == 10) {
-    sqBufRsrcWord3.gfx10.format = BUF_FORMAT_32_UINT;
-    sqBufRsrcWord3.gfx10.resourceLevel = 1;
-    sqBufRsrcWord3.gfx10.oobSelect = stride ? 3 : 2;
-    assert(sqBufRsrcWord3.u32All == 0x21014FAC || sqBufRsrcWord3.u32All == 0x31014FAC);
-  } else if (gfxIp.major >= 11) {
-    sqBufRsrcWord3.gfx11.format = BUF_FORMAT_32_UINT;
-    sqBufRsrcWord3.gfx11.oobSelect = stride ? 3 : 2;
-    assert(sqBufRsrcWord3.u32All == 0x20014FAC || sqBufRsrcWord3.u32All == 0x30014FAC);
-  } else {
-    llvm_unreachable("Not implemented!");
+  {
+    // Dword 0
+    bufDesc = CreateInsertElement(bufDesc, descElem0, uint64_t(0));
+
+    // Dword 1
+    SqBufRsrcWord1 sqBufRsrcWord1 = {};
+    sqBufRsrcWord1.bits.baseAddressHi = UINT16_MAX;
+    descElem1 = CreateAnd(descElem1, getInt32(sqBufRsrcWord1.u32All));
+    if (stride) {
+      SqBufRsrcWord1 sqBufRsrcWord1Stride = {};
+      sqBufRsrcWord1Stride.bits.stride = stride;
+      descElem1 = CreateOr(descElem1, getInt32(sqBufRsrcWord1Stride.u32All));
+    }
+    bufDesc = CreateInsertElement(bufDesc, descElem1, 1);
+
+    // Dword 2
+    SqBufRsrcWord2 sqBufRsrcWord2 = {};
+    sqBufRsrcWord2.bits.numRecords = UINT32_MAX;
+    bufDesc = CreateInsertElement(bufDesc, getInt32(sqBufRsrcWord2.u32All), 2);
+
+    // Dword 3
+    SqBufRsrcWord3 sqBufRsrcWord3 = {};
+    sqBufRsrcWord3.bits.dstSelX = BUF_DST_SEL_X;
+    sqBufRsrcWord3.bits.dstSelY = BUF_DST_SEL_Y;
+    sqBufRsrcWord3.bits.dstSelZ = BUF_DST_SEL_Z;
+    sqBufRsrcWord3.bits.dstSelW = BUF_DST_SEL_W;
+    if (gfxIp.major == 10) {
+      sqBufRsrcWord3.gfx10.format = BUF_FORMAT_32_UINT;
+      sqBufRsrcWord3.gfx10.resourceLevel = 1;
+      sqBufRsrcWord3.gfx10.oobSelect = stride ? 3 : 2;
+      assert(sqBufRsrcWord3.u32All == 0x21014FAC || sqBufRsrcWord3.u32All == 0x31014FAC);
+    } else if (gfxIp.major >= 11) {
+      sqBufRsrcWord3.gfx11.format = BUF_FORMAT_32_UINT;
+      sqBufRsrcWord3.gfx11.oobSelect = stride ? 3 : 2;
+      assert(sqBufRsrcWord3.u32All == 0x20014FAC || sqBufRsrcWord3.u32All == 0x30014FAC);
+    } else {
+      llvm_unreachable("Not implemented!");
+    }
+    bufDesc = CreateInsertElement(bufDesc, getInt32(sqBufRsrcWord3.u32All), 3);
   }
-  bufDesc = CreateInsertElement(bufDesc, getInt32(sqBufRsrcWord3.u32All), 3);
 
   return bufDesc;
 }
diff --git a/lgc/builder/ImageBuilder.cpp b/lgc/builder/ImageBuilder.cpp
index 9b2fdb6d0e..363ebefd7c 100644
--- a/lgc/builder/ImageBuilder.cpp
+++ b/lgc/builder/ImageBuilder.cpp
@@ -423,14 +423,9 @@ static Type *convertToFloatingPointType(Type *origTy) {
 // @param instName : Name to give instruction(s)
 Value *BuilderImpl::CreateImageLoad(Type *resultTy, unsigned dim, unsigned flags, Value *imageDesc, Value *coord,
                                     Value *mipLevel, const Twine &instName) {
-  imageDesc = fixImageDescForRead(imageDesc);
-  // Mark usage of images, to allow the compute workgroup reconfiguration optimization.
-  getPipelineState()->getShaderResourceUsage(m_shaderStage.value())->useImages = true;
-  getPipelineState()->getShaderResourceUsage(m_shaderStage.value())->resourceRead = true;
-  assert(coord->getType()->getScalarType()->isIntegerTy(32));
-  coord = handleFragCoordViewIndex(coord, flags, dim);
+  if (isa<PoisonValue>(imageDesc))
+    return PoisonValue::get(resultTy);
 
-  unsigned dmask = 1;
   Type *origTexelTy = resultTy;
   if (auto structResultTy = dyn_cast<StructType>(resultTy))
     origTexelTy = structResultTy->getElementType(0);
@@ -444,6 +439,21 @@ Value *BuilderImpl::CreateImageLoad(Type *resultTy, unsigned dim, unsigned flags
     texelTy = FixedVectorType::get(getHalfTy(), 4);
   }
 
+  bool isTexelBuffer = (dim == Dim1DBuffer || dim == Dim1DArrayBuffer);
+  bool needFullDesc = texelTy != origTexelTy && origTexelTy->isIntOrIntVectorTy(64) && origTexelTy->isVectorTy() &&
+                      m_pipelineState->getOptions().allowNullDescriptor;
+  imageDesc = transformImageDesc(imageDesc, needFullDesc, isTexelBuffer, resultTy);
+  const bool isVecTyDesc = imageDesc->getType()->isVectorTy();
+  if (isVecTyDesc)
+    imageDesc = fixImageDescForRead(imageDesc);
+  // Mark usage of images, to allow the compute workgroup reconfiguration optimization.
+  getPipelineState()->getShaderResourceUsage(m_shaderStage.value())->useImages = true;
+  getPipelineState()->getShaderResourceUsage(m_shaderStage.value())->resourceRead = true;
+  assert(coord->getType()->getScalarType()->isIntegerTy(32));
+  coord = handleFragCoordViewIndex(coord, flags, dim);
+
+  unsigned dmask = 1;
+
   if (auto vectorResultTy = dyn_cast<FixedVectorType>(texelTy))
     dmask = (1U << vectorResultTy->getNumElements()) - 1;
 
@@ -462,7 +472,7 @@ Value *BuilderImpl::CreateImageLoad(Type *resultTy, unsigned dim, unsigned flags
   SmallVector<Value *, 16> args;
   Instruction *imageInst = nullptr;
   unsigned imageDescArgIndex = 0;
-  if (imageDesc->getType() == getDescTy(ResourceNodeType::DescriptorResource)) {
+  if (!isTexelBuffer) {
     // Not texel buffer; use image load instruction.
     // Build the intrinsic arguments.
     bool tfe = isa<StructType>(intrinsicDataTy);
@@ -509,11 +519,13 @@ Value *BuilderImpl::CreateImageLoad(Type *resultTy, unsigned dim, unsigned flags
 
   // Add a waterfall loop if needed.
   Value *result = imageInst;
-  if (flags & ImageFlagNonUniformImage)
-    result = createWaterfallLoop(imageInst, imageDescArgIndex,
-                                 getPipelineState()->getShaderOptions(m_shaderStage.value()).scalarizeWaterfallLoads);
-  else if (flags & ImageFlagEnforceReadFirstLaneImage)
-    enforceReadFirstLane(imageInst, imageDescArgIndex);
+  if (imageDesc->getType()->isVectorTy()) {
+    if (flags & ImageFlagNonUniformImage)
+      result = createWaterfallLoop(imageInst, imageDescArgIndex,
+                                   getPipelineState()->getShaderOptions(m_shaderStage.value()).scalarizeWaterfallLoads);
+    else if (flags & ImageFlagEnforceReadFirstLaneImage)
+      enforceReadFirstLane(imageInst, imageDescArgIndex);
+  }
 
   if (texelTy != origTexelTy) {
     Value *texel = result;
@@ -581,6 +593,8 @@ Value *BuilderImpl::CreateImageLoad(Type *resultTy, unsigned dim, unsigned flags
 // @param instName : Name to give instruction(s)
 Value *BuilderImpl::CreateImageLoadWithFmask(Type *resultTy, unsigned dim, unsigned flags, Value *imageDesc,
                                              Value *fmaskDesc, Value *coord, Value *sampleNum, const Twine &instName) {
+  if (isa<PoisonValue>(imageDesc))
+    return PoisonValue::get(resultTy);
   // Load texel from F-mask image.
   unsigned fmaskDim = dim;
   switch (dim) {
@@ -596,7 +610,7 @@ Value *BuilderImpl::CreateImageLoadWithFmask(Type *resultTy, unsigned dim, unsig
   }
 
   // When the shadow table is disabled, we don't need to load F-mask descriptor
-  if (m_pipelineState->getOptions().enableFmask) {
+  if (m_pipelineState->getOptions().enableFmask && !isa<PoisonValue>(fmaskDesc)) {
     Value *fmaskTexel = CreateImageLoad(FixedVectorType::get(getInt32Ty(), 4), fmaskDim, flags, fmaskDesc, coord,
                                         nullptr, instName + ".fmaskload");
 
@@ -607,6 +621,11 @@ Value *BuilderImpl::CreateImageLoadWithFmask(Type *resultTy, unsigned dim, unsig
     calcSampleNum = CreateAnd(calcSampleNum, getInt32(15));
 
     // Check whether the F-mask descriptor has a BUF_DATA_FORMAT_INVALID (0) format (dword[1].bit[20-25]).
+    if (!fmaskDesc->getType()->isVectorTy()) {
+      auto callInst = cast<CallInst>(fmaskTexel);
+      unsigned argIdx = callInst->arg_size() == 5 ? 0 : callInst->arg_size() - 3;
+      fmaskDesc = callInst->getArgOperand(argIdx);
+    }
     Value *fmaskFormat = CreateExtractElement(fmaskDesc, 1);
     fmaskFormat = CreateAnd(fmaskFormat, getInt32(63 << 20));
     Value *fmaskValidFormat = CreateICmpNE(fmaskFormat, getInt32(0));
@@ -634,6 +653,8 @@ Value *BuilderImpl::CreateImageLoadWithFmask(Type *resultTy, unsigned dim, unsig
 // @param instName : Name to give instruction(s)
 Value *BuilderImpl::CreateImageStore(Value *texel, unsigned dim, unsigned flags, Value *imageDesc, Value *coord,
                                      Value *mipLevel, const Twine &instName) {
+  if (isa<PoisonValue>(imageDesc))
+    return PoisonValue::get(texel->getType());
   // Mark usage of images, to allow the compute workgroup reconfiguration optimization.
   getPipelineState()->getShaderResourceUsage(m_shaderStage.value())->resourceWrite = true;
   assert(coord->getType()->getScalarType()->isIntegerTy(32));
@@ -658,11 +679,14 @@ Value *BuilderImpl::CreateImageStore(Value *texel, unsigned dim, unsigned flags,
   SmallVector<Value *, 6> derivatives;
   dim = prepareCoordinate(dim, coord, nullptr, nullptr, nullptr, coords, derivatives);
 
+  bool isTexelBuffer = (dim == Dim1DBuffer || dim == Dim1DArrayBuffer);
+  imageDesc = transformImageDesc(imageDesc, false, isTexelBuffer, texel->getType());
+
   Type *texelTy = texel->getType();
   SmallVector<Value *, 16> args;
   Instruction *imageStore = nullptr;
   unsigned imageDescArgIndex = 0;
-  if (imageDesc->getType() == getDescTy(ResourceNodeType::DescriptorResource)) {
+  if (!isTexelBuffer) {
     // Not texel buffer; use image store instruction.
     // Build the intrinsic arguments.
     unsigned dmask = 1;
@@ -717,12 +741,14 @@ Value *BuilderImpl::CreateImageStore(Value *texel, unsigned dim, unsigned flags,
         CreateIntrinsic(Intrinsic::amdgcn_struct_buffer_store_format, texel->getType(), args, nullptr, instName);
   }
 
-  // Add a waterfall loop if needed.
-  if (flags & ImageFlagNonUniformImage)
-    createWaterfallLoop(imageStore, imageDescArgIndex,
-                        getPipelineState()->getShaderOptions(m_shaderStage.value()).scalarizeWaterfallLoads);
-  else if (flags & ImageFlagEnforceReadFirstLaneImage)
-    enforceReadFirstLane(imageStore, imageDescArgIndex);
+  if (imageDesc->getType()->isVectorTy()) {
+    // Add a waterfall loop if needed.
+    if (flags & ImageFlagNonUniformImage)
+      createWaterfallLoop(imageStore, imageDescArgIndex,
+                          getPipelineState()->getShaderOptions(m_shaderStage.value()).scalarizeWaterfallLoads);
+    else if (flags & ImageFlagEnforceReadFirstLaneImage)
+      enforceReadFirstLane(imageStore, imageDescArgIndex);
+  }
 
   return imageStore;
 }
@@ -805,6 +831,10 @@ Value *BuilderImpl::CreateImageSampleConvertYCbCr(Type *resultTy, unsigned dim,
   Value *imageDesc = imageDescArray;
   if (isa<ArrayType>(imageDescArray->getType()))
     imageDesc = CreateExtractValue(imageDescArray, 0);
+  if (isa<PoisonValue>(imageDesc))
+    imageDesc = PoisonValue::get(FixedVectorType::get(getInt32Ty(), 8));
+  else
+    imageDesc = transformImageDesc(imageDesc, true, false, resultTy);
   imageDesc = fixImageDescForRead(imageDesc);
 
   YCbCrSampleInfo sampleInfoLuma = {resultTy, dim, flags, imageDesc, samplerDescLuma, address, instName.str(), true};
@@ -817,6 +847,10 @@ Value *BuilderImpl::CreateImageSampleConvertYCbCr(Type *resultTy, unsigned dim,
   // Set image descriptor for chroma channel
   for (unsigned planeIdx = 1; planeIdx < yCbCrMetaData.word1.planes; ++planeIdx) {
     imageDesc = CreateExtractValue(imageDescArray, planeIdx);
+    if (isa<PoisonValue>(imageDesc))
+      imageDesc = PoisonValue::get(FixedVectorType::get(getInt32Ty(), 8));
+    else
+      imageDesc = transformImageDesc(imageDesc, true, false, resultTy);
     imageDesc = fixImageDescForRead(imageDesc);
     YCbCrConverter.SetImgDescChroma(planeIdx, imageDesc);
   }
@@ -843,6 +877,9 @@ Value *BuilderImpl::CreateImageSampleConvertYCbCr(Type *resultTy, unsigned dim,
 // @param instName : Name to give instruction(s)
 Value *BuilderImpl::CreateImageGather(Type *resultTy, unsigned dim, unsigned flags, Value *imageDesc,
                                       Value *samplerDesc, ArrayRef<Value *> address, const Twine &instName) {
+  if (isa<PoisonValue>(imageDesc) || isa<PoisonValue>(samplerDesc))
+    return PoisonValue::get(resultTy);
+
   Value *coord = address[ImageAddressIdxCoordinate];
   assert(coord->getType()->getScalarType()->isFloatTy() || coord->getType()->getScalarType()->isHalfTy());
 
@@ -861,8 +898,7 @@ Value *BuilderImpl::CreateImageGather(Type *resultTy, unsigned dim, unsigned fla
       gatherTy = StructType::get(getContext(), {gatherTy, getInt32Ty()});
   }
 
-  // Only the first 4 dwords are sampler descriptor, we need to extract these values under any condition
-  samplerDesc = CreateShuffleVector(samplerDesc, samplerDesc, ArrayRef<int>{0, 1, 2, 3});
+  samplerDesc = transformSamplerDesc(samplerDesc);
 
   if (m_pipelineState->getOptions().disableTruncCoordForGather) {
     samplerDesc = modifySamplerDescForGather(samplerDesc);
@@ -930,7 +966,16 @@ Value *BuilderImpl::CreateImageGather(Type *resultTy, unsigned dim, unsigned fla
 Value *BuilderImpl::CreateImageSampleGather(Type *resultTy, unsigned dim, unsigned flags, Value *coord,
                                             Value *imageDesc, Value *samplerDesc, ArrayRef<Value *> address,
                                             const Twine &instName, bool isSample) {
-  imageDesc = fixImageDescForRead(imageDesc);
+  if (isa<PoisonValue>(imageDesc) || isa<PoisonValue>(samplerDesc))
+    return PoisonValue::get(resultTy);
+
+  imageDesc = transformImageDesc(imageDesc, false, false, resultTy);
+  const bool isVecTyDesc = imageDesc->getType()->isVectorTy();
+  if (isVecTyDesc)
+    imageDesc = fixImageDescForRead(imageDesc);
+
+  samplerDesc = transformSamplerDesc(samplerDesc);
+
   // Mark usage of images, to allow the compute workgroup reconfiguration optimization.
   getPipelineState()->getShaderResourceUsage(m_shaderStage.value())->useImages = true;
   // Set up the mask of address components provided, for use in searching the intrinsic ID table
@@ -1057,16 +1102,22 @@ Value *BuilderImpl::CreateImageSampleGather(Type *resultTy, unsigned dim, unsign
 
   // Add a waterfall loop if needed.
   SmallVector<unsigned, 2> nonUniformArgIndexes;
-  if (flags & ImageFlagNonUniformImage)
-    nonUniformArgIndexes.push_back(imageDescArgIndex);
-  else if (flags & ImageFlagEnforceReadFirstLaneImage)
-    enforceReadFirstLane(imageOp, imageDescArgIndex);
+  if (imageDesc->getType()->isVectorTy()) {
+    if (flags & ImageFlagNonUniformImage)
+      nonUniformArgIndexes.push_back(imageDescArgIndex);
+    else if (flags & ImageFlagEnforceReadFirstLaneImage)
+      enforceReadFirstLane(imageOp, imageDescArgIndex);
+  }
 
-  const unsigned samplerDescArgIndex = imageDescArgIndex + 1;
-  if (flags & ImageFlagNonUniformSampler)
-    nonUniformArgIndexes.push_back(samplerDescArgIndex);
-  else if (flags & ImageFlagEnforceReadFirstLaneSampler)
-    enforceReadFirstLane(imageOp, samplerDescArgIndex);
+  if (samplerDesc->getType()->isVectorTy()) {
+    const unsigned samplerDescArgIndex = imageDescArgIndex + 1;
+    if (flags & ImageFlagNonUniformSampler) {
+      nonUniformArgIndexes.push_back(samplerDescArgIndex);
+    } else {
+      // TODO: Re-add the condition once backend fix the waterfall loop bug.
+      enforceReadFirstLane(imageOp, samplerDescArgIndex);
+    }
+  }
 
   if (!nonUniformArgIndexes.empty())
     imageOp = createWaterfallLoop(imageOp, nonUniformArgIndexes,
@@ -1123,6 +1174,8 @@ Value *BuilderImpl::CreateImageAtomicCompareSwap(unsigned dim, unsigned flags, A
 Value *BuilderImpl::CreateImageAtomicCommon(unsigned atomicOp, unsigned dim, unsigned flags, AtomicOrdering ordering,
                                             Value *imageDesc, Value *coord, Value *inputValue, Value *comparatorValue,
                                             const Twine &instName) {
+  if (isa<PoisonValue>(imageDesc))
+    return PoisonValue::get(inputValue->getType());
   getPipelineState()->getShaderResourceUsage(m_shaderStage.value())->resourceWrite = true;
   assert(coord->getType()->getScalarType()->isIntegerTy(32));
   coord = handleFragCoordViewIndex(coord, flags, dim);
@@ -1131,7 +1184,7 @@ Value *BuilderImpl::CreateImageAtomicCommon(unsigned atomicOp, unsigned dim, uns
   case AtomicOrdering::Release:
   case AtomicOrdering::AcquireRelease:
   case AtomicOrdering::SequentiallyConsistent:
-    CreateFence(AtomicOrdering::Release, SyncScope::System);
+    CreateFence(AtomicOrdering::Release, getContext().getOrInsertSyncScopeID("agent"));
     break;
   default:
     break;
@@ -1142,10 +1195,13 @@ Value *BuilderImpl::CreateImageAtomicCommon(unsigned atomicOp, unsigned dim, uns
   SmallVector<Value *, 6> derivatives;
   dim = prepareCoordinate(dim, coord, nullptr, nullptr, nullptr, coords, derivatives);
 
+  bool isTexelBuffer = (dim == Dim1DBuffer || dim == Dim1DArrayBuffer);
+  imageDesc = transformImageDesc(imageDesc, false, isTexelBuffer, nullptr);
+
   SmallVector<Value *, 8> args;
   Instruction *atomicInst = nullptr;
   unsigned imageDescArgIndex = 0;
-  if (imageDesc->getType() == getDescTy(ResourceNodeType::DescriptorResource)) {
+  if (!isTexelBuffer) {
     // Resource descriptor. Use the image atomic instruction.
     args.push_back(inputValue);
     if (atomicOp == AtomicOpCompareSwap)
@@ -1176,18 +1232,20 @@ Value *BuilderImpl::CreateImageAtomicCommon(unsigned atomicOp, unsigned dim, uns
     atomicInst =
         CreateIntrinsic(StructBufferAtomicIntrinsicTable[atomicOp], inputValue->getType(), args, nullptr, instName);
   }
-  if (flags & ImageFlagNonUniformImage)
-    atomicInst =
-        createWaterfallLoop(atomicInst, imageDescArgIndex,
-                            getPipelineState()->getShaderOptions(m_shaderStage.value()).scalarizeWaterfallLoads);
-  else if (flags & ImageFlagEnforceReadFirstLaneImage)
-    enforceReadFirstLane(atomicInst, imageDescArgIndex);
+  if (imageDesc->getType()->isVectorTy()) {
+    if (flags & ImageFlagNonUniformImage)
+      atomicInst =
+          createWaterfallLoop(atomicInst, imageDescArgIndex,
+                              getPipelineState()->getShaderOptions(m_shaderStage.value()).scalarizeWaterfallLoads);
+    else if (flags & ImageFlagEnforceReadFirstLaneImage)
+      enforceReadFirstLane(atomicInst, imageDescArgIndex);
+  }
 
   switch (ordering) {
   case AtomicOrdering::Acquire:
   case AtomicOrdering::AcquireRelease:
   case AtomicOrdering::SequentiallyConsistent:
-    CreateFence(AtomicOrdering::Acquire, SyncScope::System);
+    CreateFence(AtomicOrdering::Acquire, getContext().getOrInsertSyncScopeID("agent"));
     break;
   default:
     break;
@@ -1204,8 +1262,12 @@ Value *BuilderImpl::CreateImageAtomicCommon(unsigned atomicOp, unsigned dim, uns
 // @param imageDesc : Image descriptor or texel buffer descriptor
 // @param instName : Name to give instruction(s)
 Value *BuilderImpl::CreateImageQueryLevels(unsigned dim, unsigned flags, Value *imageDesc, const Twine &instName) {
+  if (isa<PoisonValue>(imageDesc))
+    return PoisonValue::get(getInt32Ty());
   dim = dim == DimCubeArray ? DimCube : dim;
 
+  imageDesc = transformImageDesc(imageDesc, true, false, nullptr);
+
   Value *numMipLevel = nullptr;
   GfxIpVersion gfxIp = getPipelineState()->getTargetInfo().getGfxIpVersion();
   SqImgRsrcRegHandler proxySqRsrcRegHelper(this, imageDesc, &gfxIp);
@@ -1242,6 +1304,10 @@ Value *BuilderImpl::CreateImageQueryLevels(unsigned dim, unsigned flags, Value *
 // @param imageDesc : Image descriptor or texel buffer descriptor
 // @param instName : Name to give instruction(s)
 Value *BuilderImpl::CreateImageQuerySamples(unsigned dim, unsigned flags, Value *imageDesc, const Twine &instName) {
+  if (isa<PoisonValue>(imageDesc))
+    return PoisonValue::get(getInt32Ty());
+
+  imageDesc = transformImageDesc(imageDesc, true, false, nullptr);
   Value *descWord3 = CreateExtractElement(imageDesc, 3);
   Value *lastLevel = nullptr;
   if (m_pipelineState->getTargetInfo().getGfxIpVersion().major <= 11) {
@@ -1299,7 +1365,11 @@ Value *BuilderImpl::CreateImageQuerySamples(unsigned dim, unsigned flags, Value
 // @param instName : Name to give instruction(s)
 Value *BuilderImpl::CreateImageQuerySize(unsigned dim, unsigned flags, Value *imageDesc, Value *lod,
                                          const Twine &instName) {
-  if (imageDesc->getType() == getDescTy(ResourceNodeType::DescriptorTexelBuffer)) {
+  if (isa<PoisonValue>(imageDesc))
+    return PoisonValue::get(getInt32Ty());
+  bool isTexelBuffer = (dim == Dim1DBuffer || dim == Dim1DArrayBuffer);
+  imageDesc = transformImageDesc(imageDesc, true, isTexelBuffer, nullptr);
+  if (isTexelBuffer) {
     // Texel buffer.
     // Extract NUM_RECORDS (SQ_BUF_RSRC_WORD2)
     Value *numRecords = CreateExtractElement(imageDesc, 2);
@@ -1396,6 +1466,9 @@ Value *BuilderImpl::CreateImageQuerySize(unsigned dim, unsigned flags, Value *im
 // @param instName : Name to give instruction(s)
 Value *BuilderImpl::CreateImageGetLod(unsigned dim, unsigned flags, Value *imageDesc, Value *samplerDesc, Value *coord,
                                       const Twine &instName) {
+  if (isa<PoisonValue>(imageDesc) || isa<PoisonValue>(samplerDesc))
+    return PoisonValue::get(FixedVectorType::get(getFloatTy(), 2));
+
   // Remove array from dimension if any.
   switch (dim) {
   case Dim1DArray:
@@ -1417,8 +1490,13 @@ Value *BuilderImpl::CreateImageGetLod(unsigned dim, unsigned flags, Value *image
   SmallVector<Value *, 6> derivatives;
   dim = prepareCoordinate(dim, coord, nullptr, nullptr, nullptr, coords, derivatives);
 
-  // Only the first 4 dwords are sampler descriptor, we need to extract these values under any condition
-  samplerDesc = CreateShuffleVector(samplerDesc, samplerDesc, ArrayRef<int>{0, 1, 2, 3});
+  imageDesc = transformImageDesc(imageDesc, false, false, nullptr);
+  if (isa<FixedVectorType>(samplerDesc->getType())) {
+    // Only the first 4 dwords are sampler descriptor, we need to extract these values under any condition
+    samplerDesc = CreateShuffleVector(samplerDesc, ArrayRef<int>{0, 1, 2, 3});
+  } else {
+    samplerDesc = transformSamplerDesc(samplerDesc);
+  }
 
   SmallVector<Value *, 9> args;
   args.push_back(getInt32(3)); // dmask
@@ -1432,19 +1510,23 @@ Value *BuilderImpl::CreateImageGetLod(unsigned dim, unsigned flags, Value *image
 
   Instruction *result = CreateIntrinsic(ImageGetLodIntrinsicTable[dim],
                                         {FixedVectorType::get(getFloatTy(), 2), getFloatTy()}, args, nullptr, instName);
-  // Add a waterfall loop if needed.
-  SmallVector<unsigned, 2> nonUniformArgIndexes;
-  if (flags & ImageFlagNonUniformImage)
-    nonUniformArgIndexes.push_back(imageDescArgIndex);
-  else if (flags & ImageFlagEnforceReadFirstLaneImage)
-    enforceReadFirstLane(result, imageDescArgIndex);
 
-  const unsigned samplerDescArgIndex = imageDescArgIndex + 1;
-  if (flags & ImageFlagNonUniformSampler)
-    nonUniformArgIndexes.push_back(samplerDescArgIndex);
-  else if (flags & ImageFlagEnforceReadFirstLaneSampler)
-    enforceReadFirstLane(result, samplerDescArgIndex);
+  SmallVector<unsigned, 2> nonUniformArgIndexes;
+  if (imageDesc->getType()->isVectorTy()) {
+    // Add a waterfall loop if needed.
+    if (flags & ImageFlagNonUniformImage)
+      nonUniformArgIndexes.push_back(imageDescArgIndex);
+    else if (flags & ImageFlagEnforceReadFirstLaneImage)
+      enforceReadFirstLane(result, imageDescArgIndex);
+  }
 
+  if (samplerDesc->getType()->isVectorTy()) {
+    const unsigned samplerDescArgIndex = imageDescArgIndex + 1;
+    if (flags & ImageFlagNonUniformSampler)
+      nonUniformArgIndexes.push_back(samplerDescArgIndex);
+    else if (flags & ImageFlagEnforceReadFirstLaneSampler)
+      enforceReadFirstLane(result, samplerDescArgIndex);
+  }
   if (!nonUniformArgIndexes.empty())
     result = createWaterfallLoop(result, nonUniformArgIndexes,
                                  getPipelineState()->getShaderOptions(m_shaderStage.value()).scalarizeWaterfallLoads);
@@ -1911,3 +1993,40 @@ Value *BuilderImpl::modifySamplerDescForGather(Value *samplerDesc) {
 
   return samplerDesc;
 }
+
+// =====================================================================================================================
+// Transform image descriptor pointer to a i32 type or a descriptor load instruction.
+//
+// @param imageDesc : image descriptor pointer
+// @param mustLoad : Whether to load image descriptor from the pointer
+// @param isTexelBuffer : Whether it is a texel buffer
+// @param texelType : The type of the texel
+// @returns The transformed descriptor
+Value *BuilderImpl::transformImageDesc(Value *imageDesc, bool mustLoad, bool isTexelBuffer, Type *texelType) {
+  assert(!isa<PoisonValue>(imageDesc));
+  if (isa<FixedVectorType>(imageDesc->getType()))
+    return imageDesc;
+
+  // Explicitly load the descriptor from the descriptor pointer
+  Type *descType = FixedVectorType::get(getInt32Ty(), isTexelBuffer ? 4 : 8);
+  Value *desc = CreateLoad(descType, imageDesc);
+  cast<Instruction>(desc)->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(getContext(), {}));
+  return desc;
+}
+
+// =====================================================================================================================
+// Transform sampler descriptor pointer to a i32 type or a descriptor load instruction.
+//
+// @param samplerDesc : descriptor pointer or a full descriptor
+// @returns Transformed sampler descriptor
+Value *BuilderImpl::transformSamplerDesc(Value *samplerDesc) {
+  assert(!isa<PoisonValue>(samplerDesc));
+  if (isa<FixedVectorType>(samplerDesc->getType()))
+    return samplerDesc;
+
+  // Explicitly load the descriptor from the descriptor pointer
+  Type *descType = FixedVectorType::get(getInt32Ty(), 4);
+  Value *desc = CreateLoad(descType, samplerDesc);
+  cast<Instruction>(desc)->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(getContext(), {}));
+  return desc;
+}
diff --git a/lgc/builder/InOutBuilder.cpp b/lgc/builder/InOutBuilder.cpp
index 4172a0aa6b..14cd7b5032 100644
--- a/lgc/builder/InOutBuilder.cpp
+++ b/lgc/builder/InOutBuilder.cpp
@@ -430,8 +430,8 @@ void BuilderImpl::markGenericInputOutputUsage(bool isOutput, unsigned location,
       if (isOutput) {
         // Keep all locations if the next stage of the output is fragment shader or is unspecified
         if (m_shaderStage != ShaderStage::Fragment) {
-          ShaderStageEnum nextStage = m_pipelineState->getNextShaderStage(m_shaderStage.value());
-          keepAllLocations = nextStage == ShaderStage::Fragment || nextStage == ShaderStage::Invalid;
+          auto nextStage = m_pipelineState->getNextShaderStage(m_shaderStage.value());
+          keepAllLocations = nextStage == ShaderStage::Fragment || !nextStage;
         }
       } else {
         // Keep all locations if it is the input of fragment shader
diff --git a/lgc/builder/SubgroupBuilder.cpp b/lgc/builder/SubgroupBuilder.cpp
index 3a74d5bc01..c62acd661c 100644
--- a/lgc/builder/SubgroupBuilder.cpp
+++ b/lgc/builder/SubgroupBuilder.cpp
@@ -79,31 +79,27 @@ unsigned BuilderImpl::getShaderWaveSize() {
 //
 // @param instName : Name to give final instruction.
 Value *SubgroupBuilder::CreateSubgroupElect(const Twine &instName) {
-  bool excludeHelperLanes = false;
-  if (getShaderStage(GetInsertBlock()->getParent()).value() == ShaderStage::Fragment)
-    excludeHelperLanes = m_pipelineState->getShaderModes()->getFragmentShaderMode().waveOpsExcludeHelperLanes;
-  return CreateICmpEQ(CreateSubgroupMbcnt(createGroupBallot(getTrue(), excludeHelperLanes)), getInt32(0));
+  auto shaderStage = getShaderStage(GetInsertBlock()->getParent());
+  return CreateICmpEQ(CreateSubgroupMbcnt(createGroupBallot(getTrue(), shaderStage.value())), getInt32(0));
 }
 
 // =====================================================================================================================
 // Create a subgroup all call.
 //
 // @param value : The value to compare across the subgroup. Must be an integer type.
+// @param shaderStage : shader stage enum.
 // @param instName : Name to give final instruction.
-Value *SubgroupBuilder::CreateSubgroupAll(Value *const value, const Twine &instName) {
-  bool ballotExcludeHelperLanes = false;
+Value *SubgroupBuilder::createSubgroupAll(Value *const value, ShaderStageEnum shaderStage, const Twine &instName) {
   bool includeHelperLanes = false;
   bool requireHelperLanes = false;
 
-  if (getShaderStage(GetInsertBlock()->getParent()).value() == ShaderStage::Fragment) {
+  if (shaderStage == ShaderStage::Fragment) {
     const auto &fragmentMode = m_pipelineState->getShaderModes()->getFragmentShaderMode();
-    ballotExcludeHelperLanes = fragmentMode.waveOpsExcludeHelperLanes;
     includeHelperLanes = !fragmentMode.waveOpsExcludeHelperLanes;
     requireHelperLanes = fragmentMode.waveOpsRequireHelperLanes;
   }
 
-  Value *result = CreateICmpEQ(createGroupBallot(value, ballotExcludeHelperLanes),
-                               createGroupBallot(getTrue(), ballotExcludeHelperLanes));
+  Value *result = CreateICmpEQ(createGroupBallot(value, shaderStage), createGroupBallot(getTrue(), shaderStage));
   result = CreateSelect(CreateUnaryIntrinsic(Intrinsic::is_constant, value), value, result);
 
   // Helper invocations of whole quad mode should be included in the subgroup vote execution
@@ -122,18 +118,18 @@ Value *SubgroupBuilder::CreateSubgroupAll(Value *const value, const Twine &instN
 // @param value : The value to compare across the subgroup. Must be an integer type.
 // @param instName : Name to give final instruction.
 Value *SubgroupBuilder::CreateSubgroupAny(Value *const value, const Twine &instName) {
-  bool ballotExcludeHelperLanes = false;
+  auto shaderStage = getShaderStage(GetInsertBlock()->getParent());
+
   bool includeHelperLanes = false;
   bool requireHelperLanes = false;
 
   if (getShaderStage(GetInsertBlock()->getParent()).value() == ShaderStage::Fragment) {
     const auto &fragmentMode = m_pipelineState->getShaderModes()->getFragmentShaderMode();
-    ballotExcludeHelperLanes = fragmentMode.waveOpsExcludeHelperLanes;
     includeHelperLanes = !fragmentMode.waveOpsExcludeHelperLanes;
     requireHelperLanes = fragmentMode.waveOpsRequireHelperLanes;
   }
 
-  Value *result = CreateICmpNE(createGroupBallot(value, ballotExcludeHelperLanes), getInt64(0));
+  Value *result = CreateICmpNE(createGroupBallot(value, shaderStage.value()), getInt64(0));
   result = CreateSelect(CreateUnaryIntrinsic(Intrinsic::is_constant, value), value, result);
 
   // Helper invocations of whole quad mode should be included in the subgroup vote execution
@@ -152,9 +148,11 @@ Value *SubgroupBuilder::CreateSubgroupAny(Value *const value, const Twine &instN
 // @param value : The value to compare across the subgroup. Must be an integer type.
 // @param instName : Name to give final instruction.
 Value *SubgroupBuilder::CreateSubgroupAllEqual(Value *const value, const Twine &instName) {
+  auto shaderStage = getShaderStage(GetInsertBlock()->getParent()).value();
+
   Type *const type = value->getType();
 
-  Value *compare = CreateSubgroupBroadcastFirst(value, instName);
+  Value *compare = createSubgroupBroadcastFirst(value, shaderStage, instName);
 
   if (type->isFPOrFPVectorTy())
     compare = CreateFCmpOEQ(compare, value);
@@ -169,9 +167,9 @@ Value *SubgroupBuilder::CreateSubgroupAllEqual(Value *const value, const Twine &
     for (unsigned i = 1, compCount = cast<FixedVectorType>(type)->getNumElements(); i < compCount; i++)
       result = CreateAnd(result, CreateExtractElement(compare, i));
 
-    return CreateSubgroupAll(result, instName);
+    return createSubgroupAll(result, shaderStage, instName);
   }
-  return CreateSubgroupAll(compare, instName);
+  return createSubgroupAll(compare, shaderStage, instName);
 }
 
 // =====================================================================================================================
@@ -183,6 +181,8 @@ Value *SubgroupBuilder::CreateSubgroupAllEqual(Value *const value, const Twine &
 // @param instName : Name to give final instruction.
 Value *SubgroupBuilder::CreateSubgroupRotate(Value *const value, Value *const delta, Value *const clusterSize,
                                              const Twine &instName) {
+  auto shaderStage = getShaderStage(GetInsertBlock()->getParent()).value();
+
   // LocalId = SubgroupLocalInvocationId
   // RotationGroupSize = hasClusterSIze? ClusterSize : SubgroupSize.
   // Invocation ID = ((LocalId + Delta) & (RotationGroupSize - 1)) + (LocalId & ~(RotationGroupSize - 1))
@@ -194,7 +194,7 @@ Value *SubgroupBuilder::CreateSubgroupRotate(Value *const value, Value *const de
         CreateOr(CreateAnd(invocationId, rotationGroupSize), CreateAnd(localId, CreateNot(rotationGroupSize)));
   }
 
-  return CreateSubgroupShuffle(value, invocationId, instName);
+  return createSubgroupShuffle(value, invocationId, shaderStage, instName);
 }
 
 // =====================================================================================================================
@@ -232,12 +232,14 @@ Value *BuilderImpl::CreateSubgroupBroadcastWaterfall(Value *const value, Value *
 // Create a subgroup broadcastfirst call.
 //
 // @param value : The value to read from the first active lane into all other active lanes.
+// @param shaderStage : shader stage enum.
 // @param instName : Name to give final instruction.
-Value *BuilderImpl::CreateSubgroupBroadcastFirst(Value *const value, const Twine &instName) {
-  const auto &fragmentMode = m_pipelineState->getShaderModes()->getFragmentShaderMode();
+Value *BuilderImpl::createSubgroupBroadcastFirst(Value *const value, ShaderStageEnum shaderStage,
+                                                 const Twine &instName) {
   // For waveOpsExcludeHelperLanes mode, we need filter out the helperlane and use readlane instead.
-  if (m_shaderStage == ShaderStage::Fragment && fragmentMode.waveOpsExcludeHelperLanes) {
-    Value *ballot = createGroupBallot(getTrue());
+  if (shaderStage == ShaderStage::Fragment &&
+      m_pipelineState->getShaderModes()->getFragmentShaderMode().waveOpsExcludeHelperLanes) {
+    Value *ballot = createGroupBallot(getTrue(), shaderStage);
     Value *firstlane = CreateIntrinsic(Intrinsic::cttz, getInt64Ty(), {ballot, getTrue()});
     firstlane = CreateTrunc(firstlane, getInt32Ty());
 
@@ -384,10 +386,12 @@ Value *BuilderImpl::CreateSubgroupBallotFindMsb(Value *const value, const Twine
 //
 // @param value : The value to shuffle.
 // @param index : The index to shuffle from.
+// @param shaderStage : shader stage enum.
 // @param instName : Name to give final instruction.
-Value *BuilderImpl::CreateSubgroupShuffle(Value *const value, Value *const index, const Twine &instName) {
+Value *BuilderImpl::createSubgroupShuffle(Value *const value, Value *const index, ShaderStageEnum shaderStage,
+                                          const Twine &instName) {
 
-  if (supportWaveWideBPermute()) {
+  if (supportWaveWideBPermute(shaderStage)) {
     auto mapFunc = [](BuilderBase &builder, ArrayRef<Value *> mappedArgs,
                       ArrayRef<Value *> passthroughArgs) -> Value * {
       return builder.CreateIntrinsic(Intrinsic::amdgcn_ds_bpermute, {}, {passthroughArgs[0], mappedArgs[0]});
@@ -398,7 +402,7 @@ Value *BuilderImpl::CreateSubgroupShuffle(Value *const value, Value *const index
   }
 
   if (supportPermLane64Dpp()) {
-    assert(getShaderWaveSize() == 64);
+    assert(getPipelineState()->getShaderWaveSize(shaderStage) == 64);
 
     // Start the WWM section by setting the inactive lanes.
     Value *const poisonValue = PoisonValue::get(value->getType());
@@ -431,9 +435,9 @@ Value *BuilderImpl::CreateSubgroupShuffle(Value *const value, Value *const index
     auto result = CreateSelect(indexInSameHalf, bPermSameHalf, bPermOtherHalf);
 
     // If required, force inputs of the operation to be computed in WQM.
-    if (m_shaderStage == ShaderStage::Fragment &&
+    if (shaderStage == ShaderStage::Fragment &&
         m_pipelineState->getShaderModes()->getFragmentShaderMode().waveOpsRequireHelperLanes)
-      result = createWqm(result);
+      result = createWqm(result, shaderStage);
 
     return result;
   }
@@ -630,12 +634,15 @@ Value *BuilderImpl::CreateSubgroupClusteredReduction(GroupArithOp groupArithOp,
     // Use a permute lane to cross rows (row 1 <-> row 0, row 3 <-> row 2).
     result = createGroupArithmeticOperation(groupArithOp, result,
                                             createPermLaneX16(result, result, UINT32_MAX, UINT32_MAX, true, false));
+    if (waveSize == 32)
+      result = createReadFirstLane(result);
   }
 
   if (clusterSize == 64) {
     assert(waveSize == 64);
     if (supportPermLane64Dpp()) {
       result = createGroupArithmeticOperation(groupArithOp, result, createPermLane64(result));
+      result = createReadFirstLane(result);
     } else {
       Value *const broadcast31 = CreateSubgroupBroadcast(result, getInt32(31), instName);
       Value *const broadcast63 = CreateSubgroupBroadcast(result, getInt32(63), instName);
@@ -1303,6 +1310,18 @@ Value *BuilderImpl::createPermLane64(Value *const updateValue) {
   return CreateMapToSimpleType(mapFunc, updateValue, {});
 }
 
+// =====================================================================================================================
+// Create a call to get the first lane.
+//
+// @param updateValue : The value to update with.
+Value *BuilderImpl::createReadFirstLane(Value *const updateValue) {
+  auto mapFunc = [](BuilderBase &builder, ArrayRef<Value *> mappedArgs, ArrayRef<Value *> passthroughArgs) -> Value * {
+    return builder.CreateIntrinsic(builder.getInt32Ty(), Intrinsic::amdgcn_readfirstlane, {mappedArgs[0]});
+  };
+
+  return CreateMapToSimpleType(mapFunc, updateValue, {});
+}
+
 // =====================================================================================================================
 // Create a call to ds swizzle.
 //
@@ -1333,12 +1352,13 @@ Value *BuilderImpl::createWwm(Value *const value) {
 // Only in fragment shader stage.
 //
 // @param value : The value to pass to the soft WQM call.
-Value *BuilderImpl::createWqm(Value *const value) {
+// @param shaderStage : shader stage enum.
+Value *BuilderImpl::createWqm(Value *const value, ShaderStageEnum shaderStage) {
   auto mapFunc = [](BuilderBase &builder, ArrayRef<Value *> mappedArgs, ArrayRef<Value *>) -> Value * {
     return builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_wqm, mappedArgs[0]);
   };
 
-  if (m_shaderStage == ShaderStage::Fragment)
+  if (shaderStage == ShaderStage::Fragment)
     return CreateMapToSimpleType(mapFunc, value, {});
 
   return value;
@@ -1398,15 +1418,16 @@ Value *BuilderImpl::createThreadMaskedSelect(Value *const threadMask, uint64_t a
 // Do group ballot, turning a per-lane boolean value (in a VGPR) into a subgroup-wide shared SGPR.
 //
 // @param value : The value to contribute to the SGPR, must be an boolean type.
-// @param excludeHelperLanes : exclude helper lanes.
-Value *BuilderImpl::createGroupBallot(Value *const value, bool excludeHelperLanes) {
+// @param shaderStage : shader stage enum.
+Value *BuilderImpl::createGroupBallot(Value *const value, ShaderStageEnum shaderStage) {
   // Check the type is definitely an boolean.
   assert(value->getType()->isIntegerTy(1));
 
   Value *result = value;
 
   // For waveOpsExcludeHelperLanes mode, we need mask away the helperlane.
-  if (excludeHelperLanes) {
+  if (shaderStage == ShaderStage::Fragment &&
+      m_pipelineState->getShaderModes()->getFragmentShaderMode().waveOpsExcludeHelperLanes) {
     auto isLive = CreateIntrinsic(Intrinsic::amdgcn_live_mask, {}, {}, nullptr, {});
     result = CreateAnd(isLive, result);
   }
@@ -1426,11 +1447,7 @@ Value *BuilderImpl::createGroupBallot(Value *const value, bool excludeHelperLane
 //
 // @param value : The value to contribute to the SGPR, must be an boolean type.
 Value *BuilderImpl::createGroupBallot(Value *const value) {
-  // For waveOpsExcludeHelperLanes mode, we need mask away the helperlane.
-  bool excludeHelperLanes = false;
-  if (m_shaderStage == ShaderStage::Fragment)
-    excludeHelperLanes = m_pipelineState->getShaderModes()->getFragmentShaderMode().waveOpsExcludeHelperLanes;
-  return createGroupBallot(value, excludeHelperLanes);
+  return createGroupBallot(value, m_shaderStage.value());
 }
 
 // =====================================================================================================================
diff --git a/lgc/include/lgc/builder/BuilderImpl.h b/lgc/include/lgc/builder/BuilderImpl.h
index 98057d0670..652b7b920c 100644
--- a/lgc/include/lgc/builder/BuilderImpl.h
+++ b/lgc/include/lgc/builder/BuilderImpl.h
@@ -74,7 +74,7 @@ class BuilderImpl : public BuilderDefs {
   PipelineState *getPipelineState() const { return m_pipelineState; }
 
   // Get whether the context we are building in supports ds_bpermute or v_bpermute across all lanes in the wave.
-  bool supportWaveWideBPermute() const;
+  bool supportWaveWideBPermute(ShaderStageEnum shaderStage) const;
 
   // Get whether the context we are building in supports permute lane 64 DPP operations.
   bool supportPermLane64Dpp() const;
@@ -441,6 +441,12 @@ class BuilderImpl : public BuilderDefs {
   // Modify sampler descriptor to force set trunc_coord as 0 for gather4 instruction.
   llvm::Value *modifySamplerDescForGather(llvm::Value *samplerDesc);
 
+  // Transform 32-bit image descriptor pointer to a i32 type or a descriptor load instruction.
+  llvm::Value *transformImageDesc(llvm::Value *imageDesc, bool mustLoad, bool isTexelBuffer, llvm::Type *texelType);
+
+  // Transform 32-bit sampler descriptor pointer to a i32 type or a descriptor load instruction.
+  llvm::Value *transformSamplerDesc(llvm::Value *samplerDesc);
+
   enum ImgDataFormat {
     IMG_DATA_FORMAT_32 = 4,
     IMG_DATA_FORMAT_8_8_8_8 = 10,
@@ -657,7 +663,9 @@ class BuilderImpl : public BuilderDefs {
                                                 const llvm::Twine &instName = "");
 
   // Create a subgroup broadcast first.
-  llvm::Value *CreateSubgroupBroadcastFirst(llvm::Value *const value, const llvm::Twine &instName = "");
+  llvm::Value *CreateSubgroupBroadcastFirst(llvm::Value *const value, const llvm::Twine &instName = "") {
+    return createSubgroupBroadcastFirst(value, m_shaderStage.value(), instName);
+  }
 
   // Create a subgroup ballot.
   llvm::Value *CreateSubgroupBallot(llvm::Value *const value, const llvm::Twine &instName = "");
@@ -686,7 +694,9 @@ class BuilderImpl : public BuilderDefs {
 
   // Create a subgroup shuffle.
   llvm::Value *CreateSubgroupShuffle(llvm::Value *const value, llvm::Value *const index,
-                                     const llvm::Twine &instName = "");
+                                     const llvm::Twine &instName = "") {
+    return createSubgroupShuffle(value, index, m_shaderStage.value(), instName);
+  }
 
   // Create a subgroup shuffle xor.
   llvm::Value *CreateSubgroupShuffleXor(llvm::Value *const value, llvm::Value *const mask,
@@ -771,20 +781,28 @@ class BuilderImpl : public BuilderDefs {
   llvm::Value *createPermLaneX16(llvm::Value *const origValue, llvm::Value *const updateValue, unsigned selectBitsLow,
                                  unsigned selectBitsHigh, bool fetchInactive, bool boundCtrl);
   llvm::Value *createPermLane64(llvm::Value *const updateValue);
+  llvm::Value *createReadFirstLane(llvm::Value *const updateValue);
 
   llvm::Value *createDsSwizzle(llvm::Value *const value, uint16_t dsPattern);
   llvm::Value *createWwm(llvm::Value *const value);
-  llvm::Value *createWqm(llvm::Value *const value);
+  llvm::Value *createWqm(llvm::Value *const value) { return createWqm(value, m_shaderStage.value()); }
   llvm::Value *createThreadMask();
   llvm::Value *createThreadMaskedSelect(llvm::Value *const threadMask, uint64_t andMask, llvm::Value *const value1,
                                         llvm::Value *const value2);
   uint16_t getDsSwizzleBitMode(uint8_t xorMask, uint8_t orMask, uint8_t andMask);
   uint16_t getDsSwizzleQuadMode(uint8_t lane0, uint8_t lane1, uint8_t lane2, uint8_t lane3);
 
-protected:
-  llvm::Value *createGroupBallot(llvm::Value *const value, bool excludeHelperLanes);
   llvm::Value *createGroupBallot(llvm::Value *const value);
+
+protected:
+  // The subgroup operation with explicit shader stage as parameter.
   llvm::Value *createFindMsb(llvm::Value *const mask);
+  llvm::Value *createGroupBallot(llvm::Value *const value, ShaderStageEnum shaderStage);
+  llvm::Value *createSubgroupBroadcastFirst(llvm::Value *const value, ShaderStageEnum shaderStage,
+                                            const llvm::Twine &instName);
+  llvm::Value *createSubgroupShuffle(llvm::Value *const value, llvm::Value *const index, ShaderStageEnum shaderStage,
+                                     const llvm::Twine &instName);
+  llvm::Value *createWqm(llvm::Value *const value, ShaderStageEnum shaderStage);
 };
 
 } // namespace lgc
diff --git a/lgc/include/lgc/builder/SubgroupBuilder.h b/lgc/include/lgc/builder/SubgroupBuilder.h
index afd0c82b0c..6497951021 100644
--- a/lgc/include/lgc/builder/SubgroupBuilder.h
+++ b/lgc/include/lgc/builder/SubgroupBuilder.h
@@ -59,7 +59,9 @@ class SubgroupBuilder : public BuilderImpl {
   //
   // @param value : The value to compare
   // @param instName : Name to give instruction(s)
-  llvm::Value *CreateSubgroupAll(llvm::Value *const value, const llvm::Twine &instName = "");
+  llvm::Value *CreateSubgroupAll(llvm::Value *const value, const llvm::Twine &instName = "") {
+    return createSubgroupAll(value, getShaderStage(GetInsertBlock()->getParent()).value(), instName);
+  }
 
   // Create a subgroup all equal.
   //
@@ -80,6 +82,9 @@ class SubgroupBuilder : public BuilderImpl {
   SubgroupBuilder() = delete;
   SubgroupBuilder(const SubgroupBuilder &) = delete;
   SubgroupBuilder &operator=(const SubgroupBuilder &) = delete;
+
+  // The subgroup operation with explicit shader stage as parameter.
+  llvm::Value *createSubgroupAll(llvm::Value *const value, ShaderStageEnum shaderStage, const llvm::Twine &instName);
 };
 
 } // namespace lgc
diff --git a/lgc/include/lgc/patch/LowerCooperativeMatrix.h b/lgc/include/lgc/patch/LowerCooperativeMatrix.h
index 7deb5cbee9..e277d2e23b 100644
--- a/lgc/include/lgc/patch/LowerCooperativeMatrix.h
+++ b/lgc/include/lgc/patch/LowerCooperativeMatrix.h
@@ -212,9 +212,8 @@ class LowerCooperativeMatrix : public Patch, public llvm::PassInfoMixin<LowerCoo
   llvm::Value *createDotProductInt8Int32(llvm::Value *vector1, llvm::Value *vector2, llvm::Value *accumulator,
                                          unsigned flags, bool isSat, const llvm::Twine &instName,
                                          llvm::Instruction *insertPos);
-  llvm::Value *createDotProductInt16Int16(llvm::Value *vector1, llvm::Value *vector2, llvm::Value *accumulator,
-                                          unsigned flags, bool isSat, const llvm::Twine &instName,
-                                          llvm::Instruction *insertPos);
+  llvm::Value *createDotProductInt(llvm::Value *vector1, llvm::Value *vector2, llvm::Value *accumulator, unsigned flags,
+                                   bool isSat, const llvm::Twine &instName, llvm::Instruction *insertPos);
 
   llvm::Value *getLaneNumber(BuilderBase &builder);
 
diff --git a/lgc/include/lgc/patch/LowerDebugPrintf.h b/lgc/include/lgc/patch/LowerDebugPrintf.h
index 9d820f9ecc..9428eac9a3 100644
--- a/lgc/include/lgc/patch/LowerDebugPrintf.h
+++ b/lgc/include/lgc/patch/LowerDebugPrintf.h
@@ -61,6 +61,7 @@ class LowerDebugPrintf : public llvm::PassInfoMixin<LowerDebugPrintf> {
   void setupElfsPrintfStrings();
   llvm::DenseMap<uint64_t, ElfInfo> m_elfInfos;
   llvm::SmallVector<llvm::Instruction *> m_toErase;
+  llvm::Value *m_debugPrintfBuffer = nullptr;
   PipelineState *m_pipelineState = nullptr;
 };
 
diff --git a/lgc/include/lgc/patch/SystemValues.h b/lgc/include/lgc/patch/SystemValues.h
index 069ea1de16..3153bc3882 100644
--- a/lgc/include/lgc/patch/SystemValues.h
+++ b/lgc/include/lgc/patch/SystemValues.h
@@ -92,6 +92,9 @@ class ShaderSystemValues {
   // Get pointers to emit counters (GS)
   std::pair<llvm::Type *, llvm::ArrayRef<llvm::Value *>> getEmitCounterPtr();
 
+  // Get pointer to total emit counter (GS)
+  llvm::Value *getTotalEmitCounterPtr();
+
   // Get global internal table pointer as pointer to i8.
   llvm::Instruction *getInternalGlobalTablePtr();
 
@@ -141,6 +144,7 @@ class ShaderSystemValues {
   llvm::Value *m_tessCoord = nullptr;                               // Tessellated coordinate (TES)
   llvm::Value *m_esGsOffsets = nullptr;                             // ES -> GS offsets (GS in)
   llvm::SmallVector<llvm::Value *, MaxGsStreams> m_emitCounterPtrs; // Pointers to emit counters (GS)
+  llvm::Value *m_totalEmitCounterPtr;                               // Pointer to total emit counter (GS)
 
   llvm::SmallVector<llvm::Value *, 8> m_descTablePtrs;       // Descriptor table pointers
   llvm::SmallVector<llvm::Value *, 8> m_shadowDescTablePtrs; // Shadow descriptor table pointers
diff --git a/lgc/include/lgc/state/AbiMetadata.h b/lgc/include/lgc/state/AbiMetadata.h
index b687fa1576..a11fa5bef9 100644
--- a/lgc/include/lgc/state/AbiMetadata.h
+++ b/lgc/include/lgc/state/AbiMetadata.h
@@ -34,6 +34,8 @@
  */
 #pragma once
 
+#include "lgc/CommonDefs.h"
+#include "llvm/Support/ErrorHandling.h"
 #include <stdint.h>
 
 namespace lgc {
@@ -622,8 +624,27 @@ typedef enum SPI_SHADER_EX_FORMAT {
 } SPI_SHADER_EX_FORMAT;
 
 // The names of API shader stages used in PAL metadata, in ShaderStageEnum order.
-static const char *const ApiStageNames[] = {".task",     ".vertex", ".hull",  ".domain",
-                                            ".geometry", ".mesh",   ".pixel", ".compute"};
+inline const char *shaderStageToApiName(ShaderStageEnum stage) {
+  switch (stage) {
+  case ShaderStage::Task:
+    return ".task";
+  case ShaderStage::Vertex:
+    return ".vertex";
+  case ShaderStage::TessControl:
+    return ".hull";
+  case ShaderStage::TessEval:
+    return ".domain";
+  case ShaderStage::Geometry:
+    return ".geometry";
+  case ShaderStage::Mesh:
+    return ".mesh";
+  case ShaderStage::Fragment:
+    return ".pixel";
+  case ShaderStage::Compute:
+    return ".compute";
+  }
+  llvm::report_fatal_error("No api name for this shader stage");
+}
 
 // The names of hardware shader stages used in PAL metadata, in Util::Abi::HardwareStage order.
 static const char *const HwStageNames[static_cast<unsigned>(Util::Abi::HardwareStage::Count)] = {".hs", ".gs", ".vs",
diff --git a/lgc/include/lgc/state/PipelineState.h b/lgc/include/lgc/state/PipelineState.h
index 9a2ad7bebd..9bc8bed882 100644
--- a/lgc/include/lgc/state/PipelineState.h
+++ b/lgc/include/lgc/state/PipelineState.h
@@ -41,6 +41,7 @@
 #include "llvm/IR/PassManager.h"
 #include "llvm/Pass.h"
 #include <map>
+#include <optional>
 
 namespace llvm {
 
@@ -256,9 +257,9 @@ class PipelineState final : public Pipeline {
   bool hasShaderStage(ShaderStageEnum stage) { return getShaderStageMask().contains(stage); }
   bool isGraphics();
   bool isComputeLibrary() const { return m_computeLibrary; }
-  ShaderStageEnum getLastVertexProcessingStage() const;
-  ShaderStageEnum getPrevShaderStage(ShaderStageEnum shaderStage) const;
-  ShaderStageEnum getNextShaderStage(ShaderStageEnum shaderStage) const;
+  std::optional<ShaderStageEnum> getLastVertexProcessingStage() const;
+  std::optional<ShaderStageEnum> getPrevShaderStage(ShaderStageEnum shaderStage) const;
+  std::optional<ShaderStageEnum> getNextShaderStage(ShaderStageEnum shaderStage) const;
 
   // Get client name
   const char *getClient() const { return m_client.c_str(); }
diff --git a/lgc/interface/lgc/Builder.h b/lgc/interface/lgc/Builder.h
index f345b7010c..85ef8a388a 100644
--- a/lgc/interface/lgc/Builder.h
+++ b/lgc/interface/lgc/Builder.h
@@ -200,17 +200,19 @@ class BuilderDefs : public BuilderCommon {
 
   // Possible values for dimension argument for image methods.
   enum {
-    Dim1D = 0,          // Coordinate: x
-    Dim2D = 1,          // Coordinate: x, y
-    Dim3D = 2,          // Coordinate: x, y, z
-    DimCube = 3,        // Coordinate: x, y, face
-    Dim1DArray = 4,     // Coordinate: x, slice
-    Dim2DArray = 5,     // Coordinate: x, y, slice
-    Dim2DMsaa = 6,      // Coordinate: x, y, fragid
-    Dim2DArrayMsaa = 7, // Coordinate: x, y, slice, fragid
-    DimCubeArray = 8,   // Coordinate: x, y, face, slice (despite both SPIR-V and ISA
-                        //    combining face and slice into one component)
-    DimRect = 9,        // Coordinate: x, y
+    Dim1D = 0,             // Coordinate: x
+    Dim2D = 1,             // Coordinate: x, y
+    Dim3D = 2,             // Coordinate: x, y, z
+    DimCube = 3,           // Coordinate: x, y, face
+    Dim1DArray = 4,        // Coordinate: x, slice
+    Dim2DArray = 5,        // Coordinate: x, y, slice
+    Dim2DMsaa = 6,         // Coordinate: x, y, fragid
+    Dim2DArrayMsaa = 7,    // Coordinate: x, y, slice, fragid
+    DimCubeArray = 8,      // Coordinate: x, y, face, slice (despite both SPIR-V and ISA
+                           //    combining face and slice into one component)
+    DimRect = 9,           // Coordinate: x, y
+    Dim1DBuffer = 10,      // Coordinate: x (identify a texel buffer)
+    Dim1DArrayBuffer = 11, // Coordinate: x, slice (identify a texel buffer)
   };
 
   // Get the number of coordinates for the specified dimension argument.
@@ -219,6 +221,7 @@ class BuilderDefs : public BuilderCommon {
   static unsigned getImageNumCoords(unsigned dim) {
     switch (dim) {
     case Dim1D:
+    case Dim1DBuffer:
       return 1;
     case Dim2D:
       return 2;
@@ -227,6 +230,7 @@ class BuilderDefs : public BuilderCommon {
     case DimCube:
       return 3;
     case Dim1DArray:
+    case Dim1DArrayBuffer:
       return 2;
     case Dim2DArray:
       return 3;
@@ -249,6 +253,7 @@ class BuilderDefs : public BuilderCommon {
   static unsigned getImageQuerySizeComponentCount(unsigned dim) {
     switch (dim) {
     case Dim1D:
+    case Dim1DBuffer:
       return 1;
     case Dim2D:
       return 2;
@@ -257,6 +262,7 @@ class BuilderDefs : public BuilderCommon {
     case DimCube:
       return 2;
     case Dim1DArray:
+    case Dim1DArrayBuffer:
       return 2;
     case Dim2DArray:
       return 3;
@@ -279,6 +285,7 @@ class BuilderDefs : public BuilderCommon {
   static unsigned getImageDerivativeComponentCount(unsigned dim) {
     switch (dim) {
     case Dim1D:
+    case Dim1DBuffer:
       return 1;
     case Dim2D:
       return 2;
@@ -287,6 +294,7 @@ class BuilderDefs : public BuilderCommon {
     case DimCube:
       return 3;
     case Dim1DArray:
+    case Dim1DArrayBuffer:
       return 1;
     case Dim2DArray:
       return 2;
diff --git a/lgc/interface/lgc/CommonDefs.h b/lgc/interface/lgc/CommonDefs.h
index 1cace83670..39af5ff976 100644
--- a/lgc/interface/lgc/CommonDefs.h
+++ b/lgc/interface/lgc/CommonDefs.h
@@ -31,6 +31,7 @@
 #pragma once
 
 #include "EnumIterator.h"
+#include "llvm/ADT/DenseMap.h"
 #include <array>
 #include <cstdint>
 
@@ -63,6 +64,34 @@ enum ShaderStage : unsigned {
 // TODO Temporary definition until ShaderStage is converted to a class enum.
 using ShaderStageEnum = ShaderStage::ShaderStage;
 
+/// All shader stages
+[[maybe_unused]] constexpr const std::array ShaderStages = {
+    ShaderStage::Compute,  ShaderStage::Fragment,    ShaderStage::Vertex,
+    ShaderStage::Geometry, ShaderStage::TessControl, ShaderStage::TessEval,
+    ShaderStage::Task,     ShaderStage::Mesh,        ShaderStage::CopyShader,
+};
+
+/// All graphics shader stages.
+/// These are in execution order.
+[[maybe_unused]] constexpr const std::array ShaderStagesGraphics = {
+    ShaderStage::Task,     ShaderStage::Vertex, ShaderStage::TessControl, ShaderStage::TessEval,
+    ShaderStage::Geometry, ShaderStage::Mesh,   ShaderStage::Fragment,
+};
+
+/// Graphics and compute shader stages.
+/// The graphics stages are in execution order.
+[[maybe_unused]] constexpr const std::array ShaderStagesNative = {
+    ShaderStage::Task,     ShaderStage::Vertex, ShaderStage::TessControl, ShaderStage::TessEval,
+    ShaderStage::Geometry, ShaderStage::Mesh,   ShaderStage::Fragment,    ShaderStage::Compute,
+};
+
+/// Graphics and compute shader stages and copy shader.
+/// The graphics stages are in execution order.
+[[maybe_unused]] constexpr const std::array ShaderStagesNativeCopy = {
+    ShaderStage::Task, ShaderStage::Vertex,   ShaderStage::TessControl, ShaderStage::TessEval,   ShaderStage::Geometry,
+    ShaderStage::Mesh, ShaderStage::Fragment, ShaderStage::Compute,     ShaderStage::CopyShader,
+};
+
 class ShaderStageMask {
 public:
   constexpr ShaderStageMask() {}
@@ -77,6 +106,11 @@ class ShaderStageMask {
       *this |= ShaderStageMask(stage);
   };
 
+  template <size_t N> constexpr explicit ShaderStageMask(const std::array<ShaderStageEnum, N> &stages) {
+    for (auto stage : stages)
+      *this |= ShaderStageMask(stage);
+  };
+
   constexpr static ShaderStageMask fromRaw(uint32_t mask) {
     ShaderStageMask result;
     result.m_value = mask;
@@ -98,6 +132,7 @@ class ShaderStageMask {
 
   constexpr bool contains(ShaderStageEnum stage) const;
   constexpr bool contains_any(std::initializer_list<ShaderStageEnum> stages) const;
+  template <size_t N> constexpr bool contains_any(const std::array<ShaderStageEnum, N> &stages) const;
   constexpr bool empty() const { return m_value == 0; }
 
   uint32_t m_value = 0;
@@ -133,6 +168,10 @@ constexpr bool ShaderStageMask::contains_any(std::initializer_list<ShaderStageEn
   return (*this & ShaderStageMask(stages)).m_value != 0;
 }
 
+template <size_t N> constexpr bool ShaderStageMask::contains_any(const std::array<ShaderStageEnum, N> &stages) const {
+  return (*this & ShaderStageMask(stages)).m_value != 0;
+}
+
 enum AddrSpace {
   ADDR_SPACE_FLAT = 0,                   // Flat memory
   ADDR_SPACE_GLOBAL = 1,                 // Global memory
@@ -202,4 +241,13 @@ namespace llvm {
 // Enable iteration over resource node type with `lgc::enumRange<ResourceNodeType>()`.
 LGC_DEFINE_DEFAULT_ITERABLE_ENUM(lgc::ResourceNodeType);
 
+template <> struct DenseMapInfo<lgc::ShaderStageEnum> {
+  using T = lgc::ShaderStageEnum;
+
+  static T getEmptyKey() { return static_cast<T>(DenseMapInfo<uint32_t>::getEmptyKey()); }
+  static T getTombstoneKey() { return static_cast<T>(DenseMapInfo<uint32_t>::getTombstoneKey()); }
+  static unsigned getHashValue(const T &Val) { return static_cast<unsigned>(Val); }
+  static bool isEqual(const T &LHS, const T &RHS) { return LHS == RHS; }
+};
+
 } // namespace llvm
diff --git a/lgc/interface/lgc/LgcDialect.td b/lgc/interface/lgc/LgcDialect.td
index 7dbfd25551..2f80b43bf9 100644
--- a/lgc/interface/lgc/LgcDialect.td
+++ b/lgc/interface/lgc/LgcDialect.td
@@ -163,7 +163,7 @@ def LoadStridedBufferDescOp : LgcOp<"load.strided.buffer.desc", [Memory<[]>, Wil
 }
 
 def DebugPrintfOp : LgcOp<"debug.printf", [Memory<[(readwrite InaccessibleMem)]>, WillReturn]> {
-  let arguments = (ins BufferPointer:$buffer, ConstantPointer:$format, varargs:$args);
+  let arguments = (ins ImmutableStringAttr:$format, varargs:$args);
   let results = (outs);
 
   let summary = "print a formatted message";
diff --git a/lgc/interface/lgc/Pipeline.h b/lgc/interface/lgc/Pipeline.h
index 1c30ffe785..7bdb636f6c 100644
--- a/lgc/interface/lgc/Pipeline.h
+++ b/lgc/interface/lgc/Pipeline.h
@@ -625,7 +625,7 @@ struct TessellationMode {
 };
 
 // Kind of GS input primitives.
-enum class InputPrimitives : unsigned { Points, Lines, LinesAdjacency, Triangles, TrianglesAdjacency };
+enum class InputPrimitives : unsigned { Points, Lines, LinesAdjacency, Triangles, TrianglesAdjacency, Patch };
 
 // Kind of GS/mesh shader output primitives
 enum class OutputPrimitives : unsigned {
@@ -645,6 +645,7 @@ struct GeometryShaderMode {
   OutputPrimitives outputPrimitive; // Kind of output primitives
   unsigned invocations;             // Number of times to invoke shader for each input primitive
   unsigned outputVertices;          // Max number of vertices the shader will emit in one invocation
+  unsigned controlPoints;           // Number of control points when the input primitive is a patch
   unsigned robustGsEmits;           // robust buffer access
 };
 
diff --git a/llpc/context/GfxRuntimeContext.h b/lgc/interface/lgc/RuntimeContext.h
similarity index 91%
rename from llpc/context/GfxRuntimeContext.h
rename to lgc/interface/lgc/RuntimeContext.h
index 83639d10d1..fcf50e4778 100644
--- a/llpc/context/GfxRuntimeContext.h
+++ b/lgc/interface/lgc/RuntimeContext.h
@@ -24,8 +24,8 @@
  **********************************************************************************************************************/
 /**
  ***********************************************************************************************************************
- * @file  GfxRuntimeContext.h
- * @brief LLVMContext extension that stores a GfxRuntime library module
+ * @file  RuntimeContext.h
+ * @brief LLVMContext extension that stores a Runtime library module
  ***********************************************************************************************************************
  */
 #pragma once
@@ -40,16 +40,16 @@ class Module;
 namespace lgc {
 
 // This extension can be attached to an LLVMContext and queried via the
-// GfxRuntimeContext::get method inherited from the base class.
+// RuntimeContext::get method inherited from the base class.
 //
-// Compiler drivers (like LLPC) are expected to set theModule to the GfxRuntime
+// Compiler drivers (like LLPC) are expected to set theModule to the Runtime
 // library, so that advanced blend pass can cross-module inline
 // functions implemented there.
+
 class GfxRuntimeContext : public llvm_dialects::ContextExtensionImpl<GfxRuntimeContext> {
 public:
   explicit GfxRuntimeContext(llvm::LLVMContext &) {}
-  ~GfxRuntimeContext();
-
+  ~GfxRuntimeContext() = default;
   static Key theKey;
   std::unique_ptr<llvm::Module> theModule;
 };
diff --git a/lgc/patch/ConfigBuilderBase.cpp b/lgc/patch/ConfigBuilderBase.cpp
index 2d2d57147b..9302cc39df 100644
--- a/lgc/patch/ConfigBuilderBase.cpp
+++ b/lgc/patch/ConfigBuilderBase.cpp
@@ -111,10 +111,11 @@ void ConfigBuilderBase::addApiHwShaderMapping(ShaderStageEnum apiStage, unsigned
 // Get the MsgPack map node for the specified API shader in the ".shaders" map
 //
 // @param apiStage : API shader stage
-msgpack::MapDocNode ConfigBuilderBase::getApiShaderNode(unsigned apiStage) {
+msgpack::MapDocNode ConfigBuilderBase::getApiShaderNode(ShaderStageEnum apiStage) {
   if (m_apiShaderNodes[apiStage].isEmpty()) {
     m_apiShaderNodes[apiStage] =
-        m_pipelineNode[Util::Abi::PipelineMetadataKey::Shaders].getMap(true)[ApiStageNames[apiStage]].getMap(true);
+        m_pipelineNode[Util::Abi::PipelineMetadataKey::Shaders].getMap(true)[shaderStageToApiName(apiStage)].getMap(
+            true);
   }
   return m_apiShaderNodes[apiStage];
 }
@@ -139,7 +140,7 @@ msgpack::MapDocNode ConfigBuilderBase::getHwShaderNode(Util::Abi::HardwareStage
 // @param apiStage : API shader stage
 unsigned ConfigBuilderBase::setShaderHash(ShaderStageEnum apiStage) {
   const ShaderOptions &shaderOptions = m_pipelineState->getShaderOptions(apiStage);
-  auto hashNode = getApiShaderNode(unsigned(apiStage))[Util::Abi::ShaderMetadataKey::ApiShaderHash].getArray(true);
+  auto hashNode = getApiShaderNode(apiStage)[Util::Abi::ShaderMetadataKey::ApiShaderHash].getArray(true);
   hashNode[0] = shaderOptions.hash[0];
   hashNode[1] = shaderOptions.hash[1];
   return shaderOptions.hash[0] >> 32 ^ shaderOptions.hash[0] ^ shaderOptions.hash[1] >> 32 ^ shaderOptions.hash[1];
@@ -314,7 +315,7 @@ void ConfigBuilderBase::setThreadgroupDimensions(llvm::ArrayRef<unsigned> values
 }
 
 // =====================================================================================================================
-// Set stream-out vertex strides (GFX11+)
+// Set stream-out vertex strides
 //
 // @param values : Values to set
 void ConfigBuilderBase::setStreamOutVertexStrides(ArrayRef<unsigned> values) {
diff --git a/lgc/patch/ConfigBuilderBase.h b/lgc/patch/ConfigBuilderBase.h
index 84f030e6a0..31c6c57acb 100644
--- a/lgc/patch/ConfigBuilderBase.h
+++ b/lgc/patch/ConfigBuilderBase.h
@@ -126,11 +126,11 @@ class ConfigBuilderBase {
 
 private:
   // Get the MsgPack map node for the specified API shader in the ".shaders" map
-  llvm::msgpack::MapDocNode getApiShaderNode(unsigned apiStage);
+  llvm::msgpack::MapDocNode getApiShaderNode(ShaderStageEnum apiStage);
 
   llvm::msgpack::Document *m_document;      // The MsgPack document
   llvm::msgpack::MapDocNode m_pipelineNode; // MsgPack map node for amdpal.pipelines[0]
-  llvm::msgpack::MapDocNode m_apiShaderNodes[ShaderStage::NativeStageCount];
+  llvm::DenseMap<ShaderStageEnum, llvm::msgpack::MapDocNode> m_apiShaderNodes;
   // MsgPack map node for each API shader's node in
   //  ".shaders"
   llvm::msgpack::MapDocNode m_hwShaderNodes[unsigned(Util::Abi::HardwareStage::Count)];
diff --git a/lgc/patch/LowerCooperativeMatrix.cpp b/lgc/patch/LowerCooperativeMatrix.cpp
index 5dc9181dcb..f5d20ea013 100644
--- a/lgc/patch/LowerCooperativeMatrix.cpp
+++ b/lgc/patch/LowerCooperativeMatrix.cpp
@@ -1060,6 +1060,7 @@ Value *LowerCooperativeMatrix::cooperativeMatrixReshapeBetween16bitAnd32bitOnAcc
   } else {
     resultValue =
         builder.CreateBitCast(resultValue, FixedVectorType::get(builder.getFloatTy(), 4)); // 1st case:after convert
+    resultValue = builder.CreateShuffleVector(resultValue, {0, 1, 2, 3, -1, -1, -1, -1});
   }
   return resultValue;
 }
@@ -1542,8 +1543,8 @@ void LowerCooperativeMatrix::visitCooperativeMatrixMulAddOp(CooperativeMatrixMul
         mulAB1 = createDotProductFp16Fp16(rowData1, colData, accumulator1, isSatOrOpsel, instName, &muladd);
         mulAB2 = createDotProductFp16Fp16(rowData2, colData, accumulator2, isSatOrOpsel, instName, &muladd);
       } else {
-        mulAB1 = createDotProductInt16Int16(rowData1, colData, accumulator1, flags, isSatOrOpsel, instName, &muladd);
-        mulAB2 = createDotProductInt16Int16(rowData2, colData, accumulator2, flags, isSatOrOpsel, instName, &muladd);
+        mulAB1 = createDotProductInt(rowData1, colData, accumulator1, flags, isSatOrOpsel, instName, &muladd);
+        mulAB2 = createDotProductInt(rowData2, colData, accumulator2, flags, isSatOrOpsel, instName, &muladd);
       }
       dotProductValue = builder.CreateInsertElement(dotProductValue, mulAB1, accIdx);
       dotProductValue = builder.CreateInsertElement(dotProductValue, mulAB2, accIdx + 1);
@@ -1575,6 +1576,8 @@ Value *LowerCooperativeMatrix::createDotProductFp16Fp32(Value *const vector1, Va
   BuilderBase builder(*m_context);
   builder.SetInsertPoint(insertPos);
 
+  // Dot instructions are not available on gfx1010
+  const bool emulateDot = m_gfxIp.isGfx(10, 1) && m_gfxIp.stepping == 0;
   const unsigned compCount = cast<FixedVectorType>(vector1->getType())->getNumElements();
   Value *scalar = initAccumulator;
   auto intrinsicDot = Intrinsic::amdgcn_fdot2;
@@ -1583,8 +1586,18 @@ Value *LowerCooperativeMatrix::createDotProductFp16Fp32(Value *const vector1, Va
     input1 = builder.CreateBitCast(input1, FixedVectorType::get(builder.getHalfTy(), 2));
     Value *input2 = builder.CreateExtractElement(vector2, i);
     input2 = builder.CreateBitCast(input2, FixedVectorType::get(builder.getHalfTy(), 2));
-    scalar =
-        builder.CreateIntrinsic(intrinsicDot, {}, {input1, input2, scalar, builder.getInt1(isSat)}, nullptr, instName);
+    if (emulateDot) {
+      Value *input1Fp32 = builder.CreateFPCast(input1, FixedVectorType::get(builder.getFloatTy(), 2));
+      Value *input2Fp32 = builder.CreateFPCast(input2, FixedVectorType::get(builder.getFloatTy(), 2));
+      for (unsigned j = 0; j < 2; ++j) {
+        Value *lhs = builder.CreateExtractElement(input1Fp32, j);
+        Value *rhs = builder.CreateExtractElement(input2Fp32, j);
+        scalar = builder.CreateIntrinsic(Intrinsic::fmuladd, lhs->getType(), {lhs, rhs, scalar});
+      }
+    } else {
+      scalar = builder.CreateIntrinsic(intrinsicDot, {}, {input1, input2, scalar, builder.getInt1(isSat)}, nullptr,
+                                       instName);
+    }
   }
   scalar->setName(instName);
   return scalar;
@@ -1638,6 +1651,8 @@ Value *LowerCooperativeMatrix::createDotProductInt8Int32(Value *vector1, Value *
   BuilderBase builder(*m_context);
   builder.SetInsertPoint(insertPos);
 
+  // Dot instructions are not available on gfx1010
+  const bool emulateDot = m_gfxIp.isGfx(10, 1) && m_gfxIp.stepping == 0;
   const bool isSigned = (flags & lgc::Builder::FirstVectorSigned);
   auto intrinsicDot = isSigned ? Intrinsic::amdgcn_sdot4 : Intrinsic::amdgcn_udot4;
 
@@ -1646,8 +1661,14 @@ Value *LowerCooperativeMatrix::createDotProductInt8Int32(Value *vector1, Value *
   for (unsigned i = 0; i < compCount; ++i) {
     Value *input1 = builder.CreateExtractElement(vector1, i);
     Value *input2 = builder.CreateExtractElement(vector2, i);
-    scalar =
-        builder.CreateIntrinsic(intrinsicDot, {}, {input1, input2, scalar, builder.getInt1(false)}, nullptr, instName);
+    if (emulateDot) {
+      input1 = builder.CreateBitCast(input1, FixedVectorType::get(builder.getInt8Ty(), 4));
+      input2 = builder.CreateBitCast(input2, FixedVectorType::get(builder.getInt8Ty(), 4));
+      scalar = createDotProductInt(input1, input2, scalar, flags, isSat, instName, insertPos);
+    } else {
+      scalar = builder.CreateIntrinsic(intrinsicDot, {}, {input1, input2, scalar, builder.getInt1(false)}, nullptr,
+                                       instName);
+    }
   }
 
   // Always use sadd_sat here as uint32@C is not supported.
@@ -1677,6 +1698,8 @@ Value *LowerCooperativeMatrix::createDotProductInt16Int32(Value *vector1, Value
   BuilderBase builder(*m_context);
   builder.SetInsertPoint(insertPos);
 
+  // Dot instructions are not available on gfx1010
+  const bool emulateDot = m_gfxIp.isGfx(10, 1) && m_gfxIp.stepping == 0;
   const bool isSigned = (flags & lgc::Builder::FirstVectorSigned);
   auto intrinsicDot = isSigned ? Intrinsic::amdgcn_sdot2 : Intrinsic::amdgcn_udot2;
 
@@ -1687,8 +1710,12 @@ Value *LowerCooperativeMatrix::createDotProductInt16Int32(Value *vector1, Value
     input1 = builder.CreateBitCast(input1, FixedVectorType::get(builder.getInt16Ty(), 2));
     Value *input2 = builder.CreateExtractElement(vector2, i);
     input2 = builder.CreateBitCast(input2, FixedVectorType::get(builder.getInt16Ty(), 2));
-    scalar =
-        builder.CreateIntrinsic(intrinsicDot, {}, {input1, input2, scalar, builder.getInt1(isSat)}, nullptr, instName);
+    if (emulateDot) {
+      scalar = createDotProductInt(input1, input2, scalar, flags, isSat, instName, insertPos);
+    } else {
+      scalar = builder.CreateIntrinsic(intrinsicDot, {}, {input1, input2, scalar, builder.getInt1(isSat)}, nullptr,
+                                       instName);
+    }
   }
   scalar->setName(instName);
   return scalar;
@@ -1704,9 +1731,8 @@ Value *LowerCooperativeMatrix::createDotProductInt16Int32(Value *vector1, Value
 // @param isSat:  SaturatingAccumulation for calculation
 // @param instName : Name to give instruction(s)
 // @param insertPos : Where to insert the instruction
-Value *LowerCooperativeMatrix::createDotProductInt16Int16(Value *vector1, Value *vector2, Value *accumulator,
-                                                          unsigned flags, bool isSat, const Twine &instName,
-                                                          Instruction *insertPos) {
+Value *LowerCooperativeMatrix::createDotProductInt(Value *vector1, Value *vector2, Value *accumulator, unsigned flags,
+                                                   bool isSat, const Twine &instName, Instruction *insertPos) {
   BuilderBase builder(*m_context);
   builder.SetInsertPoint(insertPos);
   Type *inputTy = vector1->getType();
@@ -1720,9 +1746,13 @@ Value *LowerCooperativeMatrix::createDotProductInt16Int16(Value *vector1, Value
   // as unsigned.
   const bool isMixed = (flags == lgc::Builder::FirstVectorSigned);
 
-  Type *targetTy = builder.getInt64Ty();
+  const auto outputSizeInBits = outputTy->getScalarSizeInBits();
+  const auto compSizeInBits = inputTy->getScalarSizeInBits();
+  Type *targetTy = compSizeInBits * 2 >= outputSizeInBits ? builder.getIntNTy(outputSizeInBits * 2) : outputTy;
+  const auto targetSizeInBits = targetTy->getScalarSizeInBits();
+  assert(targetSizeInBits <= 64);
   // Emulate dot product with no HW support cases
-  Value *scalar = builder.getInt64(0);
+  Value *scalar = builder.getIntN(targetSizeInBits, 0);
   for (unsigned elemIdx = 0; elemIdx < compCount; ++elemIdx) {
     Value *elem1 = builder.CreateExtractElement(vector1, elemIdx);
     elem1 = isSigned ? builder.CreateSExt(elem1, targetTy) : builder.CreateZExt(elem1, targetTy);
@@ -1732,28 +1762,27 @@ Value *LowerCooperativeMatrix::createDotProductInt16Int16(Value *vector1, Value
     scalar = builder.CreateAdd(product, scalar);
   }
 
-  scalar = builder.CreateTrunc(scalar, builder.getInt32Ty());
-  accumulator = builder.CreateTrunc(accumulator, builder.getInt32Ty());
+  scalar = builder.CreateTrunc(scalar, outputTy);
+  accumulator = builder.CreateTrunc(accumulator, outputTy);
   Intrinsic::ID addIntrinsic = isSigned ? Intrinsic::sadd_sat : Intrinsic::uadd_sat;
   scalar = builder.CreateBinaryIntrinsic(addIntrinsic, scalar, accumulator, nullptr, instName);
 
-  const unsigned bitWidth = outputTy->getScalarSizeInBits();
-  auto unsignedMax = (2ULL << (bitWidth - 1)) - 1;
+  auto unsignedMax = (2ULL << (targetSizeInBits - 1)) - 1;
   auto signedMax = unsignedMax >> 1;
   auto signedMin = -1ULL - signedMax;
 
   Value *minimum = nullptr, *maximum = nullptr;
   Value *isUnderflow = nullptr, *isOverflow = nullptr;
   if (isSigned) {
-    scalar = builder.CreateSExt(scalar, builder.getInt64Ty());
-    minimum = ConstantInt::getSigned(builder.getInt64Ty(), signedMin);
-    maximum = ConstantInt::getSigned(builder.getInt64Ty(), signedMax);
+    scalar = builder.CreateSExt(scalar, targetTy);
+    minimum = ConstantInt::getSigned(targetTy, signedMin);
+    maximum = ConstantInt::getSigned(targetTy, signedMax);
     isUnderflow = builder.CreateICmpSLT(scalar, minimum);
     isOverflow = builder.CreateICmpSGT(scalar, maximum);
   } else {
-    scalar = builder.CreateZExt(scalar, builder.getInt64Ty());
-    minimum = builder.getInt64(0);
-    maximum = builder.getInt64(unsignedMax);
+    scalar = builder.CreateZExt(scalar, targetTy);
+    minimum = builder.getIntN(targetSizeInBits, 0);
+    maximum = builder.getIntN(targetSizeInBits, unsignedMax);
     isUnderflow = builder.CreateICmpULT(scalar, minimum);
     isOverflow = builder.CreateICmpUGT(scalar, maximum);
   }
diff --git a/lgc/patch/LowerDebugPrintf.cpp b/lgc/patch/LowerDebugPrintf.cpp
index 7de0ef8752..3ccc0d5862 100644
--- a/lgc/patch/LowerDebugPrintf.cpp
+++ b/lgc/patch/LowerDebugPrintf.cpp
@@ -30,20 +30,24 @@
  */
 #include "lgc/patch/LowerDebugPrintf.h"
 #include "lgc/LgcDialect.h"
+#include "lgc/builder/BuilderImpl.h"
 #include "lgc/patch/Patch.h"
 #include "lgc/state/PalMetadata.h"
 #include "lgc/state/PipelineState.h"
 #include "llvm-dialects/Dialect/Visitor.h"
 #include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/BinaryFormat/MsgPackDocument.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/Debug.h"
+#include <set>
 
 #define DEBUG_TYPE "lower-debug-printf"
 
 using namespace llvm;
 using namespace lgc;
 
+constexpr unsigned PrintfBufferBindingId = 6;
 namespace lgc {
 
 // =====================================================================================================================
@@ -57,10 +61,40 @@ PreservedAnalyses LowerDebugPrintf::run(Module &module, ModuleAnalysisManager &a
   PipelineState *pipelineState = analysisManager.getResult<PipelineStateWrapper>(module).getPipelineState();
   m_pipelineState = pipelineState;
 
-  static const auto visitor =
-      llvm_dialects::VisitorBuilder<LowerDebugPrintf>().add(&LowerDebugPrintf::visitDebugPrintf).build();
+  // Find the function which contains DebugPrintf dialect
+  typedef SmallSetVector<Function *, 4> FuncSet;
+  FuncSet printfFuncs;
+  static const auto debugPrintfFuncsVisitor =
+      llvm_dialects::VisitorBuilder<FuncSet>()
+          .setStrategy(llvm_dialects::VisitorStrategy::ByFunctionDeclaration)
+          .add<DebugPrintfOp>([](FuncSet &pfunc, auto &inst) { pfunc.insert(inst.getFunction()); })
+          .build();
+  debugPrintfFuncsVisitor.visit(printfFuncs, module);
 
-  visitor.visit(*this, module);
+  if (printfFuncs.empty())
+    return PreservedAnalyses::all();
+
+  bool hasPrintfDesc =
+      pipelineState
+          ->findResourceNode(ResourceNodeType::DescriptorBuffer, InternalDescriptorSetId, PrintfBufferBindingId)
+          .second != nullptr;
+
+  static const auto lowerDebugfPrintOpVisitor = llvm_dialects::VisitorBuilder<LowerDebugPrintf>()
+                                                    .setStrategy(llvm_dialects::VisitorStrategy::ByFunctionDeclaration)
+                                                    .add(&LowerDebugPrintf::visitDebugPrintf)
+                                                    .build();
+
+  BuilderImpl builder(m_pipelineState);
+  for (auto func : printfFuncs) {
+    // Create printbuffer Descriptor at the beginning of the function which contains DebugPrintf dialect ops
+    builder.SetInsertPointPastAllocas(func);
+    m_debugPrintfBuffer = hasPrintfDesc
+                              ? m_debugPrintfBuffer = builder.create<BufferDescToPtrOp>(builder.CreateBufferDesc(
+                                    InternalDescriptorSetId, PrintfBufferBindingId, builder.getInt32(0), 2))
+                              : nullptr;
+
+    lowerDebugfPrintOpVisitor.visit(*this, *func);
+  }
 
   for (auto inst : m_toErase)
     inst->eraseFromParent();
@@ -79,10 +113,11 @@ PreservedAnalyses LowerDebugPrintf::run(Module &module, ModuleAnalysisManager &a
 void LowerDebugPrintf::visitDebugPrintf(DebugPrintfOp &op) {
   m_toErase.push_back(&op);
 
-  Value *debugPrintfBuffer = op.getBuffer();
-  if (isa<PoisonValue>(debugPrintfBuffer))
+  if (!m_debugPrintfBuffer)
     return;
 
+  Value *debugPrintfBuffer = m_debugPrintfBuffer;
+
   BuilderBase builder(&op);
 
   // Printf output variables in DWORDs
@@ -94,8 +129,7 @@ void LowerDebugPrintf::visitDebugPrintf(DebugPrintfOp &op) {
     getDwordValues(var, printArgs, bit64Vector, builder);
   }
 
-  GlobalVariable *globalStr = cast<GlobalVariable>(op.getFormat());
-  StringRef strDebugStr = (cast<ConstantDataSequential>(globalStr->getInitializer()))->getAsString();
+  StringRef strDebugStr = op.getFormat();
 
   uint64_t hash = hash_value(strDebugStr);
 
diff --git a/lgc/patch/MeshTaskShader.cpp b/lgc/patch/MeshTaskShader.cpp
index c1b259d425..6e47c76187 100644
--- a/lgc/patch/MeshTaskShader.cpp
+++ b/lgc/patch/MeshTaskShader.cpp
@@ -998,13 +998,14 @@ void MeshTaskShader::lowerEmitMeshTasks(EmitMeshTasksOp &emitMeshTasksOp) {
   auto emitMeshTasksBlock = checkEmitMeshTasksBlock->splitBasicBlock(emitMeshTasksCall, ".emitMeshTasks");
   auto endEmitMeshTasksBlock = emitMeshTasksBlock->splitBasicBlock(emitMeshTasksCall, ".endEmitMeshTasks");
 
+  SyncScope::ID agentScope = m_builder.getContext().getOrInsertSyncScopeID("agent");
   // Modify ".checkEmitMeshTasks" block
   {
     m_builder.SetInsertPoint(checkEmitMeshTasksBlock->getTerminator());
 
     if (m_accessTaskPayload) {
       // Make sure the task payload read/write access is completed
-      m_builder.CreateFence(AtomicOrdering::Release, SyncScope::System);
+      m_builder.CreateFence(AtomicOrdering::Release, agentScope);
       createBarrier();
     }
 
@@ -1043,7 +1044,7 @@ void MeshTaskShader::lowerEmitMeshTasks(EmitMeshTasksOp &emitMeshTasksOp) {
       valueToAdd = m_builder.CreateBitCast(valueToAdd, m_builder.getInt64Ty());
 
       m_builder.CreateAtomicRMW(AtomicRMWInst::Add, meshPipeStatsBufEntryPtr, valueToAdd, MaybeAlign(),
-                                AtomicOrdering::Monotonic, SyncScope::System);
+                                AtomicOrdering::Monotonic, agentScope);
     }
 
     //
@@ -2082,7 +2083,7 @@ void MeshTaskShader::exportVertex() {
   if (waAtmPrecedesPos) {
     // Before the first export call of vertex position data, add s_wait_vscnt 0 to make sure the completion of all
     // attributes being written to the attribute ring buffer
-    m_builder.CreateFence(AtomicOrdering::Release, SyncScope::System);
+    m_builder.CreateFence(AtomicOrdering::Release, m_builder.getContext().getOrInsertSyncScopeID("agent"));
 
     doExport(ExportKind::Pos, posExports);
   }
@@ -2101,6 +2102,7 @@ void MeshTaskShader::collectMeshStatsInfo(Function *entryPoint, Value *numMeshPr
   const uint64_t numMeshThreads = meshMode.workgroupSizeX * meshMode.workgroupSizeY * meshMode.workgroupSizeZ;
 
   Value *meshPipeStatsBufPtr = m_pipelineSysValues.get(entryPoint)->getMeshPipeStatsBufPtr();
+  SyncScope::ID agentScope = m_builder.getContext().getOrInsertSyncScopeID("agent");
 
   //
   // Record numMeshThreads
@@ -2122,7 +2124,7 @@ void MeshTaskShader::collectMeshStatsInfo(Function *entryPoint, Value *numMeshPr
     valueToAdd = m_builder.CreateBitCast(valueToAdd, m_builder.getInt64Ty());
 
     m_builder.CreateAtomicRMW(AtomicRMWInst::Add, meshPipeStatsBufEntryPtr, valueToAdd, MaybeAlign(),
-                              AtomicOrdering::Monotonic, SyncScope::System);
+                              AtomicOrdering::Monotonic, agentScope);
   }
 
   //
@@ -2147,7 +2149,7 @@ void MeshTaskShader::collectMeshStatsInfo(Function *entryPoint, Value *numMeshPr
     valueToAdd = m_builder.CreateBitCast(valueToAdd, m_builder.getInt64Ty());
 
     m_builder.CreateAtomicRMW(AtomicRMWInst::Add, meshPipeStatsBufEntryPtr, valueToAdd, MaybeAlign(),
-                              AtomicOrdering::Monotonic, SyncScope::System);
+                              AtomicOrdering::Monotonic, agentScope);
   }
 }
 
diff --git a/lgc/patch/NggPrimShader.cpp b/lgc/patch/NggPrimShader.cpp
index bca8196fef..24db846493 100644
--- a/lgc/patch/NggPrimShader.cpp
+++ b/lgc/patch/NggPrimShader.cpp
@@ -4068,12 +4068,13 @@ void NggPrimShader::writeGsOutput(Value *output, unsigned location, unsigned com
   const unsigned attribOffset = (location * 4) + component;
   auto ldsOffset = m_builder.CreateAdd(vertexOffset, m_builder.getInt32(attribOffset));
 
+  IRBuilder<>::InsertPointGuard guard(m_builder);
+
+  // Skip GS-VS ring write if the emit is invalid
   if (geometryMode.robustGsEmits) {
-    // skip the lds write by writing to a dummy offset.
-    // ldsOffset = (totalEmitVerts >= outputVertices) ? InvalidValue : ldsOffset
-    auto dummyOffset = m_builder.getInt32(0x80000000);
-    auto outOfRange = m_builder.CreateICmpUGE(totalEmitVerts, m_builder.getInt32(geometryMode.outputVertices));
-    ldsOffset = m_builder.CreateSelect(outOfRange, dummyOffset, ldsOffset);
+    // validEmit = totalEmitVerts < outputVertices
+    auto validEmit = m_builder.CreateICmpULT(totalEmitVerts, m_builder.getInt32(geometryMode.outputVertices));
+    m_builder.CreateIf(validEmit, false);
   }
 
   writeValueToLds(output, ldsOffset);
@@ -4246,7 +4247,7 @@ Function *NggPrimShader::createGsEmitHandler() {
       totalEmitVerts = m_builder.CreateLoad(m_builder.getInt32Ty(), totalEmitVertsPtr);
       // totalEmitVerts++
       totalEmitVerts = m_builder.CreateAdd(totalEmitVerts, m_builder.getInt32(1));
-      // outVerts = (totalEmitVerts >= outputVertices) ? 0 : outVerts
+      // outVerts = (totalEmitVerts > outputVertices) ? 0 : outVerts
       Value *outOfRange = m_builder.CreateICmpUGT(totalEmitVerts, m_builder.getInt32(geometryMode.outputVertices));
       outVerts = m_builder.CreateSelect(outOfRange, m_builder.getInt32(0), outVerts);
     }
@@ -6253,7 +6254,7 @@ void NggPrimShader::processVertexAttribExport(Function *&target) {
     // Before the first export call, add s_wait_vscnt 0 to make sure the completion of all attributes being written
     // to the attribute ring buffer
     m_builder.SetInsertPoint(exportCalls[0]);
-    m_builder.CreateFence(AtomicOrdering::Release, SyncScope::System);
+    m_builder.CreateFence(AtomicOrdering::Release, m_builder.getContext().getOrInsertSyncScopeID("agent"));
   }
 
   // Remove calls
diff --git a/lgc/patch/NggPrimShader.h b/lgc/patch/NggPrimShader.h
index 7332661439..71a8e27982 100644
--- a/lgc/patch/NggPrimShader.h
+++ b/lgc/patch/NggPrimShader.h
@@ -32,6 +32,7 @@
 
 #include "lgc/state/PipelineState.h"
 #include "lgc/state/TargetInfo.h"
+#include "lgc/util/BuilderBase.h"
 #include "llvm/IR/Module.h"
 
 namespace lgc {
@@ -419,7 +420,7 @@ class NggPrimShader {
   VertexCullInfoOffsets m_vertCullInfoOffsets;           // A collection of offsets within an item of vertex cull info
   StreamOutControlCbOffsets m_streamOutControlCbOffsets; // A collection of offsets within stream-out control buffer
 
-  llvm::IRBuilder<> m_builder; // LLVM IR builder
+  BuilderBase m_builder; // LLVM IR builder
 
   llvm::Constant *m_lds = nullptr; // Global variable to model primitive shader LDS
   PrimShaderLdsLayout m_ldsLayout; // Primitive shader LDS layout
diff --git a/lgc/patch/Patch.cpp b/lgc/patch/Patch.cpp
index 1c94ed3a4c..9e759fec1e 100644
--- a/lgc/patch/Patch.cpp
+++ b/lgc/patch/Patch.cpp
@@ -180,7 +180,6 @@ void Patch::addPasses(PipelineState *pipelineState, lgc::PassManager &passMgr, T
   }
 
   passMgr.addPass(IPSCCPPass());
-  passMgr.addPass(LowerDebugPrintf());
 
   passMgr.addPass(createModuleToFunctionPassAdaptor(CombineCooperativeMatrix()));
   // Lower the cooperative matrix
@@ -201,6 +200,7 @@ void Patch::addPasses(PipelineState *pipelineState, lgc::PassManager &passMgr, T
   passMgr.addPass(PatchCopyShader());
   passMgr.addPass(LowerVertexFetch());
   passMgr.addPass(LowerFragColorExport());
+  passMgr.addPass(LowerDebugPrintf());
   passMgr.addPass(LowerDesc());
   passMgr.addPass(PatchEntryPointMutate());
   passMgr.addPass(PatchInitializeWorkgroupMemory());
diff --git a/lgc/patch/PatchEntryPointMutate.cpp b/lgc/patch/PatchEntryPointMutate.cpp
index bfc06831de..3deff7bf30 100644
--- a/lgc/patch/PatchEntryPointMutate.cpp
+++ b/lgc/patch/PatchEntryPointMutate.cpp
@@ -55,6 +55,7 @@
 
 #include "lgc/patch/PatchEntryPointMutate.h"
 #include "ShaderMerger.h"
+#include "compilerutils/CompilerUtils.h"
 #include "lgc/LgcContext.h"
 #include "lgc/LgcCpsDialect.h"
 #include "lgc/LgcDialect.h"
@@ -847,7 +848,8 @@ Function *PatchEntryPointMutate::lowerCpsFunction(Function *func, ArrayRef<Type
     // Get stack address of pushed state and load it from continuation stack.
     unsigned stateSize = layout.getTypeStoreSize(state->getType());
     vsp = builder.CreateConstInBoundsGEP1_32(builder.getInt8Ty(), vsp, -alignTo(stateSize, ContinuationStackAlignment));
-    Value *newState = builder.CreateLoad(state->getType(), vsp, "cps.state");
+    auto *newState = builder.CreateLoad(state->getType(), vsp, "cps.state");
+    CompilerUtils::setIsLastUseLoad(*newState);
     state->replaceAllUsesWith(newState);
   }
   vsp = builder.CreatePtrToInt(vsp, builder.getInt32Ty());
@@ -1101,7 +1103,7 @@ void PatchEntryPointMutate::gatherUserDataUsage(Module *module) {
       // offsets to calculate numbers of written primitives/dwords and update the counters.  auto lastVertexStage =
       auto lastVertexStage = m_pipelineState->getLastVertexProcessingStage();
       lastVertexStage = lastVertexStage == ShaderStage::CopyShader ? ShaderStage::Geometry : lastVertexStage;
-      getUserDataUsage(lastVertexStage)->usesStreamOutTable = true;
+      getUserDataUsage(lastVertexStage.value())->usesStreamOutTable = true;
     }
   }
 }
diff --git a/lgc/patch/PatchInOutImportExport.cpp b/lgc/patch/PatchInOutImportExport.cpp
index fb16894137..09027bf9fb 100644
--- a/lgc/patch/PatchInOutImportExport.cpp
+++ b/lgc/patch/PatchInOutImportExport.cpp
@@ -1082,6 +1082,14 @@ void PatchInOutImportExport::visitCallInst(CallInst &callInst) {
         Value *emitCounter = builder.CreateLoad(emitCounterTy, emitCounterPtr);
         emitCounter = builder.CreateAdd(emitCounter, builder.getInt32(1));
         builder.CreateStore(emitCounter, emitCounterPtr);
+
+        // Increment total emit vertex counter
+        if (m_pipelineState->getShaderModes()->getGeometryShaderMode().robustGsEmits) {
+          auto totalEmitCounterPtr = m_pipelineSysValues.get(m_entryPoint)->getTotalEmitCounterPtr();
+          Value *totalEmitCounter = builder.CreateLoad(builder.getInt32Ty(), totalEmitCounterPtr);
+          totalEmitCounter = builder.CreateAdd(totalEmitCounter, builder.getInt32(1));
+          builder.CreateStore(totalEmitCounter, totalEmitCounterPtr);
+        }
       }
     }
   }
@@ -1101,7 +1109,7 @@ void PatchInOutImportExport::visitReturnInst(ReturnInst &retInst) {
   // Whether this shader stage has to use "exp" instructions to export outputs
   const bool useExpInst = ((m_shaderStage == ShaderStage::Vertex || m_shaderStage == ShaderStage::TessEval ||
                             m_shaderStage == ShaderStage::CopyShader) &&
-                           (nextStage == ShaderStage::Invalid || nextStage == ShaderStage::Fragment));
+                           (!nextStage || nextStage == ShaderStage::Fragment));
 
   BuilderBase builder(&retInst);
 
@@ -1329,7 +1337,7 @@ void PatchInOutImportExport::visitReturnInst(ReturnInst &retInst) {
       }
 
       // NOTE: We have to export gl_ClipDistance[] or gl_CullDistancep[] via generic outputs as well.
-      assert(nextStage == ShaderStage::Invalid || nextStage == ShaderStage::Fragment);
+      assert(!nextStage || nextStage == ShaderStage::Fragment);
 
       bool hasClipCullExport = true;
       if (nextStage == ShaderStage::Fragment) {
@@ -1385,7 +1393,7 @@ void PatchInOutImportExport::visitReturnInst(ReturnInst &retInst) {
       bool hasPrimitiveIdExport = false;
       if (nextStage == ShaderStage::Fragment) {
         hasPrimitiveIdExport = nextBuiltInUsage.primitiveId;
-      } else if (nextStage == ShaderStage::Invalid) {
+      } else if (!nextStage) {
         if (m_shaderStage == ShaderStage::CopyShader) {
           hasPrimitiveIdExport =
               m_pipelineState->getShaderResourceUsage(ShaderStage::Geometry)->builtInUsage.gs.primitiveId;
@@ -1466,7 +1474,7 @@ void PatchInOutImportExport::visitReturnInst(ReturnInst &retInst) {
         bool hasViewportIndexExport = true;
         if (nextStage == ShaderStage::Fragment) {
           hasViewportIndexExport = nextBuiltInUsage.viewportIndex;
-        } else if (nextStage == ShaderStage::Invalid) {
+        } else if (!nextStage) {
           hasViewportIndexExport = false;
         }
 
@@ -1485,7 +1493,7 @@ void PatchInOutImportExport::visitReturnInst(ReturnInst &retInst) {
         bool hasLayerExport = true;
         if (nextStage == ShaderStage::Fragment) {
           hasLayerExport = nextBuiltInUsage.layer;
-        } else if (nextStage == ShaderStage::Invalid) {
+        } else if (!nextStage) {
           hasLayerExport = false;
         }
 
@@ -1533,8 +1541,8 @@ void PatchInOutImportExport::visitReturnInst(ReturnInst &retInst) {
   } else if (m_shaderStage == ShaderStage::Geometry) {
     // NOTE: Per programming guide, we should do a "s_waitcnt 0,0,0 + s_waitcnt_vscnt 0" before issuing a "done", so
     // we use fence release to generate s_waitcnt vmcnt lgkmcnt/s_waitcnt_vscnt before s_sendmsg(MSG_GS_DONE)
-    SyncScope::ID scope =
-        m_pipelineState->isGsOnChip() ? m_context->getOrInsertSyncScopeID("workgroup") : SyncScope::System;
+    StringRef scopeName = m_pipelineState->isGsOnChip() ? "workgroup" : "agent";
+    SyncScope::ID scope = m_context->getOrInsertSyncScopeID(scopeName);
     builder.CreateFence(AtomicOrdering::Release, scope);
 
     auto &entryArgIdxs = m_pipelineState->getShaderInterfaceData(ShaderStage::Geometry)->entryArgIdxs.gs;
@@ -4001,6 +4009,18 @@ void PatchInOutImportExport::storeValueToGsVsRing(Value *storeValue, unsigned lo
 
     auto ringOffset = calcGsVsRingOffsetForOutput(location, compIdx, streamId, emitCounter, gsVsOffset, builder);
 
+    IRBuilder<>::InsertPointGuard guard(builder);
+
+    // Skip GS-VS ring write if the emit is invalid
+    const auto &geometryMode = m_pipelineState->getShaderModes()->getGeometryShaderMode();
+    if (geometryMode.robustGsEmits) {
+      auto totalEmitCounterPtr = m_pipelineSysValues.get(m_entryPoint)->getTotalEmitCounterPtr();
+      auto totalEmitCounter = builder.CreateLoad(builder.getInt32Ty(), totalEmitCounterPtr);
+      // validEmit = totalEmitCounter < outputVertices
+      auto validEmit = builder.CreateICmpULT(totalEmitCounter, builder.getInt32(geometryMode.outputVertices));
+      builder.CreateIf(validEmit, false);
+    }
+
     if (m_pipelineState->isGsOnChip()) {
       auto lds = Patch::getLdsVariable(m_pipelineState, m_entryPoint);
       Value *storePtr = builder.CreateGEP(builder.getInt32Ty(), lds, ringOffset);
@@ -4048,7 +4068,6 @@ Value *PatchInOutImportExport::calcEsGsRingOffsetForOutput(unsigned location, un
                                                            BuilderBase &builder) {
   // ES -> GS ring is always on-chip on GFX10+
   // ringOffset = esGsOffset + threadId * esGsRingItemSize + location * 4 + compIdx
-
   assert(m_pipelineState->hasShaderStage(ShaderStage::Geometry));
   const auto &calcFactor = m_pipelineState->getShaderResourceUsage(ShaderStage::Geometry)->inOutUsage.gs.calcFactor;
 
@@ -4071,12 +4090,37 @@ Value *PatchInOutImportExport::calcEsGsRingOffsetForOutput(unsigned location, un
 // @param builder : the builder to use
 Value *PatchInOutImportExport::calcEsGsRingOffsetForInput(unsigned location, unsigned compIdx, Value *vertexIdx,
                                                           BuilderBase &builder) {
+  // ES -> GS ring is always on-chip on GFX10+
+  assert(m_pipelineState->hasShaderStage(ShaderStage::Geometry));
+  const auto &calcFactor = m_pipelineState->getShaderResourceUsage(ShaderStage::Geometry)->inOutUsage.gs.calcFactor;
+
   auto esGsOffsets = m_pipelineSysValues.get(m_entryPoint)->getEsGsOffsets();
+  const auto &geometryMode = m_pipelineState->getShaderModes()->getGeometryShaderMode();
 
-  // ES -> GS ring is always on-chip on GFX10+
-  Value *vertexOffset = builder.CreateExtractElement(esGsOffsets, vertexIdx);
+  Value *vertexOffset = nullptr;
+  if (geometryMode.inputPrimitive == InputPrimitives::Patch) {
+    assert(geometryMode.controlPoints > 0); // Must have control points
+
+    // NOTE: If the input primitive is a patch, the calculation of vertex offset is different from other input primitive
+    // types as follow:
+    //
+    //   vertexOffset = esGsOffset0 + vertexIdx * esGsRingItemSize
+    //
+    // The esGsOffset0 is the starting offset of control points for each patch with such HW layout:
+    //
+    // +-----------------+-----------------+-----+-------------------+
+    // | Control Point 0 | Control Point 1 | ... | Control Point N-1 |
+    // +-----------------+-----------------+-----+-------------------+
+    // |<-------------------------- Patch -------------------------->|
+    //
+    vertexOffset = builder.CreateMul(vertexIdx, builder.getInt32(calcFactor.esGsRingItemSize));
+    vertexOffset = builder.CreateAdd(builder.CreateExtractElement(esGsOffsets, static_cast<uint64_t>(0)), vertexOffset);
+  } else {
+    // vertexOffset = esGsOffsets[vertexIdx] (vertexIdx < 6)
+    vertexOffset = builder.CreateExtractElement(esGsOffsets, vertexIdx);
+  }
 
-  // ringOffset = vertexOffset[N] + (location * 4 + compIdx);
+  // ringOffset = vertexOffset + (location * 4 + compIdx);
   Value *ringOffset = builder.CreateAdd(vertexOffset, builder.getInt32(location * 4 + compIdx));
   return ringOffset;
 }
@@ -4620,7 +4664,7 @@ void PatchInOutImportExport::addExportInstForGenericOutput(Value *output, unsign
   const auto nextStage = m_pipelineState->getNextShaderStage(m_shaderStage);
   const bool useExpInst = ((m_shaderStage == ShaderStage::Vertex || m_shaderStage == ShaderStage::TessEval ||
                             m_shaderStage == ShaderStage::CopyShader) &&
-                           (nextStage == ShaderStage::Invalid || nextStage == ShaderStage::Fragment));
+                           (!nextStage || nextStage == ShaderStage::Fragment));
   assert(useExpInst);
   (void(useExpInst)); // unused
 
diff --git a/lgc/patch/PatchReadFirstLane.cpp b/lgc/patch/PatchReadFirstLane.cpp
index ee9c388585..301e0c46a0 100644
--- a/lgc/patch/PatchReadFirstLane.cpp
+++ b/lgc/patch/PatchReadFirstLane.cpp
@@ -361,7 +361,12 @@ void ReadFirstLaneOptimizer::collectAssumeUniforms(BasicBlock *block,
 
   while (!candidates.empty()) {
     Instruction *candidate = candidates.pop_back_val();
-
+    if (auto intrinsic = dyn_cast<IntrinsicInst>(candidate)) {
+      // Don't lift readfirstlane that is manually added after permlane64 or permlanex16 in subgroupClusteredReduction
+      if (intrinsic->getIntrinsicID() == Intrinsic::amdgcn_permlane64 ||
+          intrinsic->getIntrinsicID() == Intrinsic::amdgcn_permlanex16)
+        continue;
+    }
     if (isAllUsersAssumedUniform(candidate))
       tryPropagate(candidate, false);
   }
diff --git a/lgc/patch/PatchResourceCollect.cpp b/lgc/patch/PatchResourceCollect.cpp
index d9a091bda7..4d8f42f2b6 100644
--- a/lgc/patch/PatchResourceCollect.cpp
+++ b/lgc/patch/PatchResourceCollect.cpp
@@ -123,7 +123,7 @@ PreservedAnalyses PatchResourceCollect::run(Module &module, ModuleAnalysisManage
     if (func.isDeclaration())
       continue;
     auto stage = getShaderStage(&func);
-    if (!m_shaderStage || &func == pipelineShaders.getEntryPoint(m_shaderStage))
+    if (!stage || &func == pipelineShaders.getEntryPoint(stage.value()))
       continue;
     m_shaderStage = stage.value();
     m_entryPoint = &func;
@@ -462,6 +462,9 @@ bool PatchResourceCollect::checkGsOnChipValidity() {
       useAdjacency = true;
       inVertsPerPrim = 6;
       break;
+    case InputPrimitives::Patch:
+      inVertsPerPrim = geometryMode.controlPoints;
+      break;
     default:
       llvm_unreachable("Unexpected input primitive type!");
       break;
@@ -1640,7 +1643,7 @@ void PatchResourceCollect::mapBuiltInToGenericInOut() {
   auto &inOutUsage = resUsage->inOutUsage;
 
   const auto nextStage = m_pipelineState->getNextShaderStage(m_shaderStage);
-  auto nextResUsage = nextStage != ShaderStage::Invalid ? m_pipelineState->getShaderResourceUsage(nextStage) : nullptr;
+  auto nextResUsage = nextStage ? m_pipelineState->getShaderResourceUsage(nextStage.value()) : nullptr;
 
   assert(inOutUsage.builtInInputLocMap.empty()); // Should be empty
   assert(inOutUsage.builtInOutputLocMap.empty());
@@ -1812,7 +1815,7 @@ void PatchResourceCollect::mapBuiltInToGenericInOut() {
       }
 
       builtInUsage.vs.primitiveShadingRate = false;
-    } else if (nextStage == ShaderStage::Invalid) {
+    } else if (!nextStage) {
       // VS only
       if (builtInUsage.vs.clipDistance > 0 || builtInUsage.vs.cullDistance > 0) {
         unsigned mapLoc = availOutMapLoc++;
@@ -1975,7 +1978,7 @@ void PatchResourceCollect::mapBuiltInToGenericInOut() {
       if (inOutUsage.builtInOutputLocMap.find(BuiltInCullDistance) != inOutUsage.builtInOutputLocMap.end() &&
           inOutUsage.builtInOutputLocMap[BuiltInCullDistance] == InvalidValue)
         inOutUsage.builtInOutputLocMap[BuiltInCullDistance] = availOutMapLoc++;
-    } else if (nextStage == ShaderStage::Invalid) {
+    } else if (!nextStage) {
       // TCS only
       if (builtInUsage.tcs.position)
         inOutUsage.builtInOutputLocMap[BuiltInPosition] = availOutMapLoc++;
@@ -2033,7 +2036,7 @@ void PatchResourceCollect::mapBuiltInToGenericInOut() {
       // incorrectness of location assignment during builtin-to-generic mapping.
       const auto prevStage = m_pipelineState->getPrevShaderStage(m_shaderStage);
       if (prevStage == ShaderStage::TessControl) {
-        const auto &prevBuiltInUsage = m_pipelineState->getShaderResourceUsage(prevStage)->builtInUsage.tcs;
+        const auto &prevBuiltInUsage = m_pipelineState->getShaderResourceUsage(prevStage.value())->builtInUsage.tcs;
         clipDistanceCount = std::max(clipDistanceCount, prevBuiltInUsage.clipDistance);
       }
 
@@ -2047,7 +2050,7 @@ void PatchResourceCollect::mapBuiltInToGenericInOut() {
 
       const auto prevStage = m_pipelineState->getPrevShaderStage(m_shaderStage);
       if (prevStage == ShaderStage::TessControl) {
-        const auto &prevBuiltInUsage = m_pipelineState->getShaderResourceUsage(prevStage)->builtInUsage.tcs;
+        const auto &prevBuiltInUsage = m_pipelineState->getShaderResourceUsage(prevStage.value())->builtInUsage.tcs;
         cullDistanceCount = std::max(cullDistanceCount, prevBuiltInUsage.clipDistance);
       }
 
@@ -2158,7 +2161,7 @@ void PatchResourceCollect::mapBuiltInToGenericInOut() {
       } else {
         builtInUsage.tes.viewportIndex = 0;
       }
-    } else if (nextStage == ShaderStage::Invalid) {
+    } else if (!nextStage) {
       // TES only
       if (builtInUsage.tes.clipDistance > 0 || builtInUsage.tes.cullDistance > 0) {
         unsigned mapLoc = availOutMapLoc++;
@@ -2282,7 +2285,7 @@ void PatchResourceCollect::mapBuiltInToGenericInOut() {
         const unsigned mapLoc = nextInOutUsage.builtInInputLocMap[BuiltInViewportIndex];
         builtInOutLocs[BuiltInViewportIndex] = mapLoc;
       }
-    } else if (nextStage == ShaderStage::Invalid) {
+    } else if (!nextStage) {
       // GS only
       unsigned availOutMapLoc = inOutUsage.outputLocInfoMap.size(); // Reset available location
 
@@ -2368,7 +2371,7 @@ void PatchResourceCollect::mapBuiltInToGenericInOut() {
         const unsigned mapLoc = nextInOutUsage.builtInInputLocMap[BuiltInCullDistance];
         inOutUsage.mesh.vertexBuiltInExportSlots[BuiltInCullDistance] = mapLoc;
       }
-    } else if (nextStage == ShaderStage::Invalid) {
+    } else if (!nextStage) {
       // Mesh shader only
       unsigned availExportLoc = inOutUsage.outputMapLocCount;
 
@@ -2416,7 +2419,7 @@ void PatchResourceCollect::mapBuiltInToGenericInOut() {
         const unsigned mapLoc = nextInOutUsage.perPrimitiveBuiltInInputLocMap[BuiltInViewportIndex];
         inOutUsage.mesh.primitiveBuiltInExportSlots[BuiltInViewportIndex] = mapLoc;
       }
-    } else if (nextStage == ShaderStage::Invalid) {
+    } else if (!nextStage) {
       // Mesh shader only
       unsigned availPerPrimitiveExportLoc = inOutUsage.perPrimitiveOutputMapLocCount;
 
@@ -2682,7 +2685,7 @@ void PatchResourceCollect::updateInputLocInfoMapWithUnpack() {
   auto preStage = m_pipelineState->getPrevShaderStage(m_shaderStage);
   if (preStage == ShaderStage::TessControl || preStage == ShaderStage::Mesh) {
     if (!inputLocInfoMap.empty()) {
-      auto &outputLocInfoMap = m_pipelineState->getShaderResourceUsage(preStage)->inOutUsage.outputLocInfoMap;
+      auto &outputLocInfoMap = m_pipelineState->getShaderResourceUsage(preStage.value())->inOutUsage.outputLocInfoMap;
       for (auto &infoPair : outputLocInfoMap) {
         if (infoPair.second != InvalidValue) {
           inputLocInfoMap[infoPair.first] = InvalidValue;
@@ -2692,7 +2695,8 @@ void PatchResourceCollect::updateInputLocInfoMapWithUnpack() {
     }
     auto &perPatchInLocMap = inOutUsage.perPatchInputLocMap;
     if (!perPatchInLocMap.empty()) {
-      auto &perPatchOutLocMap = m_pipelineState->getShaderResourceUsage(preStage)->inOutUsage.perPatchOutputLocMap;
+      auto &perPatchOutLocMap =
+          m_pipelineState->getShaderResourceUsage(preStage.value())->inOutUsage.perPatchOutputLocMap;
       for (auto &locPair : perPatchOutLocMap) {
         if (locPair.second != InvalidValue) {
           perPatchInLocMap[locPair.first] = InvalidValue;
@@ -2756,10 +2760,10 @@ void PatchResourceCollect::updateInputLocInfoMapWithUnpack() {
 // =====================================================================================================================
 // Clear unused output from outputLocInfoMap, perPatchOutputLocMap, and perPrimitiveOutputLocMap
 void PatchResourceCollect::clearUnusedOutput() {
-  ShaderStageEnum nextStage = m_pipelineState->getNextShaderStage(m_shaderStage);
+  auto nextStage = m_pipelineState->getNextShaderStage(m_shaderStage);
   auto &inOutUsage = m_pipelineState->getShaderResourceUsage(m_shaderStage)->inOutUsage;
   auto &outputLocInfoMap = inOutUsage.outputLocInfoMap;
-  if (nextStage != ShaderStage::Invalid) {
+  if (nextStage) {
     // Collect the locations of TCS's imported outputs
     DenseSet<unsigned> importOutputLocs;
     if (m_shaderStage == ShaderStage::TessControl) {
@@ -2784,7 +2788,7 @@ void PatchResourceCollect::clearUnusedOutput() {
 
     // Do normal input/output matching
     SmallVector<InOutLocationInfo, 4> unusedLocInfos;
-    auto nextResUsage = m_pipelineState->getShaderResourceUsage(nextStage);
+    auto nextResUsage = m_pipelineState->getShaderResourceUsage(nextStage.value());
     const auto &nextInLocInfoMap = nextResUsage->inOutUsage.inputLocInfoMap;
 
     for (auto &locInfoPair : outputLocInfoMap) {
@@ -2899,8 +2903,9 @@ void PatchResourceCollect::updateOutputLocInfoMapWithUnpack() {
     // If we don't have to keep the locations and the next stage is valid, try to get location map of the outputs from
     // corresponding inputs of next stage.
     const bool keepLocation = m_shaderStage == ShaderStage::Geometry && !canChangeOutputLocationsForGs();
-    if (!keepLocation && nextStage != ShaderStage::Invalid) {
-      auto &nextStageInputLocInfoMap = m_pipelineState->getShaderResourceUsage(nextStage)->inOutUsage.inputLocInfoMap;
+    if (!keepLocation && nextStage) {
+      auto &nextStageInputLocInfoMap =
+          m_pipelineState->getShaderResourceUsage(nextStage.value())->inOutUsage.inputLocInfoMap;
       for (auto &locInfoPair : outputLocInfoMap) {
         const auto &locationInfo = locInfoPair.first;
         auto &newLocationInfo = locInfoPair.second;
@@ -3000,9 +3005,9 @@ void PatchResourceCollect::updateOutputLocInfoMapWithUnpack() {
     assert(m_shaderStage == ShaderStage::TessControl);
 
     // If the next stage is valid, try to get location map of the outputs from corresponding inputs of next stage.
-    if (nextStage != ShaderStage::Invalid) {
+    if (nextStage) {
       auto &nextStagePerPatchInputLocInfoMap =
-          m_pipelineState->getShaderResourceUsage(nextStage)->inOutUsage.perPatchInputLocMap;
+          m_pipelineState->getShaderResourceUsage(nextStage.value())->inOutUsage.perPatchInputLocMap;
       for (auto &locPair : perPatchOutputLocMap) {
         if (locPair.second != InvalidValue)
           continue; // Skip mapped locations
@@ -3046,9 +3051,9 @@ void PatchResourceCollect::updateOutputLocInfoMapWithUnpack() {
     assert(m_shaderStage == ShaderStage::Mesh);
 
     // If the next stage is valid, try to get location map of the outputs from corresponding inputs of next stage.
-    if (nextStage != ShaderStage::Invalid) {
+    if (nextStage) {
       auto &nextStagePerPrimitiveInputLocMap =
-          m_pipelineState->getShaderResourceUsage(nextStage)->inOutUsage.perPrimitiveInputLocMap;
+          m_pipelineState->getShaderResourceUsage(nextStage.value())->inOutUsage.perPrimitiveInputLocMap;
       for (auto &locPair : perPrimitiveOutputLocMap) {
         if (locPair.second != InvalidValue)
           continue; // Skip mapped locations
@@ -3096,7 +3101,7 @@ bool PatchResourceCollect::canChangeOutputLocationsForGs() {
     return true;
   if (m_pipelineState->getPalMetadata()->haveFsInputMappings())
     return true;
-  if (m_pipelineState->getNextShaderStage(ShaderStage::Geometry) != ShaderStage::Invalid)
+  if (m_pipelineState->getNextShaderStage(ShaderStage::Geometry))
     return true;
   return false;
 }
@@ -3157,8 +3162,8 @@ void PatchResourceCollect::updateOutputLocInfoMapWithPack() {
   assert(m_shaderStage == ShaderStage::Vertex || m_shaderStage == ShaderStage::TessEval ||
          m_shaderStage == ShaderStage::Geometry);
   auto nextStage = m_pipelineState->getNextShaderStage(m_shaderStage);
-  assert(nextStage != ShaderStage::Invalid);
-  auto &nextStageInputLocInfoMap = m_pipelineState->getShaderResourceUsage(nextStage)->inOutUsage.inputLocInfoMap;
+  auto &nextStageInputLocInfoMap =
+      m_pipelineState->getShaderResourceUsage(nextStage.value())->inOutUsage.inputLocInfoMap;
 
   // Remove unused outputs and update the output map
   if (m_shaderStage != m_pipelineState->getLastVertexProcessingStage()) {
@@ -3707,7 +3712,7 @@ void PatchResourceCollect::clearUndefinedOutput() {
 
     for (auto call : candidateCalls) {
       // For unlinked case, we should keep the location info map unchanged.
-      if (m_pipelineState->getNextShaderStage(m_shaderStage) != ShaderStage::Invalid) {
+      if (m_pipelineState->getNextShaderStage(m_shaderStage)) {
         // Remove the output location info if it exists
         unsigned index = m_shaderStage == ShaderStage::Mesh ? 2 : 1;
         unsigned component = cast<ConstantInt>(call->getArgOperand(index))->getZExtValue();
diff --git a/lgc/patch/RegisterMetadataBuilder.cpp b/lgc/patch/RegisterMetadataBuilder.cpp
index 6042c4313d..8e452560e8 100644
--- a/lgc/patch/RegisterMetadataBuilder.cpp
+++ b/lgc/patch/RegisterMetadataBuilder.cpp
@@ -129,9 +129,9 @@ void RegisterMetadataBuilder::buildPalMetadata() {
     if (hwStageMask & (Util::Abi::HwShaderGs | Util::Abi::HwShaderVs))
       buildPaSpecificRegisters();
 
-    if (lastVertexProcessingStage != ShaderStage::Invalid && m_pipelineState->isUnlinked()) {
+    if (lastVertexProcessingStage && m_pipelineState->isUnlinked()) {
       // Fill ".preraster_output_semantic"
-      auto resUsage = m_pipelineState->getShaderResourceUsage(lastVertexProcessingStage);
+      auto resUsage = m_pipelineState->getShaderResourceUsage(lastVertexProcessingStage.value());
       auto &outputLocInfoMap = resUsage->inOutUsage.outputLocInfoMap;
       auto &builtInOutputLocMap = resUsage->inOutUsage.builtInOutputLocMap;
       // Collect semantic info for generic input and builtIns {gl_ClipDistance, gl_CulDistance, gl_Layer,
@@ -200,6 +200,8 @@ void RegisterMetadataBuilder::buildLsHsRegisters() {
       lsVgprCompCnt = 3; // Enable all LS VGPRs (LS VGPR2 - VGPR5)
     else
       lsVgprCompCnt = 1; // Must enable relative vertex ID (LS VGPR2 and VGPR3)
+  } else {
+    llvm_unreachable("Not implemented!");
   }
   getGraphicsRegNode()[Util::Abi::GraphicsRegisterMetadataKey::LsVgprCompCnt] = lsVgprCompCnt;
 
@@ -226,6 +228,7 @@ void RegisterMetadataBuilder::buildEsGsRegisters() {
   const auto gsResUsage = m_pipelineState->getShaderResourceUsage(ShaderStage::Geometry);
   const auto &gsBuiltInUsage = gsResUsage->builtInUsage.gs;
   const auto &gsInOutUsage = gsResUsage->inOutUsage;
+  const auto &geometryMode = m_pipelineState->getShaderModes()->getGeometryShaderMode();
   const auto &calcFactor = gsInOutUsage.gs.calcFactor;
   const auto tesResUsage = m_pipelineState->getShaderResourceUsage(ShaderStage::TessEval);
   const auto &tesBuiltInUsage = tesResUsage->builtInUsage.tes;
@@ -233,11 +236,12 @@ void RegisterMetadataBuilder::buildEsGsRegisters() {
 
   // ES_VGPR_COMP_CNT in SPI_SHADER_PGM_RSRC2_GS
   unsigned gsVgprCompCnt = 0;
-  if (calcFactor.inputVertices > 4 || gsBuiltInUsage.invocationId)
+  if ((calcFactor.inputVertices > 4 && geometryMode.inputPrimitive != InputPrimitives::Patch) ||
+      gsBuiltInUsage.invocationId)
     gsVgprCompCnt = 3; // Enable vtx4/vtx5 offset (GS VGPR3) or GS instance ID (GS VGPR4)
   else if (gsBuiltInUsage.primitiveIdIn)
     gsVgprCompCnt = 2; // Enable primitive ID (GS VGPR2)
-  else if (calcFactor.inputVertices > 2)
+  else if (calcFactor.inputVertices > 2 && geometryMode.inputPrimitive != InputPrimitives::Patch)
     gsVgprCompCnt = 1; // Enable vtx2/vtx3 offset (GS VGPR1)
   getGraphicsRegNode()[Util::Abi::GraphicsRegisterMetadataKey::GsVgprCompCnt] = gsVgprCompCnt;
 
@@ -257,7 +261,6 @@ void RegisterMetadataBuilder::buildEsGsRegisters() {
   getHwShaderNode(Util::Abi::HardwareStage::Gs)[Util::Abi::HardwareStageMetadataKey::OffchipLdsEn] = hasTs;
 
   // VGT_GS_MAX_VERT_OUT
-  const auto &geometryMode = m_pipelineState->getShaderModes()->getGeometryShaderMode();
   unsigned maxVertOut = std::max(1u, static_cast<unsigned>(geometryMode.outputVertices));
   getGraphicsRegNode()[Util::Abi::GraphicsRegisterMetadataKey::VgtGsMaxVertOut] = maxVertOut;
 
@@ -345,6 +348,13 @@ void RegisterMetadataBuilder::buildEsGsRegisters() {
   // VGT_ESGS_RING_ITEMSIZE
   getGraphicsRegNode()[Util::Abi::GraphicsRegisterMetadataKey::VgtEsgsRingItemsize] = calcFactor.esGsRingItemSize;
 
+  // VGT_LS_HS_CONFIG
+  if (geometryMode.inputPrimitive == InputPrimitives::Patch) {
+    assert(geometryMode.controlPoints > 0);
+    auto vgtLsHsConfig = getGraphicsRegNode()[Util::Abi::GraphicsRegisterMetadataKey::VgtLsHsConfig].getMap(true);
+    vgtLsHsConfig[Util::Abi::VgtLsHsConfigMetadataKey::HsNumInputCp] = geometryMode.controlPoints;
+  }
+
   // GE_MAX_OUTPUT_PER_SUBGROUP and VGT_GS_MAX_PRIMS_PER_SUBGROUP
   const unsigned maxPrimsPerSubgroup = std::min(gsInstPrimsInSubgrp * maxVertOut, MaxGsThreadsPerSubgroup);
   getGraphicsRegNode()[Util::Abi::GraphicsRegisterMetadataKey::MaxVertsPerSubgroup] = maxPrimsPerSubgroup;
@@ -380,11 +390,12 @@ void RegisterMetadataBuilder::buildPrimShaderRegisters() {
   unsigned gsVgprCompCnt = 0;
   if (m_gfxIp.major <= 11) {
     if (m_hasGs) {
-      if (calcFactor.inputVertices > 4 || gsBuiltInUsage.invocationId)
+      if ((calcFactor.inputVertices > 4 && geometryMode.inputPrimitive != InputPrimitives::Patch) ||
+          gsBuiltInUsage.invocationId)
         gsVgprCompCnt = 3; // Enable vtx4/vtx5 offset (GS VGPR3) or GS instance ID (GS VGPR4)
       else if (gsBuiltInUsage.primitiveIdIn)
         gsVgprCompCnt = 2; // Enable primitive ID (GS VGPR2)
-      else if (calcFactor.inputVertices > 2)
+      else if (calcFactor.inputVertices > 2 && geometryMode.inputPrimitive != InputPrimitives::Patch)
         gsVgprCompCnt = 1; // Enable vtx2/vtx3 offset (GS VGPR1)
     } else if (m_hasVs) {
       // NOTE: When GS is absent, only those VGPRs are required: vtx0/vtx1 offset, vtx2/vtx3 offset,
@@ -584,6 +595,13 @@ void RegisterMetadataBuilder::buildPrimShaderRegisters() {
           (m_hasGs ? calcFactor.esGsRingItemSize : 1);
     }
 
+    // VGT_LS_HS_CONFIG
+    if (geometryMode.inputPrimitive == InputPrimitives::Patch) {
+      assert(geometryMode.controlPoints > 0);
+      auto vgtLsHsConfig = getGraphicsRegNode()[Util::Abi::GraphicsRegisterMetadataKey::VgtLsHsConfig].getMap(true);
+      vgtLsHsConfig[Util::Abi::VgtLsHsConfigMetadataKey::HsNumInputCp] = geometryMode.controlPoints;
+    }
+
     const auto nggControl = m_pipelineState->getNggControl();
     assert(nggControl->enableNgg);
     if (!nggControl->passthroughMode) {
@@ -1285,7 +1303,7 @@ void RegisterMetadataBuilder::buildPaSpecificRegisters() {
 
     // On 10.3+ all auxiliary position exports are optimized, not just the misc exports.
     if (m_gfxIp >= GfxIpVersion{10, 3})
-      paClClipCntl[Util::Abi::PaClVsOutCntlMetadataKey::VsOutMiscSideBusEna] = true;
+      paClVsOutCntl[Util::Abi::PaClVsOutCntlMetadataKey::VsOutMiscSideBusEna] = true;
   }
 
   // PA_CL_VTE_CNTL
@@ -1385,7 +1403,7 @@ void RegisterMetadataBuilder::setVgtShaderStagesEn(unsigned hwStageMask) {
     ShaderStageEnum apiStage = ShaderStage::Vertex;
     if (m_hasGs || m_hasMesh) {
       apiStage = m_hasGs ? ShaderStage::Geometry : ShaderStage::Mesh;
-      vgtShaderStagesEn[Util::Abi::VgtShaderStagesEnMetadataKey::GsStageEn] = GS_STAGE_ON;
+      vgtShaderStagesEn[Util::Abi::VgtShaderStagesEnMetadataKey::GsStageEn] = true;
     } else if (m_hasTes) {
       apiStage = ShaderStage::TessEval;
     }
diff --git a/lgc/patch/ShaderMerger.cpp b/lgc/patch/ShaderMerger.cpp
index 4b6a4b8121..14b49e5dc1 100644
--- a/lgc/patch/ShaderMerger.cpp
+++ b/lgc/patch/ShaderMerger.cpp
@@ -728,10 +728,12 @@ Function *ShaderMerger::generateEsGsEntryPoint(Function *esEntryPoint, Function
   ArrayRef<Argument *> vgprArgs(args.begin() + NumSpecialSgprInputs + 1, args.end());
 
   // GS VGPRs
+  const auto &geometryMode = m_pipelineState->getShaderModes()->getGeometryShaderMode();
+
   Value *esGsOffsets01 = vgprArgs[0];
 
   Value *esGsOffsets23 = PoisonValue::get(builder.getInt32Ty());
-  if (calcFactor.inputVertices > 2) {
+  if (calcFactor.inputVertices > 2 && geometryMode.inputPrimitive != InputPrimitives::Patch) {
     // NOTE: ES to GS offset (vertex 2 and 3) is valid once the primitive type has more than 2 vertices.
     esGsOffsets23 = vgprArgs[1];
   }
@@ -740,7 +742,7 @@ Function *ShaderMerger::generateEsGsEntryPoint(Function *esEntryPoint, Function
   Value *invocationId = vgprArgs[3];
 
   Value *esGsOffsets45 = PoisonValue::get(builder.getInt32Ty());
-  if (calcFactor.inputVertices > 4) {
+  if (calcFactor.inputVertices > 4 && geometryMode.inputPrimitive != InputPrimitives::Patch) {
     // NOTE: ES to GS offset (vertex 4 and 5) is valid once the primitive type has more than 4 vertices.
     esGsOffsets45 = vgprArgs[4];
   }
diff --git a/lgc/patch/SystemValues.cpp b/lgc/patch/SystemValues.cpp
index 0deecd5e06..1f8396c9f3 100644
--- a/lgc/patch/SystemValues.cpp
+++ b/lgc/patch/SystemValues.cpp
@@ -240,6 +240,7 @@ Value *ShaderSystemValues::getEsGsOffsets() {
     auto insertPos = &*m_entryPoint->front().getFirstNonPHIOrDbgOrAlloca();
     auto intfData = m_pipelineState->getShaderInterfaceData(m_shaderStage);
 
+    // TODO: We should only insert those offsets required by the specified input primitive.
     m_esGsOffsets = PoisonValue::get(FixedVectorType::get(Type::getInt32Ty(*m_context), 6));
     for (unsigned i = 0; i < InterfaceData::MaxEsGsOffsetCount; ++i) {
       auto esGsOffset =
@@ -320,8 +321,6 @@ std::pair<Type *, ArrayRef<Value *>> ShaderSystemValues::getEmitCounterPtr() {
   assert(m_shaderStage == ShaderStage::Geometry);
   auto *emitCounterTy = Type::getInt32Ty(*m_context);
   if (m_emitCounterPtrs.empty()) {
-    // TODO: We should only insert those offsets required by the specified input primitive.
-
     // Setup GS emit vertex counter
     auto &dataLayout = m_entryPoint->getParent()->getDataLayout();
     auto insertPos = &*m_entryPoint->front().getFirstNonPHIOrDbgOrAlloca();
@@ -334,6 +333,21 @@ std::pair<Type *, ArrayRef<Value *>> ShaderSystemValues::getEmitCounterPtr() {
   return std::make_pair(emitCounterTy, ArrayRef<Value *>(m_emitCounterPtrs));
 }
 
+// =====================================================================================================================
+// Get pointer to total emit counter (GS)
+Value *ShaderSystemValues::getTotalEmitCounterPtr() {
+  assert(m_shaderStage == ShaderStage::Geometry);
+  assert(m_pipelineState->getShaderModes()->getGeometryShaderMode().robustGsEmits); // Must enable robust GS emits
+  if (!m_totalEmitCounterPtr) {
+    // Setup GS total emit vertex counter
+    BuilderBase builder(&*m_entryPoint->front().getFirstNonPHIOrDbgOrAlloca());
+
+    m_totalEmitCounterPtr = builder.CreateAlloca(builder.getInt32Ty());
+    builder.CreateStore(builder.getInt32(0), m_totalEmitCounterPtr);
+  }
+  return m_totalEmitCounterPtr;
+}
+
 // =====================================================================================================================
 // Get internal global table pointer as pointer to i8.
 Instruction *ShaderSystemValues::getInternalGlobalTablePtr() {
diff --git a/lgc/patch/VertexFetch.cpp b/lgc/patch/VertexFetch.cpp
index c8cf1f7e44..70c618aeff 100644
--- a/lgc/patch/VertexFetch.cpp
+++ b/lgc/patch/VertexFetch.cpp
@@ -118,9 +118,10 @@ class VertexFetchImpl : public VertexFetch {
 
   Value *loadVertexBufferDescriptor(unsigned binding, BuilderImpl &builderImpl);
 
-  void addVertexFetchInst(Value *vbDesc, Value *vbIndex, Value *srdStride, unsigned numChannels, unsigned offset,
-                          unsigned dfmt, unsigned nfmt, unsigned inputCompBytes, unsigned fetchCompBytes, bool isSigned,
-                          bool isPacked, bool fetchInByte, BuilderImpl &builderImpl, Value **ppFetch) const;
+  void addVertexFetchInst(Value *vbDesc, Value *vbIndex, Value *srdStride, Type *inputTy, unsigned numChannels,
+                          unsigned offset, unsigned dfmt, unsigned nfmt, unsigned inputCompBytes,
+                          unsigned fetchCompBytes, bool isSigned, bool isPacked, bool fetchInByte,
+                          BuilderImpl &builderImpl, Value **ppFetch) const;
 
   bool needPostShuffle(const VertexInputDescription *inputDesc, std::vector<Constant *> &shuffleMask) const;
 
@@ -715,8 +716,8 @@ Value *VertexFetchImpl::fetchVertex(InputImportGenericOp *inst, Value *descPtr,
     m_instanceIndex = ShaderInputs::getInstanceIndex(builder, *m_lgcContext);
   }
 
-  // Get the vertex buffer table pointer as pointer to v4i32 descriptor.
-  Type *vbDescTy = FixedVectorType::get(Type::getInt32Ty(*m_context), 4);
+  Type *vbDescTy = nullptr;
+  { vbDescTy = FixedVectorType::get(Type::getInt32Ty(*m_context), 4); }
   if (!m_vertexBufTablePtr) {
     IRBuilderBase::InsertPointGuard ipg(builder);
     builder.SetInsertPointPastAllocas(inst->getFunction());
@@ -835,13 +836,16 @@ Value *VertexFetchImpl::fetchVertex(InputImportGenericOp *inst, Value *descPtr,
   assert(bitWidth == 8 || bitWidth == 16 || bitWidth == 32 || bitWidth == 64);
 
   Intrinsic::ID instId = Intrinsic::amdgcn_struct_buffer_load_format;
-  if (m_useSoftwareVertexBufferDescriptors) {
-    instId = Intrinsic::amdgcn_raw_buffer_load_format;
-    auto srdStride = builder.CreateExtractElement(vbDesc, 3);
-    byteOffset = builder.CreateAdd(builder.CreateMul(vbIndex, srdStride), byteOffset);
+
+  {
+    if (m_useSoftwareVertexBufferDescriptors) {
+      instId = Intrinsic::amdgcn_raw_buffer_load_format;
+      auto srdStride = builder.CreateExtractElement(vbDesc, 3);
+      byteOffset = builder.CreateAdd(builder.CreateMul(vbIndex, srdStride), byteOffset);
+    }
+    // Replace buffer format
+    vbDesc = builder.CreateInsertElement(vbDesc, bufferFormat, 3);
   }
-  // Replace buffer format
-  vbDesc = builder.CreateInsertElement(vbDesc, bufferFormat, 3);
 
   SmallVector<Value *, 5> args;
   args.push_back(vbDesc);
@@ -849,7 +853,7 @@ Value *VertexFetchImpl::fetchVertex(InputImportGenericOp *inst, Value *descPtr,
     args.push_back(vbIndex);
   unsigned offsetIdx = args.size();
   args.push_back(byteOffset);
-  args.push_back(builder.getInt32(0));
+  { args.push_back(builder.getInt32(0)); }
   args.push_back(builder.getInt32(0));
 
   if (disablePerCompFetch) {
@@ -1251,9 +1255,9 @@ Value *VertexFetchImpl::fetchVertex(Type *inputTy, const VertexInputDescription
   // After back-end optimization, intrinsics may be combined to fetch the whole vertex in generated ISA codes.
   // To make sure combination works, we need to keep tbuffer_load formats as same as possible when visit this function.
   // To avoid redundant extract and insert operation, we need to keep component bit width as same as input component.
-  addVertexFetchInst(vbDesc, vbIndex, srdStride, numChannels, description->offset, compFormatInfo->fetchDfmt,
-                     description->nfmt, inputCompBytes, fetchCompBytes, numFormatInfo->isSigned, isPacked, fetchInByte,
-                     builderImpl, &vertexFetch);
+  addVertexFetchInst(vbDesc, vbIndex, srdStride, inputCompTy, numChannels, description->offset,
+                     compFormatInfo->fetchDfmt, description->nfmt, inputCompBytes, fetchCompBytes,
+                     numFormatInfo->isSigned, isPacked, fetchInByte, builderImpl, &vertexFetch);
 
   // When do fetch in Byte, we need to emulate final results manually.
   postFetchEmulation(description, fetchInByte, inputCompBytes, numChannels, numFormatInfo, compFormatInfo, builderImpl,
@@ -1611,10 +1615,10 @@ void VertexFetchImpl::postFetchEmulation(const VertexInputDescription *descripti
 // @param fetchInByte: Do fetch in Byte if the vertex attribute offset and stride are not aligned.
 // @param builderImpl : BuilderImpl to use to insert vertex fetch instructions
 // @param [out] ppFetch : Destination of vertex fetch
-void VertexFetchImpl::addVertexFetchInst(Value *vbDesc, Value *vbIndex, Value *srdStride, unsigned numChannels,
-                                         unsigned offset, unsigned dfmt, unsigned nfmt, unsigned inputCompBytes,
-                                         unsigned fetchCompBytes, bool isSigned, bool isPacked, bool fetchInByte,
-                                         BuilderImpl &builderImpl, Value **ppFetch) const {
+void VertexFetchImpl::addVertexFetchInst(Value *vbDesc, Value *vbIndex, Value *srdStride, Type *inputTy,
+                                         unsigned numChannels, unsigned offset, unsigned dfmt, unsigned nfmt,
+                                         unsigned inputCompBytes, unsigned fetchCompBytes, bool isSigned, bool isPacked,
+                                         bool fetchInByte, BuilderImpl &builderImpl, Value **ppFetch) const {
   Intrinsic::ID instId = Intrinsic::amdgcn_struct_tbuffer_load;
   Value *instOffset = builderImpl.getInt32(0);
   if (m_useSoftwareVertexBufferDescriptors) {
@@ -1703,7 +1707,11 @@ void VertexFetchImpl::addVertexFetchInst(Value *vbDesc, Value *vbIndex, Value *s
     if (inputCompBytes < compBytes)
       compVal = builderImpl.CreateTrunc(compVal, inputCompTy);
     else if (inputCompBytes > compBytes) {
-      if (isSigned)
+      if (inputTy->isFloatTy() && nfmt == BufNumFormatFloat) {
+        compVal = builderImpl.CreateBitCast(compVal, builderImpl.getHalfTy());
+        compVal = builderImpl.CreateFPExt(compVal, builderImpl.getFloatTy());
+        compVal = builderImpl.CreateBitCast(compVal, inputCompTy);
+      } else if (isSigned)
         compVal = builderImpl.CreateSExt(compVal, inputCompTy);
       else
         compVal = builderImpl.CreateZExt(compVal, inputCompTy);
@@ -1783,15 +1791,14 @@ std::pair<Value *, Value *> VertexFetchImpl::convertSrdToOffsetMode(Value *vbDes
   //   uint32  strideInBytes;
   // };
 
+  GfxIpVersion gfxIp = m_lgcContext->getTargetInfo().getGfxIpVersion();
   // Stride is from the third DWORD.
   auto srdStride = builder.CreateExtractElement(vbDesc, 3);
-
   SqBufRsrcWord3 sqBufRsrcWord3 = {};
   sqBufRsrcWord3.bits.dstSelX = BUF_DST_SEL_X;
   sqBufRsrcWord3.bits.dstSelY = BUF_DST_SEL_Y;
   sqBufRsrcWord3.bits.dstSelZ = BUF_DST_SEL_Z;
   sqBufRsrcWord3.bits.dstSelW = BUF_DST_SEL_W;
-  GfxIpVersion gfxIp = m_lgcContext->getTargetInfo().getGfxIpVersion();
   if (gfxIp.major == 10) {
     sqBufRsrcWord3.gfx10.format = BUF_FORMAT_32_UINT;
     sqBufRsrcWord3.gfx10.resourceLevel = 1;
diff --git a/lgc/state/PipelineState.cpp b/lgc/state/PipelineState.cpp
index 5734ce2b56..a4379a9c8d 100644
--- a/lgc/state/PipelineState.cpp
+++ b/lgc/state/PipelineState.cpp
@@ -497,41 +497,43 @@ void PipelineState::readShaderStageMask(Module *module) {
 
 // =====================================================================================================================
 // Get the last vertex processing shader stage in this pipeline, or ShaderStage::Invalid if none.
-ShaderStageEnum PipelineState::getLastVertexProcessingStage() const {
-  if (m_stageMask.contains(ShaderStage::CopyShader))
-    return ShaderStage::CopyShader;
-  if (m_stageMask.contains(ShaderStage::Geometry))
-    return ShaderStage::Geometry;
-  if (m_stageMask.contains(ShaderStage::TessEval))
-    return ShaderStage::TessEval;
-  if (m_stageMask.contains(ShaderStage::Vertex))
-    return ShaderStage::Vertex;
-  return ShaderStage::Invalid;
+std::optional<ShaderStageEnum> PipelineState::getLastVertexProcessingStage() const {
+  for (auto stage : {ShaderStage::CopyShader, ShaderStage::Geometry, ShaderStage::TessEval, ShaderStage::Vertex}) {
+    if (m_stageMask.contains(stage))
+      return stage;
+  }
+  return std::nullopt;
 }
 
 // =====================================================================================================================
 // Gets the previous active shader stage in this pipeline
 //
 // @param shaderStage : Current shader stage
-ShaderStageEnum PipelineState::getPrevShaderStage(ShaderStageEnum shaderStage) const {
+std::optional<ShaderStageEnum> PipelineState::getPrevShaderStage(ShaderStageEnum shaderStage) const {
   if (shaderStage == ShaderStage::Compute)
-    return ShaderStage::Invalid;
+    return std::nullopt;
 
   if (shaderStage == ShaderStage::CopyShader) {
     // Treat copy shader as part of geometry shader
     shaderStage = ShaderStage::Geometry;
   }
 
-  assert(shaderStage < ShaderStage::GfxCount);
+  std::optional<ShaderStageEnum> prevStage;
 
-  ShaderStageEnum prevStage = ShaderStage::Invalid;
+  bool foundCurrent = false;
+  for (auto stage : llvm::reverse(ShaderStagesGraphics)) {
+    if (!foundCurrent) {
+      if (stage == shaderStage)
+        foundCurrent = true;
+      continue;
+    }
 
-  for (int stage = shaderStage - 1; stage >= 0; --stage) {
-    if (m_stageMask.contains(static_cast<ShaderStageEnum>(stage))) {
-      prevStage = static_cast<ShaderStageEnum>(stage);
+    if (m_stageMask.contains(stage)) {
+      prevStage = stage;
       break;
     }
   }
+  assert(foundCurrent);
 
   return prevStage;
 }
@@ -540,28 +542,34 @@ ShaderStageEnum PipelineState::getPrevShaderStage(ShaderStageEnum shaderStage) c
 // Gets the next active shader stage in this pipeline
 //
 // @param shaderStage : Current shader stage
-ShaderStageEnum PipelineState::getNextShaderStage(ShaderStageEnum shaderStage) const {
+std::optional<ShaderStageEnum> PipelineState::getNextShaderStage(ShaderStageEnum shaderStage) const {
   if (shaderStage == ShaderStage::Compute)
-    return ShaderStage::Invalid;
+    return std::nullopt;
 
   if (shaderStage == ShaderStage::CopyShader) {
     // Treat copy shader as part of geometry shader
     shaderStage = ShaderStage::Geometry;
   }
 
-  assert(shaderStage < ShaderStage::GfxCount);
-
-  ShaderStageEnum nextStage = ShaderStage::Invalid;
+  std::optional<ShaderStageEnum> nextStage;
   auto stageMask = m_stageMask;
   if (isPartPipeline())
     stageMask |= ShaderStageMask(ShaderStage::Fragment);
 
-  for (unsigned stage = shaderStage + 1; stage < ShaderStage::GfxCount; ++stage) {
-    if (stageMask.contains(static_cast<ShaderStageEnum>(stage))) {
-      nextStage = static_cast<ShaderStageEnum>(stage);
+  bool foundCurrent = false;
+  for (auto stage : ShaderStagesGraphics) {
+    if (!foundCurrent) {
+      if (stage == shaderStage)
+        foundCurrent = true;
+      continue;
+    }
+
+    if (stageMask.contains(stage)) {
+      nextStage = stage;
       break;
     }
   }
+  assert(foundCurrent);
 
   return nextStage;
 }
@@ -1434,8 +1442,8 @@ void PipelineState::buildAbiHwShaderMap() {
   } else {
     if (hasGs) {
       auto preGsStage = getPrevShaderStage(ShaderStage::Geometry);
-      if (preGsStage != ShaderStage::Invalid)
-        m_abiHwShaderMap[preGsStage] = Util::Abi::HwShaderGs;
+      if (preGsStage.has_value())
+        m_abiHwShaderMap[preGsStage.value()] = Util::Abi::HwShaderGs;
     }
     if (hasTcs) {
       m_abiHwShaderMap[ShaderStage::TessControl] = Util::Abi::HwShaderHs;
@@ -1444,16 +1452,16 @@ void PipelineState::buildAbiHwShaderMap() {
     }
 
     auto lastVertexProcessingStage = getLastVertexProcessingStage();
-    if (lastVertexProcessingStage != ShaderStage::Invalid) {
+    if (lastVertexProcessingStage.has_value()) {
       if (lastVertexProcessingStage == ShaderStage::CopyShader)
         lastVertexProcessingStage = ShaderStage::Geometry;
       if (isNggEnabled()) {
-        m_abiHwShaderMap[lastVertexProcessingStage] = Util::Abi::HwShaderGs;
+        m_abiHwShaderMap[lastVertexProcessingStage.value()] = Util::Abi::HwShaderGs;
         m_abiPipelineType = hasTs ? Util::Abi::PipelineType::NggTess : Util::Abi::PipelineType::Ngg;
       } else {
-        m_abiHwShaderMap[lastVertexProcessingStage] = Util::Abi::HwShaderVs;
+        m_abiHwShaderMap[lastVertexProcessingStage.value()] = Util::Abi::HwShaderVs;
         if (hasGs)
-          m_abiHwShaderMap[lastVertexProcessingStage] |= Util::Abi::HwShaderGs;
+          m_abiHwShaderMap[lastVertexProcessingStage.value()] |= Util::Abi::HwShaderGs;
 
         if (hasTs && hasGs)
           m_abiPipelineType = Util::Abi::PipelineType::GsTess;
@@ -1688,7 +1696,7 @@ bool PipelineState::enableSwXfb() {
   auto lastVertexStage = getLastVertexProcessingStage();
   lastVertexStage = lastVertexStage == ShaderStage::CopyShader ? ShaderStage::Geometry : lastVertexStage;
 
-  if (lastVertexStage == ShaderStage::Invalid) {
+  if (!lastVertexStage) {
     assert(isUnlinked()); // Unlinked fragment shader or part-pipeline
     return false;
   }
@@ -1933,16 +1941,16 @@ void PipelineState::initializeInOutPackState() {
     // We are assuming that if any of the vertex processing, then the vertex processing stages are complete.  For
     // example, if we see a vertex shader and geometry shader with no tessellation shaders, then we will assume we can
     // pack the vertex outputs and geometry inputs because no tessellation shader will be added later.
-    for (ShaderStageEnum stage : lgc::enumRange(ShaderStage::GfxCount)) {
+    for (auto stage : ShaderStagesGraphics) {
       if (!m_stageMask.contains(stage))
         continue;
       if (stage == ShaderStage::TessEval)
         continue;
-      ShaderStageEnum preStage = getPrevShaderStage(stage);
-      if (preStage == ShaderStage::Invalid)
+      auto preStage = getPrevShaderStage(stage);
+      if (!preStage)
         continue;
       m_inputPackState[stage] = true;
-      m_outputPackState[preStage] = true;
+      m_outputPackState[*preStage] = true;
     }
   }
 }
@@ -1952,12 +1960,12 @@ void PipelineState::initializeInOutPackState() {
 //
 // @param shaderStage : The given shader stage
 bool PipelineState::canPackInput(ShaderStageEnum shaderStage) {
-  ShaderStageEnum preStage = getPrevShaderStage(shaderStage);
+  auto preStage = getPrevShaderStage(shaderStage);
   // The input packable state of the current stage should match the output packable state of the previous stage, except
   // that the current stage has no previous and it is a null FS.
-  if (preStage != ShaderStage::Invalid &&
+  if (preStage &&
       !(shaderStage == ShaderStage::Fragment && getShaderResourceUsage(shaderStage)->inOutUsage.fs.isNullFs))
-    assert(m_inputPackState[shaderStage] == m_outputPackState[preStage]);
+    assert(m_inputPackState[shaderStage] == m_outputPackState[preStage.value()]);
   return m_inputPackState[shaderStage];
 }
 
@@ -1966,25 +1974,30 @@ bool PipelineState::canPackInput(ShaderStageEnum shaderStage) {
 //
 // @param shaderStage : The given shader stage
 bool PipelineState::canPackOutput(ShaderStageEnum shaderStage) {
-  ShaderStageEnum nextStage = getNextShaderStage(shaderStage);
+  auto nextStage = getNextShaderStage(shaderStage);
   // The output packable state of the current stage should match the input packable state of the next stage, except that
   // the current stage has no next stage or a null FS.
-  if (nextStage != ShaderStage::Invalid &&
-      !(nextStage == ShaderStage::Fragment && getShaderResourceUsage(nextStage)->inOutUsage.fs.isNullFs))
-    assert(m_outputPackState[shaderStage] == m_inputPackState[nextStage]);
+  if (nextStage && !(nextStage == ShaderStage::Fragment && getShaderResourceUsage(*nextStage)->inOutUsage.fs.isNullFs))
+    assert(m_outputPackState[shaderStage] == m_inputPackState[*nextStage]);
   return m_outputPackState[shaderStage];
 }
 
 // =====================================================================================================================
 // Get the count of vertices per primitive. For GS, the count is for output primitive.
 unsigned PipelineState::getVerticesPerPrimitive() {
-  if (hasShaderStage(ShaderStage::Geometry)) {
-    const auto &geometryMode = getShaderModes()->getGeometryShaderMode();
-    switch (geometryMode.outputPrimitive) {
+  if (hasShaderStage(ShaderStage::Geometry) || hasShaderStage(ShaderStage::Mesh)) {
+    OutputPrimitives outputPrimitive = OutputPrimitives::Points;
+    if (hasShaderStage(ShaderStage::Geometry))
+      outputPrimitive = getShaderModes()->getGeometryShaderMode().outputPrimitive;
+    else
+      outputPrimitive = getShaderModes()->getMeshShaderMode().outputPrimitive;
+    switch (outputPrimitive) {
     case OutputPrimitives::Points:
       return 1;
+    case OutputPrimitives::Lines:
     case OutputPrimitives::LineStrip:
       return 2;
+    case OutputPrimitives::Triangles:
     case OutputPrimitives::TriangleStrip:
       return 3;
     default:
diff --git a/llpc/context/GfxRuntimeContext.cpp b/lgc/state/RuntimeContext.cpp
similarity index 90%
rename from llpc/context/GfxRuntimeContext.cpp
rename to lgc/state/RuntimeContext.cpp
index d7932edf6b..644b892748 100644
--- a/llpc/context/GfxRuntimeContext.cpp
+++ b/lgc/state/RuntimeContext.cpp
@@ -24,17 +24,15 @@
  **********************************************************************************************************************/
 /**
  ***********************************************************************************************************************
- * @file  GfxRuntimeContext.cpp
- * @brief LLVMContext extension that stores a GfxRuntime library module
+ * @file  RuntimeContext.cpp
+ * @brief LLVMContext extension that stores a Runtime library module
  ***********************************************************************************************************************
  */
 
-#include "GfxRuntimeContext.h"
+#include "lgc/RuntimeContext.h"
 #include "llvm/IR/Module.h"
 
 using namespace llvm;
 using namespace lgc;
 
 GfxRuntimeContext::Key GfxRuntimeContext::theKey;
-
-GfxRuntimeContext::~GfxRuntimeContext() = default;
diff --git a/lgc/state/ShaderStage.cpp b/lgc/state/ShaderStage.cpp
index c7eb5d2f75..78b3868fea 100644
--- a/lgc/state/ShaderStage.cpp
+++ b/lgc/state/ShaderStage.cpp
@@ -82,8 +82,9 @@ void lgc::setShaderStage(GlobalObject *func, std::optional<ShaderStageEnum> stag
         MDNode::get(func->getContext(),
                     {ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(func->getContext()), stage.value()))});
     func->setMetadata(mdKindId, stageMetaNode);
-  } else
+  } else {
     func->eraseMetadata(mdKindId);
+  }
 }
 
 // =====================================================================================================================
@@ -137,13 +138,28 @@ bool lgc::isShaderEntryPoint(const Function *func) {
 //
 // @param shaderStage : Shader stage
 const char *lgc::getShaderStageAbbreviation(ShaderStageEnum shaderStage) {
-  if (shaderStage == ShaderStage::CopyShader)
+  switch (shaderStage) {
+  case ShaderStage::Compute:
+    return "CS";
+  case ShaderStage::Fragment:
+    return "FS";
+  case ShaderStage::Vertex:
+    return "VS";
+  case ShaderStage::Geometry:
+    return "GS";
+  case ShaderStage::CopyShader:
     return "COPY";
-  if (shaderStage > ShaderStage::Compute)
-    return "Bad";
-
-  static const char *ShaderStageAbbrs[] = {"TASK", "VS", "TCS", "TES", "GS", "MESH", "FS", "CS"};
-  return ShaderStageAbbrs[static_cast<unsigned>(shaderStage)];
+  case ShaderStage::TessControl:
+    return "TCS";
+  case ShaderStage::TessEval:
+    return "TES";
+  case ShaderStage::Task:
+    return "TASK";
+  case ShaderStage::Mesh:
+    return "MESH";
+  default:
+    llvm_unreachable("Unhandled ShaderStage");
+  }
 }
 
 // =====================================================================================================================
diff --git a/lgc/test/CsLowerDebugPrintf.lgc b/lgc/test/CsLowerDebugPrintf.lgc
index f4e21a52b0..d380f37da8 100644
--- a/lgc/test/CsLowerDebugPrintf.lgc
+++ b/lgc/test/CsLowerDebugPrintf.lgc
@@ -6,27 +6,29 @@ source_filename = "llpc_compute_8"
 target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7"
 target triple = "amdgcn--amdpal"
 
-@str.1 = internal addrspace(4) constant [10 x i8] c"Output:%d\0A"
+@0 = private unnamed_addr constant [11 x i8] c"Output:%d\0A\00", align 1
+@1 = private unnamed_addr constant [22 x i8] c"workgroup size:%f,%f\0A\00", align 1
 
 ; Function Attrs: nounwind
 define dllexport spir_func void @lgc.shader.CS.main() local_unnamed_addr #0 !spirv.ExecutionModel !7 !lgc.shaderstage !8 {
 .entry:
-  %0 = call <3 x i32> @lgc.shader.input.WorkgroupId(i32 0) #1
+  %0 = call <3 x i32> @lgc.shader.input.WorkgroupId(i32 0) #2
   %1 = mul <3 x i32> %0, <i32 1, i32 1, i32 1>
-  %2 = call <3 x i32> @lgc.shader.input.LocalInvocationId(i32 47) #1
-  %3 = insertelement <3 x i32> %2, i32 0, i64 1
-  %4 = insertelement <3 x i32> %3, i32 0, i64 2
-  %5 = call <3 x i32> @lgc.reconfigure.local.invocation.id(<3 x i32> %4, i32 0) #1
-  %6 = add <3 x i32> %1, %5
-  %__llpc_input_proxy_gl_GlobalInvocationID.0.vec.extract = extractelement <3 x i32> %6, i64 0
-  %7 = call ptr addrspace(4) @lgc.descriptor.table.addr(i32 6, i32 6, i32 -1, i32 6, i32 -1) #1
-  %8 = getelementptr i8, ptr addrspace(4) %7, i32 0
-  %9 = load <4 x i32>, ptr addrspace(4) %8, align 16
-  %10 = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> %9)
-  %11 = insertelement <2 x i32> poison, i32 0, i64 0
-  %12 = insertelement <2 x i32> %11, i32 0, i64 1
-  %13 = bitcast <2 x i32> %12 to i64
-  call void (...) @lgc.debug.printf(ptr addrspace(7) %10, ptr addrspace(4) @str.1, i32 %__llpc_input_proxy_gl_GlobalInvocationID.0.vec.extract, i64 %13)
+  %2 = call i32 @lgc.shader.input.LocalInvocationId(i32 49) #2
+  %3 = and i32 %2, 1023
+  %4 = insertelement <3 x i32> poison, i32 %3, i64 0
+  %5 = lshr i32 %2, 10
+  %6 = and i32 %5, 1023
+  %7 = insertelement <3 x i32> %4, i32 %6, i64 1
+  %8 = lshr i32 %5, 10
+  %9 = insertelement <3 x i32> %7, i32 %8, i64 2
+  %10 = insertelement <3 x i32> %9, i32 0, i64 1
+  %11 = insertelement <3 x i32> %10, i32 0, i64 2
+  %12 = call <3 x i32> @lgc.reconfigure.local.invocation.id(<3 x i32> %11, i32 0) #2
+  %13 = add <3 x i32> %1, %12
+  %__llpc_input_proxy_gl_GlobalInvocationID.0.vec.extract = extractelement <3 x i32> %13, i64 0
+  call void (...) @lgc.debug.printf(ptr nonnull @0, i32 %__llpc_input_proxy_gl_GlobalInvocationID.0.vec.extract)
+  call void (...) @lgc.debug.printf(ptr nonnull @1, double 1.000000e+00, double 1.000000e+00)
   ret void
 }
 
@@ -70,42 +72,64 @@ attributes #2 = { nounwind willreturn memory(none) }
 !8 = !{i32 7}
 ; CHECK-LABEL: @lgc.shader.CS.main(
 ; CHECK-NEXT:  .entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call <3 x i32> @lgc.shader.input.WorkgroupId(i32 0) #[[ATTR1:[0-9]+]]
-; CHECK-NEXT:    [[TMP1:%.*]] = mul <3 x i32> [[TMP0]], <i32 1, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP2:%.*]] = call <3 x i32> @lgc.shader.input.LocalInvocationId(i32 47) #[[ATTR1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <3 x i32> [[TMP2]], i32 0, i64 1
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <3 x i32> [[TMP3]], i32 0, i64 2
-; CHECK-NEXT:    [[TMP5:%.*]] = call <3 x i32> @lgc.reconfigure.local.invocation.id(<3 x i32> [[TMP4]], i32 0) #[[ATTR1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = add <3 x i32> [[TMP1]], [[TMP5]]
-; CHECK-NEXT:    [[__LLPC_INPUT_PROXY_GL_GLOBALINVOCATIONID_0_VEC_EXTRACT:%.*]] = extractelement <3 x i32> [[TMP6]], i64 0
-; CHECK-NEXT:    [[TMP7:%.*]] = call ptr addrspace(4) @lgc.descriptor.table.addr(i32 6, i32 6, i32 -1, i32 6, i32 -1) #[[ATTR1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP8]], align 16
-; CHECK-NEXT:    [[TMP10:%.*]] = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> [[TMP9]])
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x i32> poison, i32 0, i64 0
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <2 x i32> [[TMP11]], i32 0, i64 1
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <2 x i32> [[TMP12]] to i64
-; CHECK-NEXT:    [[TMP14:%.*]] = trunc i64 [[TMP13]] to i32
-; CHECK-NEXT:    [[TMP15:%.*]] = lshr i64 [[TMP13]], 32
-; CHECK-NEXT:    [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32
-; CHECK-NEXT:    [[TMP17:%.*]] = atomicrmw add ptr addrspace(7) [[TMP10]], i64 5 monotonic, align 8
-; CHECK-NEXT:    [[TMP18:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP17]], i64 536870912)
-; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
-; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP19]], 4
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i32, ptr addrspace(7) [[TMP10]], i32 [[TMP20]]
-; CHECK-NEXT:    store i32 {{-?[0-9]+}}, ptr addrspace(7) [[TMP21]], align 4
-; CHECK-NEXT:    [[TMP22:%.*]] = add i32 [[TMP20]], 1
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i32, ptr addrspace(7) [[TMP10]], i32 [[TMP22]]
-; CHECK-NEXT:    store i32 {{-?[0-9]+}}, ptr addrspace(7) [[TMP23]], align 4
-; CHECK-NEXT:    [[TMP24:%.*]] = add i32 [[TMP22]], 1
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr i32, ptr addrspace(7) [[TMP10]], i32 [[TMP24]]
-; CHECK-NEXT:    store i32 [[__LLPC_INPUT_PROXY_GL_GLOBALINVOCATIONID_0_VEC_EXTRACT]], ptr addrspace(7) [[TMP25]], align 4
-; CHECK-NEXT:    [[TMP26:%.*]] = add i32 [[TMP24]], 1
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr i32, ptr addrspace(7) [[TMP10]], i32 [[TMP26]]
-; CHECK-NEXT:    store i32 [[TMP14]], ptr addrspace(7) [[TMP27]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc()
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @lgc.load.user.data__i32(i32 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr addrspace(4)
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP6]], align 16
+; CHECK-NEXT:    [[TMP8:%.*]] = call ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32> [[TMP7]])
+; CHECK-NEXT:    [[TMP9:%.*]] = call <3 x i32> @lgc.shader.input.WorkgroupId(i32 0) #[[ATTR2:[0-9]+]]
+; CHECK-NEXT:    [[TMP10:%.*]] = mul <3 x i32> [[TMP9]], <i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @lgc.shader.input.LocalInvocationId(i32 49) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP12:%.*]] = and i32 [[TMP11]], 1023
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <3 x i32> poison, i32 [[TMP12]], i64 0
+; CHECK-NEXT:    [[TMP14:%.*]] = lshr i32 [[TMP11]], 10
+; CHECK-NEXT:    [[TMP15:%.*]] = and i32 [[TMP14]], 1023
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <3 x i32> [[TMP13]], i32 [[TMP15]], i64 1
+; CHECK-NEXT:    [[TMP17:%.*]] = lshr i32 [[TMP14]], 10
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <3 x i32> [[TMP16]], i32 [[TMP17]], i64 2
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <3 x i32> [[TMP18]], i32 0, i64 1
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <3 x i32> [[TMP19]], i32 0, i64 2
+; CHECK-NEXT:    [[TMP21:%.*]] = call <3 x i32> @lgc.reconfigure.local.invocation.id(<3 x i32> [[TMP20]], i32 0) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP22:%.*]] = add <3 x i32> [[TMP10]], [[TMP21]]
+; CHECK-NEXT:    [[__LLPC_INPUT_PROXY_GL_GLOBALINVOCATIONID_0_VEC_EXTRACT:%.*]] = extractelement <3 x i32> [[TMP22]], i64 0
+; CHECK-NEXT:    [[TMP23:%.*]] = atomicrmw add ptr addrspace(7) [[TMP8]], i64 3 monotonic, align 8
+; CHECK-NEXT:    [[TMP24:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP23]], i64 536870912)
+; CHECK-NEXT:    [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32
+; CHECK-NEXT:    [[TMP26:%.*]] = add i32 [[TMP25]], 4
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr i32, ptr addrspace(7) [[TMP8]], i32 [[TMP26]]
+; CHECK-NEXT:    store i32 {{-?[0-9]+}}, ptr addrspace(7) [[TMP27]], align 4
 ; CHECK-NEXT:    [[TMP28:%.*]] = add i32 [[TMP26]], 1
-; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr i32, ptr addrspace(7) [[TMP10]], i32 [[TMP28]]
-; CHECK-NEXT:    store i32 [[TMP16]], ptr addrspace(7) [[TMP29]], align 4
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr i32, ptr addrspace(7) [[TMP8]], i32 [[TMP28]]
+; CHECK-NEXT:    store i32 {{-?[0-9]+}}, ptr addrspace(7) [[TMP29]], align 4
 ; CHECK-NEXT:    [[TMP30:%.*]] = add i32 [[TMP28]], 1
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr i32, ptr addrspace(7) [[TMP8]], i32 [[TMP30]]
+; CHECK-NEXT:    store i32 [[__LLPC_INPUT_PROXY_GL_GLOBALINVOCATIONID_0_VEC_EXTRACT]], ptr addrspace(7) [[TMP31]], align 4
+; CHECK-NEXT:    [[TMP32:%.*]] = add i32 [[TMP30]], 1
+; CHECK-NEXT:    [[TMP33:%.*]] = atomicrmw add ptr addrspace(7) [[TMP8]], i64 6 monotonic, align 8
+; CHECK-NEXT:    [[TMP34:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP33]], i64 536870912)
+; CHECK-NEXT:    [[TMP35:%.*]] = trunc i64 [[TMP34]] to i32
+; CHECK-NEXT:    [[TMP36:%.*]] = add i32 [[TMP35]], 4
+; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr i32, ptr addrspace(7) [[TMP8]], i32 [[TMP36]]
+; CHECK-NEXT:    store i32 {{-?[0-9]+}}, ptr addrspace(7) [[TMP37]], align 4
+; CHECK-NEXT:    [[TMP38:%.*]] = add i32 [[TMP36]], 1
+; CHECK-NEXT:    [[TMP39:%.*]] = getelementptr i32, ptr addrspace(7) [[TMP8]], i32 [[TMP38]]
+; CHECK-NEXT:    store i32 {{-?[0-9]+}}, ptr addrspace(7) [[TMP39]], align 4
+; CHECK-NEXT:    [[TMP40:%.*]] = add i32 [[TMP38]], 1
+; CHECK-NEXT:    [[TMP41:%.*]] = getelementptr i32, ptr addrspace(7) [[TMP8]], i32 [[TMP40]]
+; CHECK-NEXT:    store i32 0, ptr addrspace(7) [[TMP41]], align 4
+; CHECK-NEXT:    [[TMP42:%.*]] = add i32 [[TMP40]], 1
+; CHECK-NEXT:    [[TMP43:%.*]] = getelementptr i32, ptr addrspace(7) [[TMP8]], i32 [[TMP42]]
+; CHECK-NEXT:    store i32 {{-?[0-9]+}}, ptr addrspace(7) [[TMP43]], align 4
+; CHECK-NEXT:    [[TMP44:%.*]] = add i32 [[TMP42]], 1
+; CHECK-NEXT:    [[TMP45:%.*]] = getelementptr i32, ptr addrspace(7) [[TMP8]], i32 [[TMP44]]
+; CHECK-NEXT:    store i32 0, ptr addrspace(7) [[TMP45]], align 4
+; CHECK-NEXT:    [[TMP46:%.*]] = add i32 [[TMP44]], 1
+; CHECK-NEXT:    [[TMP47:%.*]] = getelementptr i32, ptr addrspace(7) [[TMP8]], i32 [[TMP46]]
+; CHECK-NEXT:    store i32 {{-?[0-9]+}}, ptr addrspace(7) [[TMP47]], align 4
+; CHECK-NEXT:    [[TMP48:%.*]] = add i32 [[TMP46]], 1
 ; CHECK-NEXT:    ret void
 ;
diff --git a/lgc/test/CsReconfigWorkgroup.lgc b/lgc/test/CsReconfigWorkgroup.lgc
index 0ef36e6515..3235ced729 100644
--- a/lgc/test/CsReconfigWorkgroup.lgc
+++ b/lgc/test/CsReconfigWorkgroup.lgc
@@ -51,8 +51,7 @@ define dllexport spir_func void @lgc.shader.CS.main() local_unnamed_addr #0 !lgc
   %2 = bitcast i8 addrspace(7)* %0 to <3 x i32> addrspace(7)*
   store <3 x i32> %1, <3 x i32> addrspace(7)* %2, align 4
   %imgdescptr = call <8 x i32> addrspace(4)* (...) @lgc.create.get.desc.ptr.v8i32(i32 1, i32 0, i32 0, i32 1)
-  %imgdesc = load <8 x i32>, <8 x i32> addrspace(4)* %imgdescptr
-  %imgload = call <2 x float> (...) @lgc.create.image.load.v2f32(i32 1, i32 0, <8 x i32> %imgdesc, <2 x i32><i32 1, i32 2>)
+  %imgload = call <2 x float> (...) @lgc.create.image.load.v2f32(i32 1, i32 0, <8 x i32> addrspace(4)* %imgdescptr, <2 x i32><i32 1, i32 2>)
   %storeptr = getelementptr i8, i8 addrspace(7)* %0, i64 16
   %storeptrcast = bitcast i8 addrspace(7)* %storeptr to <2 x float> addrspace(7)*
   store <2 x float> %imgload, <2 x float> addrspace(7)* %storeptrcast
@@ -99,8 +98,7 @@ define dllexport spir_func void @lgc.shader.CS.main() local_unnamed_addr #0 !lgc
   %2 = bitcast i8 addrspace(7)* %0 to <3 x i32> addrspace(7)*
   store <3 x i32> %1, <3 x i32> addrspace(7)* %2, align 4
   %imgdescptr = call <8 x i32> addrspace(4)* (...) @lgc.create.get.desc.ptr.v8i32(i32 1, i32 0, i32 0, i32 1)
-  %imgdesc = load <8 x i32>, <8 x i32> addrspace(4)* %imgdescptr
-  %imgload = call <2 x float> (...) @lgc.create.image.load.v2f32(i32 1, i32 0, <8 x i32> %imgdesc, <2 x i32><i32 1, i32 2>)
+  %imgload = call <2 x float> (...) @lgc.create.image.load.v2f32(i32 1, i32 0, <8 x i32> addrspace(4)* %imgdescptr, <2 x i32><i32 1, i32 2>)
   %storeptr = getelementptr i8, i8 addrspace(7)* %0, i64 16
   %storeptrcast = bitcast i8 addrspace(7)* %storeptr to <2 x float> addrspace(7)*
   store <2 x float> %imgload, <2 x float> addrspace(7)* %storeptrcast
diff --git a/lgc/test/ElfRelocationSize.lgc b/lgc/test/ElfRelocationSize.lgc
index e77214a9c1..3125c72577 100644
--- a/lgc/test/ElfRelocationSize.lgc
+++ b/lgc/test/ElfRelocationSize.lgc
@@ -66,12 +66,10 @@ entry:
   %0 = extractelement <2 x float> %texcoordadj, i32 0
   %1 = extractelement <2 x float> %texcoordadj, i32 1
   %2 = call <8 x i32> addrspace(4)* (...) @lgc.create.get.desc.ptr.p4v8i32(i32 1, i32 0, i32 0, i32 1)
-  %3 = load <8 x i32>, <8 x i32> addrspace(4)* %2, align 32
   %4 = call <4 x i32> addrspace(4)* (...) @lgc.create.get.desc.ptr.p4v4i32(i32 2, i32 0, i32 0, i32 2)
-  %5 = load <4 x i32>, <4 x i32> addrspace(4)* %4, align 16
   %6 = insertelement <2 x float> undef, float %0, i64 0
   %7 = insertelement <2 x float> %6, float %1, i64 1
-  %8 = call <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 0, <8 x i32> %3, <4 x i32> %5, i32 1, <2 x float> %7)
+  %8 = call <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 0, <8 x i32> addrspace(4)* %2, <4 x i32> addrspace(4)* %4, i32 1, <2 x float> %7)
   %9 = extractelement <4 x float> %8, i64 0
   %10 = insertvalue %types.ResRet.f32.1 undef, float %9, 0
   %11 = extractelement <4 x float> %8, i64 1
diff --git a/lgc/test/PartPipeline.lgc b/lgc/test/PartPipeline.lgc
index a39eb3f36b..d368f9394c 100644
--- a/lgc/test/PartPipeline.lgc
+++ b/lgc/test/PartPipeline.lgc
@@ -74,12 +74,10 @@ entry:
   %0 = extractelement <2 x float> %texcoordadj, i32 0
   %1 = extractelement <2 x float> %texcoordadj, i32 1
   %2 = call <8 x i32> addrspace(4)* (...) @lgc.create.get.desc.ptr.p4v8i32(i32 1, i32 1, i32 0, i32 1)
-  %3 = load <8 x i32>, <8 x i32> addrspace(4)* %2, align 32
   %4 = call <4 x i32> addrspace(4)* (...) @lgc.create.get.desc.ptr.p4v4i32(i32 2, i32 2, i32 0, i32 2)
-  %5 = load <4 x i32>, <4 x i32> addrspace(4)* %4, align 16
   %6 = insertelement <2 x float> undef, float %0, i64 0
   %7 = insertelement <2 x float> %6, float %1, i64 1
-  %8 = call <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 0, <8 x i32> %3, <4 x i32> %5, i32 1, <2 x float> %7)
+  %8 = call <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 0, <8 x i32> addrspace(4)* %2, <4 x i32> addrspace(4)* %4, i32 1, <2 x float> %7)
   %9 = extractelement <4 x float> %8, i64 0
   %10 = insertvalue %types.ResRet.f32.1 undef, float %9, 0
   %11 = extractelement <4 x float> %8, i64 1
diff --git a/lgc/test/PatchInvalidImageDescriptor.lgc b/lgc/test/PatchInvalidImageDescriptor.lgc
index 32a48a7f9f..debed60f1e 100644
--- a/lgc/test/PatchInvalidImageDescriptor.lgc
+++ b/lgc/test/PatchInvalidImageDescriptor.lgc
@@ -4,7 +4,7 @@
 
 ; CHECK-LABEL: IR Dump After Patch LLVM for workarounds
 ; GFX1010: extractelement <8 x i32> %{{[0-9]+}}, i64 3
-; GFX1010-NEXT: icmp sge i32
+; GFX1010: icmp sge i32
 ; GFX1010-NEXT: and i32
 ; GFX1010-NEXT: select i1
 ; GFX1010-NEXT: [[PATCHED_DESC0:%[.a-zA-Z0-9]+]] = insertelement <8 x i32> %{{[0-9]+}}
@@ -12,7 +12,7 @@
 
 ; GFX1010: call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> zeroinitializer, i32 15, i32 0, i32 0, <8 x i32> %{{[0-9]+}}, i32 0, i32 0)
 
-; GFX1010: %.sample = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> %{{[0-9]+}}, <4 x i32> %.sampler, i1 false, i32 0, i32 0)
+; GFX1010: %.sample = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> %{{[0-9]+}}, <4 x i32> %{{[0-9]+}}, i1 false, i32 0, i32 0)
 
 ; GFX1010: %.gather = call <4 x float> @llvm.amdgcn.image.gather4.l.2d.v4f32.f32(i32 1, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, <8 x i32> %{{[0-9]+}}, <4 x i32> %{{[0-9]+}}, i1 false, i32 0, i32 0)
 
@@ -40,22 +40,20 @@ define dllexport spir_func void @lgc.shader.VS.main() local_unnamed_addr #0 !lgc
   %.desc.ptr1 = bitcast <8 x i32> addrspace(4)* %.desc.ptr2 to i8 addrspace(4)*
   %.desc.ptr0 = getelementptr i8, i8 addrspace(4)* %.desc.ptr1, i64 0
   %.desc.ptr = bitcast i8 addrspace(4)* %.desc.ptr0 to <8 x i32> addrspace(4)*
-  %.desc = load <8 x i32>, <8 x i32> addrspace(4)* %.desc.ptr, align 32
   %.sampler.ptr = call <4 x i32> addrspace(4)* (...) @lgc.create.get.desc.ptr.p4v4i32(i32 2, i32 2, i32 0, i32 13)
-  %.sampler = load <4 x i32>, <4 x i32> addrspace(4)* %.sampler.ptr, align 16
 
-  %.load = call <4 x float> (...) @lgc.create.image.load.v4f32(i32 0, i32 128, <8 x i32> %.desc, i32 1)
-  call void (...) @lgc.create.image.store(<4 x float> zeroinitializer, i32 1, i32 128, <8 x i32> %.desc, <2 x i32> zeroinitializer)
+  %.load = call <4 x float> (...) @lgc.create.image.load.v4f32(i32 0, i32 128, <8 x i32> addrspace(4)* %.desc.ptr, i32 1)
+  call void (...) @lgc.create.image.store(<4 x float> zeroinitializer, i32 1, i32 128, <8 x i32> addrspace(4)* %.desc.ptr, <2 x i32> zeroinitializer)
 
-  %.sample = call <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 128, <8 x i32> %.desc, <4 x i32> %.sampler, i32 1, <2 x float> zeroinitializer)
-  %.gather = call <4 x float> (...) @lgc.create.image.gather.v4f32(i32 1, i32 128, <8 x i32> %.desc, <4 x i32> %.sampler, i32 37, <2 x float> zeroinitializer, i32 0, float 0.000000e+00)
+  %.sample = call <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 128, <8 x i32> addrspace(4)* %.desc.ptr, <4 x i32> addrspace(4)* %.sampler.ptr, i32 1, <2 x float> zeroinitializer)
+  %.gather = call <4 x float> (...) @lgc.create.image.gather.v4f32(i32 1, i32 128, <8 x i32> addrspace(4)* %.desc.ptr, <4 x i32> addrspace(4)* %.sampler.ptr, i32 37, <2 x float> zeroinitializer, i32 0, float 0.000000e+00)
 
-  %.atomic = call i32 (...) @lgc.create.image.atomic.i32(i32 2, i32 0, i32 128, i32 0, <8 x i32> %.desc, i32 0, i32 1) #0
+  %.atomic = call i32 (...) @lgc.create.image.atomic.i32(i32 2, i32 0, i32 128, i32 0, <8 x i32> addrspace(4)* %.desc.ptr, i32 0, i32 1) #0
 
-  %.lod = call <2 x float> (...) @lgc.create.image.get.lod.v2f32(i32 1, i32 128, <8 x i32> %.desc, <4 x i32> %.sampler, <2 x float> zeroinitializer)
+  %.lod = call <2 x float> (...) @lgc.create.image.get.lod.v2f32(i32 1, i32 128, <8 x i32> addrspace(4)* %.desc.ptr, <4 x i32> addrspace(4)* %.sampler.ptr, <2 x float> zeroinitializer)
 
-  %.query.size = call <2 x i32> (...) @lgc.create.image.query.size.v2i32(i32 1, i32 128, <8 x i32> %.desc, i32 0)
-  %.query.levels = call i32 (...) @lgc.create.image.query.levels.i32(i32 1, i32 128, <8 x i32> %.desc)
+  %.query.size = call <2 x i32> (...) @lgc.create.image.query.size.v2i32(i32 1, i32 128, <8 x i32> addrspace(4)* %.desc.ptr, i32 0)
+  %.query.levels = call i32 (...) @lgc.create.image.query.levels.i32(i32 1, i32 128, <8 x i32> addrspace(4)* %.desc.ptr)
 
   %lane = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)  ; just some source of divergence
   %ofs = mul i32 %lane, 32
@@ -65,8 +63,7 @@ define dllexport spir_func void @lgc.shader.VS.main() local_unnamed_addr #0 !lgc
   %.desc2.ptr1 = bitcast <8 x i32> addrspace(4)* %.desc2.ptr2 to i8 addrspace(4)*
   %.desc2.ptr0 = getelementptr i8, i8 addrspace(4)* %.desc2.ptr1, i32 %ofs
   %.desc2.ptr = bitcast i8 addrspace(4)* %.desc2.ptr0 to <8 x i32> addrspace(4)*
-  %.desc2 = load <8 x i32>, <8 x i32> addrspace(4)* %.desc2.ptr, align 32
-  call void (...) @lgc.create.image.store(<4 x float> zeroinitializer, i32 0, i32 8, <8 x i32> %.desc2, i32 zeroinitializer)
+  call void (...) @lgc.create.image.store(<4 x float> zeroinitializer, i32 0, i32 8, <8 x i32> addrspace(4)* %.desc2.ptr, i32 zeroinitializer)
 
   ret void
 }
diff --git a/lgc/test/SubgroupClusteredReduction.lgc b/lgc/test/SubgroupClusteredReduction.lgc
new file mode 100644
index 0000000000..40b53f9584
--- /dev/null
+++ b/lgc/test/SubgroupClusteredReduction.lgc
@@ -0,0 +1,51 @@
+; NOTE: Assertions have been autogenerated by tool/update_llpc_test_checks.py UTC_ARGS: --tool lgc
+; RUN: lgc -o - --mcpu=gfx1100 --emit-llvm %s | FileCheck -check-prefixes=CHECK %s
+
+define dllexport spir_func i32 @fn(i32 %value1, i32 %value2) !lgc.shaderstage !0 {
+.entry:
+    %r1 = call i32 (...) @lgc.create.subgroup.clustered.reduction.i32(i32 11, i32 %value1, i32 64)
+    %r2 = call i32 (...) @lgc.create.subgroup.clustered.reduction.i32(i32 11, i32 %value2, i32 32)
+    %r = add i32 %r1, %r2
+    ret i32 %r
+}
+
+declare i32 @lgc.create.subgroup.clustered.reduction.i32(...)
+
+; ShaderStage::Compute
+!0 = !{i32 7}
+
+; Setting Threadgroup Dimensions to 64 x 1 x 1
+!llpc.compute.mode = !{!1}
+!1 = !{i32 64, i32 1, i32 1}
+; CHECK-LABEL: @_amdgpu_cs_main(
+; CHECK-NEXT:  .entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.amdgcn.set.inactive.i32(i32 [[VALUE1:%.*]], i32 0)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 [[TMP0]], i32 177, i32 15, i32 15, i1 true)
+; CHECK-NEXT:    [[TMP2:%.*]] = or i32 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 [[TMP2]], i32 78, i32 15, i32 15, i1 true)
+; CHECK-NEXT:    [[TMP4:%.*]] = or i32 [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 [[TMP4]], i32 321, i32 15, i32 15, i1 true)
+; CHECK-NEXT:    [[TMP6:%.*]] = or i32 [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 [[TMP6]], i32 320, i32 15, i32 15, i1 true)
+; CHECK-NEXT:    [[TMP8:%.*]] = or i32 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.amdgcn.permlanex16(i32 undef, i32 [[TMP8]], i32 -1, i32 -1, i1 true, i1 false)
+; CHECK-NEXT:    [[TMP10:%.*]] = or i32 [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.permlane64(i32 [[TMP10]])
+; CHECK-NEXT:    [[TMP12:%.*]] = or i32 [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP12]])
+; CHECK-NEXT:    [[R1:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP13]])
+; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.set.inactive.i32(i32 [[VALUE2:%.*]], i32 0)
+; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 [[TMP14]], i32 177, i32 15, i32 15, i1 true)
+; CHECK-NEXT:    [[TMP16:%.*]] = or i32 [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 [[TMP16]], i32 78, i32 15, i32 15, i1 true)
+; CHECK-NEXT:    [[TMP18:%.*]] = or i32 [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 [[TMP18]], i32 321, i32 15, i32 15, i1 true)
+; CHECK-NEXT:    [[TMP20:%.*]] = or i32 [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 undef, i32 [[TMP20]], i32 320, i32 15, i32 15, i1 true)
+; CHECK-NEXT:    [[TMP22:%.*]] = or i32 [[TMP20]], [[TMP21]]
+; CHECK-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.permlanex16(i32 undef, i32 [[TMP22]], i32 -1, i32 -1, i1 true, i1 false)
+; CHECK-NEXT:    [[TMP24:%.*]] = or i32 [[TMP22]], [[TMP23]]
+; CHECK-NEXT:    [[R2:%.*]] = call i32 @llvm.amdgcn.wwm.i32(i32 [[TMP24]])
+; CHECK-NEXT:    [[R:%.*]] = add i32 [[R2]], [[R1]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
diff --git a/lgc/test/TaskShaderOps.lgc b/lgc/test/TaskShaderOps.lgc
index 16d98df3dd..88a85fee82 100644
--- a/lgc/test/TaskShaderOps.lgc
+++ b/lgc/test/TaskShaderOps.lgc
@@ -51,7 +51,7 @@
 ; CHECK-NEXT: [[meshPipeStatsBufAddr64:%[0-9]*]] = bitcast <2 x i32> [[meshPipeStatsBufAddr2x32]] to i64
 ; CHECK-NEXT: [[meshPipeStatsBufAddr:%[0-9]*]] = inttoptr i64 [[meshPipeStatsBufAddr64]] to ptr addrspace(1)
 ; CHECK: [[numTaskThreadsPtr8:%[0-9]*]] = getelementptr i8, ptr addrspace(1) [[meshPipeStatsBufAddr]], i64 16
-; CHECK: %{{[0-9]*}} = atomicrmw add ptr addrspace(1) [[numTaskThreadsPtr8]], i64 %{{[0-9]*}} monotonic, align 8
+; CHECK: %{{[0-9]*}} = atomicrmw add ptr addrspace(1) [[numTaskThreadsPtr8]], i64 %{{[0-9]*}} syncscope("agent") monotonic, align 8
 ; CHECK: [[ringSize:%[0-9]*]] = extractelement <4 x i32> [[drawDataRingDesc]], i64 2
 ; CHECK-NEXT: [[numEntries:%[0-9]*]] = lshr i32 [[ringSize]], 4
 ; CHECK-NEXT: [[wrapMask:%[0-9]*]] = add nuw nsw i32 [[numEntries]], 268435455
diff --git a/lgc/test/TestWaterfallLoopForStruct.lgc b/lgc/test/TestWaterfallLoopForStruct.lgc
index 448a7ceee2..c43691bddf 100644
--- a/lgc/test/TestWaterfallLoopForStruct.lgc
+++ b/lgc/test/TestWaterfallLoopForStruct.lgc
@@ -17,10 +17,7 @@ define dllexport spir_func void @lgc.shader.VS.main() local_unnamed_addr #0 !spi
   %6 = mul i32 %0, %2
   %7 = sext i32 %6 to i64
   %8 = getelementptr i8, ptr addrspace(4) %1, i64 %7
-  %9 = insertvalue { ptr addrspace(4), i32, i32, i32 } %5, ptr addrspace(4) %8, 0
-  %10 = load <8 x i32>, ptr addrspace(4) %8, align 32, !invariant.load !12
-  %11 = insertvalue [3 x <8 x i32>] poison, <8 x i32> %10, 0
-  %12 = call { <4 x float>, i32 } (...) @"lgc.create.image.load.s[v4f32,i32]"(i32 1, i32 8, <8 x i32> %10, <2 x i32> <i32 1, i32 1>)
+  %12 = call { <4 x float>, i32 } (...) @"lgc.create.image.load.s[v4f32,i32]"(i32 1, i32 8, ptr addrspace(4) %8, <2 x i32> <i32 1, i32 1>)
   %13 = extractvalue { <4 x float>, i32 } %12, 1
   %14 = extractvalue { <4 x float>, i32 } %12, 0
   %15 = icmp sgt i32 %13, 0
@@ -92,22 +89,20 @@ attributes #2 = { nounwind willreturn memory(read) }
 ; CHECK-NEXT:    [[TMP11:%.*]] = mul i32 [[TMP2]], 32
 ; CHECK-NEXT:    [[TMP12:%.*]] = sext i32 [[TMP11]] to i64
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP7]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = insertvalue { ptr addrspace(4), i32, i32, i32 } [[TMP10]], ptr addrspace(4) [[TMP13]], 0
-; CHECK-NEXT:    [[TMP15:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP13]], align 32, !invariant.load !12
-; CHECK-NEXT:    [[TMP16:%.*]] = insertvalue [3 x <8 x i32>] poison, <8 x i32> [[TMP15]], 0
-; CHECK-NEXT:    [[TMP17:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[TMP11]])
-; CHECK-NEXT:    [[TMP18:%.*]] = call <8 x i32> @llvm.amdgcn.waterfall.readfirstlane.v8i32.v8i32(i32 [[TMP17]], <8 x i32> [[TMP15]])
-; CHECK-NEXT:    [[TMP19:%.*]] = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v4f32i32s.i32(i32 15, i32 1, i32 1, <8 x i32> [[TMP18]], i32 1, i32 0)
-; CHECK-NEXT:    [[TMP20:%.*]] = extractvalue { <4 x float>, i32 } [[TMP19]], 0
-; CHECK-NEXT:    [[TMP21:%.*]] = call <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP17]], <4 x float> [[TMP20]])
-; CHECK-NEXT:    [[TMP22:%.*]] = extractvalue { <4 x float>, i32 } [[TMP19]], 1
-; CHECK-NEXT:    [[TMP23:%.*]] = call i32 @llvm.amdgcn.waterfall.end.i32(i32 [[TMP17]], i32 [[TMP22]])
-; CHECK-NEXT:    [[TMP24:%.*]] = insertvalue { <4 x float>, i32 } poison, <4 x float> [[TMP21]], 0
-; CHECK-NEXT:    [[TMP25:%.*]] = insertvalue { <4 x float>, i32 } [[TMP24]], i32 [[TMP23]], 1
-; CHECK-NEXT:    [[TMP26:%.*]] = extractvalue { <4 x float>, i32 } [[TMP25]], 1
-; CHECK-NEXT:    [[TMP27:%.*]] = extractvalue { <4 x float>, i32 } [[TMP25]], 0
-; CHECK-NEXT:    [[TMP28:%.*]] = icmp sgt i32 [[TMP26]], 0
-; CHECK-NEXT:    [[TMP29:%.*]] = select i1 [[TMP28]], <4 x float> [[TMP27]], <4 x float> zeroinitializer
-; CHECK-NEXT:    call void @lgc.output.export.generic.i32.i32.v4f32(i32 0, i32 0, <4 x float> [[TMP29]]) #[[ATTR5:[0-9]+]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP13]], align 32, !invariant.load !12
+; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[TMP11]])
+; CHECK-NEXT:    [[TMP16:%.*]] = call <8 x i32> @llvm.amdgcn.waterfall.readfirstlane.v8i32.v8i32(i32 [[TMP15]], <8 x i32> [[TMP14]])
+; CHECK-NEXT:    [[TMP17:%.*]] = call { <4 x float>, i32 } @llvm.amdgcn.image.load.2d.sl_v4f32i32s.i32(i32 15, i32 1, i32 1, <8 x i32> [[TMP16]], i32 1, i32 0)
+; CHECK-NEXT:    [[TMP18:%.*]] = extractvalue { <4 x float>, i32 } [[TMP17]], 0
+; CHECK-NEXT:    [[TMP19:%.*]] = call <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP15]], <4 x float> [[TMP18]])
+; CHECK-NEXT:    [[TMP20:%.*]] = extractvalue { <4 x float>, i32 } [[TMP17]], 1
+; CHECK-NEXT:    [[TMP21:%.*]] = call i32 @llvm.amdgcn.waterfall.end.i32(i32 [[TMP15]], i32 [[TMP20]])
+; CHECK-NEXT:    [[TMP22:%.*]] = insertvalue { <4 x float>, i32 } poison, <4 x float> [[TMP19]], 0
+; CHECK-NEXT:    [[TMP23:%.*]] = insertvalue { <4 x float>, i32 } [[TMP22]], i32 [[TMP21]], 1
+; CHECK-NEXT:    [[TMP24:%.*]] = extractvalue { <4 x float>, i32 } [[TMP23]], 1
+; CHECK-NEXT:    [[TMP25:%.*]] = extractvalue { <4 x float>, i32 } [[TMP23]], 0
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp sgt i32 [[TMP24]], 0
+; CHECK-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], <4 x float> [[TMP25]], <4 x float> zeroinitializer
+; CHECK-NEXT:    call void @lgc.output.export.generic.i32.i32.v4f32(i32 0, i32 0, <4 x float> [[TMP27]]) #[[ATTR5:[0-9]+]]
 ; CHECK-NEXT:    ret void
 ;
diff --git a/lgc/test/TextureRange.lgc b/lgc/test/TextureRange.lgc
index 0f50ab4c50..3cda8fc4f8 100644
--- a/lgc/test/TextureRange.lgc
+++ b/lgc/test/TextureRange.lgc
@@ -87,11 +87,9 @@ define dllexport spir_func void @lgc.shader.FS.PSMain() local_unnamed_addr #0 !s
   %13 = fmul reassoc nnan nsz arcp contract afn float %10, %12
   %14 = call ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4(i32 1, i32 1, i64 3221225472, i32 1)
   %15 = call i32 (...) @lgc.create.get.desc.stride__i32(i32 1, i32 1, i64 3221225472, i32 1)
-  %16 = load <8 x i32>, ptr addrspace(4) %14, align 32
   %17 = call ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4(i32 2, i32 2, i64 2147483648, i32 0)
   %18 = call i32 (...) @lgc.create.get.desc.stride__i32(i32 2, i32 2, i64 2147483648, i32 0)
-  %19 = load <4 x i32>, ptr addrspace(4) %17, align 16
-  %20 = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample__v4f32(i32 1, i32 512, <8 x i32> %16, <4 x i32> %19, i32 1, <2 x float> %6)
+  %20 = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample__v4f32(i32 1, i32 512, ptr addrspace(4) %14, ptr addrspace(4) %17, i32 1, <2 x float> %6)
   %.splatinsert = insertelement <4 x float> poison, float %13, i64 0
   %21 = shufflevector <4 x float> %.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
   %scale = fmul reassoc nnan nsz arcp contract afn <4 x float> %20, %21
diff --git a/lgc/test/Transforms/LowerCooperativeMatrix/gfx1010muladd.lgc b/lgc/test/Transforms/LowerCooperativeMatrix/gfx1010muladd.lgc
new file mode 100644
index 0000000000..90ffa98e22
--- /dev/null
+++ b/lgc/test/Transforms/LowerCooperativeMatrix/gfx1010muladd.lgc
@@ -0,0 +1,32 @@
+; RUN: lgc -march=amdgcn -o - --mcpu=gfx1010 -filetype=asm %s | FileCheck -check-prefixes=CHECK %s
+
+define void @matmul_f16f32_emulator(ptr addrspace(3) %out0, <8 x float> %a, <8 x float> %b, <8 x float> %c0) !lgc.shaderstage !0 {
+; CHECK-NOT: v_dot
+  %value = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f8(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 2, i32 1)
+  call void (...) @lgc.cooperative.matrix.store(ptr addrspace(3) %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %value)
+  ret void
+}
+
+define void @matmul_i16i32_emulator(ptr addrspace(3) %out0, <8 x i32> %a, <8 x i32> %b, <8 x i32> %c0) !lgc.shaderstage !0 {
+; CHECK-NOT: v_dot
+  %value = call <8 x i32> (...) @lgc.cooperative.matrix.muladd__v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c0, i1 true, i1 true, i1 false, i1 false, i32 5, i32 4)
+  call void (...) @lgc.cooperative.matrix.store(ptr addrspace(3) %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x i32> %value)
+  ret void
+}
+
+define void @matmul_i8i32_emulator(ptr addrspace(3) %out0, <8 x i32> %a, <8 x i32> %b, <8 x i32> %c0) !lgc.shaderstage !0 {
+; CHECK-NOT: v_dot
+  %value = call <8 x i32> (...) @lgc.cooperative.matrix.muladd__v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c0, i1 true, i1 true, i1 false, i1 false, i32 5, i32 3)
+  call void (...) @lgc.cooperative.matrix.store(ptr addrspace(3) %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x i32> %value)
+  ret void
+}
+
+declare <8 x float> @lgc.cooperative.matrix.muladd__v8f8(...)
+declare <8 x i32> @lgc.cooperative.matrix.muladd__v8i32(...)
+declare void @lgc.cooperative.matrix.store(...)
+
+; ShaderStage::Compute
+!0 = !{i32 7}
+; Setting Threadgroup Dimensions to 64 x 1 x 1
+!llpc.compute.mode = !{!1}
+!1 = !{i32 64, i32 1, i32 1}
diff --git a/lgc/test/Transforms/LowerCooperativeMatrix/gfx1011muladd.lgc b/lgc/test/Transforms/LowerCooperativeMatrix/gfx1011muladd.lgc
new file mode 100644
index 0000000000..88292bf642
--- /dev/null
+++ b/lgc/test/Transforms/LowerCooperativeMatrix/gfx1011muladd.lgc
@@ -0,0 +1,33 @@
+; RUN: lgc -march=amdgcn -o - --mcpu=gfx1011 -filetype=asm %s | FileCheck -check-prefixes=CHECK %s
+
+define void @matmul_f16f32_emulator(ptr addrspace(3) %out0, <8 x float> %a, <8 x float> %b, <8 x float> %c0) !lgc.shaderstage !0 {
+; CHECK: v_dot2c_f32_f16
+  %value = call <8 x float> (...) @lgc.cooperative.matrix.muladd__v8f8(<8 x float> %a, <8 x float> %b, <8 x float> %c0, i1 true, i1 true, i1 false, i1 false, i32 2, i32 1)
+  call void (...) @lgc.cooperative.matrix.store(ptr addrspace(3) %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> %value)
+  ret void
+}
+
+define void @matmul_i16i32_emulator(ptr addrspace(3) %out0, <8 x i32> %a, <8 x i32> %b, <8 x i32> %c0) !lgc.shaderstage !0 {
+; CHECK: v_dot2_i32_i16
+  %value = call <8 x i32> (...) @lgc.cooperative.matrix.muladd__v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c0, i1 true, i1 true, i1 false, i1 false, i32 5, i32 4)
+  call void (...) @lgc.cooperative.matrix.store(ptr addrspace(3) %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x i32> %value)
+  ret void
+}
+
+define void @matmul_i8i32_emulator(ptr addrspace(3) %out0, <8 x i32> %a, <8 x i32> %b, <8 x i32> %c0) !lgc.shaderstage !0 {
+; CHECK: v_dot4c_i32_i8
+  %value = call <8 x i32> (...) @lgc.cooperative.matrix.muladd__v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c0, i1 true, i1 true, i1 false, i1 false, i32 5, i32 3)
+  call void (...) @lgc.cooperative.matrix.store(ptr addrspace(3) %out0, i32 4, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x i32> %value)
+  ret void
+}
+
+declare <8 x float> @lgc.cooperative.matrix.muladd__v8f8(...)
+declare <8 x i32> @lgc.cooperative.matrix.muladd__v8i32(...)
+declare void @lgc.cooperative.matrix.store(...)
+
+; ShaderStage::Compute
+!0 = !{i32 7}
+; Setting Threadgroup Dimensions to 64 x 1 x 1
+!llpc.compute.mode = !{!1}
+!1 = !{i32 64, i32 1, i32 1}
+
diff --git a/lgc/test/Transforms/LowerDebugPrintf/basic.lgc b/lgc/test/Transforms/LowerDebugPrintf/basic.lgc
index a20f4e0fd5..312478af57 100644
--- a/lgc/test/Transforms/LowerDebugPrintf/basic.lgc
+++ b/lgc/test/Transforms/LowerDebugPrintf/basic.lgc
@@ -1,15 +1,19 @@
-; RUN: lgc -o - -passes='require<lgc-pipeline-state>,lgc-lower-debug-printf' %s | FileCheck --check-prefixes=IR %s
-; RUN: lgc -o - -passes='require<lgc-pipeline-state>,lgc-lower-debug-printf,print<lgc-pipeline-state>' %s -o /dev/null 2>&1 | FileCheck --check-prefixes=PALMD %s
+; RUN: lgc -o - -passes="require<lgc-pipeline-state>,lgc-lower-debug-printf" %s | FileCheck --check-prefixes=IR %s
+; RUN: lgc -o - -passes="require<lgc-pipeline-state>,lgc-lower-debug-printf,print<lgc-pipeline-state>" %s -o /dev/null 2>&1 | FileCheck --check-prefixes=PALMD %s
 
-@str = internal addrspace(4) global [8 x i8] c"Test: %u"
+@str = private unnamed_addr constant [8 x i8] c"Test: %u", align 1
 
-define spir_func void @simple(ptr addrspace(7) %buffer) !lgc.shaderstage !0 {
+define spir_func void @simple() !lgc.shaderstage !0 {
 ; IR-LABEL: @simple(
 ; IR-NOT: call {{.*}} @lgc.debug.printf
-  call void (...) @lgc.debug.printf(ptr addrspace(7) %buffer, ptr addrspace(4) @str, i32 42)
+  call void (...) @lgc.debug.printf(ptr nonnull @str, i32 42)
   ret void
 }
 
+!lgc.user.data.nodes = !{!4, !5}
+!4 = !{!"DescriptorTableVaPtr", i32 7, i32 0, i32 0, i32 1, i32 1}
+!5 = !{!"DescriptorBuffer", i32 6, i32 0, i32 0, i32 4, i32 -1, i32 6, i32 4}
+
 ; IR: !amdgpu.pal.metadata.msgpack =
 
 ; PALMD:      amdpal.format_strings:
diff --git a/lgc/test/lgcdis.lgc b/lgc/test/lgcdis.lgc
index 7b970b75ac..b90f44f093 100644
--- a/lgc/test/lgcdis.lgc
+++ b/lgc/test/lgcdis.lgc
@@ -95,10 +95,8 @@ define dllexport void @lgc.shader.FS.main() !lgc.shaderstage !25 {
 entry:
   %TEXCOORD = call <2 x float> (...) @lgc.create.read.generic.input.v2f32(i32 1, i32 0, i32 0, i32 1, i32 16, i32 poison)
   %imageptr = call <8 x i32> addrspace(4)* (...) @lgc.create.get.desc.ptr.p4v8i32(i32 1, i32 1, i32 0, i32 1)
-  %image = load <8 x i32>, <8 x i32> addrspace(4)* %imageptr, align 32
   %samplerptr = call <4 x i32> addrspace(4)* (...) @lgc.create.get.desc.ptr.p4v4i32(i32 2, i32 2, i32 0, i32 2)
-  %sampler = load <4 x i32>, <4 x i32> addrspace(4)* %samplerptr, align 16
-  %sample = call <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 0, <8 x i32> %image, <4 x i32> %sampler, i32 1, <2 x float> %TEXCOORD)
+  %sample = call <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 0, <8 x i32> addrspace(4)* %imageptr, <4 x i32> addrspace(4)* %samplerptr, i32 1, <2 x float> %TEXCOORD)
   call void (...) @lgc.create.write.generic.output(<4 x float> %sample, i32 0, i32 0, i32 0, i32 1, i32 0, i32 poison)
   ret void
 }
diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest1.lgc b/lgc/test/scalarizationOfDescriptorLoadsTest1.lgc
index cf01ecaead..186f904c28 100644
--- a/lgc/test/scalarizationOfDescriptorLoadsTest1.lgc
+++ b/lgc/test/scalarizationOfDescriptorLoadsTest1.lgc
@@ -8,31 +8,31 @@ target triple = "amdgcn--amdpal"
 ; Function Attrs: nounwind
 define dllexport spir_func void @lgc.shader.VS.main() local_unnamed_addr #0 !spirv.ExecutionModel !14 !lgc.shaderstage !15 {
 ; CHECK-LABEL: define dllexport spir_func void @lgc.shader.VS.main(
-; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !spirv.ExecutionModel !14 !lgc.shaderstage [[META15:![0-9]+]] {
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !spirv.ExecutionModel [[META14:![0-9]+]] !lgc.shaderstage [[META15:![0-9]+]] {
 ; CHECK-NEXT:  .entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @lgc.input.import.generic__i32(i1 false, i32 0, i32 0, i32 0, i32 poison)
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[TMP0]], poison
 ; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP3]], align 32, !invariant.load [[META16:![0-9]+]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP0]], poison
 ; CHECK-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP7]], align 16, !invariant.load [[META16]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP7]], align 32, !invariant.load [[META16:![0-9]+]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[TMP5]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP9]], i32 [[TMP5]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP12]], align 16, !invariant.load [[META16]]
-; CHECK-NEXT:    [[TMP14:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> [[TMP13]], i32 0, i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP12]], align 32, !invariant.load [[META16]]
+; CHECK-NEXT:    [[TMP14:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 0, <8 x i32> [[TMP13]], i32 0, i32 0)
 ; CHECK-NEXT:    [[TMP15:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP9]], <4 x float> [[TMP14]])
+; CHECK-NEXT:    [[TMP22:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP3]], align 32, !invariant.load [[META16]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[TMP1]])
 ; CHECK-NEXT:    [[TMP17:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP16]], i32 [[TMP1]])
 ; CHECK-NEXT:    [[TMP18:%.*]] = sext i32 [[TMP17]] to i64
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP19]], align 32, !invariant.load [[META16]]
-; CHECK-NEXT:    [[TMP21:%.*]] = call <4 x i32> @llvm.amdgcn.waterfall.last.use.v4i32(i32 [[TMP16]], <4 x i32> [[TMP20]])
-; CHECK-NEXT:    call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> [[TMP15]], <4 x i32> [[TMP21]], i32 1, i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[TMP20:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP19]], align 32, !invariant.load [[META16]]
+; CHECK-NEXT:    [[TMP21:%.*]] = call <8 x i32> @llvm.amdgcn.waterfall.last.use.v8i32(i32 [[TMP16]], <8 x i32> [[TMP20]])
+; CHECK-NEXT:    call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> [[TMP15]], i32 15, i32 1, <8 x i32> [[TMP21]], i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
 .entry:
@@ -42,13 +42,11 @@ define dllexport spir_func void @lgc.shader.VS.main() local_unnamed_addr #0 !spi
   %3 = mul i32 %0, %2
   %4 = sext i32 %3 to i64
   %5 = getelementptr i8, ptr addrspace(4) %1, i64 %4
-  %6 = load <4 x i32>, ptr addrspace(4) %5, align 32, !invariant.load !16
   %7 = mul i32 %0, %2
   %8 = sext i32 %7 to i64
   %9 = getelementptr i8, ptr addrspace(4) %1, i64 %8
-  %10 = load <4 x i32>, ptr addrspace(4) %9, align 16, !invariant.load !16
-  %11 = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.load.v4f32(i32 0, i32 8, <4 x i32> %10, i32 0)
-  call void (...) @lgc.create.image.store(<4 x float> %11, i32 0, i32 8, <4 x i32> %6, i32 1)
+  %11 = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.load.v4f32(i32 0, i32 8, ptr addrspace(4) %9, i32 0)
+  call void (...) @lgc.create.image.store(<4 x float> %11, i32 0, i32 8, ptr addrspace(4) %5, i32 1)
   ret void
 }
 
@@ -99,3 +97,8 @@ attributes #3 = { nounwind memory(write) }
 !14 = !{i32 0}
 !15 = !{i32 1}
 !16 = !{}
+;.
+; CHECK: [[META14]] = !{i32 0}
+; CHECK: [[META15]] = !{i32 1}
+; CHECK: [[META16]] = !{}
+;.
diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest10.lgc b/lgc/test/scalarizationOfDescriptorLoadsTest10.lgc
index 7835bf62e7..8d8609b8ea 100644
--- a/lgc/test/scalarizationOfDescriptorLoadsTest10.lgc
+++ b/lgc/test/scalarizationOfDescriptorLoadsTest10.lgc
@@ -10,7 +10,7 @@ target triple = "amdgcn--amdpal"
 ; Function Attrs: nounwind
 define dllexport spir_func void @lgc.shader.FS.main() local_unnamed_addr #0 !spirv.ExecutionModel !8 !lgc.shaderstage !9 {
 ; CHECK-LABEL: define dllexport spir_func void @lgc.shader.FS.main(
-; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !spirv.ExecutionModel !8 !lgc.shaderstage [[META9:![0-9]+]] {
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !spirv.ExecutionModel [[META8:![0-9]+]] !lgc.shaderstage [[META9:![0-9]+]] {
 ; CHECK-NEXT:  .entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc()
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
@@ -44,8 +44,8 @@ define dllexport spir_func void @lgc.shader.FS.main() local_unnamed_addr #0 !spi
 ; CHECK-NEXT:    [[I6:%.*]] = mul i32 [[PHI_IND]], [[PHI]]
 ; CHECK-NEXT:    [[I7:%.*]] = sext i32 [[I6]] to i64
 ; CHECK-NEXT:    [[I8:%.*]] = getelementptr i8, ptr addrspace(4) [[I2]], i64 [[I7]]
-; CHECK-NEXT:    [[I9:%.*]] = load <4 x i32>, ptr addrspace(4) [[I8]], align 16, !invariant.load [[META10:![0-9]+]]
-; CHECK-NEXT:    [[I10:%.*]] = load <8 x i32>, ptr addrspace(4) [[I5]], align 32, !invariant.load [[META10]]
+; CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i32>, ptr addrspace(4) [[I5]], align 32, !invariant.load [[META10:![0-9]+]]
+; CHECK-NEXT:    [[TMP22:%.*]] = load <4 x i32>, ptr addrspace(4) [[I8]], align 16, !invariant.load [[META10]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[I3]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP12]], i32 [[I3]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
@@ -90,9 +90,7 @@ loop.latch:                                       ; preds = %bb2, %bb1
   %i6 = mul i32 %phi.ind, %phi
   %i7 = sext i32 %i6 to i64
   %i8 = getelementptr i8, ptr addrspace(4) %i2, i64 %i7
-  %i9 = load <4 x i32>, ptr addrspace(4) %i8, align 16, !invariant.load !10
-  %i10 = load <8 x i32>, ptr addrspace(4) %i5, align 32, !invariant.load !10
-  %i11 = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 24, <8 x i32> %i10, <4 x i32> %i9, i32 1, <2 x float> zeroinitializer)
+  %i11 = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 24, ptr addrspace(4) %i5, ptr addrspace(4) %i8, i32 1, <2 x float> zeroinitializer)
   call void (...) @lgc.create.write.generic.output(<4 x float> %i11, i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison)
   %ind = add i32 %phi.ind, 1
   %cond2 = icmp ne i32 %ind, 1000
@@ -147,3 +145,8 @@ attributes #3 = { nounwind }
 !8 = !{i32 4}
 !9 = !{i32 6}
 !10 = !{}
+;.
+; CHECK: [[META8]] = !{i32 4}
+; CHECK: [[META9]] = !{i32 6}
+; CHECK: [[META10]] = !{}
+;.
diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest11.lgc b/lgc/test/scalarizationOfDescriptorLoadsTest11.lgc
index 2a2d450d1b..68797e0c4e 100644
--- a/lgc/test/scalarizationOfDescriptorLoadsTest11.lgc
+++ b/lgc/test/scalarizationOfDescriptorLoadsTest11.lgc
@@ -10,7 +10,7 @@ target triple = "amdgcn--amdpal"
 ; Function Attrs: nounwind
 define dllexport spir_func void @lgc.shader.FS.main() local_unnamed_addr #0 !spirv.ExecutionModel !8 !lgc.shaderstage !9 {
 ; CHECK-LABEL: define dllexport spir_func void @lgc.shader.FS.main(
-; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !spirv.ExecutionModel !8 !lgc.shaderstage [[META9:![0-9]+]] {
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !spirv.ExecutionModel [[META8:![0-9]+]] !lgc.shaderstage [[META9:![0-9]+]] {
 ; CHECK-NEXT:  .entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc()
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
@@ -44,8 +44,8 @@ define dllexport spir_func void @lgc.shader.FS.main() local_unnamed_addr #0 !spi
 ; CHECK-NEXT:    br label [[LOOP_LATCH]]
 ; CHECK:       loop.latch:
 ; CHECK-NEXT:    [[PHI:%.*]] = phi ptr addrspace(4) [ [[I5]], [[BB1]] ], [ [[I8]], [[BB2]] ]
-; CHECK-NEXT:    [[I9:%.*]] = load <4 x i32>, ptr addrspace(4) [[PHI]], align 16, !invariant.load [[META10:![0-9]+]]
-; CHECK-NEXT:    [[I10:%.*]] = load <8 x i32>, ptr addrspace(4) [[PHI]], align 32, !invariant.load [[META10]]
+; CHECK-NEXT:    [[I10:%.*]] = load <8 x i32>, ptr addrspace(4) [[PHI]], align 32, !invariant.load [[META10:![0-9]+]]
+; CHECK-NEXT:    [[I9:%.*]] = load <4 x i32>, ptr addrspace(4) [[PHI]], align 16, !invariant.load [[META10]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.v8i32(i32 0, <8 x i32> [[I10]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.v4i32(i32 [[TMP12]], <4 x i32> [[I9]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = call <8 x i32> @llvm.amdgcn.waterfall.readfirstlane.v8i32.v8i32(i32 [[TMP13]], <8 x i32> [[I10]])
@@ -86,9 +86,7 @@ bb2:                                              ; preds = %loop
 
 loop.latch:                                       ; preds = %bb2, %bb1
   %phi = phi ptr addrspace(4) [ %i5, %bb1 ], [ %i8, %bb2 ]
-  %i9 = load <4 x i32>, ptr addrspace(4) %phi, align 16, !invariant.load !10
-  %i10 = load <8 x i32>, ptr addrspace(4) %phi, align 32, !invariant.load !10
-  %i11 = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 24, <8 x i32> %i10, <4 x i32> %i9, i32 1, <2 x float> zeroinitializer)
+  %i11 = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 24, ptr addrspace(4) %phi, ptr addrspace(4) %phi, i32 1, <2 x float> zeroinitializer)
   call void (...) @lgc.create.write.generic.output(<4 x float> %i11, i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison)
   %ind = add i32 %phi.ind, 1
   %cond2 = icmp ne i32 %ind, 1000
@@ -143,3 +141,8 @@ attributes #3 = { nounwind }
 !8 = !{i32 4}
 !9 = !{i32 6}
 !10 = !{}
+;.
+; CHECK: [[META8]] = !{i32 4}
+; CHECK: [[META9]] = !{i32 6}
+; CHECK: [[META10]] = !{}
+;.
diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest12.lgc b/lgc/test/scalarizationOfDescriptorLoadsTest12.lgc
index afd0072c01..e5909c6acb 100644
--- a/lgc/test/scalarizationOfDescriptorLoadsTest12.lgc
+++ b/lgc/test/scalarizationOfDescriptorLoadsTest12.lgc
@@ -11,7 +11,7 @@ target triple = "amdgcn--amdpal"
 ; Function Attrs: nounwind
 define dllexport spir_func void @lgc.shader.FS.main() local_unnamed_addr #0 !spirv.ExecutionModel !8 !lgc.shaderstage !9 {
 ; CHECK-LABEL: define dllexport spir_func void @lgc.shader.FS.main(
-; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !spirv.ExecutionModel !8 !lgc.shaderstage [[META9:![0-9]+]] {
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !spirv.ExecutionModel [[META8:![0-9]+]] !lgc.shaderstage [[META9:![0-9]+]] {
 ; CHECK-NEXT:  .entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc()
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
@@ -38,8 +38,8 @@ define dllexport spir_func void @lgc.shader.FS.main() local_unnamed_addr #0 !spi
 ; CHECK-NEXT:    [[I6:%.*]] = mul i32 [[PHI_IND]], 48
 ; CHECK-NEXT:    [[I7:%.*]] = sext i32 [[I6]] to i64
 ; CHECK-NEXT:    [[I8:%.*]] = getelementptr i8, ptr addrspace(4) [[I2]], i64 [[I7]]
-; CHECK-NEXT:    [[I9:%.*]] = load <4 x i32>, ptr addrspace(4) [[I8]], align 16, !invariant.load [[META10:![0-9]+]]
-; CHECK-NEXT:    [[I10:%.*]] = load <8 x i32>, ptr addrspace(4) [[I5]], align 32, !invariant.load [[META10]]
+; CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i32>, ptr addrspace(4) [[I5]], align 32, !invariant.load [[META10:![0-9]+]]
+; CHECK-NEXT:    [[TMP22:%.*]] = load <4 x i32>, ptr addrspace(4) [[I8]], align 16, !invariant.load [[META10]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[I3]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP12]], i32 [[I3]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
@@ -75,9 +75,7 @@ loop:                                             ; preds = %loop, %.entry
   %i6 = mul i32 %phi.ind, %b
   %i7 = sext i32 %i6 to i64
   %i8 = getelementptr i8, ptr addrspace(4) %i2, i64 %i7
-  %i9 = load <4 x i32>, ptr addrspace(4) %i8, align 16, !invariant.load !10
-  %i10 = load <8 x i32>, ptr addrspace(4) %i5, align 32, !invariant.load !10
-  %i11 = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 24, <8 x i32> %i10, <4 x i32> %i9, i32 1, <2 x float> zeroinitializer)
+  %i11 = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 24, ptr addrspace(4) %i5, ptr addrspace(4) %i8, i32 1, <2 x float> zeroinitializer)
   %i12 = fadd <4 x float> %phi.img, <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
   %ind = add i32 %phi.ind, 1
   %cond = icmp ne i32 %ind, 1000
@@ -133,3 +131,8 @@ attributes #3 = { nounwind }
 !8 = !{i32 4}
 !9 = !{i32 6}
 !10 = !{}
+;.
+; CHECK: [[META8]] = !{i32 4}
+; CHECK: [[META9]] = !{i32 6}
+; CHECK: [[META10]] = !{}
+;.
diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest13.lgc b/lgc/test/scalarizationOfDescriptorLoadsTest13.lgc
index cd41d3a2bf..9cb2e24f8f 100644
--- a/lgc/test/scalarizationOfDescriptorLoadsTest13.lgc
+++ b/lgc/test/scalarizationOfDescriptorLoadsTest13.lgc
@@ -11,7 +11,7 @@ target triple = "amdgcn--amdpal"
 ; Function Attrs: nounwind
 define dllexport spir_func void @lgc.shader.FS.main() local_unnamed_addr #0 !spirv.ExecutionModel !8 !lgc.shaderstage !9 {
 ; CHECK-LABEL: define dllexport spir_func void @lgc.shader.FS.main(
-; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !spirv.ExecutionModel !8 !lgc.shaderstage [[META9:![0-9]+]] {
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !spirv.ExecutionModel [[META8:![0-9]+]] !lgc.shaderstage [[META9:![0-9]+]] {
 ; CHECK-NEXT:  .entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc()
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
@@ -31,16 +31,15 @@ define dllexport spir_func void @lgc.shader.FS.main() local_unnamed_addr #0 !spi
 ; CHECK-NEXT:    [[I3:%.*]] = mul i32 [[I]], 48
 ; CHECK-NEXT:    [[I4:%.*]] = sext i32 [[I3]] to i64
 ; CHECK-NEXT:    [[I5:%.*]] = getelementptr i8, ptr addrspace(4) [[I1]], i64 [[I4]]
-; CHECK-NEXT:    [[L:%.*]] = load <8 x i32>, ptr addrspace(4) [[I5]], align 32, !invariant.load [[META10:![0-9]+]]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[PHI_IND:%.*]] = phi i32 [ 0, [[DOTENTRY:%.*]] ], [ [[IND:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[PHI_LOAD:%.*]] = phi <8 x i32> [ [[L]], [[DOTENTRY]] ], [ [[I10:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[PHI_LOAD1:%.*]] = phi ptr addrspace(4) [ [[I5]], [[DOTENTRY]] ], [ [[I8:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[I6:%.*]] = mul i32 [[PHI_IND]], 48
 ; CHECK-NEXT:    [[I7:%.*]] = sext i32 [[I6]] to i64
-; CHECK-NEXT:    [[I8:%.*]] = getelementptr i8, ptr addrspace(4) [[I2]], i64 [[I7]]
-; CHECK-NEXT:    [[I9:%.*]] = load <4 x i32>, ptr addrspace(4) [[I8]], align 16, !invariant.load [[META10]]
-; CHECK-NEXT:    [[I10]] = load <8 x i32>, ptr addrspace(4) [[I8]], align 32, !invariant.load [[META10]]
+; CHECK-NEXT:    [[I8]] = getelementptr i8, ptr addrspace(4) [[I2]], i64 [[I7]]
+; CHECK-NEXT:    [[PHI_LOAD:%.*]] = load <8 x i32>, ptr addrspace(4) [[PHI_LOAD1]], align 32, !invariant.load [[META10:![0-9]+]]
+; CHECK-NEXT:    [[TMP20:%.*]] = load <4 x i32>, ptr addrspace(4) [[I8]], align 16, !invariant.load [[META10]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.v8i32(i32 0, <8 x i32> [[PHI_LOAD]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 [[TMP12]], i32 [[I6]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = call <8 x i32> @llvm.amdgcn.waterfall.readfirstlane.v8i32.v8i32(i32 [[TMP13]], <8 x i32> [[PHI_LOAD]])
@@ -65,19 +64,16 @@ define dllexport spir_func void @lgc.shader.FS.main() local_unnamed_addr #0 !spi
   %i3 = mul i32 %i, %a
   %i4 = sext i32 %i3 to i64
   %i5 = getelementptr i8, ptr addrspace(4) %i1, i64 %i4
-  %l = load <8 x i32>, ptr addrspace(4) %i5, align 32, !invariant.load !10
   br label %loop
 
 loop:                                             ; preds = %loop, %.entry
   %phi.ind = phi i32 [ 0, %.entry ], [ %ind, %loop ]
-  %phi.load = phi <8 x i32> [ %l, %.entry ], [ %i10, %loop ]
+  %phi.load = phi ptr addrspace(4) [ %i5, %.entry ], [ %i8, %loop ]
   %b = call i32 (...) @lgc.create.get.desc.stride__i32(i32 2, i32 2, i64 0, i32 7)
   %i6 = mul i32 %phi.ind, %b
   %i7 = sext i32 %i6 to i64
   %i8 = getelementptr i8, ptr addrspace(4) %i2, i64 %i7
-  %i9 = load <4 x i32>, ptr addrspace(4) %i8, align 16, !invariant.load !10
-  %i10 = load <8 x i32>, ptr addrspace(4) %i8, align 32, !invariant.load !10
-  %i11 = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 24, <8 x i32> %phi.load, <4 x i32> %i9, i32 1, <2 x float> zeroinitializer)
+  %i11 = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 24, ptr addrspace(4) %phi.load, ptr addrspace(4) %i8, i32 1, <2 x float> zeroinitializer)
   call void (...) @lgc.create.write.generic.output(<4 x float> %i11, i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison)
   %ind = add i32 %phi.ind, 1
   %cond = icmp ne i32 %ind, 1000
@@ -132,3 +128,8 @@ attributes #3 = { nounwind }
 !8 = !{i32 4}
 !9 = !{i32 6}
 !10 = !{}
+;.
+; CHECK: [[META8]] = !{i32 4}
+; CHECK: [[META9]] = !{i32 6}
+; CHECK: [[META10]] = !{}
+;.
diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest14.lgc b/lgc/test/scalarizationOfDescriptorLoadsTest14.lgc
index bba218fbc6..34edce02c5 100644
--- a/lgc/test/scalarizationOfDescriptorLoadsTest14.lgc
+++ b/lgc/test/scalarizationOfDescriptorLoadsTest14.lgc
@@ -12,7 +12,7 @@ target triple = "amdgcn--amdpal"
 ; Function Attrs: nounwind
 define dllexport spir_func void @lgc.shader.FS.main() local_unnamed_addr #0 !spirv.ExecutionModel !8 !lgc.shaderstage !9 {
 ; CHECK-LABEL: define dllexport spir_func void @lgc.shader.FS.main(
-; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !spirv.ExecutionModel !8 !lgc.shaderstage [[META9:![0-9]+]] {
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !spirv.ExecutionModel [[META8:![0-9]+]] !lgc.shaderstage [[META9:![0-9]+]] {
 ; CHECK-NEXT:  .entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc()
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
@@ -39,8 +39,8 @@ define dllexport spir_func void @lgc.shader.FS.main() local_unnamed_addr #0 !spi
 ; CHECK-NEXT:    [[I6:%.*]] = mul i32 [[PHI_IND]], 48
 ; CHECK-NEXT:    [[I7:%.*]] = sext i32 [[I6]] to i64
 ; CHECK-NEXT:    [[I8:%.*]] = getelementptr i8, ptr addrspace(4) [[I2]], i64 [[I7]]
-; CHECK-NEXT:    [[I9:%.*]] = load <4 x i32>, ptr addrspace(4) [[I8]], align 16, !invariant.load [[META10:![0-9]+]]
-; CHECK-NEXT:    [[I10:%.*]] = load <8 x i32>, ptr addrspace(4) [[I5]], align 32, !invariant.load [[META10]]
+; CHECK-NEXT:    [[TMP25:%.*]] = load <8 x i32>, ptr addrspace(4) [[I5]], align 32, !invariant.load [[META10:![0-9]+]]
+; CHECK-NEXT:    [[TMP26:%.*]] = load <4 x i32>, ptr addrspace(4) [[I8]], align 16, !invariant.load [[META10]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[I3]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP12]], i32 [[I3]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
@@ -56,13 +56,14 @@ define dllexport spir_func void @lgc.shader.FS.main() local_unnamed_addr #0 !spi
 ; CHECK-NEXT:    [[COND:%.*]] = icmp ne i32 [[IND]], 1000
 ; CHECK-NEXT:    br i1 [[COND]], label [[LOOP]], label [[EXIT:%.*]]
 ; CHECK:       exit:
+; CHECK-NEXT:    [[TMP27:%.*]] = load <8 x i32>, ptr addrspace(4) [[I8]], align 32, !invariant.load [[META10]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[I6]])
 ; CHECK-NEXT:    [[TMP22:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP21]], i32 [[I6]])
 ; CHECK-NEXT:    [[TMP23:%.*]] = sext i32 [[TMP22]] to i64
 ; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr addrspace(4) [[I2]], i64 [[TMP23]]
-; CHECK-NEXT:    [[TMP25:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP24]], align 16, !invariant.load [[META10]]
-; CHECK-NEXT:    [[TMP26:%.*]] = call <4 x i32> @llvm.amdgcn.waterfall.last.use.v4i32(i32 [[TMP21]], <4 x i32> [[TMP25]])
-; CHECK-NEXT:    call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> [[I12]], <4 x i32> [[TMP26]], i32 1, i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[TMP28:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP24]], align 32, !invariant.load [[META10]]
+; CHECK-NEXT:    [[TMP29:%.*]] = call <8 x i32> @llvm.amdgcn.waterfall.last.use.v8i32(i32 [[TMP21]], <8 x i32> [[TMP28]])
+; CHECK-NEXT:    call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> [[I12]], i32 15, i32 1, <8 x i32> [[TMP29]], i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
 .entry:
@@ -82,16 +83,14 @@ loop:                                             ; preds = %loop, %.entry
   %i6 = mul i32 %phi.ind, %b
   %i7 = sext i32 %i6 to i64
   %i8 = getelementptr i8, ptr addrspace(4) %i2, i64 %i7
-  %i9 = load <4 x i32>, ptr addrspace(4) %i8, align 16, !invariant.load !10
-  %i10 = load <8 x i32>, ptr addrspace(4) %i5, align 32, !invariant.load !10
-  %i11 = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 24, <8 x i32> %i10, <4 x i32> %i9, i32 1, <2 x float> zeroinitializer)
+  %i11 = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 24, ptr addrspace(4) %i5, ptr addrspace(4) %i8, i32 1, <2 x float> zeroinitializer)
   %i12 = fadd <4 x float> %phi.img, <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
   %ind = add i32 %phi.ind, 1
   %cond = icmp ne i32 %ind, 1000
   br i1 %cond, label %loop, label %exit
 
 exit:                                             ; preds = %loop
-  call void (...) @lgc.create.image.store(<4 x float> %i12, i32 0, i32 8, <4 x i32> %i9, i32 1)
+  call void (...) @lgc.create.image.store(<4 x float> %i12, i32 0, i32 8, ptr addrspace(4) %i8, i32 1)
   ret void
 }
 
@@ -140,3 +139,8 @@ attributes #3 = { nounwind memory(write) }
 !8 = !{i32 4}
 !9 = !{i32 6}
 !10 = !{}
+;.
+; CHECK: [[META8]] = !{i32 4}
+; CHECK: [[META9]] = !{i32 6}
+; CHECK: [[META10]] = !{}
+;.
diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest15.lgc b/lgc/test/scalarizationOfDescriptorLoadsTest15.lgc
index c22cf5fc59..ab426db63d 100644
--- a/lgc/test/scalarizationOfDescriptorLoadsTest15.lgc
+++ b/lgc/test/scalarizationOfDescriptorLoadsTest15.lgc
@@ -11,7 +11,7 @@ target triple = "amdgcn--amdpal"
 ; Function Attrs: nounwind
 define dllexport spir_func void @lgc.shader.FS.main() local_unnamed_addr #0 !spirv.ExecutionModel !8 !lgc.shaderstage !9 {
 ; CHECK-LABEL: define dllexport spir_func void @lgc.shader.FS.main(
-; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !spirv.ExecutionModel !8 !lgc.shaderstage [[META9:![0-9]+]] {
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !spirv.ExecutionModel [[META8:![0-9]+]] !lgc.shaderstage [[META9:![0-9]+]] {
 ; CHECK-NEXT:  .entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc()
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
@@ -38,8 +38,8 @@ define dllexport spir_func void @lgc.shader.FS.main() local_unnamed_addr #0 !spi
 ; CHECK-NEXT:    [[I6:%.*]] = mul i32 [[PHI_IND]], 48
 ; CHECK-NEXT:    [[I7:%.*]] = sext i32 [[I6]] to i64
 ; CHECK-NEXT:    [[I8:%.*]] = getelementptr i8, ptr addrspace(4) [[I2]], i64 [[I7]]
-; CHECK-NEXT:    [[I9:%.*]] = load <4 x i32>, ptr addrspace(4) [[I8]], align 16, !invariant.load [[META10:![0-9]+]]
-; CHECK-NEXT:    [[I10:%.*]] = load <8 x i32>, ptr addrspace(4) [[I5]], align 32, !invariant.load [[META10]]
+; CHECK-NEXT:    [[TMP25:%.*]] = load <8 x i32>, ptr addrspace(4) [[I5]], align 32, !invariant.load [[META10:![0-9]+]]
+; CHECK-NEXT:    [[TMP26:%.*]] = load <4 x i32>, ptr addrspace(4) [[I8]], align 16, !invariant.load [[META10]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[I3]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP12]], i32 [[I3]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
@@ -51,13 +51,14 @@ define dllexport spir_func void @lgc.shader.FS.main() local_unnamed_addr #0 !spi
 ; CHECK-NEXT:    [[TMP20:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[TMP16]], <4 x i32> [[TMP19]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[I11]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP12]], <4 x float> [[TMP20]])
 ; CHECK-NEXT:    [[I12:%.*]] = fadd <4 x float> [[PHI_IMG]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT:    [[TMP27:%.*]] = load <8 x i32>, ptr addrspace(4) [[I8]], align 32, !invariant.load [[META10]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[I6]])
 ; CHECK-NEXT:    [[TMP22:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP21]], i32 [[I6]])
 ; CHECK-NEXT:    [[TMP23:%.*]] = sext i32 [[TMP22]] to i64
 ; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr addrspace(4) [[I2]], i64 [[TMP23]]
-; CHECK-NEXT:    [[TMP25:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP24]], align 16, !invariant.load [[META10]]
-; CHECK-NEXT:    [[TMP26:%.*]] = call <4 x i32> @llvm.amdgcn.waterfall.last.use.v4i32(i32 [[TMP21]], <4 x i32> [[TMP25]])
-; CHECK-NEXT:    call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> [[I12]], <4 x i32> [[TMP26]], i32 1, i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[TMP28:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP24]], align 32, !invariant.load [[META10]]
+; CHECK-NEXT:    [[TMP29:%.*]] = call <8 x i32> @llvm.amdgcn.waterfall.last.use.v8i32(i32 [[TMP21]], <8 x i32> [[TMP28]])
+; CHECK-NEXT:    call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> [[I12]], i32 15, i32 1, <8 x i32> [[TMP29]], i32 0, i32 0)
 ; CHECK-NEXT:    [[IND]] = add i32 [[PHI_IND]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp ne i32 [[IND]], 1000
 ; CHECK-NEXT:    br i1 [[COND]], label [[LOOP]], label [[EXIT:%.*]]
@@ -81,11 +82,9 @@ loop:                                             ; preds = %loop, %.entry
   %i6 = mul i32 %phi.ind, %b
   %i7 = sext i32 %i6 to i64
   %i8 = getelementptr i8, ptr addrspace(4) %i2, i64 %i7
-  %i9 = load <4 x i32>, ptr addrspace(4) %i8, align 16, !invariant.load !10
-  %i10 = load <8 x i32>, ptr addrspace(4) %i5, align 32, !invariant.load !10
-  %i11 = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 24, <8 x i32> %i10, <4 x i32> %i9, i32 1, <2 x float> zeroinitializer)
+  %i11 = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 24, ptr addrspace(4) %i5, ptr addrspace(4) %i8, i32 1, <2 x float> zeroinitializer)
   %i12 = fadd <4 x float> %phi.img, <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-  call void (...) @lgc.create.image.store(<4 x float> %i12, i32 0, i32 8, <4 x i32> %i9, i32 1)
+  call void (...) @lgc.create.image.store(<4 x float> %i12, i32 0, i32 8, ptr addrspace(4) %i8, i32 1)
   %ind = add i32 %phi.ind, 1
   %cond = icmp ne i32 %ind, 1000
   br i1 %cond, label %loop, label %exit
@@ -139,3 +138,8 @@ attributes #3 = { nounwind memory(write) }
 !8 = !{i32 4}
 !9 = !{i32 6}
 !10 = !{}
+;.
+; CHECK: [[META8]] = !{i32 4}
+; CHECK: [[META9]] = !{i32 6}
+; CHECK: [[META10]] = !{}
+;.
diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest16.lgc b/lgc/test/scalarizationOfDescriptorLoadsTest16.lgc
index 573c55b677..39f2ed7d68 100644
--- a/lgc/test/scalarizationOfDescriptorLoadsTest16.lgc
+++ b/lgc/test/scalarizationOfDescriptorLoadsTest16.lgc
@@ -11,7 +11,7 @@ target triple = "amdgcn--amdpal"
 ; Function Attrs: nounwind
 define dllexport spir_func void @lgc.shader.FS.main() local_unnamed_addr #0 !spirv.ExecutionModel !8 !lgc.shaderstage !9 {
 ; CHECK-LABEL: define dllexport spir_func void @lgc.shader.FS.main(
-; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !spirv.ExecutionModel !8 !lgc.shaderstage [[META9:![0-9]+]] {
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !spirv.ExecutionModel [[META8:![0-9]+]] !lgc.shaderstage [[META9:![0-9]+]] {
 ; CHECK-NEXT:  .entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc()
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
@@ -37,16 +37,16 @@ define dllexport spir_func void @lgc.shader.FS.main() local_unnamed_addr #0 !spi
 ; CHECK-NEXT:    [[I6:%.*]] = mul i32 [[PHI_IND]], 48
 ; CHECK-NEXT:    [[I7:%.*]] = sext i32 [[I6]] to i64
 ; CHECK-NEXT:    [[I8:%.*]] = getelementptr i8, ptr addrspace(4) [[I2]], i64 [[I7]]
-; CHECK-NEXT:    [[I9:%.*]] = load <4 x i32>, ptr addrspace(4) [[I8]], align 16, !invariant.load [[META10:![0-9]+]]
-; CHECK-NEXT:    [[I10:%.*]] = load <4 x i32>, ptr addrspace(4) [[I5]], align 32, !invariant.load [[META10]]
+; CHECK-NEXT:    [[I10:%.*]] = load <4 x i32>, ptr addrspace(4) [[I5]], align 32, !invariant.load [[META10:![0-9]+]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i32> [[I10]] to <4 x float>
+; CHECK-NEXT:    [[TMP17:%.*]] = load <8 x i32>, ptr addrspace(4) [[I8]], align 32, !invariant.load [[META10]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[I6]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP13]], i32 [[I6]])
 ; CHECK-NEXT:    [[TMP15:%.*]] = sext i32 [[TMP14]] to i64
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr addrspace(4) [[I2]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP16]], align 16, !invariant.load [[META10]]
-; CHECK-NEXT:    [[TMP18:%.*]] = call <4 x i32> @llvm.amdgcn.waterfall.last.use.v4i32(i32 [[TMP13]], <4 x i32> [[TMP17]])
-; CHECK-NEXT:    call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> [[TMP12]], <4 x i32> [[TMP18]], i32 1, i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[TMP18:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP16]], align 32, !invariant.load [[META10]]
+; CHECK-NEXT:    [[TMP19:%.*]] = call <8 x i32> @llvm.amdgcn.waterfall.last.use.v8i32(i32 [[TMP13]], <8 x i32> [[TMP18]])
+; CHECK-NEXT:    call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> [[TMP12]], i32 15, i32 1, <8 x i32> [[TMP19]], i32 0, i32 0)
 ; CHECK-NEXT:    [[IND]] = add i32 [[PHI_IND]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp ne i32 [[IND]], 1000
 ; CHECK-NEXT:    br i1 [[COND]], label [[LOOP]], label [[EXIT:%.*]]
@@ -69,9 +69,8 @@ loop:                                             ; preds = %loop, %.entry
   %i6 = mul i32 %phi.ind, %b
   %i7 = sext i32 %i6 to i64
   %i8 = getelementptr i8, ptr addrspace(4) %i2, i64 %i7
-  %i9 = load <4 x i32>, ptr addrspace(4) %i8, align 16, !invariant.load !10
   %i10 = load <4 x i32>, ptr addrspace(4) %i5, align 32, !invariant.load !10
-  call void (...) @lgc.create.image.store(<4 x i32> %i10, i32 0, i32 8, <4 x i32> %i9, i32 1)
+  call void (...) @lgc.create.image.store(<4 x i32> %i10, i32 0, i32 8, ptr addrspace(4) %i8, i32 1)
   %ind = add i32 %phi.ind, 1
   %cond = icmp ne i32 %ind, 1000
   br i1 %cond, label %loop, label %exit
@@ -125,3 +124,8 @@ attributes #3 = { nounwind memory(write) }
 !8 = !{i32 4}
 !9 = !{i32 6}
 !10 = !{}
+;.
+; CHECK: [[META8]] = !{i32 4}
+; CHECK: [[META9]] = !{i32 6}
+; CHECK: [[META10]] = !{}
+;.
diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest2.lgc b/lgc/test/scalarizationOfDescriptorLoadsTest2.lgc
index bca91884fb..c3ed6f1215 100644
--- a/lgc/test/scalarizationOfDescriptorLoadsTest2.lgc
+++ b/lgc/test/scalarizationOfDescriptorLoadsTest2.lgc
@@ -8,23 +8,23 @@ target triple = "amdgcn--amdpal"
 ; Function Attrs: nounwind
 define dllexport spir_func void @lgc.shader.VS.main() local_unnamed_addr #0 !spirv.ExecutionModel !14 !lgc.shaderstage !15 {
 ; CHECK-LABEL: define dllexport spir_func void @lgc.shader.VS.main(
-; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !spirv.ExecutionModel !14 !lgc.shaderstage [[META15:![0-9]+]] {
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !spirv.ExecutionModel [[META14:![0-9]+]] !lgc.shaderstage [[META15:![0-9]+]] {
 ; CHECK-NEXT:  .entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @lgc.input.import.generic__i32(i1 false, i32 0, i32 0, i32 0, i32 poison)
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[TMP0]], poison
 ; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP3]], align 16, !invariant.load [[META16:![0-9]+]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP5]], align 16, !invariant.load [[META16]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP5]], align 16, !invariant.load [[META16:![0-9]+]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <4 x float>
+; CHECK-NEXT:    [[TMP14:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP3]], align 32, !invariant.load [[META16]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[TMP1]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP8]], i32 [[TMP1]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = sext i32 [[TMP9]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP11]], align 16, !invariant.load [[META16]]
-; CHECK-NEXT:    [[TMP13:%.*]] = call <4 x i32> @llvm.amdgcn.waterfall.last.use.v4i32(i32 [[TMP8]], <4 x i32> [[TMP12]])
-; CHECK-NEXT:    call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> [[TMP7]], <4 x i32> [[TMP13]], i32 1, i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[TMP12:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP11]], align 32, !invariant.load [[META16]]
+; CHECK-NEXT:    [[TMP13:%.*]] = call <8 x i32> @llvm.amdgcn.waterfall.last.use.v8i32(i32 [[TMP8]], <8 x i32> [[TMP12]])
+; CHECK-NEXT:    call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> [[TMP7]], i32 15, i32 1, <8 x i32> [[TMP13]], i32 0, i32 0)
 ; CHECK-NEXT:    ret void
 ;
 .entry:
@@ -34,10 +34,9 @@ define dllexport spir_func void @lgc.shader.VS.main() local_unnamed_addr #0 !spi
   %3 = mul i32 %0, %2
   %4 = sext i32 %3 to i64
   %5 = getelementptr i8, ptr addrspace(4) %1, i64 %4
-  %6 = load <4 x i32>, ptr addrspace(4) %5, align 16, !invariant.load !16
   %7 = getelementptr i8, ptr addrspace(4) %1, i64 %4
   %8 = load <4 x i32>, ptr addrspace(4) %7, align 16, !invariant.load !16
-  call void (...) @lgc.create.image.store(<4 x i32> %8, i32 0, i32 8, <4 x i32> %6, i32 1)
+  call void (...) @lgc.create.image.store(<4 x i32> %8, i32 0, i32 8, ptr addrspace(4) %5, i32 1)
   ret void
 }
 
@@ -86,3 +85,8 @@ attributes #4 = { nounwind }
 !14 = !{i32 0}
 !15 = !{i32 1}
 !16 = !{}
+;.
+; CHECK: [[META14]] = !{i32 0}
+; CHECK: [[META15]] = !{i32 1}
+; CHECK: [[META16]] = !{}
+;.
diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest3.lgc b/lgc/test/scalarizationOfDescriptorLoadsTest3.lgc
index 21d460b530..4616e701a2 100644
--- a/lgc/test/scalarizationOfDescriptorLoadsTest3.lgc
+++ b/lgc/test/scalarizationOfDescriptorLoadsTest3.lgc
@@ -8,7 +8,7 @@ target triple = "amdgcn--amdpal"
 ; Function Attrs: nounwind
 define dllexport spir_func void @lgc.shader.VS.main() local_unnamed_addr #0 !spirv.ExecutionModel !14 !lgc.shaderstage !15 {
 ; CHECK-LABEL: define dllexport spir_func void @lgc.shader.VS.main(
-; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !spirv.ExecutionModel !14 !lgc.shaderstage [[META15:![0-9]+]] {
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !spirv.ExecutionModel [[META14:![0-9]+]] !lgc.shaderstage [[META15:![0-9]+]] {
 ; CHECK-NEXT:  .entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @lgc.input.import.generic__i32(i1 false, i32 0, i32 0, i32 0, i32 poison)
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[TMP0]], poison
@@ -18,12 +18,12 @@ define dllexport spir_func void @lgc.shader.VS.main() local_unnamed_addr #0 !spi
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP0]], poison
 ; CHECK-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP7]], align 16, !invariant.load [[META16]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP7]], align 32, !invariant.load [[META16]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[TMP5]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP9]], i32 [[TMP5]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP12]], align 16, !invariant.load [[META16]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP12]], align 32, !invariant.load [[META16]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[TMP13]], <4 x i32> <i32 42, i32 42, i32 42, i32 42>, i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[TMP15:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP9]], <4 x float> [[TMP14]])
 ; CHECK-NEXT:    ret void
@@ -39,8 +39,7 @@ define dllexport spir_func void @lgc.shader.VS.main() local_unnamed_addr #0 !spi
   %7 = mul i32 %0, %2
   %8 = sext i32 %7 to i64
   %9 = getelementptr i8, ptr addrspace(4) %1, i64 %8
-  %10 = load <8 x i32>, ptr addrspace(4) %9, align 16, !invariant.load !16
-  %11 = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 24, <8 x i32> %10, <4 x i32> <i32 42, i32 42, i32 42, i32 42>, i32 1, <2 x float> zeroinitializer)
+  %11 = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 24, ptr addrspace(4) %9, <4 x i32> <i32 42, i32 42, i32 42, i32 42>, i32 1, <2 x float> zeroinitializer)
   ret void
 }
 
@@ -87,3 +86,8 @@ attributes #2 = { nounwind memory(none) }
 !14 = !{i32 0}
 !15 = !{i32 1}
 !16 = !{}
+;.
+; CHECK: [[META14]] = !{i32 0}
+; CHECK: [[META15]] = !{i32 1}
+; CHECK: [[META16]] = !{}
+;.
diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest4.lgc b/lgc/test/scalarizationOfDescriptorLoadsTest4.lgc
index a6076a3787..1fbc044bd6 100644
--- a/lgc/test/scalarizationOfDescriptorLoadsTest4.lgc
+++ b/lgc/test/scalarizationOfDescriptorLoadsTest4.lgc
@@ -10,7 +10,7 @@ declare <4 x i32> @foo1(i32 %V)
 ; Function Attrs: nounwind
 define dllexport spir_func void @lgc.shader.VS.main() local_unnamed_addr #0 !spirv.ExecutionModel !14 !lgc.shaderstage !15 {
 ; CHECK-LABEL: define dllexport spir_func void @lgc.shader.VS.main(
-; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !spirv.ExecutionModel !14 !lgc.shaderstage [[META15:![0-9]+]] {
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !spirv.ExecutionModel [[META14:![0-9]+]] !lgc.shaderstage [[META15:![0-9]+]] {
 ; CHECK-NEXT:  .entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @lgc.input.import.generic__i32(i1 false, i32 0, i32 0, i32 0, i32 poison)
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[TMP0]], poison
@@ -20,14 +20,15 @@ define dllexport spir_func void @lgc.shader.VS.main() local_unnamed_addr #0 !spi
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP0]], poison
 ; CHECK-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP7]], align 16, !invariant.load [[META16]]
-; CHECK-NEXT:    [[TMP9:%.*]] = call <4 x i32> @foo1(i32 [[TMP0]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call ptr addrspace(4) @foo1(i32 [[TMP0]])
+; CHECK-NEXT:    [[TMP19:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP7]], align 32, !invariant.load [[META16]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP8]], align 16, !invariant.load [[META16]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[TMP5]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.v4i32(i32 [[TMP10]], <4 x i32> [[TMP9]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP11]], i32 [[TMP5]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = sext i32 [[TMP12]] to i64
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP14]], align 16, !invariant.load [[META16]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP14]], align 32, !invariant.load [[META16]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = call <4 x i32> @llvm.amdgcn.waterfall.readfirstlane.v4i32.v4i32(i32 [[TMP11]], <4 x i32> [[TMP9]])
 ; CHECK-NEXT:    [[TMP17:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[TMP15]], <4 x i32> [[TMP16]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[TMP18:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP11]], <4 x float> [[TMP17]])
@@ -44,9 +45,8 @@ define dllexport spir_func void @lgc.shader.VS.main() local_unnamed_addr #0 !spi
   %7 = mul i32 %0, %2
   %8 = sext i32 %7 to i64
   %9 = getelementptr i8, ptr addrspace(4) %1, i64 %8
-  %10 = load <8 x i32>, ptr addrspace(4) %9, align 16, !invariant.load !16
-  %11 = call <4 x i32> @foo1(i32 %0)
-  %12 = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 24, <8 x i32> %10, <4 x i32> %11, i32 1, <2 x float> zeroinitializer)
+  %11 = call ptr addrspace(4) @foo1(i32 %0)
+  %12 = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 24, ptr addrspace(4) %9, ptr addrspace(4) %11, i32 1, <2 x float> zeroinitializer)
   ret void
 }
 
@@ -93,3 +93,8 @@ attributes #2 = { nounwind memory(none) }
 !14 = !{i32 0}
 !15 = !{i32 1}
 !16 = !{}
+;.
+; CHECK: [[META14]] = !{i32 0}
+; CHECK: [[META15]] = !{i32 1}
+; CHECK: [[META16]] = !{}
+;.
diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest5.lgc b/lgc/test/scalarizationOfDescriptorLoadsTest5.lgc
index 6cb1fdfdcc..085be16a66 100644
--- a/lgc/test/scalarizationOfDescriptorLoadsTest5.lgc
+++ b/lgc/test/scalarizationOfDescriptorLoadsTest5.lgc
@@ -8,7 +8,7 @@ target triple = "amdgcn--amdpal"
 ; Function Attrs: nounwind
 define dllexport spir_func void @lgc.shader.VS.main() local_unnamed_addr #0 !spirv.ExecutionModel !14 !lgc.shaderstage !15 {
 ; CHECK-LABEL: define dllexport spir_func void @lgc.shader.VS.main(
-; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !spirv.ExecutionModel !14 !lgc.shaderstage [[META15:![0-9]+]] {
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !spirv.ExecutionModel [[META14:![0-9]+]] !lgc.shaderstage [[META15:![0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @lgc.input.import.generic__i32(i1 false, i32 0, i32 0, i32 0, i32 poison)
 ; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i32 [[TMP0]], 0
@@ -17,25 +17,25 @@ define dllexport spir_func void @lgc.shader.VS.main() local_unnamed_addr #0 !spi
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[TMP0]], poison
 ; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP3]], align 16, !invariant.load [[META16:![0-9]+]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP0]], poison
 ; CHECK-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP7]], align 16, !invariant.load [[META16]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP7]], align 32, !invariant.load [[META16:![0-9]+]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[TMP5]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP9]], i32 [[TMP5]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP12]], align 16, !invariant.load [[META16]]
-; CHECK-NEXT:    [[TMP14:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32(<4 x i32> [[TMP13]], i32 0, i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[TMP13:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP12]], align 32, !invariant.load [[META16]]
+; CHECK-NEXT:    [[TMP14:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 0, <8 x i32> [[TMP13]], i32 0, i32 0)
 ; CHECK-NEXT:    [[TMP15:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP9]], <4 x float> [[TMP14]])
+; CHECK-NEXT:    [[TMP22:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP3]], align 32, !invariant.load [[META16]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[TMP1]])
 ; CHECK-NEXT:    [[TMP17:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP16]], i32 [[TMP1]])
 ; CHECK-NEXT:    [[TMP18:%.*]] = sext i32 [[TMP17]] to i64
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr addrspace(4) poison, i64 [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP19]], align 16, !invariant.load [[META16]]
-; CHECK-NEXT:    [[TMP21:%.*]] = call <4 x i32> @llvm.amdgcn.waterfall.last.use.v4i32(i32 [[TMP16]], <4 x i32> [[TMP20]])
-; CHECK-NEXT:    call void @llvm.amdgcn.struct.buffer.store.format.v4f32(<4 x float> [[TMP15]], <4 x i32> [[TMP21]], i32 1, i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[TMP20:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP19]], align 32, !invariant.load [[META16]]
+; CHECK-NEXT:    [[TMP21:%.*]] = call <8 x i32> @llvm.amdgcn.waterfall.last.use.v8i32(i32 [[TMP16]], <8 x i32> [[TMP20]])
+; CHECK-NEXT:    call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> [[TMP15]], i32 15, i32 1, <8 x i32> [[TMP21]], i32 0, i32 0)
 ; CHECK-NEXT:    br label [[RET]]
 ; CHECK:       ret:
 ; CHECK-NEXT:    ret void
@@ -51,13 +51,11 @@ bb:                                                ; preds = %entry
   %3 = mul i32 %0, %2
   %4 = sext i32 %3 to i64
   %5 = getelementptr i8, ptr addrspace(4) %1, i64 %4
-  %6 = load <4 x i32>, ptr addrspace(4) %5, align 16, !invariant.load !16
   %7 = mul i32 %0, %2
   %8 = sext i32 %7 to i64
   %9 = getelementptr i8, ptr addrspace(4) %1, i64 %8
-  %10 = load <4 x i32>, ptr addrspace(4) %9, align 16, !invariant.load !16
-  %11 = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.load.v4f32(i32 0, i32 8, <4 x i32> %10, i32 0)
-  call void (...) @lgc.create.image.store(<4 x float> %11, i32 0, i32 8, <4 x i32> %6, i32 1)
+  %11 = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.load.v4f32(i32 0, i32 8, ptr addrspace(4) %9, i32 0)
+  call void (...) @lgc.create.image.store(<4 x float> %11, i32 0, i32 8, ptr addrspace(4) %5, i32 1)
   br label %ret
 
 ret:                                               ; preds = %bb, %entry
@@ -111,3 +109,8 @@ attributes #3 = { nounwind memory(write) }
 !14 = !{i32 0}
 !15 = !{i32 1}
 !16 = !{}
+;.
+; CHECK: [[META14]] = !{i32 0}
+; CHECK: [[META15]] = !{i32 1}
+; CHECK: [[META16]] = !{}
+;.
diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest6.lgc b/lgc/test/scalarizationOfDescriptorLoadsTest6.lgc
index cf57f85ba7..42398f1ce4 100644
--- a/lgc/test/scalarizationOfDescriptorLoadsTest6.lgc
+++ b/lgc/test/scalarizationOfDescriptorLoadsTest6.lgc
@@ -8,7 +8,7 @@ target triple = "amdgcn--amdpal"
 ; Function Attrs: nounwind
 define dllexport spir_func void @lgc.shader.FS.main() local_unnamed_addr #0 !spirv.ExecutionModel !22 !lgc.shaderstage !23 {
 ; CHECK-LABEL: define dllexport spir_func void @lgc.shader.FS.main(
-; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !spirv.ExecutionModel !22 !lgc.shaderstage [[META23:![0-9]+]] {
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !spirv.ExecutionModel [[META22:![0-9]+]] !lgc.shaderstage [[META23:![0-9]+]] {
 ; CHECK-NEXT:  .entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc()
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
@@ -35,8 +35,8 @@ define dllexport spir_func void @lgc.shader.FS.main() local_unnamed_addr #0 !spi
 ; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x i32> [[TMP15]] to i64
 ; CHECK-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr addrspace(4)
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP17]], i32 0
-; CHECK-NEXT:    [[TMP19:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP18]], align 16, !invariant.load [[META24:![0-9]+]]
-; CHECK-NEXT:    [[TMP20:%.*]] = call <4 x i32> @llvm.amdgcn.struct.buffer.load.format.v4i32(<4 x i32> [[TMP19]], i32 [[DOT0]], i32 0, i32 0, i32 0), !invariant.load [[META24]]
+; CHECK-NEXT:    [[TMP19:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP18]], align 32, !invariant.load [[META24:![0-9]+]]
+; CHECK-NEXT:    [[TMP20:%.*]] = call <4 x i32> @llvm.amdgcn.image.load.1d.v4i32.i32(i32 15, i32 [[DOT0]], <8 x i32> [[TMP19]], i32 0, i32 0), !invariant.load [[META24]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i32> [[TMP20]], i64 0
 ; CHECK-NEXT:    [[TMP22:%.*]] = call i32 @lgc.load.user.data__i32(i32 36)
 ; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP22]], i64 0
@@ -46,12 +46,12 @@ define dllexport spir_func void @lgc.shader.FS.main() local_unnamed_addr #0 !spi
 ; CHECK-NEXT:    [[TMP27:%.*]] = mul i32 [[TMP21]], 32
 ; CHECK-NEXT:    [[TMP28:%.*]] = sext i32 [[TMP27]] to i64
 ; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP26]], i64 [[TMP28]]
-; CHECK-NEXT:    [[TMP30:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP29]], align 32, !invariant.load [[META24]]
 ; CHECK-NEXT:    [[TMP31:%.*]] = call i32 @lgc.load.user.data__i32(i32 36)
 ; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[TMP31]], i64 0
 ; CHECK-NEXT:    [[TMP33:%.*]] = bitcast <2 x i32> [[TMP32]] to i64
 ; CHECK-NEXT:    [[TMP34:%.*]] = inttoptr i64 [[TMP33]] to ptr addrspace(4)
 ; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP34]], i32 0
+; CHECK-NEXT:    [[TMP52:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP29]], align 32, !invariant.load [[META24]]
 ; CHECK-NEXT:    [[TMP36:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP35]], align 16, !invariant.load [[META24]]
 ; CHECK-NEXT:    [[TMP37:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[TMP27]])
 ; CHECK-NEXT:    [[TMP38:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP37]], i32 [[TMP27]])
@@ -64,26 +64,29 @@ define dllexport spir_func void @lgc.shader.FS.main() local_unnamed_addr #0 !spi
 ; CHECK-NEXT:    [[TMP45:%.*]] = sext i32 [[TMP44]] to i64
 ; CHECK-NEXT:    [[TMP46:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP26]], i64 [[TMP45]]
 ; CHECK-NEXT:    [[TMP47:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP46]], align 32, !invariant.load [[META24]]
+; CHECK-NEXT:    [[TMP59:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP35]], align 16, !invariant.load [[META24]]
 ; CHECK-NEXT:    [[TMP48:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[TMP44]])
 ; CHECK-NEXT:    [[TMP49:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP48]], i32 [[TMP44]])
 ; CHECK-NEXT:    [[TMP50:%.*]] = sext i32 [[TMP49]] to i64
 ; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP26]], i64 [[TMP50]]
-; CHECK-NEXT:    [[TMP52:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP51]], align 32, !invariant.load [[META24]]
-; CHECK-NEXT:    [[TMP53:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[TMP52]], <4 x i32> [[TMP36]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[TMP67:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP51]], align 32, !invariant.load [[META24]]
+; CHECK-NEXT:    [[TMP53:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[TMP67]], <4 x i32> [[TMP59]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[TMP54:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP48]], <4 x float> [[TMP53]])
+; CHECK-NEXT:    [[TMP68:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP29]], align 32, !invariant.load [[META24]]
+; CHECK-NEXT:    [[TMP69:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP35]], align 16, !invariant.load [[META24]]
 ; CHECK-NEXT:    [[TMP55:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[TMP27]])
 ; CHECK-NEXT:    [[TMP56:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP55]], i32 [[TMP27]])
 ; CHECK-NEXT:    [[TMP57:%.*]] = sext i32 [[TMP56]] to i64
 ; CHECK-NEXT:    [[TMP58:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP26]], i64 [[TMP57]]
-; CHECK-NEXT:    [[TMP59:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP58]], align 32, !invariant.load [[META24]]
-; CHECK-NEXT:    [[TMP60:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[TMP59]], <4 x i32> [[TMP36]], i1 false, i32 0, i32 0)
+; CHECK-NEXT:    [[TMP70:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP58]], align 32, !invariant.load [[META24]]
+; CHECK-NEXT:    [[TMP60:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[TMP70]], <4 x i32> [[TMP69]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[TMP61:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP55]], <4 x float> [[TMP60]])
 ; CHECK-NEXT:    [[TMP62]] = fadd reassoc nnan nsz arcp contract afn <4 x float> [[DOT09]], [[TMP61]]
 ; CHECK-NEXT:    [[TMP63:%.*]] = fadd reassoc nnan nsz arcp contract afn <4 x float> [[TMP43]], [[TMP54]]
 ; CHECK-NEXT:    [[TMP64]] = fadd reassoc nnan nsz arcp contract afn <4 x float> [[DOT010]], [[TMP63]]
 ; CHECK-NEXT:    [[TMP65]] = add i32 [[DOT0]], 1
 ; CHECK-NEXT:    br label [[TMP9]], !llvm.loop [[LOOP25:![0-9]+]]
-; CHECK:       66:
+; CHECK:       69:
 ; CHECK-NEXT:    ret void
 ;
 .entry:
@@ -105,24 +108,20 @@ define dllexport spir_func void @lgc.shader.FS.main() local_unnamed_addr #0 !spi
 7:                                                ; preds = %3
   %8 = call ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4(i32 4, i32 4, i64 1, i32 12)
   %9 = call i32 (...) @lgc.create.get.desc.stride__i32(i32 4, i32 4, i64 1, i32 12)
-  %10 = load <4 x i32>, ptr addrspace(4) %8, align 16, !invariant.load !24
-  %11 = call <4 x i32> (...) @lgc.create.image.load.v4i32(i32 0, i32 1536, <4 x i32> %10, i32 %.0)
+  %11 = call <4 x i32> (...) @lgc.create.image.load.v4i32(i32 0, i32 1536, ptr addrspace(4) %8, i32 %.0)
   %12 = extractelement <4 x i32> %11, i64 0
   %13 = call ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4(i32 1, i32 1, i64 0, i32 6)
   %14 = call i32 (...) @lgc.create.get.desc.stride__i32(i32 1, i32 1, i64 0, i32 6)
   %15 = mul i32 %12, %14
   %16 = sext i32 %15 to i64
   %17 = getelementptr i8, ptr addrspace(4) %13, i64 %16
-  %18 = load <8 x i32>, ptr addrspace(4) %17, align 32, !invariant.load !24
   %19 = call ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4(i32 2, i32 2, i64 0, i32 5)
-  %20 = load <4 x i32>, ptr addrspace(4) %19, align 16, !invariant.load !24
-  %21 = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 24, <8 x i32> %18, <4 x i32> %20, i32 1, <2 x float> zeroinitializer)
+  %21 = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 24, ptr addrspace(4) %17, ptr addrspace(4) %19, i32 1, <2 x float> zeroinitializer)
   %22 = mul i32 %1, %14
   %23 = sext i32 %22 to i64
   %24 = getelementptr i8, ptr addrspace(4) %13, i64 %23
-  %25 = load <8 x i32>, ptr addrspace(4) %24, align 32, !invariant.load !24
-  %26 = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 24, <8 x i32> %25, <4 x i32> %20, i32 1, <2 x float> zeroinitializer)
-  %27 = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 24, <8 x i32> %18, <4 x i32> %20, i32 1, <2 x float> zeroinitializer)
+  %26 = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 24, ptr addrspace(4) %24, ptr addrspace(4) %19, i32 1, <2 x float> zeroinitializer)
+  %27 = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 24, ptr addrspace(4) %17, ptr addrspace(4) %19, i32 1, <2 x float> zeroinitializer)
   %28 = fadd reassoc nnan nsz arcp contract afn <4 x float> %.09, %27
   %29 = fadd reassoc nnan nsz arcp contract afn <4 x float> %21, %26
   %30 = fadd reassoc nnan nsz arcp contract afn <4 x float> %.010, %29
@@ -191,3 +190,9 @@ attributes #2 = { nounwind memory(none) }
 !23 = !{i32 6}
 !24 = !{}
 !25 = distinct !{!25}
+;.
+; CHECK: [[META22]] = !{i32 4}
+; CHECK: [[META23]] = !{i32 6}
+; CHECK: [[META24]] = !{}
+; CHECK: [[LOOP25]] = distinct !{[[LOOP25]]}
+;.
diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest7.lgc b/lgc/test/scalarizationOfDescriptorLoadsTest7.lgc
index 445a355b8d..4b1edb4ae3 100644
--- a/lgc/test/scalarizationOfDescriptorLoadsTest7.lgc
+++ b/lgc/test/scalarizationOfDescriptorLoadsTest7.lgc
@@ -10,7 +10,7 @@ target triple = "amdgcn--amdpal"
 ; Function Attrs: nounwind
 define dllexport spir_func void @lgc.shader.FS.main() local_unnamed_addr #0 !spirv.ExecutionModel !8 !lgc.shaderstage !9 {
 ; CHECK-LABEL: define dllexport spir_func void @lgc.shader.FS.main(
-; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !spirv.ExecutionModel !8 !lgc.shaderstage [[META9:![0-9]+]] {
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !spirv.ExecutionModel [[META8:![0-9]+]] !lgc.shaderstage [[META9:![0-9]+]] {
 ; CHECK-NEXT:  .entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc()
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
@@ -37,18 +37,15 @@ define dllexport spir_func void @lgc.shader.FS.main() local_unnamed_addr #0 !spi
 ; CHECK-NEXT:    [[I8:%.*]] = mul i32 [[I]], 48
 ; CHECK-NEXT:    [[I9:%.*]] = sext i32 [[I8]] to i64
 ; CHECK-NEXT:    [[I10:%.*]] = getelementptr i8, ptr addrspace(4) [[I3]], i64 [[I9]]
-; CHECK-NEXT:    [[I11:%.*]] = load <4 x i32>, ptr addrspace(4) [[I10]], align 16, !invariant.load [[META10:![0-9]+]]
 ; CHECK-NEXT:    br label [[BB3:%.*]]
 ; CHECK:       bb3:
-; CHECK-NEXT:    [[I12:%.*]] = load <8 x i32>, ptr addrspace(4) [[I7]], align 32, !invariant.load [[META10]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load <8 x i32>, ptr addrspace(4) [[I7]], align 32, !invariant.load [[META10:![0-9]+]]
+; CHECK-NEXT:    [[TMP19:%.*]] = load <4 x i32>, ptr addrspace(4) [[I3]], align 16, !invariant.load [[META10]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[I5]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP12]], i32 [[I5]])
-; CHECK-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr addrspace(4) [[I1]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP15]], align 32, !invariant.load [[META10]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = sext i32 [[TMP13]] to i64
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr addrspace(4) [[I3]], i64 [[TMP17]]
-; CHECK-NEXT:    [[TMP19:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP18]], align 16, !invariant.load [[META10]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr addrspace(4) [[I1]], i64 [[TMP17]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP18]], align 32, !invariant.load [[META10]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[TMP16]], <4 x i32> [[TMP19]], i1 false, i32 0, i32 0)
 ; CHECK-NEXT:    [[I13:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.waterfall.end.v4f32(i32 [[TMP12]], <4 x float> [[TMP20]])
 ; CHECK-NEXT:    call void @lgc.output.export.generic.i32.i32.v4f32(i32 0, i32 0, <4 x float> [[I13]]) #[[ATTR5:[0-9]+]]
@@ -72,12 +69,10 @@ bb2:                                              ; preds = %bb1
   %i8 = mul i32 %i, %i4
   %i9 = sext i32 %i8 to i64
   %i10 = getelementptr i8, ptr addrspace(4) %i3, i64 %i9
-  %i11 = load <4 x i32>, ptr addrspace(4) %i10, align 16, !invariant.load !10
   br label %bb3
 
 bb3:                                              ; preds = %bb2
-  %i12 = load <8 x i32>, ptr addrspace(4) %i7, align 32, !invariant.load !10
-  %i13 = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 24, <8 x i32> %i12, <4 x i32> %i11, i32 1, <2 x float> zeroinitializer)
+  %i13 = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 24, ptr addrspace(4) %i7, ptr addrspace(4) %i3, i32 1, <2 x float> zeroinitializer)
   call void (...) @lgc.create.write.generic.output(<4 x float> %i13, i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison)
   ret void
 }
@@ -127,3 +122,8 @@ attributes #3 = { nounwind }
 !8 = !{i32 4}
 !9 = !{i32 6}
 !10 = !{}
+;.
+; CHECK: [[META8]] = !{i32 4}
+; CHECK: [[META9]] = !{i32 6}
+; CHECK: [[META10]] = !{}
+;.
diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest8.lgc b/lgc/test/scalarizationOfDescriptorLoadsTest8.lgc
index 03b16464d2..4958e7432e 100644
--- a/lgc/test/scalarizationOfDescriptorLoadsTest8.lgc
+++ b/lgc/test/scalarizationOfDescriptorLoadsTest8.lgc
@@ -10,7 +10,7 @@ target triple = "amdgcn--amdpal"
 ; Function Attrs: nounwind
 define dllexport spir_func void @lgc.shader.FS.main() local_unnamed_addr #0 !spirv.ExecutionModel !8 !lgc.shaderstage !9 {
 ; CHECK-LABEL: define dllexport spir_func void @lgc.shader.FS.main(
-; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !spirv.ExecutionModel !8 !lgc.shaderstage [[META9:![0-9]+]] {
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !spirv.ExecutionModel [[META8:![0-9]+]] !lgc.shaderstage [[META9:![0-9]+]] {
 ; CHECK-NEXT:  .entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc()
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
@@ -41,8 +41,8 @@ define dllexport spir_func void @lgc.shader.FS.main() local_unnamed_addr #0 !spi
 ; CHECK-NEXT:    [[I6:%.*]] = mul i32 [[I]], [[PHI]]
 ; CHECK-NEXT:    [[I7:%.*]] = sext i32 [[I6]] to i64
 ; CHECK-NEXT:    [[I8:%.*]] = getelementptr i8, ptr addrspace(4) [[I2]], i64 [[I7]]
-; CHECK-NEXT:    [[I9:%.*]] = load <4 x i32>, ptr addrspace(4) [[I8]], align 16, !invariant.load [[META10:![0-9]+]]
-; CHECK-NEXT:    [[I10:%.*]] = load <8 x i32>, ptr addrspace(4) [[I5]], align 32, !invariant.load [[META10]]
+; CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i32>, ptr addrspace(4) [[I5]], align 32, !invariant.load [[META10:![0-9]+]]
+; CHECK-NEXT:    [[TMP22:%.*]] = load <4 x i32>, ptr addrspace(4) [[I8]], align 16, !invariant.load [[META10]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[I3]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP12]], i32 [[I3]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
@@ -79,9 +79,7 @@ bb3:                                              ; preds = %bb2, %bb1
   %i6 = mul i32 %i, %phi
   %i7 = sext i32 %i6 to i64
   %i8 = getelementptr i8, ptr addrspace(4) %i2, i64 %i7
-  %i9 = load <4 x i32>, ptr addrspace(4) %i8, align 16, !invariant.load !10
-  %i10 = load <8 x i32>, ptr addrspace(4) %i5, align 32, !invariant.load !10
-  %i11 = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 24, <8 x i32> %i10, <4 x i32> %i9, i32 1, <2 x float> zeroinitializer)
+  %i11 = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 24, ptr addrspace(4) %i5, ptr addrspace(4) %i8, i32 1, <2 x float> zeroinitializer)
   call void (...) @lgc.create.write.generic.output(<4 x float> %i11, i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison)
   ret void
 }
@@ -131,3 +129,8 @@ attributes #3 = { nounwind }
 !8 = !{i32 4}
 !9 = !{i32 6}
 !10 = !{}
+;.
+; CHECK: [[META8]] = !{i32 4}
+; CHECK: [[META9]] = !{i32 6}
+; CHECK: [[META10]] = !{}
+;.
diff --git a/lgc/test/scalarizationOfDescriptorLoadsTest9.lgc b/lgc/test/scalarizationOfDescriptorLoadsTest9.lgc
index 97e7f1777c..c188e6376b 100644
--- a/lgc/test/scalarizationOfDescriptorLoadsTest9.lgc
+++ b/lgc/test/scalarizationOfDescriptorLoadsTest9.lgc
@@ -9,7 +9,7 @@ target triple = "amdgcn--amdpal"
 ; Function Attrs: nounwind
 define dllexport spir_func void @lgc.shader.FS.main() local_unnamed_addr #0 !spirv.ExecutionModel !8 !lgc.shaderstage !9 {
 ; CHECK-LABEL: define dllexport spir_func void @lgc.shader.FS.main(
-; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !spirv.ExecutionModel !8 !lgc.shaderstage [[META9:![0-9]+]] {
+; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] !spirv.ExecutionModel [[META8:![0-9]+]] !lgc.shaderstage [[META9:![0-9]+]] {
 ; CHECK-NEXT:  .entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc()
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
@@ -37,6 +37,8 @@ define dllexport spir_func void @lgc.shader.FS.main() local_unnamed_addr #0 !spi
 ; CHECK-NEXT:    [[I8:%.*]] = getelementptr i8, ptr addrspace(4) [[I2]], i64 [[I7]]
 ; CHECK-NEXT:    [[I9:%.*]] = load <4 x i32>, ptr addrspace(4) [[I8]], align 16, !invariant.load [[META10:![0-9]+]]
 ; CHECK-NEXT:    [[I10:%.*]] = load <8 x i32>, ptr addrspace(4) [[I5]], align 32, !invariant.load [[META10]]
+; CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i32>, ptr addrspace(4) [[I5]], align 32, !invariant.load [[META10]]
+; CHECK-NEXT:    [[TMP22:%.*]] = load <4 x i32>, ptr addrspace(4) [[I8]], align 16, !invariant.load [[META10]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.amdgcn.waterfall.begin.i32(i32 0, i32 [[I3]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.amdgcn.waterfall.readfirstlane.i32.i32(i32 [[TMP12]], i32 [[I3]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
@@ -72,7 +74,7 @@ loop:                                             ; preds = %loop, %.entry
   %i8 = getelementptr i8, ptr addrspace(4) %i2, i64 %i7
   %i9 = load <4 x i32>, ptr addrspace(4) %i8, align 16, !invariant.load !10
   %i10 = load <8 x i32>, ptr addrspace(4) %i5, align 32, !invariant.load !10
-  %i11 = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 24, <8 x i32> %i10, <4 x i32> %i9, i32 1, <2 x float> zeroinitializer)
+  %i11 = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 24, ptr addrspace(4) %i5, ptr addrspace(4) %i8, i32 1, <2 x float> zeroinitializer)
   call void (...) @lgc.create.write.generic.output(<4 x float> %i11, i32 0, i32 0, i32 0, i32 0, i32 0, i32 poison)
   %ind = add i32 %phi.ind, 1
   %cond = icmp ne i32 %ind, 1000
@@ -127,3 +129,8 @@ attributes #3 = { nounwind }
 !8 = !{i32 4}
 !9 = !{i32 6}
 !10 = !{}
+;.
+; CHECK: [[META8]] = !{i32 4}
+; CHECK: [[META9]] = !{i32 6}
+; CHECK: [[META10]] = !{}
+;.
diff --git a/lgc/test/tanh.lgc b/lgc/test/tanh.lgc
new file mode 100644
index 0000000000..977bc458fb
--- /dev/null
+++ b/lgc/test/tanh.lgc
@@ -0,0 +1,54 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --tool lgc --march amdgcn --version 4
+; RUN: lgc -mcpu=gfx1100 -filetype=asm -o - %s | FileCheck --check-prefixes=CHECK %s
+
+; ModuleID = 'LLPC module'
+source_filename = "LLPC module"
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+target triple = "amdgcn--amdpal"
+
+define float @sample(float %x) !lgc.shaderstage !1 {
+; CHECK-LABEL: sample:
+; CHECK:         s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mul_f32_e64 v1, |v0|, -2.0
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v1
+; CHECK-NEXT:    v_exp_f32_e32 v1, v1
+; CHECK-NEXT:    s_waitcnt_depctr 0xfff
+; CHECK-NEXT:    v_add_f32_e32 v1, 1.0, v1
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-NEXT:    v_cmp_lt_f32_e64 s[0:1], 0x6f800000, |v1|
+; CHECK-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x2f800000, s[0:1]
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-NEXT:    v_mul_f32_e32 v1, v1, v2
+; CHECK-NEXT:    v_rcp_f32_e32 v1, v1
+; CHECK-NEXT:    s_waitcnt_depctr 0xfff
+; CHECK-NEXT:    v_add_f32_e32 v1, v1, v1
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-NEXT:    v_mul_f32_e32 v1, v2, v1
+; CHECK-NEXT:    v_sub_f32_e32 v1, 1.0, v1
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; CHECK-NEXT:    v_bfi_b32 v0, 0x7fffffff, v1, v0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %y = call float @lgc.create.tanh.f32(float %x)
+  ret float %y
+}
+
+; Function Attrs: nounwind willreturn memory(read)
+declare !lgc.create.opcode !2 i32 @lgc.create.read.builtin.input.i32(...) #0
+
+; Function Attrs: nounwind willreturn memory(none)
+declare ptr addrspace(7) @lgc.load.buffer.desc(i64, i32, i32, i32) #1
+
+; Function Attrs: nounwind memory(none)
+declare !lgc.create.opcode !3 float @lgc.create.tanh.f32(...) #2
+
+attributes #0 = { nounwind willreturn memory(read) }
+attributes #1 = { nounwind willreturn memory(none) }
+attributes #2 = { nounwind memory(none) }
+
+!llpc.compute.mode = !{!0}
+
+!0 = !{i32 8, i32 8, i32 1}
+!1 = !{i32 7}
+!2 = !{i32 77}
+!3 = !{i32 17}
diff --git a/llpc/CMakeLists.txt b/llpc/CMakeLists.txt
index 16120f7005..277bde621a 100644
--- a/llpc/CMakeLists.txt
+++ b/llpc/CMakeLists.txt
@@ -205,7 +205,6 @@ if(ICD_BUILD_LLPC)
         context/llpcGraphicsContext.cpp
         context/llpcPipelineContext.cpp
         context/llpcRayTracingContext.cpp
-        context/GfxRuntimeContext.cpp
     )
 
 # llpc/lower
diff --git a/llpc/context/llpcCompiler.cpp b/llpc/context/llpcCompiler.cpp
index 6544139aa1..bc85bd630c 100644
--- a/llpc/context/llpcCompiler.cpp
+++ b/llpc/context/llpcCompiler.cpp
@@ -64,6 +64,7 @@
 #include "lgc/LgcCpsDialect.h"
 #include "lgc/LgcRtDialect.h"
 #include "lgc/PassManager.h"
+#include "lgc/RuntimeContext.h"
 #include "llvm-dialects/Dialect/Dialect.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SmallSet.h"
@@ -657,7 +658,7 @@ Result Compiler::BuildShaderModule(const ShaderModuleBuildInfo *shaderInfo, Shad
   std::vector<ResourceNodeData> imageSymbolInfo;
   std::vector<ResourceNodeData> atomicCounterSymbolInfo;
   std::vector<ResourceNodeData> defaultUniformSymbolInfo;
-  if (shaderInfo->options.pipelineOptions.buildResourcesDataForShaderModule &&
+  if (shaderInfo->options.pipelineOptions.getGlState().buildResourcesDataForShaderModule &&
       moduleData.binType == BinaryType::Spirv) {
     buildShaderModuleResourceUsage(shaderInfo, resourceNodes, inputSymbolInfo, outputSymbolInfo, uniformBufferInfo,
                                    storageBufferInfo, textureSymbolInfo, imageSymbolInfo, atomicCounterSymbolInfo,
@@ -700,7 +701,7 @@ Result Compiler::BuildShaderModule(const ShaderModuleBuildInfo *shaderInfo, Shad
   pShaderModuleData->binCode.pCode = bufferWritePtr;
   bufferWritePtr += codeBuffer.size() * sizeof(unsigned);
 
-  if (shaderInfo->options.pipelineOptions.buildResourcesDataForShaderModule &&
+  if (shaderInfo->options.pipelineOptions.getGlState().buildResourcesDataForShaderModule &&
       moduleData.binType == BinaryType::Spirv) {
     memcpy(bufferWritePtr, &resourceNodes, sizeof(ResourcesNodes));
     pResourcesNodes = reinterpret_cast<ResourcesNodes *>(bufferWritePtr);
@@ -763,6 +764,7 @@ static bool getSymbolInfoFromSpvVariable(const SPIRVVariable *spvVar, ResourceNo
   SPIRVWord varId = 0;
   BasicType basicType = BasicType::Unknown;
   symbolInfo->columnCount = 1;
+  symbolInfo->componentCount = 1;
 
   SPIRVWord builtIn = false;
   bool isBuiltIn = spvVar->hasDecorate(DecorationBuiltIn, 0, &builtIn);
@@ -784,6 +786,8 @@ static bool getSymbolInfoFromSpvVariable(const SPIRVVariable *spvVar, ResourceNo
   }
   if (varElemTy->getOpCode() == OpTypeMatrix) {
     symbolInfo->columnCount = varElemTy->getMatrixColumnCount();
+    if (varElemTy->getMatrixColumnType()->getOpCode() == OpTypeVector)
+      symbolInfo->componentCount = varElemTy->getMatrixColumnType()->getVectorComponentCount();
     varElemTy = varElemTy->getMatrixColumnType();
   }
   if (varElemTy->getOpCode() == OpTypeVector)
@@ -3378,37 +3382,39 @@ void Compiler::adjustRayTracingElf(ElfPackage *pipelineElf, RayTracingContext *r
   auto &shaderFunctionSection = pipeline.getMap(true)[PalAbi::PipelineMetadataKey::ShaderFunctions].getMap(true);
 
   // Get the shader function
-  auto shaderFunctionName = shaderFunctionSection.begin()->first.getString();
-  auto &shaderFunction = shaderFunctionSection.begin()->second.getMap(true);
-
-  // 1. Add raytracing pipeline indirect pipeline metadata
-  // The metadata is needed for RGP to correctly show different subtype of shaders.
-  // Determine the shader subtype by name
-  auto subtype = "Unknown";
-  if (auto shaderStage = tryGetLgcRtShaderStageFromName(shaderFunctionName)) {
-    auto stage = shaderStage.value();
-    if (stage == lgc::rt::RayTracingShaderStage::RayGeneration)
-      subtype = "RayGeneration";
-    else if (stage == lgc::rt::RayTracingShaderStage::Miss)
-      subtype = "Miss";
-    else if (stage == lgc::rt::RayTracingShaderStage::AnyHit)
-      subtype = "AnyHit";
-    else if (stage == lgc::rt::RayTracingShaderStage::ClosestHit)
-      subtype = "ClosestHit";
-    else if (stage == lgc::rt::RayTracingShaderStage::Intersection)
-      subtype = "Intersection";
-    else if (stage == lgc::rt::RayTracingShaderStage::Callable)
-      subtype = "Callable";
-    else if (stage == lgc::rt::RayTracingShaderStage::Traversal)
-      subtype = "Traversal";
-  }
-  shaderFunction[".shader_subtype"] = subtype;
-
-  // 2. Apply the .internal_pipeline_hash to .api_shader_hash in .shader_functions section
-  // NOTE: this is needed for RGP to recognize different shader subtype
-  auto pipelineHash = pipeline.getMap(true)[PalAbi::PipelineMetadataKey::InternalPipelineHash].getArray(true);
-  shaderFunction[PalAbi::ShaderMetadataKey::ApiShaderHash].getArray(true)[0] = pipelineHash[0];
-  shaderFunction[PalAbi::ShaderMetadataKey::ApiShaderHash].getArray(true)[1] = pipelineHash[1];
+  for (auto &funcSection : shaderFunctionSection) {
+    auto shaderFunctionName = funcSection.first.getString();
+    auto &shaderFunction = funcSection.second.getMap(true);
+
+    // 1. Add raytracing pipeline indirect pipeline metadata
+    // The metadata is needed for RGP to correctly show different subtype of shaders.
+    // Determine the shader subtype by name
+    auto subtype = "Unknown";
+    if (auto shaderStage = tryGetLgcRtShaderStageFromName(shaderFunctionName)) {
+      auto stage = shaderStage.value();
+      if (stage == lgc::rt::RayTracingShaderStage::RayGeneration)
+        subtype = "RayGeneration";
+      else if (stage == lgc::rt::RayTracingShaderStage::Miss)
+        subtype = "Miss";
+      else if (stage == lgc::rt::RayTracingShaderStage::AnyHit)
+        subtype = "AnyHit";
+      else if (stage == lgc::rt::RayTracingShaderStage::ClosestHit)
+        subtype = "ClosestHit";
+      else if (stage == lgc::rt::RayTracingShaderStage::Intersection)
+        subtype = "Intersection";
+      else if (stage == lgc::rt::RayTracingShaderStage::Callable)
+        subtype = "Callable";
+      else if (stage == lgc::rt::RayTracingShaderStage::Traversal)
+        subtype = "Traversal";
+    }
+    shaderFunction[".shader_subtype"] = subtype;
+
+    // 2. Apply the .internal_pipeline_hash to .api_shader_hash in .shader_functions section
+    // NOTE: this is needed for RGP to recognize different shader subtype
+    auto pipelineHash = pipeline.getMap(true)[PalAbi::PipelineMetadataKey::InternalPipelineHash].getArray(true);
+    shaderFunction[PalAbi::ShaderMetadataKey::ApiShaderHash].getArray(true)[0] = pipelineHash[0];
+    shaderFunction[PalAbi::ShaderMetadataKey::ApiShaderHash].getArray(true)[1] = pipelineHash[1];
+  }
 
   // Write modified metadata to the pipeline ELF
   ElfNote newMetaNote = metaNote;
@@ -3590,6 +3596,7 @@ void Compiler::buildShaderCacheHash(Context *context, unsigned stageMask, ArrayR
   auto pipelineInfo = reinterpret_cast<const GraphicsPipelineBuildInfo *>(context->getPipelineBuildInfo());
   auto pipelineOptions = pipelineContext->getPipelineOptions();
 
+  ShaderStage preStage = ShaderStageInvalid;
   // Build hash per shader stage
   for (ShaderStage stage : gfxShaderStages()) {
     if ((stageMask & getLgcShaderStageMask(stage)) == 0)
@@ -3619,10 +3626,21 @@ void Compiler::buildShaderCacheHash(Context *context, unsigned stageMask, ArrayR
 
     // Add per stage hash code to fragmentHasher or nonFragmentHasher per shader stage
     auto shaderHashCode = MetroHash::compact64(&hash);
-    if (stage == ShaderStageFragment)
+    if (stage == ShaderStageFragment) {
       fragmentHasher.Update(shaderHashCode);
-    else
+      const ShaderModuleData *moduleData = reinterpret_cast<const ShaderModuleData *>(shaderInfo->pModuleData);
+      if (moduleData && moduleData->usage.useBarycentric) {
+        // If fragment uses barycentrics, we still need to care about the previous stage, because the primitive type
+        // might be specified there.
+        if ((preStage != ShaderStageInvalid) && (preStage != ShaderStageVertex)) {
+          auto preShaderInfo = pipelineContext->getPipelineShaderInfo(preStage);
+          moduleData = reinterpret_cast<const ShaderModuleData *>(preShaderInfo->pModuleData);
+          fragmentHasher.Update(moduleData->cacheHash);
+        }
+      }
+    } else
       nonFragmentHasher.Update(shaderHashCode);
+    preStage = stage;
   }
 
   // Add additional pipeline state to final hasher
diff --git a/llpc/context/llpcContext.cpp b/llpc/context/llpcContext.cpp
index 58b1d25c6b..b148ed4ef1 100644
--- a/llpc/context/llpcContext.cpp
+++ b/llpc/context/llpcContext.cpp
@@ -29,7 +29,6 @@
  ***********************************************************************************************************************
  */
 #include "llpcContext.h"
-#include "GfxRuntimeContext.h"
 #include "LowerAdvancedBlend.h"
 #include "ProcessGfxRuntimeLibrary.h"
 #include "SPIRVInternal.h"
@@ -55,6 +54,7 @@
 #include "lgc/LgcDialect.h"
 #include "lgc/LgcRtDialect.h"
 #include "lgc/PassManager.h"
+#include "lgc/RuntimeContext.h"
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
 #include "llvm/Bitstream/BitstreamReader.h"
@@ -71,6 +71,10 @@
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/AlwaysInliner.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/ADCE.h"
+#include "llvm/Transforms/Scalar/InstSimplifyPass.h"
+#include "llvm/Transforms/Scalar/SROA.h"
+#include "llvm/Transforms/Scalar/SimplifyCFG.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 
 #define DEBUG_TYPE "llpc-context"
@@ -241,6 +245,8 @@ void Context::ensureGpurtLibrary() {
 
   ShaderModuleData moduleData = {};
   moduleData.binCode = rtState->gpurtShaderLibrary;
+  if (moduleData.binCode.codeSize == 0)
+    report_fatal_error("No GPURT library available");
   moduleData.binType = BinaryType::Spirv;
   moduleData.usage.keepUnusedFunctions = true;
   moduleData.usage.rayQueryLibrary = true;
@@ -277,6 +283,18 @@ void Context::ensureGpurtLibrary() {
   lowerPassMgr->addPass(AlwaysInlinerPass());
   lowerPassMgr->addPass(SpirvLowerAccessChain());
   lowerPassMgr->addPass(SpirvLowerGlobal());
+
+  // Run some basic optimization to simplify the code. This should be more efficient than optimizing them after they are
+  // inlined into the caller.
+  FunctionPassManager fpm;
+  fpm.addPass(SROAPass(SROAOptions::ModifyCFG));
+  fpm.addPass(InstSimplifyPass());
+  fpm.addPass(SimplifyCFGPass());
+  // DCE is particularly useful for removing dead instructions after continuation call, which may help reducing
+  // continuation stack size.
+  fpm.addPass(ADCEPass());
+  lowerPassMgr->addPass(createModuleToFunctionPassAdaptor(std::move(fpm)));
+
   timerProfiler.addTimerStartStopPass(*lowerPassMgr, TimerTranslate, false);
 
   lowerPassMgr->run(*gpurt);
diff --git a/llpc/context/llpcPipelineContext.cpp b/llpc/context/llpcPipelineContext.cpp
index 808cf24cdd..5338cffcd0 100644
--- a/llpc/context/llpcPipelineContext.cpp
+++ b/llpc/context/llpcPipelineContext.cpp
@@ -311,10 +311,10 @@ Options PipelineContext::computePipelineOptions() const {
   // Driver report full subgroup lanes for compute shader, here we just set fullSubgroups as default options
   options.fullSubgroups = true;
   options.internalRtShaders = getPipelineOptions()->internalRtShaders;
-  options.disableSampleMask = getPipelineOptions()->disableSampleMask;
-  options.disableTruncCoordForGather = getPipelineOptions()->disableTruncCoordForGather;
+  options.disableSampleMask = getPipelineOptions()->getGlState().disableSampleMask;
+  options.disableTruncCoordForGather = getPipelineOptions()->getGlState().disableTruncCoordForGather;
   options.enablePrimGeneratedQuery = getPipelineOptions()->enablePrimGeneratedQuery;
-  options.enableFragColor = getPipelineOptions()->enableFragColor;
+  options.enableFragColor = getPipelineOptions()->getGlState().enableFragColor;
 
   options.rtBoxSortHeuristicMode = m_rtState.boxSortHeuristicMode;
   options.rtStaticPipelineFlags = m_rtState.staticPipelineFlags;
@@ -481,7 +481,7 @@ void PipelineContext::convertResourceNode(ResourceNode &dst, const ResourceMappi
     else
       dst.concreteType = static_cast<ResourceNodeType>(src.type);
 
-    if (getPipelineOptions()->replaceSetWithResourceType && src.srdRange.set == 0) {
+    if (getPipelineOptions()->getGlState().replaceSetWithResourceType && src.srdRange.set == 0) {
       // Special value InternalDescriptorSetId(-1) will be passed in for internal usage
       dst.set = getGlResourceNodeSetFromType(src.type);
     } else {
diff --git a/llpc/docs/DdnBindlessTexture.md b/llpc/docs/DdnBindlessTexture.md
index da974d90bf..30f506efde 100644
--- a/llpc/docs/DdnBindlessTexture.md
+++ b/llpc/docs/DdnBindlessTexture.md
@@ -250,7 +250,7 @@ If a bindless texture is declared as uvec2,  it behaves identically to a normal
 
 The ARB_bindless_texture extension was published in 2013, when we implemented this extension in OGLP driver there was no SPIR-V opcode or extension support it, so we had to add two flags to indicate whether the bindless texture/image are used in the program, we can get this state from glslang, when one texture/image in a shader is declared as bindless, all the textures/images in the given program will be handled as bindless mode, which can simplify our driver’s implementation, so in LLPC’s implementation we will continue to follow this way.
 
-Two pipeline options are added to indicate whether the bindless texture or image is used, these flags are set at program link-time, so that when Llpc::Compiler::buildShaderModuleResourceUsage() is called,  the texture variables can be recognized as its real type variables (eg. if declared as `layout(bindless_sampler) uniform sampler2D s1;`, it will be recognized as a 64bit uint typed default uniform variable, instead of a texture), so that we can create the correct resourceMappingNode table for each kind of resource. And these two flags will also be checked at pipeline compile-time, so that we can generate the correct LLVM IR for bindless texture.
+Two pipeline options are added to indicate whether the bindless texture or image is used, these two flags will be checked at pipeline compile-time, so that we can generate the correct LLVM IR for bindless texture.
 
 ``` c++
 struct PipelineOptions {
@@ -314,7 +314,7 @@ If declare a bindless texture handle as samplerXX type, it will be a `OpTypeSamp
 - At program link-time, when calling `Llpc::Compiler::buildShaderModuleResourceUsage()`, we need to recognize `OpTypeSampledImage` type variable as a 64-bit unsigned integer typed default uniform, so that we will not generate resource mapping node for texture, but generate a default uniform instead;
 - At pipeline compile time, we only need to add two patches in spirvReader:
 
-    1). When calls `SPIRVToLLVM::transVariable()` to translate variable `%13`, we need to force to change the variable type from `OpTypedSapledImage` to int64, so that we can generate a uniform variable’s declaration, and we can handle `OpLoad` instruction correctly;
+    1). When calls `SPIRVToLLVM::transVariable()` to translate variable `%13`, we need to force to change the variable type from `OpTypedSampledImage` to int64, so that we can generate a uniform variable’s declaration;
 
         ```
         %11 = OpTypeSampledImage %10
@@ -329,19 +329,16 @@ If declare a bindless texture handle as samplerXX type, it will be a `OpTypeSamp
         %18 = OpLoad %15 %17
         %19 = OpImageSampleImplicitLod %7 %14 %18
         ```
-    2). When calling `SPIRVToLLVM::transValueWithOpcode<OpLoad>()` to load the bindless texture handle, we need to do two things:
-    i). Load 64-bit image descriptor address, then convert it to an int pointer with correct address space;
+    2). When calling `SPIRVToLLVM::transValueWithOpcode<OpLoad>()` to load the bindless texture handle, we need to  load the imageDescPointer by the bindless handle;
+The above solution works for the simple cases, but in real implementation, we found if the texture is declared as an array, multi-dimensional array, or declared as a struct member or block member, it is hard to handle the accessChain instruction, especially when translate the type of a bindless texture to a 64-bit unsigned integer. To handle the aggregate data types, we provided a new solution in Spirv-Builder:
+1). Convert the OpTypeSampledImage typed variable to a uvec2 type variable;
+2). before the texture function is called, insert a bitcast opCode to convert the uvec2 type handle to a sampler type variable;
 
-    ii). Currently image descriptor, sampler descriptor and fmask descriptor are stored in a structure, we need to obtain the each descriptor after loading the image descriptor address, then insert all descriptors in the structure;
-
-After the above change, we can see the pipeline dumps for the above shader, the pass “LLPC translate SPIR-V binary to LLVM IR”  and the ISA code dump looks as following, the cases that declare bindless textures handle as sampler2D can run correctly.
-
-![](./DdnBindlessTexturePipelineDumpDeclSamplerType.PNG)
+The above solution can significantly simplify the implementation in LLPC, after this change, we don't need to convert the data types at At program link-time, we don't need to change the variable's type when calling SPIRVToLLVM::transVariable(), and we don't need to do any change to handle the accessChain instructions for the aggregate types, the bindless handle will be treated just as a uvec2 type variable, and handling the case that declare a bindless texture by a samplerXX type variable would be exactly same as that declare a bindless texutre by a uvec2 type.
 
 #### 2. Declare bindless texture handle as uvec2 type
-If declare a bindless texture as uniform uvec2 type, the solution would be much easier,  we don’t need to change the variable’s data type at program link-time or when `SPIRVToLLVM::transVariable()` is called, an `OpBitcast` instruction was added by SPIR-V builder to convert a 64-bit handle to a sampler, which need to handle specially for bindless texture. As the bindless handle is a native 64-bit data type, so the result of this instruction `%14 = OpLoad %11 %13` is a 64-bit texture handle, when translate the following instruction
-`%17 = OpBitcast %16 %14`, we need to do the same thing as above case(declared the handle by sampler2D):
-
+If declare a bindless texture as uniform uvec2 type, the solution would be much easier, an `OpBitcast` instruction was added by SPIR-V builder to convert a 64-bit handle to a sampler, which need to handle specially for bindless texture. As the bindless handle is a native 64-bit data type, so the result of this instruction `%14 = OpLoad %11 %13` is a 64-bit texture handle, when translate the following instruction
+`%17 = OpBitcast %16 %14`
 - Load 64-bit image descriptor address, then convert it to an int pointer with correct address space;
 - Obtain the each descriptor’s pointer after image descriptor address is loaded, then insert all descriptors in the structure;
 
diff --git a/llpc/docs/DdnBindlessTexturePipelineDumpDeclSamplerType.PNG b/llpc/docs/DdnBindlessTexturePipelineDumpDeclSamplerType.PNG
deleted file mode 100644
index eeb1a2a4b5..0000000000
Binary files a/llpc/docs/DdnBindlessTexturePipelineDumpDeclSamplerType.PNG and /dev/null differ
diff --git a/llpc/docs/DdnBindlessTexturePipelineDumpDeclUvec2Type.PNG b/llpc/docs/DdnBindlessTexturePipelineDumpDeclUvec2Type.PNG
index 5c2eef2925..e3c868e0a3 100644
Binary files a/llpc/docs/DdnBindlessTexturePipelineDumpDeclUvec2Type.PNG and b/llpc/docs/DdnBindlessTexturePipelineDumpDeclUvec2Type.PNG differ
diff --git a/llpc/lower/LowerAdvancedBlend.cpp b/llpc/lower/LowerAdvancedBlend.cpp
index e06f675fe1..6e937329dc 100644
--- a/llpc/lower/LowerAdvancedBlend.cpp
+++ b/llpc/lower/LowerAdvancedBlend.cpp
@@ -29,13 +29,13 @@
  ***********************************************************************************************************************
  */
 #include "LowerAdvancedBlend.h"
-#include "GfxRuntimeContext.h"
 #include "SPIRVInternal.h"
 #include "compilerutils/CompilerUtils.h"
 #include "llpcContext.h"
 #include "llpcSpirvLowerInternalLibraryIntrinsicUtil.h"
 #include "vkgcDefs.h"
 #include "lgc/Builder.h"
+#include "lgc/RuntimeContext.h"
 
 #define DEBUG_TYPE "Lower-advanced-blend"
 
@@ -85,30 +85,23 @@ void LowerAdvancedBlend::processFsOutputs(Module &module) {
     if (global.getType()->getAddressSpace() == SPIRAS_Uniform && global.getName().ends_with(AdvancedBlendIsMsaaName))
       isMsaaUniform = &global;
   }
-  // Prepare arguments of AmdAdvancedBlend(inColor, imageDescMsLow, imageDescMsHigh, imageDescLow, imageDescHigh,
-  // fmaskDescLow, fmaskDescHigh, mode, isMsaa) from shaderLibrary
+  // Prepare arguments of AmdAdvancedBlend(inColor, imageDescMs, imageDesc, fmaskDesc, mode, isMsaa) from shaderLibrary
   m_builder->SetInsertPointPastAllocas(m_entryPoint);
 
   // Get the parameters and store them into the allocated parameter points
-  Type *descType = FixedVectorType::get(m_builder->getInt32Ty(), 8);
   unsigned bindings[2] = {m_binding, m_binding + 1};
-  Value *imageDescLow[2] = {};
-  Value *imageDescHigh[2] = {};
+  Value *imageDesc[2] = {};
   for (unsigned id = 0; id < 2; ++id) {
     unsigned descSet = PipelineContext::getGlResourceNodeSetFromType(Vkgc::ResourceMappingNodeType::DescriptorResource);
-    Value *imageDescPtr = m_builder->CreateGetDescPtr(ResourceNodeType::DescriptorResource,
-                                                      ResourceNodeType::DescriptorResource, descSet, bindings[id]);
-    Value *imageDesc = m_builder->CreateLoad(descType, imageDescPtr);
-    imageDescLow[id] = m_builder->CreateShuffleVector(imageDesc, ArrayRef<int>{0, 1, 2, 3});
-    imageDescHigh[id] = m_builder->CreateShuffleVector(imageDesc, ArrayRef<int>{4, 5, 6, 7});
+    imageDesc[id] = m_builder->CreateGetDescPtr(ResourceNodeType::DescriptorResource,
+                                                ResourceNodeType::DescriptorResource, descSet, bindings[id]);
+    imageDesc[id] = m_builder->CreatePtrToInt(imageDesc[id], m_builder->getInt64Ty());
   }
 
   unsigned descSet = PipelineContext::getGlResourceNodeSetFromType(Vkgc::ResourceMappingNodeType::DescriptorFmask);
-  Value *fmaskDescPtr = m_builder->CreateGetDescPtr(ResourceNodeType::DescriptorFmask,
-                                                    ResourceNodeType::DescriptorFmask, descSet, m_binding);
-  Value *fmaskDesc = m_builder->CreateLoad(descType, fmaskDescPtr);
-  Value *fmaskDescLow = m_builder->CreateShuffleVector(fmaskDesc, ArrayRef<int>{0, 1, 2, 3});
-  Value *fmaskDescHigh = m_builder->CreateShuffleVector(fmaskDesc, ArrayRef<int>{4, 5, 6, 7});
+  Value *fmaskDesc = m_builder->CreateGetDescPtr(ResourceNodeType::DescriptorFmask, ResourceNodeType::DescriptorFmask,
+                                                 descSet, m_binding);
+  fmaskDesc = m_builder->CreatePtrToInt(fmaskDesc, m_builder->getInt64Ty());
 
   assert(modeUniform && isMsaaUniform);
   modeUniform = m_builder->CreateLoad(m_builder->getInt32Ty(), modeUniform);
@@ -132,8 +125,7 @@ void LowerAdvancedBlend::processFsOutputs(Module &module) {
 
       Value *blendColor = inliner
                               .inlineCall(*m_builder, advancedBlendFunc,
-                                          {srcVal, imageDescLow[0], imageDescHigh[0], imageDescLow[1], imageDescHigh[1],
-                                           fmaskDescLow, fmaskDescHigh, modeUniform, isMsaaUniform})
+                                          {srcVal, imageDesc[0], imageDesc[1], fmaskDesc, modeUniform, isMsaaUniform})
                               .returnValue;
 
       storeInst->setOperand(0, blendColor);
diff --git a/llpc/lower/LowerGLCompatibility.cpp b/llpc/lower/LowerGLCompatibility.cpp
index 62c82a0697..aeae5b78fe 100644
--- a/llpc/lower/LowerGLCompatibility.cpp
+++ b/llpc/lower/LowerGLCompatibility.cpp
@@ -773,19 +773,16 @@ void LowerGLCompatibility::emulateDrawPixels() {
   auto vec2Type = FixedVectorType::get(floatType, 2);
   auto vec4Type = FixedVectorType::get(floatType, 4);
   auto ivec2Type = FixedVectorType::get(int32Type, 2);
-  auto ivec8Type = FixedVectorType::get(int32Type, 8);
   if (m_patchTexCoord == nullptr) {
     createPatchTexCoord();
   }
   Value *patchTexcoord = m_builder->CreateLoad(vec2Type, m_patchTexCoord);
   Value *texcoord = m_builder->CreateFPToUI(patchTexcoord, ivec2Type);
-  auto imageDesc = m_builder->CreateGetDescPtr(
+  auto imageDescPtr = m_builder->CreateGetDescPtr(
       lgc::ResourceNodeType::DescriptorResource, lgc::ResourceNodeType::DescriptorResource,
       PipelineContext::getGlResourceNodeSetFromType(Vkgc::ResourceMappingNodeType::DescriptorResource),
       Vkgc::InternalBinding::PixelOpInternalBinding);
-  auto descriptor = m_builder->CreateLoad(ivec8Type, imageDesc);
-  descriptor->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(*m_context, {}));
-  Value *texel = m_builder->CreateImageLoad(vec4Type, Dim2D, 0, descriptor, texcoord, nullptr);
+  Value *texel = m_builder->CreateImageLoad(vec4Type, Dim2D, 0, imageDescPtr, texcoord, nullptr);
 
   // Write Color
   if (buildInfo->glState.drawPixelsType == Vkgc::DrawPixelsTypeColor) {
@@ -868,7 +865,6 @@ void LowerGLCompatibility::emulateBitmap() {
   auto int32Type = m_builder->getInt32Ty();
   auto vec2Type = FixedVectorType::get(floatType, 2);
   auto ivec2Type = FixedVectorType::get(int32Type, 2);
-  auto ivec8Type = FixedVectorType::get(int32Type, 8);
   if (!m_patchTexCoord) {
     createPatchTexCoord();
   }
@@ -882,13 +878,11 @@ void LowerGLCompatibility::emulateBitmap() {
   }
   mask = m_builder->CreateShl(ConstantInt::get(ivec2Type, 1), mask);
   Value *texCoordSrc = m_builder->CreateLShr(constInt0x3, texcoord);
-  auto imageDesc = m_builder->CreateGetDescPtr(
+  auto imageDescPtr = m_builder->CreateGetDescPtr(
       lgc::ResourceNodeType::DescriptorResource, lgc::ResourceNodeType::DescriptorResource,
       PipelineContext::getGlResourceNodeSetFromType(Vkgc::ResourceMappingNodeType::DescriptorResource),
       Vkgc::InternalBinding::PixelOpInternalBinding);
-  auto descriptor = m_builder->CreateLoad(ivec8Type, imageDesc);
-  descriptor->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(*m_context, {}));
-  Value *texel = m_builder->CreateImageLoad(ivec2Type, Dim2D, 0, descriptor, texCoordSrc, nullptr);
+  Value *texel = m_builder->CreateImageLoad(ivec2Type, Dim2D, 0, imageDescPtr, texCoordSrc, nullptr);
   Value *val = m_builder->CreateAnd(mask, texel);
   val = m_builder->CreateExtractElement(val, ConstantInt::get(int32Type, 0));
   auto cmp = m_builder->CreateICmpEQ(val, ConstantInt::get(int32Type, 0));
diff --git a/llpc/lower/ProcessGfxRuntimeLibrary.cpp b/llpc/lower/ProcessGfxRuntimeLibrary.cpp
index 378a8e749e..3dac36e0a2 100644
--- a/llpc/lower/ProcessGfxRuntimeLibrary.cpp
+++ b/llpc/lower/ProcessGfxRuntimeLibrary.cpp
@@ -113,37 +113,35 @@ void ProcessGfxRuntimeLibrary::processLibraryFunction(Function *&func) {
 // =====================================================================================================================
 // Create texel load
 void ProcessGfxRuntimeLibrary::createTexelLoad(Function *func) {
-  // Arguments: imageDescLow, imageDescHigh, icoord, lod
-  constexpr unsigned argCount = 4;
-  Type *int4Ty = FixedVectorType::get(m_builder->getInt32Ty(), 4);
+  // Arguments: imageDesc, icoord, lod
+  constexpr unsigned argCount = 3;
   Type *int2Ty = FixedVectorType::get(m_builder->getInt32Ty(), 2);
-  Type *argTypes[] = {int4Ty, int4Ty, int2Ty, m_builder->getInt32Ty()};
+  Type *argTypes[] = {m_builder->getInt64Ty(), int2Ty, m_builder->getInt32Ty()};
   std::array<Value *, argCount> loadArgs;
   for (unsigned i = 0; i < argCount; ++i)
     loadArgs[i] = m_builder->CreateLoad(argTypes[i], func->getArg(i));
   unsigned imageFlag = Builder::ImageFlagInvariant | Builder::ImageFlagNotAliased;
-  auto imageDesc = m_builder->CreateShuffleVector(loadArgs[0], loadArgs[1], ArrayRef<int>{0, 1, 2, 3, 4, 5, 6, 7});
-  auto imageLoad =
-      m_builder->CreateImageLoad(func->getReturnType(), Builder::Dim2D, imageFlag, imageDesc, loadArgs[2], loadArgs[3]);
+  loadArgs[0] = m_builder->CreateIntToPtr(loadArgs[0], PointerType::get(m_builder->getContext(), ADDR_SPACE_CONST));
+  auto imageLoad = m_builder->CreateImageLoad(func->getReturnType(), Builder::Dim2D, imageFlag, loadArgs[0],
+                                              loadArgs[1], loadArgs[2]);
   m_builder->CreateRet(imageLoad);
 }
 
 // =====================================================================================================================
 // Create texel load with fmask
 void ProcessGfxRuntimeLibrary::createTexelLoadFmask(Function *func) {
-  // Argument: imageDescLow, imageDescHigh, fmaskDescLow, fmaskDescHigh, icoord, lod
-  constexpr unsigned argCount = 6;
-  Type *int4Ty = FixedVectorType::get(m_builder->getInt32Ty(), 4);
+  // Argument: imageDescMs, fmaskDesc, icoord, lod
+  constexpr unsigned argCount = 4;
   Type *int2Ty = FixedVectorType::get(m_builder->getInt32Ty(), 2);
-  Type *argTypes[] = {int4Ty, int4Ty, int4Ty, int4Ty, int2Ty, m_builder->getInt32Ty()};
+  Type *argTypes[] = {m_builder->getInt64Ty(), m_builder->getInt64Ty(), int2Ty, m_builder->getInt32Ty()};
   std::array<Value *, argCount> loadArgs;
   for (unsigned i = 0; i < argCount; ++i)
     loadArgs[i] = m_builder->CreateLoad(argTypes[i], func->getArg(i));
   unsigned imageFlag = Builder::ImageFlagInvariant | Builder::ImageFlagNotAliased;
-  auto imageDesc = m_builder->CreateShuffleVector(loadArgs[0], loadArgs[1], ArrayRef<int>{0, 1, 2, 3, 4, 5, 6, 7});
-  auto fmaskDesc = m_builder->CreateShuffleVector(loadArgs[2], loadArgs[3], ArrayRef<int>{0, 1, 2, 3, 4, 5, 6, 7});
-  auto imageLoad = m_builder->CreateImageLoadWithFmask(func->getReturnType(), Builder::Dim2DMsaa, imageFlag, imageDesc,
-                                                       fmaskDesc, loadArgs[4], loadArgs[5]);
+  loadArgs[0] = m_builder->CreateIntToPtr(loadArgs[0], PointerType::get(m_builder->getContext(), ADDR_SPACE_CONST));
+  loadArgs[1] = m_builder->CreateIntToPtr(loadArgs[1], PointerType::get(m_builder->getContext(), ADDR_SPACE_CONST));
+  auto imageLoad = m_builder->CreateImageLoadWithFmask(func->getReturnType(), Builder::Dim2DMsaa, imageFlag,
+                                                       loadArgs[0], loadArgs[1], loadArgs[2], loadArgs[3]);
   m_builder->CreateRet(imageLoad);
 }
 
diff --git a/llpc/lower/llpcSpirvLowerGlobal.cpp b/llpc/lower/llpcSpirvLowerGlobal.cpp
index ef982d0525..bc3233c22d 100644
--- a/llpc/lower/llpcSpirvLowerGlobal.cpp
+++ b/llpc/lower/llpcSpirvLowerGlobal.cpp
@@ -30,6 +30,7 @@
  */
 #include "llpcSpirvLowerGlobal.h"
 #include "SPIRVInternal.h"
+#include "compilerutils/CompilerUtils.h"
 #include "continuations/ContinuationsUtil.h"
 #include "llpcContext.h"
 #include "llpcDebug.h"
@@ -188,8 +189,7 @@ static_assert(lgc::ShadingRateHorizontal4Pixels ==
               "Shading rate flag mismatch");
 
 // =====================================================================================================================
-SpirvLowerGlobal::SpirvLowerGlobal()
-    : m_lowerInputInPlace(false), m_lowerOutputInPlace(false), m_lastVertexProcessingStage(ShaderStageInvalid) {
+SpirvLowerGlobal::SpirvLowerGlobal() : m_lastVertexProcessingStage(ShaderStageInvalid) {
 }
 
 // =====================================================================================================================
@@ -204,8 +204,39 @@ PreservedAnalyses SpirvLowerGlobal::run(Module &module, ModuleAnalysisManager &a
 
   changeRtFunctionSignature();
 
+  // Special handling of explicit interpolation (InterpolateAt* instructions) in fragment shaders -- get those out of
+  // the way.
+  if (m_shaderStage == ShaderStageFragment)
+    handleCallInst(false, true);
+
+  // Preparations for output lowering
+  m_unifiedReturn = nullptr;
+
+  if (m_shaderStage == ShaderStageGeometry) {
+    // Collect "emit" calls
+    handleCallInst(true, false);
+  } else if (m_shaderStage < ShaderStageGfxCount) {
+    ensureUnifiedReturn();
+  }
+
+  // Preparations for XFB handling
+  auto shaderStageMask = m_context->getShaderStageMask();
+  m_lastVertexProcessingStage = ShaderStageInvalid;
+
+  if (m_shaderStage < ShaderStageFragment) {
+    if (shaderStageMask & ShaderStageGeometryBit)
+      m_lastVertexProcessingStage = ShaderStageGeometry;
+    else if (shaderStageMask & ShaderStageTessEvalBit)
+      m_lastVertexProcessingStage = ShaderStageTessEval;
+    else if (shaderStageMask & ShaderStageVertexBit)
+      m_lastVertexProcessingStage = ShaderStageVertex;
+
+    if (m_shaderStage == m_lastVertexProcessingStage)
+      buildApiXfbMap();
+  }
+
   // First pass over globals
-  for (GlobalVariable &global : m_module->globals()) {
+  for (GlobalVariable &global : llvm::make_early_inc_range(m_module->globals())) {
     auto addrSpace = global.getType()->getAddressSpace();
 
     if (addrSpace == SPIRAS_Private || addrSpace == SPIRAS_Input || addrSpace == SPIRAS_Output) {
@@ -213,12 +244,11 @@ PreservedAnalyses SpirvLowerGlobal::run(Module &module, ModuleAnalysisManager &a
       // used yet for inputs/outputs.)
       convertUsersOfConstantsToInstructions(&global);
 
-      if (addrSpace == SPIRAS_Private)
+      if (addrSpace == SPIRAS_Private) {
         mapGlobalVariableToProxy(&global);
-      else if (addrSpace == SPIRAS_Input)
-        mapInputToProxy(&global);
-      else if (addrSpace == SPIRAS_Output)
-        mapOutputToProxy(&global);
+      } else {
+        lowerInOut(&global);
+      }
     } else if (addrSpace == SPIRAS_Local) {
       // Prefix all LDS variables to avoid downstream conflicts when linking shaders together
       if (global.hasName()) {
@@ -227,30 +257,17 @@ PreservedAnalyses SpirvLowerGlobal::run(Module &module, ModuleAnalysisManager &a
     }
   }
 
-  // Remove global variables that were already fully replaced
-  for (auto globalVar : m_globalsToErase) {
-    globalVar->dropAllReferences();
-    globalVar->eraseFromParent();
-  }
-  m_globalsToErase.clear();
-
-  // Do lowering operations
-  if (m_lowerInputInPlace && m_lowerOutputInPlace) {
-    // Both input and output have to be lowered in-place (without proxy variables)
-    lowerInOutInPlace(); // Just one lowering operation is sufficient
-  } else {
-    // Either input or output has to be lowered in-place, not both
-    if (m_lowerInputInPlace)
-      lowerInOutInPlace();
-    else
-      lowerInput();
-
-    if (m_lowerOutputInPlace)
-      lowerInOutInPlace();
-    else
-      lowerOutput();
+  // Now that outputs have been lowered, replace the Emit(Stream)Vertex calls with builder code.
+  for (auto emitCall : m_emitCalls) {
+    unsigned emitStreamId =
+        emitCall->arg_size() != 0 ? cast<ConstantInt>(emitCall->getArgOperand(0))->getZExtValue() : 0;
+    m_builder->SetInsertPoint(emitCall);
+    m_builder->CreateEmitVertex(emitStreamId);
+    emitCall->eraseFromParent();
   }
+  m_emitCalls.clear();
 
+  // Do further lowering operations
   if (m_shaderStage == ShaderStageVertex)
     lowerEdgeFlag();
 
@@ -294,8 +311,8 @@ void SpirvLowerGlobal::lowerEdgeFlag() {
 }
 
 // =====================================================================================================================
-// Handle "return" instructions.
-ReturnInst *SpirvLowerGlobal::ensureUnifiedReturn() {
+// Ensure that there is exactly one "ret" instruction. This is used for writing output variables for many shader types.
+void SpirvLowerGlobal::ensureUnifiedReturn() {
   SmallVector<ReturnInst *> retInsts;
 
   for (BasicBlock &block : *m_entryPoint) {
@@ -303,8 +320,10 @@ ReturnInst *SpirvLowerGlobal::ensureUnifiedReturn() {
       retInsts.push_back(retInst);
   }
 
-  if (retInsts.size() == 1)
-    return retInsts[0];
+  if (retInsts.size() == 1) {
+    m_unifiedReturn = retInsts[0];
+    return;
+  }
 
   // There are more than 2 returns; create a unified return block.
   //
@@ -319,7 +338,7 @@ ReturnInst *SpirvLowerGlobal::ensureUnifiedReturn() {
   }
 
   m_builder->SetInsertPoint(retBlock);
-  return m_builder->CreateRetVoid();
+  m_unifiedReturn = m_builder->CreateRetVoid();
 }
 
 // =====================================================================================================================
@@ -335,7 +354,7 @@ void SpirvLowerGlobal::handleCallInst(bool checkEmitCall, bool checkInterpCall)
     // We get all users before iterating because the iterator can be invalidated
     // by interpolateInputElement
     SmallVector<User *> users(function.users());
-    for (User *user : users) {
+    for (User *user : make_early_inc_range(users)) {
       assert(isa<CallInst>(user) && "We should only have CallInst instructions here.");
       CallInst *callInst = cast<CallInst>(user);
       if (checkEmitCall) {
@@ -348,6 +367,8 @@ void SpirvLowerGlobal::handleCallInst(bool checkEmitCall, bool checkInterpCall)
             mangledName.starts_with(gSPIRVName::InterpolateAtSample) ||
             mangledName.starts_with(gSPIRVName::InterpolateAtOffset) ||
             mangledName.starts_with(gSPIRVName::InterpolateAtVertexAMD)) {
+          m_builder->SetInsertPoint(callInst);
+
           // Translate interpolation functions to LLPC intrinsic calls
           auto loadSrc = callInst->getArgOperand(0);
           unsigned interpLoc = InterpLocUnknown;
@@ -375,7 +396,7 @@ void SpirvLowerGlobal::handleCallInst(bool checkEmitCall, bool checkInterpCall)
 
           GlobalVariable *gv = nullptr;
           SmallVector<Value *, 6> indexOperands;
-          if (auto getElemPtr = dyn_cast<GetElementPtrInst>(loadSrc)) {
+          if (auto getElemPtr = dyn_cast<GEPOperator>(loadSrc)) {
             // The interpolant is an element of the input
             for (auto &index : getElemPtr->indices())
               indexOperands.push_back(m_builder->CreateZExtOrTrunc(index, m_builder->getInt32Ty()));
@@ -383,7 +404,9 @@ void SpirvLowerGlobal::handleCallInst(bool checkEmitCall, bool checkInterpCall)
           } else {
             gv = cast<GlobalVariable>(loadSrc);
           }
-          interpolateInputElement(interpLoc, auxInterpValue, *callInst, gv, indexOperands);
+          Value *result = interpolateInputElement(callInst->getType(), interpLoc, auxInterpValue, gv, indexOperands);
+          callInst->replaceAllUsesWith(result);
+          callInst->eraseFromParent();
         }
       }
     }
@@ -430,162 +453,6 @@ static bool hasPrimitiveIdx(const Constant &metaVal) {
   return static_cast<bool>(inOutMeta.PerPrimitive);
 }
 
-// =====================================================================================================================
-// Handle a single "load" instruction loading a global.
-//
-// @param inOut : Global Variable instruction
-// @param indexOperands : Indices of GEP instruction
-// @param loadInst : Load instruction
-void SpirvLowerGlobal::handleLoadInstGEP(GlobalVariable *inOut, ArrayRef<Value *> indexOperands, LoadInst &loadInst) {
-
-  assert((indexOperands.empty() || cast<ConstantInt>(indexOperands.front())->isZero()) && "Non-zero GEP first index\n");
-  if (!indexOperands.empty())
-    indexOperands = indexOperands.drop_front();
-
-  m_builder->SetInsertPoint(&loadInst);
-
-  Value *vertexIdx = nullptr;
-  auto inOutTy = inOut->getValueType();
-
-  auto addrSpace = inOut->getType()->getPointerAddressSpace();
-
-  MDNode *metaNode = inOut->getMetadata(gSPIRVMD::InOut);
-  assert(metaNode);
-  auto inOutMetaVal = mdconst::dyn_extract<Constant>(metaNode->getOperand(0));
-
-  // If the input/output is arrayed, the outermost index might be used for vertex indexing
-  if (inOutTy->isArrayTy() && hasVertexIdx(*inOutMetaVal)) {
-    if (!indexOperands.empty()) {
-      vertexIdx = indexOperands.front();
-      indexOperands = indexOperands.drop_front();
-    } else if (inOutTy != loadInst.getType()) {
-      vertexIdx = m_builder->getInt32(0);
-    }
-    inOutTy = inOutTy->getArrayElementType();
-    inOutMetaVal = cast<Constant>(inOutMetaVal->getOperand(1));
-  }
-
-  Value *loadValue = loadInOutMember(inOutTy, loadInst.getType(), addrSpace, indexOperands, 0, inOutMetaVal, nullptr,
-                                     vertexIdx, InterpLocUnknown, nullptr, false);
-
-  m_loadInsts.insert(&loadInst);
-  loadInst.replaceAllUsesWith(loadValue);
-}
-
-// =====================================================================================================================
-// Handle "load" instructions.
-void SpirvLowerGlobal::handleLoadInst() {
-  auto shouldHandle = [&](const unsigned addrSpace) {
-    if (addrSpace != SPIRAS_Input && addrSpace != SPIRAS_Output)
-      return false;
-    // Skip if "load" instructions are not expected to be handled
-    const bool isTcsInput = (m_shaderStage == ShaderStageTessControl && addrSpace == SPIRAS_Input);
-    const bool isTcsOutput = (m_shaderStage == ShaderStageTessControl && addrSpace == SPIRAS_Output);
-    const bool isTesInput = (m_shaderStage == ShaderStageTessEval && addrSpace == SPIRAS_Input);
-    const bool isMeshInput = (m_shaderStage == ShaderStageMesh && addrSpace == SPIRAS_Input);
-
-    return isTcsInput || isTcsOutput || isTesInput || isMeshInput;
-  };
-
-  for (GlobalVariable &global : m_module->globals()) {
-    const unsigned addrSpace = global.getType()->getPointerAddressSpace();
-    if (!shouldHandle(addrSpace))
-      continue;
-    for (User *user : global.users()) {
-      if (LoadInst *loadInst = dyn_cast<LoadInst>(user)) {
-        handleLoadInstGEP(&global, {}, *loadInst);
-      } else if (GetElementPtrInst *gep = dyn_cast<GetElementPtrInst>(user)) {
-        // The user is a GEP
-        // We look for load instructions in the GEP users
-        for (User *gepUser : gep->users()) {
-          // We shouldn't have any chained GEPs here, they are coalesced by the LowerAccessChain pass.
-          assert(!isa<GetElementPtrInst>(gepUser));
-          if (LoadInst *loadInst = dyn_cast<LoadInst>(gepUser)) {
-            SmallVector<Value *, 6> indexOperands;
-            for (auto &index : gep->indices())
-              indexOperands.push_back(m_builder->CreateZExtOrTrunc(index, m_builder->getInt32Ty()));
-            handleLoadInstGEP(&global, indexOperands, *loadInst);
-          }
-        }
-      }
-    }
-  }
-}
-
-// =====================================================================================================================
-// Handle a single "store" instruction storing a global.
-//
-// @param output : Global Variable instruction
-// @param indexOperands : Indices of GEP instruction
-// @param storeInst : Store instruction
-void SpirvLowerGlobal::handleStoreInstGEP(GlobalVariable *output, ArrayRef<Value *> indexOperands,
-                                          StoreInst &storeInst) {
-  assert((indexOperands.empty() || cast<ConstantInt>(indexOperands.front())->isZero()) && "Non-zero GEP first index\n");
-  // drop first element
-  if (!indexOperands.empty())
-    indexOperands = indexOperands.drop_front();
-
-  m_builder->SetInsertPoint(&storeInst);
-
-  Value *storeValue = storeInst.getOperand(0);
-  Value *vertexOrPrimitiveIdx = nullptr;
-  auto outputTy = output->getValueType();
-
-  MDNode *metaNode = output->getMetadata(gSPIRVMD::InOut);
-  assert(metaNode);
-  auto outputMetaVal = mdconst::dyn_extract<Constant>(metaNode->getOperand(0));
-  // If the output is arrayed, the outermost index might be used for vertex or primitive indexing
-  if (outputTy->isArrayTy() && (hasVertexIdx(*outputMetaVal) || hasPrimitiveIdx(*outputMetaVal))) {
-    if (!indexOperands.empty()) {
-      vertexOrPrimitiveIdx = indexOperands.front();
-      indexOperands = indexOperands.drop_front();
-    } else if (outputTy != storeInst.getValueOperand()->getType()) {
-      vertexOrPrimitiveIdx = m_builder->getInt32(0);
-    }
-    outputTy = outputTy->getArrayElementType();
-    outputMetaVal = cast<Constant>(outputMetaVal->getOperand(1));
-  }
-
-  storeOutputMember(outputTy, storeInst.getValueOperand()->getType(), storeValue, indexOperands, 0, outputMetaVal,
-                    nullptr, vertexOrPrimitiveIdx);
-
-  m_storeInsts.insert(&storeInst);
-}
-
-// =====================================================================================================================
-// Visits "store" instructions.
-void SpirvLowerGlobal::handleStoreInst() {
-  auto shouldHandle = [&](const unsigned addrSpace) {
-    const bool isTcsOutput = (m_shaderStage == ShaderStageTessControl && addrSpace == SPIRAS_Output);
-    const bool isMeshOutput = (m_shaderStage == ShaderStageMesh && addrSpace == SPIRAS_Output);
-    return isTcsOutput || isMeshOutput;
-  };
-
-  for (GlobalVariable &global : m_module->globals()) {
-    const unsigned addrSpace = global.getType()->getPointerAddressSpace();
-    if (!shouldHandle(addrSpace))
-      continue;
-    for (User *user : global.users()) {
-      if (StoreInst *storeInst = dyn_cast<StoreInst>(user)) {
-        handleStoreInstGEP(&global, {}, *storeInst);
-      } else if (GetElementPtrInst *gep = dyn_cast<GetElementPtrInst>(user)) {
-        // The user is a GEP
-        // We look for store instructions in the GEP users
-        for (User *gepUser : gep->users()) {
-          // We shouldn't have any chained GEPs here, they are coalesced by the LowerAccessChain pass.
-          assert(!isa<GetElementPtrInst>(gepUser));
-          if (StoreInst *storeInst = dyn_cast<StoreInst>(gepUser)) {
-            SmallVector<Value *, 6> indexOperands;
-            for (auto &index : gep->indices())
-              indexOperands.push_back(m_builder->CreateZExtOrTrunc(index, m_builder->getInt32Ty()));
-            handleStoreInstGEP(&global, indexOperands, *storeInst);
-          }
-        }
-      }
-    }
-  }
-}
-
 // =====================================================================================================================
 // Maps the specified global variable to proxy variable.
 //
@@ -618,326 +485,158 @@ void SpirvLowerGlobal::mapGlobalVariableToProxy(GlobalVariable *globalVar) {
     });
   }
 
-  m_globalsToErase.push_back(globalVar);
-}
-
-// =====================================================================================================================
-// Maps the specified input to proxy variable.
-//
-// @param input : Input to be mapped
-void SpirvLowerGlobal::mapInputToProxy(GlobalVariable *input) {
-  // NOTE: For tessellation shader, we do not map inputs to real proxy variables. Instead, we directly
-  // replace "load" instructions with import calls in the lowering operation.
-  if (m_shaderStage == ShaderStageTessControl || m_shaderStage == ShaderStageTessEval) {
-    m_inputProxyMap[input] = nullptr;
-    m_lowerInputInPlace = true;
-    return;
-  }
-
-  m_builder->SetInsertPointPastAllocas(m_entryPoint);
-
-  const auto &dataLayout = m_module->getDataLayout();
-  Type *inputTy = input->getValueType();
-  if (inputTy->isPointerTy())
-    inputTy = m_builder->getInt64Ty();
-
-  MDNode *metaNode = input->getMetadata(gSPIRVMD::InOut);
-  assert(metaNode);
-
-  auto meta = mdconst::dyn_extract<Constant>(metaNode->getOperand(0));
-  Value *proxy = m_builder->CreateAlloca(inputTy, dataLayout.getAllocaAddrSpace(), nullptr,
-                                         Twine(LlpcName::InputProxyPrefix) + input->getName());
-
-  // Import input to proxy variable
-  auto inputValue = addCallInstForInOutImport(inputTy, SPIRAS_Input, meta, nullptr, 0, nullptr, nullptr,
-                                              InterpLocUnknown, nullptr, false);
-
-  m_builder->CreateStore(inputValue, proxy);
-
-  m_inputProxyMap[input] = proxy;
+  globalVar->dropAllReferences();
+  globalVar->eraseFromParent();
 }
 
 // =====================================================================================================================
-// Maps the specified output to proxy variable.
+// Lowers an input or output global variable.
 //
-// @param output : Output to be mapped
-void SpirvLowerGlobal::mapOutputToProxy(GlobalVariable *output) {
-  m_builder->SetInsertPointPastAllocas(m_entryPoint);
-
-  // NOTE: For tessellation control shader, task shader, or mesh shader, we do not map outputs to real proxy variables.
-  // Instead, we directly replace "store" instructions with export calls in the lowering operation.
-  if (m_shaderStage == ShaderStageTessControl || m_shaderStage == ShaderStageTask || m_shaderStage == ShaderStageMesh) {
-    if (output->hasInitializer()) {
-      auto initializer = output->getInitializer();
-      m_builder->CreateStore(initializer, output);
-    }
-    m_outputProxyMap.emplace_back(output, nullptr);
-    m_lowerOutputInPlace = true;
-    return;
-  }
-
-  const auto &dataLayout = m_module->getDataLayout();
-  Type *outputTy = output->getValueType();
-  if (outputTy->isPointerTy())
-    outputTy = m_builder->getInt64Ty();
-
-  auto proxy = m_builder->CreateAlloca(outputTy, dataLayout.getAllocaAddrSpace(), nullptr,
-                                       Twine(LlpcName::OutputProxyPrefix) + output->getName());
-
-  if (output->hasInitializer()) {
-    auto initializer = output->getInitializer();
-    m_builder->CreateStore(initializer, proxy);
-  }
-
-  m_outputProxyMap.emplace_back(output, proxy);
-}
-
-// =====================================================================================================================
-// Does lowering operations for SPIR-V inputs, replaces inputs with proxy variables.
-void SpirvLowerGlobal::lowerInput() {
-  if (m_inputProxyMap.empty()) {
-    // Skip lowering if there is no input
-    return;
-  }
-
-  // NOTE: For tessellation shader, we invoke handling of "load"/"store" instructions and replace all those
-  // instructions with import/export calls in-place.
-  assert(m_shaderStage != ShaderStageTessControl && m_shaderStage != ShaderStageTessEval);
-
-  // NOTE: For fragment shader, we have to handle interpolation functions first since input interpolants must be
-  // lowered in-place.
-  if (m_shaderStage == ShaderStageFragment) {
-    // Invoke handling of interpolation calls
-    handleCallInst(false, true);
-
-    // Remove interpolation calls, they must have been replaced with LLPC intrinsics
-    std::unordered_set<GetElementPtrInst *> getElemInsts;
-    for (auto interpCall : m_interpCalls) {
-      GetElementPtrInst *getElemPtr = dyn_cast<GetElementPtrInst>(interpCall->getArgOperand(0));
-      if (getElemPtr)
-        getElemInsts.insert(getElemPtr);
-
-      assert(interpCall->use_empty());
-      interpCall->dropAllReferences();
-      interpCall->eraseFromParent();
-    }
+// @param globalVar : the global variable to be lowered
+void SpirvLowerGlobal::lowerInOut(llvm::GlobalVariable *globalVar) {
+  assert(globalVar->getAddressSpace() == SPIRAS_Input || globalVar->getAddressSpace() == SPIRAS_Output);
+  const bool isInput = globalVar->getAddressSpace() == SPIRAS_Input;
 
-    for (auto getElemPtr : getElemInsts) {
-      if (getElemPtr->use_empty()) {
-        getElemPtr->dropAllReferences();
-        getElemPtr->eraseFromParent();
-      }
-    }
-  }
-
-  for (auto inputMap : m_inputProxyMap) {
-    auto input = cast<GlobalVariable>(inputMap.first);
-    auto proxy = inputMap.second;
-
-    for (auto user = input->user_begin(), end = input->user_end(); user != end; ++user) {
-      // NOTE: "Getelementptr" and "bitcast" will propagate the address space of pointer value (input variable)
-      // to the element pointer value (destination). We have to clear the address space of this element pointer
-      // value. The original pointer value has been lowered and therefore the address space is invalid now.
-      Instruction *inst = dyn_cast<Instruction>(*user);
-      if (inst) {
-        Type *instTy = inst->getType();
-        if (isa<PointerType>(instTy) && instTy->getPointerAddressSpace() == SPIRAS_Input) {
-          assert(isa<GetElementPtrInst>(inst) || isa<BitCastInst>(inst));
-          Type *newInstTy = PointerType::get(*m_context, SPIRAS_Private);
-          inst->mutateType(newInstTy);
-        }
-      }
-    }
-
-    handleVolatileInput(input, proxy);
-
-    input->mutateType(proxy->getType()); // To clear address space for pointer to make replacement valid
-    input->replaceAllUsesWith(proxy);
-    input->eraseFromParent();
-  }
-}
-
-// =====================================================================================================================
-// Does lowering operations for SPIR-V outputs, replaces outputs with proxy variables.
-void SpirvLowerGlobal::lowerOutput() {
-  if (m_outputProxyMap.empty() && m_shaderStage != ShaderStageGeometry) {
-    // Skip lowering if there is no output for non-geometry shader
-    return;
+  // Apply output initializer, if any
+  if (!isInput && globalVar->hasInitializer()) {
+    m_builder->SetInsertPointPastAllocas(m_entryPoint);
+    auto initializer = globalVar->getInitializer();
+    m_builder->CreateStore(initializer, globalVar);
   }
 
-  // Collect "emit" calls
-  if (m_shaderStage == ShaderStageGeometry)
-    handleCallInst(true, false);
-
-  // Create unified return block in which to place all the outputs from proxy variables
-  ReturnInst *retInst = ensureUnifiedReturn();
-
-  // NOTE: For tessellation control shader, we invoke handling of "load"/"store" instructions and replace all those
-  // instructions with import/export calls in-place.
-  assert(m_shaderStage != ShaderStageTessControl);
+  const bool mapToProxy = isInput ? (m_shaderStage != ShaderStageTessControl && m_shaderStage != ShaderStageTessEval)
+                                  : (m_shaderStage != ShaderStageTessControl && m_shaderStage != ShaderStageTask &&
+                                     m_shaderStage != ShaderStageMesh);
 
-  // Set the last vertex processing stage
-  auto shaderStageMask = m_context->getShaderStageMask();
-  m_lastVertexProcessingStage = ShaderStageInvalid;
-  if (shaderStageMask & ShaderStageGeometryBit)
-    m_lastVertexProcessingStage = ShaderStageGeometry;
-  else if (shaderStageMask & ShaderStageTessEvalBit)
-    m_lastVertexProcessingStage = ShaderStageTessEval;
-  else if (shaderStageMask & ShaderStageVertexBit)
-    m_lastVertexProcessingStage = ShaderStageVertex;
-
-  buildApiXfbMap();
-
-  // Export output from the proxy variable prior to "return" instruction or "emit" calls
-  for (auto outputMap : m_outputProxyMap) {
-    auto output = cast<GlobalVariable>(outputMap.first);
-    auto proxy = outputMap.second;
-    auto proxyTy = proxy->getAllocatedType();
-
-    MDNode *metaNode = output->getMetadata(gSPIRVMD::InOut);
+  if (mapToProxy) {
+    const auto &dataLayout = m_module->getDataLayout();
+    Type *ty = globalVar->getValueType();
+    if (ty->isPointerTy())
+      ty = m_builder->getInt64Ty();
+    MDNode *metaNode = globalVar->getMetadata(gSPIRVMD::InOut);
     assert(metaNode);
-
     auto meta = mdconst::dyn_extract<Constant>(metaNode->getOperand(0));
 
-    if (m_shaderStage == ShaderStageVertex || m_shaderStage == ShaderStageTessEval ||
-        m_shaderStage == ShaderStageFragment) {
-      m_builder->SetInsertPoint(retInst);
-      Value *outputValue = m_builder->CreateLoad(proxyTy, proxy);
-      addCallInstForOutputExport(outputValue, meta, nullptr, 0, 0, 0, nullptr, nullptr, InvalidValue);
-    } else if (m_shaderStage == ShaderStageGeometry) {
-      for (auto emitCall : m_emitCalls) {
-        unsigned emitStreamId = 0;
-
-        m_builder->SetInsertPoint(emitCall);
-
-        auto mangledName = emitCall->getCalledFunction()->getName();
-        if (mangledName.starts_with(gSPIRVName::EmitStreamVertex))
-          emitStreamId = cast<ConstantInt>(emitCall->getOperand(0))->getZExtValue();
-        else
-          assert(mangledName.starts_with(gSPIRVName::EmitVertex));
+    m_builder->SetInsertPointPastAllocas(m_entryPoint);
+    Value *proxy = m_builder->CreateAlloca(ty, dataLayout.getAllocaAddrSpace(), nullptr,
+                                           Twine(LlpcName::InputProxyPrefix) + globalVar->getName());
 
-        Value *outputValue = m_builder->CreateLoad(proxyTy, proxy);
-        addCallInstForOutputExport(outputValue, meta, nullptr, 0, 0, 0, nullptr, nullptr, emitStreamId);
-      }
-    }
-  }
+    if (isInput) {
+      // Import input to proxy variable
+      auto inputValue = addCallInstForInOutImport(ty, SPIRAS_Input, meta, nullptr, 0, nullptr, nullptr,
+                                                  InterpLocUnknown, nullptr, false);
 
-  // Replace the Emit(Stream)Vertex calls with builder code.
-  for (auto emitCall : m_emitCalls) {
-    unsigned emitStreamId =
-        emitCall->arg_size() != 0 ? cast<ConstantInt>(emitCall->getArgOperand(0))->getZExtValue() : 0;
-    m_builder->SetInsertPoint(emitCall);
-    m_builder->CreateEmitVertex(emitStreamId);
-    emitCall->eraseFromParent();
-  }
+      m_builder->CreateStore(inputValue, proxy);
 
-  // NOTE: "Getelementptr" will propagate the address space of pointer value (output variable)
-  // to the element pointer value (destination). We have to clear the address space of this element pointer
-  // value. The original pointer value has been lowered and therefore the address space is invalid now.
-  for (auto outputMap : m_outputProxyMap) {
-    auto output = cast<GlobalVariable>(outputMap.first);
+      handleVolatileInput(globalVar, proxy);
+    } else {
+      // Export the output at shader end or vertex emit
+      if (m_shaderStage == ShaderStageVertex || m_shaderStage == ShaderStageTessEval ||
+          m_shaderStage == ShaderStageFragment) {
+        m_builder->SetInsertPoint(m_unifiedReturn);
+        Value *outputValue = m_builder->CreateLoad(ty, proxy);
+        addCallInstForOutputExport(outputValue, meta, nullptr, 0, 0, 0, nullptr, nullptr, InvalidValue);
+      } else {
+        assert(m_shaderStage == ShaderStageGeometry);
 
-    SmallVector<Value *> propagationWorklist;
-    propagationWorklist.push_back(output);
+        for (auto emitCall : m_emitCalls) {
+          unsigned emitStreamId = 0;
 
-    while (!propagationWorklist.empty()) {
-      Value *current = propagationWorklist.pop_back_val();
+          m_builder->SetInsertPoint(emitCall);
 
-      for (User *user : current->users()) {
-        Instruction *inst = dyn_cast<Instruction>(user);
-        if (inst) {
-          Type *instTy = inst->getType();
-          if (isa<PointerType>(instTy) && instTy->getPointerAddressSpace() == SPIRAS_Output) {
-            assert(isa<GetElementPtrInst>(inst));
-            Type *newInstTy = PointerType::get(*m_context, SPIRAS_Private);
-            inst->mutateType(newInstTy);
+          auto mangledName = emitCall->getCalledFunction()->getName();
+          if (mangledName.starts_with(gSPIRVName::EmitStreamVertex))
+            emitStreamId = cast<ConstantInt>(emitCall->getOperand(0))->getZExtValue();
+          else
+            assert(mangledName.starts_with(gSPIRVName::EmitVertex));
 
-            propagationWorklist.push_back(user);
-          }
+          Value *outputValue = m_builder->CreateLoad(ty, proxy);
+          addCallInstForOutputExport(outputValue, meta, nullptr, 0, 0, 0, nullptr, nullptr, emitStreamId);
         }
       }
     }
 
-    auto proxy = outputMap.second;
-    output->mutateType(proxy->getType()); // To clear address space for pointer to make replacement valid
-    output->replaceAllUsesWith(proxy);
-    output->eraseFromParent();
+    SmallVector<Instruction *> toErase;
+    CompilerUtils::replaceAllPointerUses(m_builder, globalVar, proxy, toErase);
+    for (auto inst : toErase)
+      inst->eraseFromParent();
+  } else {
+    // In-place lowering.
+    SmallVector<Value *> indexStack;
+    lowerInOutUsersInPlace(globalVar, globalVar, indexStack);
   }
+
+  assert(globalVar->use_empty());
+  globalVar->eraseFromParent();
 }
 
 // =====================================================================================================================
-// Does inplace lowering operations for SPIR-V inputs/outputs, replaces "load" instructions with import calls and
-// "store" instructions with export calls.
-void SpirvLowerGlobal::lowerInOutInPlace() {
-  assert(m_shaderStage == ShaderStageTessControl || m_shaderStage == ShaderStageTessEval ||
-         m_shaderStage == ShaderStageMesh);
-
-  // Invoke handling of "load" and "store" instruction
-  handleLoadInst();
-  if (m_shaderStage == ShaderStageTessControl || m_shaderStage == ShaderStageMesh)
-    handleStoreInst();
-
-  DenseSet<GetElementPtrInst *> getElemInsts;
-
-  // Remove unnecessary "load" instructions
-  for (auto loadInst : m_loadInsts) {
-    GetElementPtrInst *const getElemPtr = dyn_cast<GetElementPtrInst>(loadInst->getPointerOperand());
-    if (getElemPtr)
-      getElemInsts.insert(getElemPtr);
-
-    assert(loadInst->use_empty());
-    loadInst->dropAllReferences();
-    loadInst->eraseFromParent();
-  }
-
-  m_loadInsts.clear();
-
-  // Remove unnecessary "store" instructions
-  for (auto storeInst : m_storeInsts) {
-    GetElementPtrInst *const getElemPtr = dyn_cast<GetElementPtrInst>(storeInst->getPointerOperand());
-    if (getElemPtr)
-      getElemInsts.insert(getElemPtr);
-
-    assert(storeInst->use_empty());
-    storeInst->dropAllReferences();
-    storeInst->eraseFromParent();
-  }
-
-  m_storeInsts.clear();
-
-  // Remove unnecessary "getelementptr" instructions
-  while (!getElemInsts.empty()) {
-    GetElementPtrInst *const getElemPtr = *getElemInsts.begin();
-    getElemInsts.erase(getElemPtr);
-
-    // If the GEP still has any uses, skip processing it.
-    if (!getElemPtr->use_empty())
-      continue;
-
-    // If the GEP is GEPing into another GEP, record that GEP as something we need to visit too.
-    if (GetElementPtrInst *const otherGetElemInst = dyn_cast<GetElementPtrInst>(getElemPtr->getPointerOperand()))
-      getElemInsts.insert(otherGetElemInst);
-
-    getElemPtr->dropAllReferences();
-    getElemPtr->eraseFromParent();
-  }
+// Recursively lower all users of `current`, which can be traced back to `globalVar` via the given GEP indices,
+// to in-place import/export ops.
+//
+// This makes the assumption that GEPs have not been type-punned (though 0 indices may have been dropped).
+void SpirvLowerGlobal::lowerInOutUsersInPlace(llvm::GlobalVariable *globalVar, llvm::Value *current,
+                                              SmallVectorImpl<llvm::Value *> &indexStack) {
+  for (User *user : llvm::make_early_inc_range(current->users())) {
+    Instruction *inst = cast<Instruction>(user);
+
+    if (auto *gep = dyn_cast<GetElementPtrInst>(inst)) {
+      // We currently expect that GEPs are only used on the global variable directly, with the global variable's type.
+      // The SpirvLowerAccessChain pass ensures this.
+      //
+      // TODO: As LLVM is moving away from GEPs towards ptradds, we need a better solution, probably by adding our
+      //       own "structured GEP" operation.
+      assert(current == globalVar && gep->getSourceElementType() == globalVar->getValueType());
+      assert(cast<ConstantInt>(gep->idx_begin()[0])->isNullValue());
+
+      for (unsigned i = 1, e = gep->getNumIndices(); i < e; ++i)
+        indexStack.push_back(m_builder->CreateZExtOrTrunc(gep->idx_begin()[i], m_builder->getInt32Ty()));
+
+      lowerInOutUsersInPlace(globalVar, gep, indexStack);
+
+      indexStack.clear();
+    } else if (isa<LoadInst>(inst) || isa<StoreInst>(inst)) {
+      auto *loadInst = dyn_cast<LoadInst>(inst);
+      auto *storeInst = dyn_cast<StoreInst>(inst);
+
+      m_builder->SetInsertPoint(inst);
+
+      Value *vertexOrPrimitiveIdx = nullptr;
+      auto inOutTy = globalVar->getValueType();
+      auto accessTy = loadInst ? loadInst->getType() : storeInst->getValueOperand()->getType();
+      auto addrSpace = globalVar->getAddressSpace();
+
+      MDNode *metaNode = globalVar->getMetadata(gSPIRVMD::InOut);
+      assert(metaNode);
+      auto inOutMetaVal = mdconst::dyn_extract<Constant>(metaNode->getOperand(0));
+
+      auto indexOperands = ArrayRef(indexStack);
+
+      // If the input/output is arrayed, the outermost index might be used for vertex indexing
+      if (inOutTy->isArrayTy() && (hasVertexIdx(*inOutMetaVal) || hasPrimitiveIdx(*inOutMetaVal))) {
+        if (!indexOperands.empty()) {
+          vertexOrPrimitiveIdx = indexOperands.front();
+          indexOperands = indexOperands.drop_front();
+        } else if (inOutTy != accessTy) {
+          vertexOrPrimitiveIdx = m_builder->getInt32(0);
+        }
+        inOutTy = inOutTy->getArrayElementType();
+        inOutMetaVal = cast<Constant>(inOutMetaVal->getOperand(1));
+      }
 
-  // Remove inputs if they are lowered in-place
-  if (m_lowerInputInPlace) {
-    for (auto inputMap : m_inputProxyMap) {
-      auto input = cast<GlobalVariable>(inputMap.first);
-      assert(input->use_empty());
-      input->eraseFromParent();
+      if (loadInst) {
+        Value *loadValue = loadInOutMember(inOutTy, accessTy, addrSpace, indexOperands, 0, inOutMetaVal, nullptr,
+                                           vertexOrPrimitiveIdx, InterpLocUnknown, nullptr, false);
+        loadInst->replaceAllUsesWith(loadValue);
+      } else {
+        Value *storeValue = storeInst->getOperand(0);
+        storeOutputMember(inOutTy, accessTy, storeValue, indexOperands, 0, inOutMetaVal, nullptr, vertexOrPrimitiveIdx);
+      }
+    } else {
+      llvm_unreachable("unhandled user of input/output variable");
     }
-  }
 
-  // Remove outputs if they are lowered in-place
-  if (m_lowerOutputInPlace) {
-    for (auto outputMap : m_outputProxyMap) {
-      auto output = cast<GlobalVariable>(outputMap.first);
-      assert(output->use_empty());
-      output->eraseFromParent();
-    }
+    inst->eraseFromParent();
   }
 }
 
@@ -1129,7 +828,6 @@ Value *SpirvLowerGlobal::addCallInstForInOutImport(Type *inOutTy, unsigned addrS
                                              vertexIdx, interpLoc, auxInterpValue, isPerVertexDimension);
           }
           inOutValue = m_builder->CreateInsertValue(inOutValue, elem, {idx});
-          // clang-format on
         }
       }
     }
@@ -1176,7 +874,7 @@ Value *SpirvLowerGlobal::addCallInstForInOutImport(Type *inOutTy, unsigned addrS
         if (addrSpace == SPIRAS_Input) {
           // In the case where the command has no baseVertex parameter, force the value of gl_BaseVertex to zero
           if (builtIn == lgc::BuiltInBaseVertex &&
-              m_context->getPipelineContext()->getPipelineOptions()->disableBaseVertex)
+              m_context->getPipelineContext()->getPipelineOptions()->getGlState().disableBaseVertex)
             inOutValue = m_builder->getInt32(0);
           else
             inOutValue = m_builder->CreateReadBuiltInInput(builtIn, inOutInfo, vertexIdx, elemIdx);
@@ -2297,6 +1995,7 @@ void SpirvLowerGlobal::lowerUniformConstants() {
 // =====================================================================================================================
 // Interpolates an element of the input.
 //
+// @param returnTy : the return type of the interpolation
 // @param interpLoc : Interpolation location, valid for fragment shader (use "InterpLocUnknown" as don't-care value)
 // @param auxInterpValue : Auxiliary value of interpolation (valid for fragment shader): - Sample ID for
 // "InterpLocSample" - Offset from the center of the pixel for "InterpLocCenter" - Vertex no. (0 ~ 2) for
@@ -2304,12 +2003,10 @@ void SpirvLowerGlobal::lowerUniformConstants() {
 // @param callInst : "Call" instruction
 // @param indexOperands : indices of GEP instruction
 // @param gv : Global Variable instruction
-void SpirvLowerGlobal::interpolateInputElement(unsigned interpLoc, Value *auxInterpValue, CallInst &callInst,
-                                               GlobalVariable *gv, ArrayRef<Value *> indexOperands) {
+Value *SpirvLowerGlobal::interpolateInputElement(Type *returnTy, unsigned interpLoc, Value *auxInterpValue,
+                                                 GlobalVariable *gv, ArrayRef<Value *> indexOperands) {
   assert((indexOperands.empty() || cast<ConstantInt>(indexOperands.front())->isZero()) && "Non-zero GEP first index\n");
 
-  m_builder->SetInsertPoint(&callInst);
-
   auto inputTy = gv->getValueType();
 
   MDNode *metaNode = gv->getMetadata(gSPIRVMD::InOut);
@@ -2328,36 +2025,26 @@ void SpirvLowerGlobal::interpolateInputElement(unsigned interpLoc, Value *auxInt
   if (hasAllConstantIndices(indexOperands)) {
     if (!indexOperands.empty())
       indexOperands = indexOperands.drop_front();
-    auto loadValue = loadInOutMember(inputTy, callInst.getFunctionType()->getReturnType(), SPIRAS_Input, indexOperands,
-                                     0, inputMeta, nullptr, nullptr, interpLoc, auxInterpValue, false);
-
-    m_interpCalls.insert(&callInst);
-    callInst.replaceAllUsesWith(loadValue);
-  } else {
-    // Interpolant an element via dynamic index by extending interpolant to each element
-    //
-    // Regardless of where we do the interpolation, the alloca for the temporary must be inserted in the function entry
-    // block for efficient code generation, so we don't use the builder for it.
-    auto interpPtr = new AllocaInst(inputTy, m_module->getDataLayout().getAllocaAddrSpace(), Twine(),
-                                    &*(m_entryPoint->begin()->getFirstInsertionPt()));
-    // Load all possibly accessed values
-    auto loadValue = loadDynamicIndexedMembers(inputTy, SPIRAS_Input, ArrayRef(indexOperands).drop_front(), inputMeta,
-                                               nullptr, interpLoc, auxInterpValue, false);
-
-    m_builder->CreateStore(loadValue, interpPtr);
-
-    auto interpElemPtr = m_builder->CreateGEP(inputTy, interpPtr, indexOperands);
-    auto interpElemTy = GetElementPtrInst::getIndexedType(inputTy, indexOperands);
-
-    // Only get the value that the original getElemPtr points to
-    auto interpElemValue = m_builder->CreateLoad(interpElemTy, interpElemPtr);
-    callInst.replaceAllUsesWith(interpElemValue);
-
-    if (callInst.user_empty()) {
-      callInst.dropAllReferences();
-      callInst.eraseFromParent();
-    }
+    return loadInOutMember(inputTy, returnTy, SPIRAS_Input, indexOperands, 0, inputMeta, nullptr, nullptr, interpLoc,
+                           auxInterpValue, false);
   }
+
+  // Interpolate an element via dynamic index by extending interpolant to each element
+  //
+  // Regardless of where we do the interpolation, the alloca for the temporary must be inserted in the function entry
+  // block for efficient code generation, so we don't use the builder for it.
+  auto interpPtr = m_builder->CreateAllocaAtFuncEntry(inputTy);
+  // Load all possibly accessed values
+  auto loadValue = loadDynamicIndexedMembers(inputTy, SPIRAS_Input, ArrayRef(indexOperands).drop_front(), inputMeta,
+                                             nullptr, interpLoc, auxInterpValue, false);
+
+  m_builder->CreateStore(loadValue, interpPtr);
+
+  auto interpElemPtr = m_builder->CreateGEP(inputTy, interpPtr, indexOperands);
+  auto interpElemTy = GetElementPtrInst::getIndexedType(inputTy, indexOperands);
+
+  // Only get the value that the original getElemPtr points to
+  return m_builder->CreateLoad(interpElemTy, interpElemPtr);
 }
 
 // =====================================================================================================================
@@ -2613,21 +2300,23 @@ void SpirvLowerGlobal::changeRtFunctionSignature() {
     }
   }
 
+  SmallVector<GlobalVariable *> globalsToErase;
+
   if (hitAttributeVar && m_entryPoint->arg_size() == 2) {
     assert(!rayTracingContext->isContinuationsMode() || m_shaderStage != ShaderStageRayTracingIntersect);
     convertUsersOfConstantsToInstructions(hitAttributeVar);
     hitAttributeVar->replaceAllUsesWith(m_entryPoint->getArg(1));
-    m_globalsToErase.push_back(hitAttributeVar);
+    globalsToErase.push_back(hitAttributeVar);
   }
 
   if (incomingPayloadVar) {
     convertUsersOfConstantsToInstructions(incomingPayloadVar);
     incomingPayloadVar->replaceAllUsesWith(m_entryPoint->getArg(0));
-    m_globalsToErase.push_back(incomingPayloadVar);
+    globalsToErase.push_back(incomingPayloadVar);
   } else if (incomingCallableDataVar) {
     convertUsersOfConstantsToInstructions(incomingCallableDataVar);
     incomingCallableDataVar->replaceAllUsesWith(m_entryPoint->getArg(0));
-    m_globalsToErase.push_back(incomingCallableDataVar);
+    globalsToErase.push_back(incomingCallableDataVar);
   }
 
   if (rayTracingContext->isContinuationsMode()) {
@@ -2647,11 +2336,10 @@ void SpirvLowerGlobal::changeRtFunctionSignature() {
     contFuncTy.writeMetadata(newFunc);
   }
 
-  for (auto globalVar : m_globalsToErase) {
+  for (auto globalVar : globalsToErase) {
     globalVar->dropAllReferences();
     globalVar->eraseFromParent();
   }
-  m_globalsToErase.clear();
 }
 
 } // namespace Llpc
diff --git a/llpc/lower/llpcSpirvLowerGlobal.h b/llpc/lower/llpcSpirvLowerGlobal.h
index 1ca6cd6ade..700f9c870b 100644
--- a/llpc/lower/llpcSpirvLowerGlobal.h
+++ b/llpc/lower/llpcSpirvLowerGlobal.h
@@ -62,14 +62,12 @@ class SpirvLowerGlobal : public SpirvLower, public llvm::PassInfoMixin<SpirvLowe
 
 private:
   void mapGlobalVariableToProxy(llvm::GlobalVariable *globalVar);
-  void mapInputToProxy(llvm::GlobalVariable *input);
-  void mapOutputToProxy(llvm::GlobalVariable *input);
+  void lowerInOut(llvm::GlobalVariable *globalVar);
+  void lowerInOutUsersInPlace(llvm::GlobalVariable *globalVar, llvm::Value *current,
+                              SmallVectorImpl<llvm::Value *> &indexStack);
 
-  llvm::ReturnInst *ensureUnifiedReturn();
+  void ensureUnifiedReturn();
 
-  void lowerInput();
-  void lowerOutput();
-  void lowerInOutInPlace();
   void lowerBufferBlock();
   void lowerTaskPayload();
   void lowerPushConsts();
@@ -105,30 +103,17 @@ class SpirvLowerGlobal : public SpirvLower, public llvm::PassInfoMixin<SpirvLowe
                          llvm::ArrayRef<llvm::Value *> indexOperands, unsigned maxLocOffset, llvm::Constant *outputMeta,
                          llvm::Value *locOffset, llvm::Value *vertexOrPrimitiveIdx);
 
-  void interpolateInputElement(unsigned interpLoc, llvm::Value *interpInfo, llvm::CallInst &callInst,
-                               GlobalVariable *gv, ArrayRef<Value *> indexOperands);
+  llvm::Value *interpolateInputElement(llvm::Type *returnTy, unsigned interpLoc, llvm::Value *interpInfo,
+                                       GlobalVariable *gv, ArrayRef<Value *> indexOperands);
 
   void buildApiXfbMap();
 
   void addCallInstForXfbOutput(const ShaderInOutMetadata &outputMeta, Value *outputValue, unsigned xfbBufferAdjust,
                                unsigned xfbOffsetAdjust, unsigned locOffset, lgc::InOutInfo outputInfo);
 
-  llvm::SmallVector<llvm::GlobalVariable *> m_globalsToErase;
-  std::unordered_map<llvm::Value *, llvm::Value *> m_inputProxyMap; // Proxy map for lowering inputs
-
-  // NOTE: Here we use list to store pairs of output proxy mappings. This is because we want output patching to be
-  // "ordered" (resulting LLVM IR for the patching always be consistent).
-  std::list<std::pair<llvm::Value *, llvm::AllocaInst *>> m_outputProxyMap; // Proxy list for lowering outputs
-
-  bool m_lowerInputInPlace;  // Whether to lower input inplace
-  bool m_lowerOutputInPlace; // Whether to lower output inplace
-
-  std::unordered_set<llvm::CallInst *> m_emitCalls;   // "Call" instructions to emit vertex (geometry shader)
-  std::unordered_set<llvm::LoadInst *> m_loadInsts;   // "Load" instructions to be removed
-  std::unordered_set<llvm::StoreInst *> m_storeInsts; // "Store" instructions to be removed
-  std::unordered_set<llvm::CallInst *> m_interpCalls; // "Call" instruction to do input interpolation
-                                                      // (fragment shader)
-  ShaderStage m_lastVertexProcessingStage;            // The last vertex processing stage
+  llvm::ReturnInst *m_unifiedReturn = nullptr;
+  std::unordered_set<llvm::CallInst *> m_emitCalls; // "Call" instructions to emit vertex (geometry shader)
+  ShaderStage m_lastVertexProcessingStage;          // The last vertex processing stage
   llvm::DenseMap<unsigned, Vkgc::XfbOutInfo>
       m_builtInXfbMap; // Map built-in to XFB output info specified by API interface
   llvm::DenseMap<unsigned, Vkgc::XfbOutInfo>
diff --git a/llpc/lower/llpcSpirvLowerInternalLibraryIntrinsicUtil.cpp b/llpc/lower/llpcSpirvLowerInternalLibraryIntrinsicUtil.cpp
index c8c644855c..65a0bcef1e 100644
--- a/llpc/lower/llpcSpirvLowerInternalLibraryIntrinsicUtil.cpp
+++ b/llpc/lower/llpcSpirvLowerInternalLibraryIntrinsicUtil.cpp
@@ -175,13 +175,13 @@ static void createAtomic(Function *func, Builder *builder, bool is64, bool isCmp
   // Create GEP to get the byte address with byte offset
   gpuAddrAsPtr = builder->CreateGEP(builder->getInt8Ty(), gpuAddrAsPtr, offset);
   Value *atomicValue = nullptr;
+  SyncScope::ID scope = func->getContext().getOrInsertSyncScopeID("agent");
   if (!isCmpXchg) {
     assert(binOp != AtomicRMWInst::BAD_BINOP);
-    atomicValue = builder->CreateAtomicRMW(binOp, gpuAddrAsPtr, value, MaybeAlign(), AtomicOrdering::Monotonic,
-                                           SyncScope::System);
+    atomicValue = builder->CreateAtomicRMW(binOp, gpuAddrAsPtr, value, MaybeAlign(), AtomicOrdering::Monotonic, scope);
   } else {
     atomicValue = builder->CreateAtomicCmpXchg(gpuAddrAsPtr, compare, value, MaybeAlign(), AtomicOrdering::Monotonic,
-                                               AtomicOrdering::Monotonic, SyncScope::System);
+                                               AtomicOrdering::Monotonic, scope);
     atomicValue = builder->CreateExtractValue(atomicValue, 0);
   }
   builder->CreateRet(atomicValue);
diff --git a/llpc/lower/llpcSpirvLowerTranslator.cpp b/llpc/lower/llpcSpirvLowerTranslator.cpp
index 575f1ed0ac..9c61f90cd4 100644
--- a/llpc/lower/llpcSpirvLowerTranslator.cpp
+++ b/llpc/lower/llpcSpirvLowerTranslator.cpp
@@ -101,7 +101,8 @@ void SpirvLowerTranslator::translateSpirvToLlvm(const PipelineShaderInfo *shader
   for (const auto &range : descriptorRangeValues) {
     if (range.type == ResourceMappingNodeType::DescriptorYCbCrSampler) {
       uint32_t rangeSet = range.set;
-      if (context->getPipelineContext()->getPipelineOptions()->replaceSetWithResourceType && range.set == 0) {
+      if (context->getPipelineContext()->getPipelineOptions()->getGlState().replaceSetWithResourceType &&
+          range.set == 0) {
         rangeSet = PipelineContext::getGlResourceNodeSetFromType(range.type);
       }
       convertingSamplers.push_back(
diff --git a/llpc/test/lit.cfg.py b/llpc/test/lit.cfg.py
index d545648727..896718f119 100644
--- a/llpc/test/lit.cfg.py
+++ b/llpc/test/lit.cfg.py
@@ -66,6 +66,9 @@
 if 'Undefined' in config.xgl_sanitizers:
     config.available_features.add('ubsan')
 
+if config.llpc_is_standalone != 'ON':
+    config.available_features.add('gpurt')
+
 llvm_config.use_default_substitutions()
 
 config.substitutions.append(('%PATH%', config.environment['PATH']))
diff --git a/llpc/test/lit.site.cfg.py.in b/llpc/test/lit.site.cfg.py.in
index c80a3bc6a4..31bd7a41b8 100644
--- a/llpc/test/lit.site.cfg.py.in
+++ b/llpc/test/lit.site.cfg.py.in
@@ -15,6 +15,7 @@ config.gfxip = "@AMDLLPC_DEFAULT_TARGET@"
 # Propagate CMake options used in lit feature tests.
 config.llvm_assertions = "@LLVM_ENABLE_ASSERTIONS@"
 config.xgl_sanitizers = "@XGL_USE_SANITIZER@"
+config.llpc_is_standalone = "@LLPC_IS_STANDALONE@"
 
 for d in "@LIT_DEFINITIONS@".split(";"):
     def_split = d.split("=")
diff --git a/llpc/test/shaderdb/core/ObjNonUniform_TestTexutreLoadStoreInt64.spvasm b/llpc/test/shaderdb/core/ObjNonUniform_TestTexutreLoadStoreInt64.spvasm
index f1f8eecaea..7eaa31a17b 100644
--- a/llpc/test/shaderdb/core/ObjNonUniform_TestTexutreLoadStoreInt64.spvasm
+++ b/llpc/test/shaderdb/core/ObjNonUniform_TestTexutreLoadStoreInt64.spvasm
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by tool/update_llpc_test_checks.py
 
 ; BEGIN_SHADERTEST
-; RUN: amdllpc --print-after=llpc-spirv-lower-translator -o - 2>&1 %s | FileCheck -check-prefixes=SHADERTEST %s
+; RUN: amdllpc --print-after=llpc-spirv-lower-translator -filetype=asm -o - 2>&1 %s | FileCheck -check-prefixes=SHADERTEST %s
 ; #version 450
 ; #extension GL_EXT_nonuniform_qualifier : require
 ; #extension GL_ARB_gpu_shader_int64 : require
@@ -88,7 +88,7 @@
                OpFunctionEnd
 ; SHADERTEST-LABEL: @main(
 ; SHADERTEST-NEXT:  .entry:
-; SHADERTEST-NEXT:    [[TMP0:%.*]] = alloca { [3 x <8 x i32>], { <4 x i32>, i32 } }, align 32, addrspace(5)
+; SHADERTEST-NEXT:    [[TMP0:%.*]] = alloca { [3 x ptr addrspace(4)], { ptr addrspace(4), i32 } }, align 8, addrspace(5)
 ; SHADERTEST-NEXT:    [[_12:%.*]] = alloca i64, align 8, addrspace(5)
 ; SHADERTEST-NEXT:    [[TMP1:%.*]] = call ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4(i32 1, i32 1, i64 0, i32 7)
 ; SHADERTEST-NEXT:    [[TMP2:%.*]] = call i32 (...) @lgc.create.get.desc.stride.i32(i32 1, i32 1, i64 0, i32 7)
@@ -127,20 +127,21 @@
 ; SHADERTEST-NEXT:    [[TMP31:%.*]] = extractvalue { { ptr addrspace(4), i32, i32, i32 }, { ptr addrspace(4), i32, i32 } } [[TMP30]], 1
 ; SHADERTEST-NEXT:    [[TMP32:%.*]] = extractvalue { ptr addrspace(4), i32, i32 } [[TMP31]], 2
 ; SHADERTEST-NEXT:    [[TMP33:%.*]] = extractvalue { ptr addrspace(4), i32, i32 } [[TMP31]], 0
-; SHADERTEST-NEXT:    [[TMP34:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP33]], align 16, !invariant.load !4
-; SHADERTEST-NEXT:    [[TMP35:%.*]] = insertvalue { <4 x i32>, i32 } poison, <4 x i32> [[TMP34]], 0
-; SHADERTEST-NEXT:    [[TMP36:%.*]] = insertvalue { <4 x i32>, i32 } [[TMP35]], i32 [[TMP32]], 1
-; SHADERTEST-NEXT:    [[TMP37:%.*]] = extractvalue { { ptr addrspace(4), i32, i32, i32 }, { ptr addrspace(4), i32, i32 } } [[TMP30]], 0
-; SHADERTEST-NEXT:    [[TMP38:%.*]] = extractvalue { ptr addrspace(4), i32, i32, i32 } [[TMP37]], 0
-; SHADERTEST-NEXT:    [[TMP39:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP38]], align 32, !invariant.load !4
-; SHADERTEST-NEXT:    [[TMP40:%.*]] = insertvalue [3 x <8 x i32>] poison, <8 x i32> [[TMP39]], 0
-; SHADERTEST-NEXT:    [[TMP41:%.*]] = insertvalue { [3 x <8 x i32>], { <4 x i32>, i32 } } poison, [3 x <8 x i32>] [[TMP40]], 0
-; SHADERTEST-NEXT:    [[TMP42:%.*]] = insertvalue { [3 x <8 x i32>], { <4 x i32>, i32 } } [[TMP41]], { <4 x i32>, i32 } [[TMP36]], 1
-; SHADERTEST-NEXT:    call void @"spirv.NonUniform.s[a3v8i32,s[v4i32,i32]]"({ [3 x <8 x i32>], { <4 x i32>, i32 } } [[TMP42]])
-; SHADERTEST-NEXT:    store { [3 x <8 x i32>], { <4 x i32>, i32 } } [[TMP42]], ptr addrspace(5) [[TMP0]], align 32
-; SHADERTEST-NEXT:    [[TMP43:%.*]] = load { [3 x <8 x i32>], { <4 x i32>, i32 } }, ptr addrspace(5) [[TMP0]], align 32
-; SHADERTEST-NEXT:    [[TMP44:%.*]] = extractvalue { [3 x <8 x i32>], { <4 x i32>, i32 } } [[TMP43]], 1
-; SHADERTEST-NEXT:    [[TMP45:%.*]] = extractvalue { [3 x <8 x i32>], { <4 x i32>, i32 } } [[TMP43]], 0
-; SHADERTEST-NEXT:    [[TMP46:%.*]] = extractvalue [3 x <8 x i32>] [[TMP45]], 0
-; SHADERTEST-NEXT:    [[TMP47:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP44]], 0
-; SHADERTEST-NEXT:    [[TMP48:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 512, <8 x i32> [[TMP46]], <4 x i32> [[TMP47]], i32 1, <2 x float> zeroinitializer)
+; SHADERTEST-NEXT:    [[TMP34:%.*]] = insertvalue { ptr addrspace(4), i32 } poison, ptr addrspace(4) [[TMP33]], 0
+; SHADERTEST-NEXT:    [[TMP35:%.*]] = insertvalue { ptr addrspace(4), i32 } [[TMP34]], i32 [[TMP32]], 1
+; SHADERTEST-NEXT:    [[TMP36:%.*]] = extractvalue { { ptr addrspace(4), i32, i32, i32 }, { ptr addrspace(4), i32, i32 } } [[TMP30]], 0
+; SHADERTEST-NEXT:    [[TMP37:%.*]] = extractvalue { ptr addrspace(4), i32, i32, i32 } [[TMP36]], 0
+; SHADERTEST-NEXT:    [[TMP38:%.*]] = insertvalue [3 x ptr addrspace(4)] poison, ptr addrspace(4) [[TMP37]], 0
+; SHADERTEST-NEXT:    [[TMP39:%.*]] = insertvalue { [3 x ptr addrspace(4)], { ptr addrspace(4), i32 } } poison, [3 x ptr addrspace(4)] [[TMP38]], 0
+; SHADERTEST-NEXT:    [[TMP40:%.*]] = insertvalue { [3 x ptr addrspace(4)], { ptr addrspace(4), i32 } } [[TMP39]], { ptr addrspace(4), i32 } [[TMP35]], 1
+; SHADERTEST-NEXT:    call void @"spirv.NonUniform.s[a3p4,s[p4,i32]]"({ [3 x ptr addrspace(4)], { ptr addrspace(4), i32 } } [[TMP40]])
+; SHADERTEST-NEXT:    store { [3 x ptr addrspace(4)], { ptr addrspace(4), i32 } } [[TMP40]], ptr addrspace(5) [[TMP0]], align 8
+; SHADERTEST-NEXT:    [[TMP41:%.*]] = load { [3 x ptr addrspace(4)], { ptr addrspace(4), i32 } }, ptr addrspace(5) [[TMP0]], align 8
+; SHADERTEST-NEXT:    [[TMP42:%.*]] = extractvalue { [3 x ptr addrspace(4)], { ptr addrspace(4), i32 } } [[TMP41]], 1
+; SHADERTEST-NEXT:    [[TMP43:%.*]] = extractvalue { [3 x ptr addrspace(4)], { ptr addrspace(4), i32 } } [[TMP41]], 0
+; SHADERTEST-NEXT:    [[TMP44:%.*]] = extractvalue [3 x ptr addrspace(4)] [[TMP43]], 0
+; SHADERTEST-NEXT:    [[TMP45:%.*]] = extractvalue { ptr addrspace(4), i32 } [[TMP42]], 0
+; SHADERTEST-NEXT:    [[TMP46:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 512, ptr addrspace(4) [[TMP44]], ptr addrspace(4) [[TMP45]], i32 1, <2 x float> zeroinitializer)
+; SHADERTEST-NEXT:    store <4 x float> [[TMP46]], ptr addrspace(65) @_3, align 16
+; SHADERTEST-NEXT:    ret void
+;
diff --git a/llpc/test/shaderdb/core/OpAccessChain_TestBlockVectorExtract_lit.frag b/llpc/test/shaderdb/core/OpAccessChain_TestBlockVectorExtract_lit.frag
index 0e28ffad56..f7005d4991 100644
--- a/llpc/test/shaderdb/core/OpAccessChain_TestBlockVectorExtract_lit.frag
+++ b/llpc/test/shaderdb/core/OpAccessChain_TestBlockVectorExtract_lit.frag
@@ -40,11 +40,11 @@ void main()
 ; SHADERTEST: %[[COLUMN1:.*]] = type <{ [3 x float], [4 x i8] }>
 ; SHADERTEST: %[[COLUMN2:.*]] = type <{ [4 x double] }>
 
-; SHADERTEST: getelementptr inbounds (<{ [3 x float], [4 x i8], [2 x %[[COLUMN1]]] }>, ptr addrspace({{.*}}) @{{.*}}, i32 0, i32 0, i32 1
+; SHADERTEST: getelementptr {{(inbounds )?}}(<{ [3 x float], [4 x i8], [2 x %[[COLUMN1]]] }>, ptr addrspace({{.*}}) @{{.*}}, i32 0, i32 0, i32 1
 ; SHADERTEST: getelementptr <{ [3 x float], [4 x i8], [2 x %[[COLUMN1]]] }>, ptr addrspace({{.*}}) @{{.*}}, i32 0, i32 2, i32 1, i32 0, i32 %{{[0-9]*}}
 ; SHADERTEST: getelementptr <{ [3 x float], [4 x i8], [2 x %[[COLUMN1]]] }>, ptr addrspace({{.*}}) @{{.*}}, i32 0, i32 2, i32 %{{[0-9]*}}, i32 0, i32 1
 ; SHADERTEST: getelementptr <{ [4 x double], [4 x %[[COLUMN2]]] }>, ptr addrspace({{.*}}) @{{.*}}, i32 0, i32 0, i32 %{{[0-9]*}}
-; SHADERTEST: getelementptr inbounds (<{ [4 x double], [4 x %[[COLUMN2]]] }>, ptr addrspace({{.*}}) @{{.*}}, i32 0, i32 1, i32 2, i32 0, i32 3
+; SHADERTEST: getelementptr {{(inbounds )?}}(<{ [4 x double], [4 x %[[COLUMN2]]] }>, ptr addrspace({{.*}}) @{{.*}}, i32 0, i32 1, i32 2, i32 0, i32 3
 ; SHADERTEST: getelementptr <{ [4 x double], [4 x %[[COLUMN2]]] }>, ptr addrspace({{.*}}) @{{.*}}, i32 0, i32 1, i32 %{{[0-9]*}}, i32 0, i32 %{{[0-9]*}}
 
 ; SHADERTEST: AMDLLPC SUCCESS
diff --git a/llpc/test/shaderdb/core/OpAtomicXXX_TestImageDimension_lit.comp b/llpc/test/shaderdb/core/OpAtomicXXX_TestImageDimension_lit.comp
index 1e2d98cc1c..2d46dfd421 100644
--- a/llpc/test/shaderdb/core/OpAtomicXXX_TestImageDimension_lit.comp
+++ b/llpc/test/shaderdb/core/OpAtomicXXX_TestImageDimension_lit.comp
@@ -124,94 +124,94 @@ void main()
 ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
 
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 2, i32 0, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, i32 7, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 2, i32 1, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 7, i32 7>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 2, i32 2, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <3 x i32> <i32 7, i32 7, i32 7>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 2, i32 9, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 7, i32 7>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 2, i32 3, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <3 x i32> <i32 7, i32 7, i32 7>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 2, i32 0, i32 0, i32 0, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i32 7, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 2, i32 4, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 7, i32 7>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 2, i32 5, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <3 x i32> <i32 7, i32 7, i32 7>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 2, i32 8, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> <i32 7, i32 7, i32 1, i32 1>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 2, i32 6, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <3 x i32> <i32 7, i32 7, i32 5>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 2, i32 7, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> <i32 7, i32 7, i32 7, i32 5>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 4, i32 0, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, i32 7, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 4, i32 1, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 7, i32 7>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 4, i32 2, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <3 x i32> <i32 7, i32 7, i32 7>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 4, i32 9, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 7, i32 7>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 4, i32 3, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <3 x i32> <i32 7, i32 7, i32 7>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 4, i32 0, i32 0, i32 0, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i32 7, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 4, i32 4, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 7, i32 7>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 4, i32 5, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <3 x i32> <i32 7, i32 7, i32 7>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 4, i32 8, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> <i32 7, i32 7, i32 1, i32 1>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 4, i32 6, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <3 x i32> <i32 7, i32 7, i32 5>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 4, i32 7, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> <i32 7, i32 7, i32 7, i32 5>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 6, i32 0, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, i32 7, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 6, i32 1, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 7, i32 7>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 6, i32 2, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <3 x i32> <i32 7, i32 7, i32 7>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 6, i32 9, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 7, i32 7>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 6, i32 3, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <3 x i32> <i32 7, i32 7, i32 7>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 6, i32 0, i32 0, i32 0, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i32 7, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 6, i32 4, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 7, i32 7>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 6, i32 5, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <3 x i32> <i32 7, i32 7, i32 7>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 6, i32 8, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> <i32 7, i32 7, i32 1, i32 1>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 6, i32 6, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <3 x i32> <i32 7, i32 7, i32 5>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 6, i32 7, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> <i32 7, i32 7, i32 7, i32 5>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 8, i32 0, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, i32 7, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 8, i32 1, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 7, i32 7>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 8, i32 2, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <3 x i32> <i32 7, i32 7, i32 7>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 8, i32 9, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 7, i32 7>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 8, i32 3, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <3 x i32> <i32 7, i32 7, i32 7>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 8, i32 0, i32 0, i32 0, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i32 7, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 8, i32 4, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 7, i32 7>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 8, i32 5, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <3 x i32> <i32 7, i32 7, i32 7>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 8, i32 8, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> <i32 7, i32 7, i32 1, i32 1>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 8, i32 6, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <3 x i32> <i32 7, i32 7, i32 5>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 8, i32 7, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> <i32 7, i32 7, i32 7, i32 5>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 9, i32 0, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, i32 7, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 9, i32 1, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 7, i32 7>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 9, i32 2, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <3 x i32> <i32 7, i32 7, i32 7>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 9, i32 9, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 7, i32 7>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 9, i32 3, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <3 x i32> <i32 7, i32 7, i32 7>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 9, i32 0, i32 0, i32 0, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i32 7, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 9, i32 4, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 7, i32 7>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 9, i32 5, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <3 x i32> <i32 7, i32 7, i32 7>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 9, i32 8, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> <i32 7, i32 7, i32 1, i32 1>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 9, i32 6, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <3 x i32> <i32 7, i32 7, i32 5>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 9, i32 7, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> <i32 7, i32 7, i32 7, i32 5>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 10, i32 0, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, i32 7, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 10, i32 1, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 7, i32 7>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 10, i32 2, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <3 x i32> <i32 7, i32 7, i32 7>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 10, i32 9, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 7, i32 7>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 10, i32 3, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <3 x i32> <i32 7, i32 7, i32 7>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 10, i32 0, i32 0, i32 0, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i32 7, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 10, i32 4, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 7, i32 7>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 10, i32 5, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <3 x i32> <i32 7, i32 7, i32 7>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 10, i32 8, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> <i32 7, i32 7, i32 1, i32 1>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 10, i32 6, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <3 x i32> <i32 7, i32 7, i32 5>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 10, i32 7, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> <i32 7, i32 7, i32 7, i32 5>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 0, i32 0, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, i32 7, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 0, i32 1, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 7, i32 7>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 0, i32 2, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <3 x i32> <i32 7, i32 7, i32 7>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 0, i32 9, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 7, i32 7>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 0, i32 3, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <3 x i32> <i32 7, i32 7, i32 7>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 0, i32 0, i32 0, i32 0, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i32 7, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 0, i32 4, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 7, i32 7>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 0, i32 5, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <3 x i32> <i32 7, i32 7, i32 7>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 0, i32 8, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> <i32 7, i32 7, i32 1, i32 1>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 0, i32 6, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <3 x i32> <i32 7, i32 7, i32 5>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 0, i32 7, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> <i32 7, i32 7, i32 7, i32 5>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.compare.swap.i32(i32 0, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, i32 7, i32 9, i32 3)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.compare.swap.i32(i32 1, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 7, i32 7>, i32 9, i32 3)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.compare.swap.i32(i32 2, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <3 x i32> <i32 7, i32 7, i32 7>, i32 9, i32 3)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.compare.swap.i32(i32 9, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 7, i32 7>, i32 9, i32 3)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.compare.swap.i32(i32 3, i32 0, i32 0, <8 x i32>
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.compare.swap.i32(i32 0, i32 0, i32 0, <4 x i32>
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.compare.swap.i32(i32 4, i32 0, i32 0, <8 x i32>
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.compare.swap.i32(i32 5, i32 0, i32 0, <8 x i32>
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.compare.swap.i32(i32 8, i32 0, i32 0, <8 x i32>
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.compare.swap.i32(i32 6, i32 0, i32 0, <8 x i32>
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.compare.swap.i32(i32 7, i32 0, i32 0, <8 x i32>
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 2, i32 0, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, i32 7, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 2, i32 1, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 7, i32 7>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 2, i32 2, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <3 x i32> <i32 7, i32 7, i32 7>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 2, i32 9, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 7, i32 7>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 2, i32 3, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <3 x i32> <i32 7, i32 7, i32 7>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 2, i32 10, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, i32 7, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 2, i32 4, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 7, i32 7>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 2, i32 5, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <3 x i32> <i32 7, i32 7, i32 7>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 2, i32 8, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <4 x i32> <i32 7, i32 7, i32 1, i32 1>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 2, i32 6, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <3 x i32> <i32 7, i32 7, i32 5>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 2, i32 7, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <4 x i32> <i32 7, i32 7, i32 7, i32 5>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 4, i32 0, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, i32 7, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 4, i32 1, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 7, i32 7>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 4, i32 2, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <3 x i32> <i32 7, i32 7, i32 7>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 4, i32 9, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 7, i32 7>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 4, i32 3, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <3 x i32> <i32 7, i32 7, i32 7>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 4, i32 10, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, i32 7, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 4, i32 4, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 7, i32 7>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 4, i32 5, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <3 x i32> <i32 7, i32 7, i32 7>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 4, i32 8, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <4 x i32> <i32 7, i32 7, i32 1, i32 1>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 4, i32 6, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <3 x i32> <i32 7, i32 7, i32 5>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 4, i32 7, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <4 x i32> <i32 7, i32 7, i32 7, i32 5>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 6, i32 0, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, i32 7, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 6, i32 1, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 7, i32 7>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 6, i32 2, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <3 x i32> <i32 7, i32 7, i32 7>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 6, i32 9, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 7, i32 7>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 6, i32 3, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <3 x i32> <i32 7, i32 7, i32 7>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 6, i32 10, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, i32 7, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 6, i32 4, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 7, i32 7>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 6, i32 5, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <3 x i32> <i32 7, i32 7, i32 7>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 6, i32 8, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <4 x i32> <i32 7, i32 7, i32 1, i32 1>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 6, i32 6, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <3 x i32> <i32 7, i32 7, i32 5>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 6, i32 7, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <4 x i32> <i32 7, i32 7, i32 7, i32 5>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 8, i32 0, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, i32 7, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 8, i32 1, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 7, i32 7>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 8, i32 2, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <3 x i32> <i32 7, i32 7, i32 7>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 8, i32 9, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 7, i32 7>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 8, i32 3, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <3 x i32> <i32 7, i32 7, i32 7>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 8, i32 10, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, i32 7, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 8, i32 4, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 7, i32 7>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 8, i32 5, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <3 x i32> <i32 7, i32 7, i32 7>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 8, i32 8, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <4 x i32> <i32 7, i32 7, i32 1, i32 1>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 8, i32 6, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <3 x i32> <i32 7, i32 7, i32 5>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 8, i32 7, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <4 x i32> <i32 7, i32 7, i32 7, i32 5>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 9, i32 0, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, i32 7, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 9, i32 1, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 7, i32 7>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 9, i32 2, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <3 x i32> <i32 7, i32 7, i32 7>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 9, i32 9, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 7, i32 7>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 9, i32 3, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <3 x i32> <i32 7, i32 7, i32 7>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 9, i32 10, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, i32 7, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 9, i32 4, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 7, i32 7>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 9, i32 5, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <3 x i32> <i32 7, i32 7, i32 7>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 9, i32 8, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <4 x i32> <i32 7, i32 7, i32 1, i32 1>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 9, i32 6, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <3 x i32> <i32 7, i32 7, i32 5>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 9, i32 7, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <4 x i32> <i32 7, i32 7, i32 7, i32 5>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 10, i32 0, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, i32 7, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 10, i32 1, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 7, i32 7>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 10, i32 2, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <3 x i32> <i32 7, i32 7, i32 7>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 10, i32 9, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 7, i32 7>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 10, i32 3, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <3 x i32> <i32 7, i32 7, i32 7>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 10, i32 10, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, i32 7, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 10, i32 4, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 7, i32 7>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 10, i32 5, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <3 x i32> <i32 7, i32 7, i32 7>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 10, i32 8, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <4 x i32> <i32 7, i32 7, i32 1, i32 1>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 10, i32 6, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <3 x i32> <i32 7, i32 7, i32 5>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 10, i32 7, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <4 x i32> <i32 7, i32 7, i32 7, i32 5>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 0, i32 0, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, i32 7, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 0, i32 1, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 7, i32 7>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 0, i32 2, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <3 x i32> <i32 7, i32 7, i32 7>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 0, i32 9, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 7, i32 7>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 0, i32 3, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <3 x i32> <i32 7, i32 7, i32 7>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 0, i32 10, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, i32 7, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 0, i32 4, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 7, i32 7>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 0, i32 5, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <3 x i32> <i32 7, i32 7, i32 7>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 0, i32 8, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <4 x i32> <i32 7, i32 7, i32 1, i32 1>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 0, i32 6, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <3 x i32> <i32 7, i32 7, i32 5>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 0, i32 7, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <4 x i32> <i32 7, i32 7, i32 7, i32 5>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.compare.swap.i32(i32 0, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, i32 7, i32 9, i32 3)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.compare.swap.i32(i32 1, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 7, i32 7>, i32 9, i32 3)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.compare.swap.i32(i32 2, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <3 x i32> <i32 7, i32 7, i32 7>, i32 9, i32 3)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.compare.swap.i32(i32 9, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 7, i32 7>, i32 9, i32 3)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.compare.swap.i32(i32 3, i32 0, i32 0, ptr addrspace(4)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.compare.swap.i32(i32 10, i32 0, i32 0, ptr addrspace(4)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.compare.swap.i32(i32 4, i32 0, i32 0, ptr addrspace(4)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.compare.swap.i32(i32 5, i32 0, i32 0, ptr addrspace(4)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.compare.swap.i32(i32 8, i32 0, i32 0, ptr addrspace(4)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.compare.swap.i32(i32 6, i32 0, i32 0, ptr addrspace(4)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.compare.swap.i32(i32 7, i32 0, i32 0, ptr addrspace(4)
 
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
 ; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i16(i32 9, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
diff --git a/llpc/test/shaderdb/core/OpAtomicXXX_TestImageMemoryQualifier_lit.comp b/llpc/test/shaderdb/core/OpAtomicXXX_TestImageMemoryQualifier_lit.comp
index 8b2876bf51..3892925122 100644
--- a/llpc/test/shaderdb/core/OpAtomicXXX_TestImageMemoryQualifier_lit.comp
+++ b/llpc/test/shaderdb/core/OpAtomicXXX_TestImageMemoryQualifier_lit.comp
@@ -16,9 +16,9 @@ void main()
 ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
 
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 2, i32 1, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 5, i32 5>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 2, i32 1, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 5, i32 5>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 2, i32 1, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 5, i32 5>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 2, i32 1, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 5, i32 5>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 2, i32 1, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 5, i32 5>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 2, i32 1, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 5, i32 5>, i32 9)
 
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
 ; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i16(i32 9, i16 5, i16 5, <8 x i32> %{{.*}}, i32 0, i32 0)
diff --git a/llpc/test/shaderdb/core/OpAtomicXXX_TestImage_lit.comp b/llpc/test/shaderdb/core/OpAtomicXXX_TestImage_lit.comp
index 9241040a7e..c64f1c2cdb 100644
--- a/llpc/test/shaderdb/core/OpAtomicXXX_TestImage_lit.comp
+++ b/llpc/test/shaderdb/core/OpAtomicXXX_TestImage_lit.comp
@@ -46,23 +46,23 @@ void main()
 ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
 
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 2, i32 1, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 7, i32 7>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 4, i32 1, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 7, i32 7>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 6, i32 1, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 7, i32 7>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 8, i32 1, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 7, i32 7>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 9, i32 1, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 7, i32 7>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 10, i32 1, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 7, i32 7>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 0, i32 1, i32 0, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 7, i32 7>, i32 9)
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.compare.swap.i32(i32 1, i32 0, i32 0, <8 x i32>
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 2, i32 1, i32 0, i32 0, <8 x i32>
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 5, i32 1, i32 0, i32 0, <8 x i32>
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 7, i32 1, i32 0, i32 0, <8 x i32>
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 8, i32 1, i32 0, i32 0, <8 x i32>
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 9, i32 1, i32 0, i32 0, <8 x i32>
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 10, i32 1, i32 0, i32 0, <8 x i32>
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 0, i32 1, i32 0, i32 0, <8 x i32>
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.compare.swap.i32(i32 1, i32 0, i32 0, <8 x i32>
-; SHADERTEST: call reassoc nnan nsz arcp contract afn float (...) @lgc.create.image.atomic.f32(i32 0, i32 1, i32 0, i32 0, <8 x i32>
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 2, i32 1, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 7, i32 7>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 4, i32 1, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 7, i32 7>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 6, i32 1, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 7, i32 7>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 8, i32 1, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 7, i32 7>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 9, i32 1, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 7, i32 7>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 10, i32 1, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 7, i32 7>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 0, i32 1, i32 0, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 7, i32 7>, i32 9)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.compare.swap.i32(i32 1, i32 0, i32 0, ptr addrspace(4)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 2, i32 1, i32 0, i32 0, ptr addrspace(4)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 5, i32 1, i32 0, i32 0, ptr addrspace(4)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 7, i32 1, i32 0, i32 0, ptr addrspace(4)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 8, i32 1, i32 0, i32 0, ptr addrspace(4)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 9, i32 1, i32 0, i32 0, ptr addrspace(4)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 10, i32 1, i32 0, i32 0, ptr addrspace(4)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 0, i32 1, i32 0, i32 0, ptr addrspace(4)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.compare.swap.i32(i32 1, i32 0, i32 0, ptr addrspace(4)
+; SHADERTEST: call reassoc nnan nsz arcp contract afn float (...) @lgc.create.image.atomic.f32(i32 0, i32 1, i32 0, i32 0, ptr addrspace(4)
 
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
 ; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i16(i32 9, i16 7, i16 7, <8 x i32> %{{.*}}, i32 0, i32 0)
diff --git a/llpc/test/shaderdb/core/OpAtomicXXX_TestImage_lit.frag b/llpc/test/shaderdb/core/OpAtomicXXX_TestImage_lit.frag
index 2ee55dcbf6..81b0a25cc1 100644
--- a/llpc/test/shaderdb/core/OpAtomicXXX_TestImage_lit.frag
+++ b/llpc/test/shaderdb/core/OpAtomicXXX_TestImage_lit.frag
@@ -48,23 +48,23 @@ void main()
 ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
 
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 2, i32 0, i32 0, i32 0, <8 x i32>
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 4, i32 1, i32 0, i32 0, <8 x i32>
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 6, i32 1, i32 128, i32 0, <8 x i32>
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 8, i32 6, i32 0, i32 0, <8 x i32>
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 9, i32 0, i32 0, i32 0, <8 x i32>
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 10, i32 0, i32 0, i32 0, <8 x i32>
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 0, i32 0, i32 0, i32 0, <8 x i32>
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.compare.swap.i32(i32 0, i32 0, i32 0, <8 x i32>
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 2, i32 3, i32 0, i32 0, <8 x i32>
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 5, i32 0, i32 0, i32 0, <4 x i32>
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 7, i32 0, i32 128, i32 0, <4 x i32>
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 8, i32 7, i32 0, i32 0, <8 x i32>
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 9, i32 3, i32 0, i32 0, <8 x i32>
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 10, i32 3, i32 0, i32 0, <8 x i32>
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 0, i32 3, i32 0, i32 0, <8 x i32>
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.compare.swap.i32(i32 3, i32 0, i32 0, <8 x i32>
-; SHADERTEST: call reassoc nnan nsz arcp contract afn float (...) @lgc.create.image.atomic.f32(i32 0, i32 9, i32 0, i32 0, <8 x i32>
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 2, i32 0, i32 0, i32 0, ptr addrspace(4)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 4, i32 1, i32 0, i32 0, ptr addrspace(4)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 6, i32 1, i32 128, i32 0, ptr addrspace(4)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 8, i32 6, i32 0, i32 0, ptr addrspace(4)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 9, i32 0, i32 0, i32 0, ptr addrspace(4)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 10, i32 0, i32 0, i32 0, ptr addrspace(4)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 0, i32 0, i32 0, i32 0, ptr addrspace(4)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.compare.swap.i32(i32 0, i32 0, i32 0, ptr addrspace(4)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 2, i32 3, i32 0, i32 0, ptr addrspace(4)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 5, i32 10, i32 0, i32 0, ptr addrspace(4)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 7, i32 10, i32 128, i32 0, ptr addrspace(4)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 8, i32 7, i32 0, i32 0, ptr addrspace(4)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 9, i32 3, i32 0, i32 0, ptr addrspace(4)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 10, i32 3, i32 0, i32 0, ptr addrspace(4)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 0, i32 3, i32 0, i32 0, ptr addrspace(4)
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.compare.swap.i32(i32 3, i32 0, i32 0, ptr addrspace(4)
+; SHADERTEST: call reassoc nnan nsz arcp contract afn float (...) @lgc.create.image.atomic.f32(i32 0, i32 9, i32 0, i32 0, ptr addrspace(4)
 
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
 ; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i16(i32 %{{.*}}, i16 1, <8 x i32> %{{.*}}, i32 0, i32 0)
diff --git a/llpc/test/shaderdb/core/OpFMul_TestOperandIsZero.spvasm b/llpc/test/shaderdb/core/OpFMul_TestOperandIsZero.spvasm
index ff8682c708..fd43a195b7 100644
--- a/llpc/test/shaderdb/core/OpFMul_TestOperandIsZero.spvasm
+++ b/llpc/test/shaderdb/core/OpFMul_TestOperandIsZero.spvasm
@@ -32,6 +32,8 @@
                OpDecorate %22 FPFastMathMode NotInf
                OpDecorate %32 FPFastMathMode NotInf
                OpDecorate %38 FPFastMathMode NotInf
+               OpDecorate %33 NoContraction
+               OpDecorate %39 NoContraction
        %void = OpTypeVoid
           %3 = OpTypeFunction %void
       %float = OpTypeFloat 32
diff --git a/llpc/test/shaderdb/core/OpFOrdEqual_TestVec3_lit.frag b/llpc/test/shaderdb/core/OpFOrdEqual_TestVec3_lit.frag
index 110b133f0d..b7e84aefe6 100644
--- a/llpc/test/shaderdb/core/OpFOrdEqual_TestVec3_lit.frag
+++ b/llpc/test/shaderdb/core/OpFOrdEqual_TestVec3_lit.frag
@@ -20,7 +20,7 @@ void main()
 // CHECK-NEXT:    [[TMP0:%.*]] = call ptr addrspace(7) @lgc.load.buffer.desc(i64 0, i32 0, i32 0, i32 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = call ptr @llvm.invariant.start.p7(i64 -1, ptr addrspace(7) [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = load <3 x float>, ptr addrspace(7) [[TMP0]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds {{i8|<{ [[]3 x float], [[]4 x i8], [[]3 x float] }>}}, ptr addrspace(7) [[TMP0]], i32 {{16|0, i32 2}}
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr {{(inbounds i8|<{ [[]3 x float], [[]4 x i8], [[]3 x float] }>)|i8}}, ptr addrspace(7) [[TMP0]], i32 {{16|0, i32 2}}
 // CHECK-NEXT:    [[TMP4:%.*]] = load <3 x float>, ptr addrspace(7) [[TMP3]], align 16
 // CHECK-NEXT:    [[TMP5:%.*]] = extractelement <3 x float> [[TMP2]], i64 0
 // CHECK-NEXT:    [[TMP6:%.*]] = extractelement <3 x float> [[TMP4]], i64 0
diff --git a/llpc/test/shaderdb/core/OpFOrdNotEqual_TestVec3_lit.frag b/llpc/test/shaderdb/core/OpFOrdNotEqual_TestVec3_lit.frag
index 05b9e49a03..0732bda8cb 100644
--- a/llpc/test/shaderdb/core/OpFOrdNotEqual_TestVec3_lit.frag
+++ b/llpc/test/shaderdb/core/OpFOrdNotEqual_TestVec3_lit.frag
@@ -20,7 +20,7 @@ void main()
 // CHECK-NEXT:    [[TMP0:%.*]] = call ptr addrspace(7) @lgc.load.buffer.desc(i64 0, i32 0, i32 0, i32 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = call ptr @llvm.invariant.start.p7(i64 -1, ptr addrspace(7) [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = load <3 x float>, ptr addrspace(7) [[TMP0]], align 16
-// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds {{i8|<{ [[]3 x float], [[]4 x i8], [[]3 x float] }>}}, ptr addrspace(7) [[TMP0]], i32 {{16|0, i32 2}}
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr {{inbounds i8|<{ [[]3 x float], [[]4 x i8], [[]3 x float] }>|i8}}, ptr addrspace(7) [[TMP0]], i32 {{16|0, i32 2}}
 // CHECK-NEXT:    [[TMP4:%.*]] = load <3 x float>, ptr addrspace(7) [[TMP3]], align 16
 // CHECK-NEXT:    [[TMP5:%.*]] = extractelement <3 x float> [[TMP2]], i64 0
 // CHECK-NEXT:    [[TMP6:%.*]] = extractelement <3 x float> [[TMP4]], i64 0
diff --git a/llpc/test/shaderdb/core/OpIEqual_TestIvec2_lit.frag b/llpc/test/shaderdb/core/OpIEqual_TestIvec2_lit.frag
index 01adb95c6a..1ce6abbbbc 100644
--- a/llpc/test/shaderdb/core/OpIEqual_TestIvec2_lit.frag
+++ b/llpc/test/shaderdb/core/OpIEqual_TestIvec2_lit.frag
@@ -20,7 +20,7 @@ void main()
 // SHADERTEST-NEXT:    [[TMP0:%.*]] = call ptr addrspace(7) @lgc.load.buffer.desc(i64 0, i32 0, i32 0, i32 0)
 // SHADERTEST-NEXT:    [[TMP1:%.*]] = call ptr @llvm.invariant.start.p7(i64 -1, ptr addrspace(7) [[TMP0]])
 // SHADERTEST-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(7) [[TMP0]], align 8
-// SHADERTEST-NEXT:    [[TMP3:%.*]] = getelementptr inbounds {{i8|<{ [[]2 x i32], [[]2 x i32] }>}}, ptr addrspace(7) [[TMP0]], i32 {{8|0, i32 1}}
+// SHADERTEST-NEXT:    [[TMP3:%.*]] = getelementptr {{inbounds i8|<{ [[]2 x i32], [[]2 x i32] }>|i8}}, ptr addrspace(7) [[TMP0]], i32 {{8|0, i32 1}}
 // SHADERTEST-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr addrspace(7) [[TMP3]], align 8
 // SHADERTEST-NEXT:    [[TMP5:%.*]] = extractelement <2 x i32> [[TMP2]], i64 0
 // SHADERTEST-NEXT:    [[TMP6:%.*]] = extractelement <2 x i32> [[TMP4]], i64 0
diff --git a/llpc/test/shaderdb/core/OpINotEqual_TestIvec2_lit.frag b/llpc/test/shaderdb/core/OpINotEqual_TestIvec2_lit.frag
index d58cc30b47..3fedb0e70d 100644
--- a/llpc/test/shaderdb/core/OpINotEqual_TestIvec2_lit.frag
+++ b/llpc/test/shaderdb/core/OpINotEqual_TestIvec2_lit.frag
@@ -20,7 +20,7 @@ void main()
 // SHADERTEST-NEXT:    [[TMP0:%.*]] = call ptr addrspace(7) @lgc.load.buffer.desc(i64 0, i32 0, i32 0, i32 0)
 // SHADERTEST-NEXT:    [[TMP1:%.*]] = call ptr @llvm.invariant.start.p7(i64 -1, ptr addrspace(7) [[TMP0]])
 // SHADERTEST-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(7) [[TMP0]], align 8
-// SHADERTEST-NEXT:    [[TMP3:%.*]] = getelementptr inbounds {{i8|<{ [[]2 x i32], [[]2 x i32] }>}}, ptr addrspace(7) [[TMP0]], i32 {{8|0, i32 1}}
+// SHADERTEST-NEXT:    [[TMP3:%.*]] = getelementptr {{inbounds i8|<{ [[]2 x i32], [[]2 x i32] }>|i8}}, ptr addrspace(7) [[TMP0]], i32 {{8|0, i32 1}}
 // SHADERTEST-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr addrspace(7) [[TMP3]], align 8
 // SHADERTEST-NEXT:    [[TMP5:%.*]] = extractelement <2 x i32> [[TMP2]], i64 0
 // SHADERTEST-NEXT:    [[TMP6:%.*]] = extractelement <2 x i32> [[TMP4]], i64 0
diff --git a/llpc/test/shaderdb/core/OpImageDrefGather_TestBasic_lit.frag b/llpc/test/shaderdb/core/OpImageDrefGather_TestBasic_lit.frag
index 151dde1a3a..380040dcfa 100644
--- a/llpc/test/shaderdb/core/OpImageDrefGather_TestBasic_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageDrefGather_TestBasic_lit.frag
@@ -17,9 +17,7 @@ void main()
 ; SHADERTEST-LABEL: {{^// LLPC}}  SPIR-V lowering results
 ; SHADERTEST: [[IMAGEPTR:%[0-9A-Za-z_.-]+]] = call {{.*}} @lgc.create.get.desc.ptr.p4{{.*}}(i32 1, i32 1, i64 0, i32 0
 ; SHADERTEST: [[SAMPLERPTR:%[0-9A-Za-z_.-]+]] = call {{.*}} @lgc.create.get.desc.ptr.p4{{.*}}(i32 2, i32 2, i64 0, i32 0
-; SHADERTEST: [[SAMPLER:%[0-9A-Za-z_.-]+]] = load <4 x i32>, {{<4 x i32> addrspace\(4\)\*|ptr addrspace\(4\)}} [[SAMPLERPTR]]
-; SHADERTEST: [[IMAGE:%[0-9A-Za-z_.-]+]] = load <8 x i32>, {{<8 x i32> addrspace\(4\)\*|ptr addrspace\(4\)}} [[IMAGEPTR]]
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> {{.*}}@lgc.create.image.gather.v4f32(i32 1, i32 512, <8 x i32> [[IMAGE]], <4 x i32> [[SAMPLER]],{{.*}},{{.*}} float 2.000000e+00
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> {{.*}}@lgc.create.image.gather.v4f32(i32 1, i32 512, ptr addrspace(4) [[IMAGEPTR]], ptr addrspace(4) [[SAMPLERPTR]],{{.*}},{{.*}} float 2.000000e+00
 
 ; SHADERTEST-LABEL: {{^// LLPC}}  pipeline patching results
 ; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.c.lz.2d.v4f32.f32(i32 1, float 2.000000e+00,
diff --git a/llpc/test/shaderdb/core/OpImageDrefGather_TestTextureGatherOffset_lit.frag b/llpc/test/shaderdb/core/OpImageDrefGather_TestTextureGatherOffset_lit.frag
index b6e0067e9d..42c07e1bba 100644
--- a/llpc/test/shaderdb/core/OpImageDrefGather_TestTextureGatherOffset_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageDrefGather_TestTextureGatherOffset_lit.frag
@@ -27,9 +27,9 @@ void main()
 ; SHADERTEST-LABEL: {{^// LLPC}}  SPIR-V lowering results
 ; SHADERTEST: ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4(i32 1, i32 1, i64 0, i32 0)
 ; SHADERTEST: ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4(i32 2, i32 2, i64 0, i32 0)
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 1, i32 512, <8 x i32>
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 5, i32 384, <8 x i32>
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 9, i32 512, <8 x i32>
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 1, i32 512, ptr addrspace(4)
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 5, i32 384, ptr addrspace(4)
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 9, i32 512, ptr addrspace(4)
 
 ; SHADERTEST-LABEL: {{^// LLPC}}  pipeline patching results
 ; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32(i32 1, i32 257, float 0x3FECCCCCC0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i1 false, i32 0, i32 0)
diff --git a/llpc/test/shaderdb/core/OpImageDrefGather_TestTextureGather_lit.frag b/llpc/test/shaderdb/core/OpImageDrefGather_TestTextureGather_lit.frag
index 77074845cf..d5f32f94b5 100644
--- a/llpc/test/shaderdb/core/OpImageDrefGather_TestTextureGather_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageDrefGather_TestTextureGather_lit.frag
@@ -1,3 +1,4 @@
+
 #version 450
 
 layout(set = 0, binding = 0) uniform sampler2DShadow      samp2DShadow;
@@ -33,7 +34,7 @@ void main()
 ; SHADERTEST: ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4{{.*}}(i32 2, i32 2, i64 0, i32 0)
 ; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 1, i32 512, {{.*}}, i32 545, <2 x float> <float 0x3FB99999A0000000, float 0x3FB99999A0000000>, float 0.000000e+00, float 0x3FECCCCCC0000000)
 ; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 5, i32 384, {{.*}}, i32 545, <3 x float> <float 0x3FC99999A0000000, float 0x3FC99999A0000000, float 0x3FC99999A0000000>, float 0.000000e+00, float 0x3FE99999A0000000)
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 9, i32 512, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i32 545, <2 x float> <float 1.000000e+00, float 1.000000e+00>, float 0.000000e+00, float 0x3FE6666660000000)
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 9, i32 512, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, i32 545, <2 x float> <float 1.000000e+00, float 1.000000e+00>, float 0.000000e+00, float 0x3FE6666660000000)
 
 ; SHADERTEST-LABEL: {{^// LLPC}}  pipeline patching results
 ; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.c.lz.2d.v4f32.f32(i32 1, float 0x3FECCCCCC0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i1 false, i32 0, i32 0)
diff --git a/llpc/test/shaderdb/core/OpImageExplicitLod_TestDrefLodOffset_lit.frag b/llpc/test/shaderdb/core/OpImageExplicitLod_TestDrefLodOffset_lit.frag
index 2ccb975b3a..5e243f4bcd 100644
--- a/llpc/test/shaderdb/core/OpImageExplicitLod_TestDrefLodOffset_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageExplicitLod_TestDrefLodOffset_lit.frag
@@ -17,7 +17,7 @@ void main()
 ; SHADERTEST-LABEL: {{^// LLPC}}  SPIR-V lowering results
 ; SHADERTEST: call {{.*}} @lgc.create.get.desc.ptr.p4(i32 1, i32 1, i64 0, i32 0
 ; SHADERTEST: call {{.*}} @lgc.create.get.desc.ptr.p4(i32 2, i32 2, i64 0, i32 0
-; SHADERTEST: call reassoc nnan nsz arcp contract afn float (...) @lgc.create.image.sample.f32(i32 1, i32 512, <8 x i32>{{.*}}, i32 801,{{.*}}, float 1.000000e+00, <2 x i32> <i32 2, i32 3>,
+; SHADERTEST: call reassoc nnan nsz arcp contract afn float (...) @lgc.create.image.sample.f32(i32 1, i32 512, ptr addrspace(4){{.*}}, i32 801,{{.*}}, float 1.000000e+00, <2 x i32> <i32 2, i32 3>,
 
 ; SHADERTEST-LABEL: {{^// LLPC}}  pipeline patching results
 ; SHADERTEST: call {{.*}} float @llvm.amdgcn.image.sample.c.l.o.2d.f32.f32(i32 1, i32 770,{{.*}},{{.*}},{{.*}}, float 1.000000e+00,{{.*}},{{.*}}, i1 false, i32 0, i32 0)
diff --git a/llpc/test/shaderdb/core/OpImageFetch_TestBuffer_lit.comp b/llpc/test/shaderdb/core/OpImageFetch_TestBuffer_lit.comp
index aa8f65b773..c97da7dea4 100644
--- a/llpc/test/shaderdb/core/OpImageFetch_TestBuffer_lit.comp
+++ b/llpc/test/shaderdb/core/OpImageFetch_TestBuffer_lit.comp
@@ -19,7 +19,7 @@ void main()
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results
 ; SHADERTEST-LABEL: {{^// LLPC}}  SPIR-V lowering results
 ; SHADERTEST: call {{.*}} @lgc.create.get.desc.ptr.p4(i32 4, i32 4, i64 0, i32 0
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.load.v4f32(i32 0, i32 1536, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i32 3)
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.load.v4f32(i32 10, i32 1536, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, i32 3)
 
 ; SHADERTEST-LABEL: {{^// LLPC}}  pipeline patching results
 ; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32({{.*}}, i32 3, i32 0, i32 0, i32 0), !invariant.load
diff --git a/llpc/test/shaderdb/core/OpImageFetch_TestTexelFetch_lit.frag b/llpc/test/shaderdb/core/OpImageFetch_TestTexelFetch_lit.frag
index 78d7897576..4dbca71ce8 100644
--- a/llpc/test/shaderdb/core/OpImageFetch_TestTexelFetch_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageFetch_TestTexelFetch_lit.frag
@@ -36,7 +36,7 @@ void main()
 ; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.load.v4f32(i32 0, i32 1536, {{.*}}, i32 2, i32 2)
 ; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.load.v4f32(i32 1, i32 128, {{.*}}, <2 x i32> <i32 7, i32 7>, i32 8)
 ; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.load.v4f32(i32 9, i32 1536, {{.*}}, <2 x i32> <i32 3, i32 3>)
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.load.v4f32(i32 0, i32 1536, {{.*}}, i32 5)
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.load.v4f32(i32 10, i32 1536, {{.*}}, i32 5)
 ; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.load.with.fmask.v4f32(i32 6, i32 128, {{.*}}, {{.*}}, <2 x i32> <i32 6, i32 6>, i32 4)
 
 ; SHADERTEST-LABEL: {{^// LLPC}}  pipeline patching results
diff --git a/llpc/test/shaderdb/core/OpImageGather_TestConstOffsets_lit.frag b/llpc/test/shaderdb/core/OpImageGather_TestConstOffsets_lit.frag
index ac7612a0e2..428160ad38 100644
--- a/llpc/test/shaderdb/core/OpImageGather_TestConstOffsets_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageGather_TestConstOffsets_lit.frag
@@ -18,7 +18,7 @@ void main()
 ; SHADERTEST-LABEL: {{^// LLPC}}  SPIR-V lowering results
 ; SHADERTEST: call {{.*}} @lgc.create.get.desc.ptr.p4(i32 1, i32 1, i64 0, i32 0
 ; SHADERTEST: call {{.*}} @lgc.create.get.desc.ptr.p4(i32 2, i32 2, i64 0, i32 0
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 1, i32 512, <8 x {{.*}}, <4 x {{.*}}, i32 293, <2 x {{.*}}, i32 2, float 0.000000e+00, [4 x <2 x i32>] [<2 x i32> <i32 1, i32 2>, <2 x i32> <i32 3, i32 4>, <2 x i32> <i32 5, i32 6>, <2 x i32> <i32 7, i32 8>])
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 1, i32 512, ptr addrspace(4) {{.*}}, ptr addrspace(4) {{.*}}, i32 293, <2 x {{.*}}, i32 2, float 0.000000e+00, [4 x <2 x i32>] [<2 x i32> <i32 1, i32 2>, <2 x i32> <i32 3, i32 4>, <2 x i32> <i32 5, i32 6>, <2 x i32> <i32 7, i32 8>])
 
 ; SHADERTEST-LABEL: {{^// LLPC}}  pipeline patching results
 ; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32(i32 4, i32 513,{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
diff --git a/llpc/test/shaderdb/core/OpImageGather_TestDrefConstOffsets_lit.frag b/llpc/test/shaderdb/core/OpImageGather_TestDrefConstOffsets_lit.frag
index 09f96ac2b3..6d1959e231 100644
--- a/llpc/test/shaderdb/core/OpImageGather_TestDrefConstOffsets_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageGather_TestDrefConstOffsets_lit.frag
@@ -18,7 +18,7 @@ void main()
 ; SHADERTEST-LABEL: {{^// LLPC}}  SPIR-V lowering results
 ; SHADERTEST: call {{.*}} @lgc.create.get.desc.ptr.p4(i32 1, i32 1, i64 0, i32 0
 ; SHADERTEST: call {{.*}} @lgc.create.get.desc.ptr.p4(i32 2, i32 2, i64 0, i32 0
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 1, i32 512, <8 x {{.*}}, <4 x {{.*}}, i32 801, <2 x {{.*}}, float 0.000000e+00, [4 x <2 x i32>] [<2 x i32> <i32 1, i32 2>, <2 x i32> <i32 3, i32 4>, <2 x i32> <i32 5, i32 6>, <2 x i32> <i32 7, i32 8>], float 1.000000e+00)
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 1, i32 512, ptr addrspace(4) {{.*}}, ptr addrspace(4) {{.*}}, i32 801, <2 x {{.*}}, float 0.000000e+00, [4 x <2 x i32>] [<2 x i32> <i32 1, i32 2>, <2 x i32> <i32 3, i32 4>, <2 x i32> <i32 5, i32 6>, <2 x i32> <i32 7, i32 8>], float 1.000000e+00)
 
 ; SHADERTEST-LABEL: {{^// LLPC}}  pipeline patching results
 ; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.2d.v4f32.f32(i32 1, i32 513, float 1.000000e+00,{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
diff --git a/llpc/test/shaderdb/core/OpImageGather_TestIntegerSampler.frag b/llpc/test/shaderdb/core/OpImageGather_TestIntegerSampler.frag
index 97be87bb57..2eb83e3bf0 100644
--- a/llpc/test/shaderdb/core/OpImageGather_TestIntegerSampler.frag
+++ b/llpc/test/shaderdb/core/OpImageGather_TestIntegerSampler.frag
@@ -25,12 +25,12 @@ void main()
 ; SHADERTEST: call {{.*}} @lgc.create.get.desc.ptr.p4(i32 2, i32 2, i64 0, i32 1
 ; SHADERTEST: call {{.*}} @lgc.create.get.desc.ptr.p4(i32 1, i32 1, i64 0, i32 0
 ; SHADERTEST: call {{.*}} @lgc.create.get.desc.ptr.p4(i32 2, i32 2, i64 0, i32 0
-; SHADERTEST: call <4 x i32> (...) @lgc.create.image.gather.v4i32(i32 1, i32 516, <8 x {{.*}}, <4 x {{.*}}, i32 37, <2 x float> <float 0.000000e+00, float 1.000000e+00>, i32 0, float 0.000000e+00)
-; SHADERTEST: call <4 x i32> (...) @lgc.create.image.gather.v4i32(i32 1, i32 516, <8 x {{.*}}, <4 x {{.*}}, i32 293, <2 x float> <float 0.000000e+00, float 1.000000e+00>, i32 0, float 0.000000e+00, <2 x i32> <i32 1, i32 2>)
-; SHADERTEST: call <4 x i32> (...) @lgc.create.image.gather.v4i32(i32 1, i32 516, <8 x {{.*}}, <4 x {{.*}}, i32 293, <2 x float> <float 0.000000e+00, float 1.000000e+00>, i32 0, float 0.000000e+00, [4 x <2 x i32>] [<2 x i32> <i32 1, i32 1>, <2 x i32> <i32 2, i32 2>, <2 x i32> <i32 3, i32 3>, <2 x i32> <i32 4, i32 4>])
-; SHADERTEST: call <4 x i32> (...) @lgc.create.image.gather.v4i32(i32 1, i32 512, <8 x {{.*}}, <4 x {{.*}}, i32 37, <2 x float> <float 0.000000e+00, float 1.000000e+00>, i32 0, float 0.000000e+00)
-; SHADERTEST: call <4 x i32> (...) @lgc.create.image.gather.v4i32(i32 1, i32 512, <8 x {{.*}}, <4 x {{.*}}, i32 293, <2 x float> <float 0.000000e+00, float 1.000000e+00>, i32 0, float 0.000000e+00, <2 x i32> <i32 1, i32 2>)
-; SHADERTEST: call <4 x i32> (...) @lgc.create.image.gather.v4i32(i32 1, i32 512, <8 x {{.*}}, <4 x {{.*}}, i32 293, <2 x float> <float 0.000000e+00, float 1.000000e+00>, i32 0, float 0.000000e+00, [4 x <2 x i32>] [<2 x i32> <i32 1, i32 1>, <2 x i32> <i32 2, i32 2>, <2 x i32> <i32 3, i32 3>, <2 x i32> <i32 4, i32 4>])
+; SHADERTEST: call <4 x i32> (...) @lgc.create.image.gather.v4i32(i32 1, i32 516, ptr addrspace(4) {{.*}}, ptr addrspace(4) {{.*}}, i32 37, <2 x float> <float 0.000000e+00, float 1.000000e+00>, i32 0, float 0.000000e+00)
+; SHADERTEST: call <4 x i32> (...) @lgc.create.image.gather.v4i32(i32 1, i32 516, ptr addrspace(4) {{.*}}, ptr addrspace(4) {{.*}}, i32 293, <2 x float> <float 0.000000e+00, float 1.000000e+00>, i32 0, float 0.000000e+00, <2 x i32> <i32 1, i32 2>)
+; SHADERTEST: call <4 x i32> (...) @lgc.create.image.gather.v4i32(i32 1, i32 516, ptr addrspace(4) {{.*}}, ptr addrspace(4) {{.*}}, i32 293, <2 x float> <float 0.000000e+00, float 1.000000e+00>, i32 0, float 0.000000e+00, [4 x <2 x i32>] [<2 x i32> <i32 1, i32 1>, <2 x i32> <i32 2, i32 2>, <2 x i32> <i32 3, i32 3>, <2 x i32> <i32 4, i32 4>])
+; SHADERTEST: call <4 x i32> (...) @lgc.create.image.gather.v4i32(i32 1, i32 512, ptr addrspace(4) {{.*}}, ptr addrspace(4) {{.*}}, i32 37, <2 x float> <float 0.000000e+00, float 1.000000e+00>, i32 0, float 0.000000e+00)
+; SHADERTEST: call <4 x i32> (...) @lgc.create.image.gather.v4i32(i32 1, i32 512, ptr addrspace(4) {{.*}}, ptr addrspace(4) {{.*}}, i32 293, <2 x float> <float 0.000000e+00, float 1.000000e+00>, i32 0, float 0.000000e+00, <2 x i32> <i32 1, i32 2>)
+; SHADERTEST: call <4 x i32> (...) @lgc.create.image.gather.v4i32(i32 1, i32 512, ptr addrspace(4) {{.*}}, ptr addrspace(4) {{.*}}, i32 293, <2 x float> <float 0.000000e+00, float 1.000000e+00>, i32 0, float 0.000000e+00, [4 x <2 x i32>] [<2 x i32> <i32 1, i32 1>, <2 x i32> <i32 2, i32 2>, <2 x i32> <i32 3, i32 3>, <2 x i32> <i32 4, i32 4>])
 
 ; SHADERTEST-LABEL: {{^// LLPC}}  pipeline patching results
 ; SHADERTEST: call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f16(i32 1, half 0xH0000, half 0xH3C00, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 false, i32 0, i32 0)
diff --git a/llpc/test/shaderdb/core/OpImageGather_TestOffset_lit.frag b/llpc/test/shaderdb/core/OpImageGather_TestOffset_lit.frag
index 52fe89de91..6509cb2964 100644
--- a/llpc/test/shaderdb/core/OpImageGather_TestOffset_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageGather_TestOffset_lit.frag
@@ -18,7 +18,7 @@ void main()
 ; SHADERTEST-LABEL: {{^// LLPC}}  SPIR-V lowering results
 ; SHADERTEST: call {{.*}} @lgc.create.get.desc.ptr.p4(i32 1, i32 1, i64 0, i32 0
 ; SHADERTEST: call {{.*}} @lgc.create.get.desc.ptr.p4(i32 2, i32 2, i64 0, i32 0
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 1, i32 512, <8 x {{.*}}, <4 x {{.*}}, i32 293, <2 x {{.*}}, i32 2, float 0.000000e+00, <2 x {{.*}})
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 1, i32 512, ptr addrspace(4) {{.*}}, ptr addrspace(4) {{.*}}, i32 293, <2 x {{.*}}, i32 2, float 0.000000e+00, <2 x {{.*}})
 
 ; SHADERTEST-LABEL: {{^// LLPC}}  pipeline patching results
 ; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32(i32 4,{{.*}},{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
diff --git a/llpc/test/shaderdb/core/OpImageGather_TestTextureGatherBiasLod_lit.frag b/llpc/test/shaderdb/core/OpImageGather_TestTextureGatherBiasLod_lit.frag
index 7c7805cc3f..9ed1e5cc97 100644
--- a/llpc/test/shaderdb/core/OpImageGather_TestTextureGatherBiasLod_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageGather_TestTextureGatherBiasLod_lit.frag
@@ -61,22 +61,22 @@ void main()
 ; SHADERTEST: call {{.*}} @lgc.create.get.desc.ptr.p4(i32 2, i32 2, i64 0, i32 1
 ; SHADERTEST: call {{.*}} @lgc.create.get.desc.ptr.p4(i32 1, i32 1, i64 0, i32 0
 ; SHADERTEST: call {{.*}} @lgc.create.get.desc.ptr.p4(i32 2, i32 2, i64 0, i32 0
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 1, i32 512, <8 x {{.*}}, <4 x {{.*}}, i32 69, <2 x {{.*}}, i32 0, {{.*}})
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 5, i32 512, <8 x {{.*}}, <4 x {{.*}}, i32 69, <3 x {{.*}}, i32 1, {{.*}})
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 3, i32 512, <8 x {{.*}}, <4 x {{.*}}, i32 69, <3 x {{.*}}, i32 2, {{.*}})
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 8, i32 512, <8 x {{.*}}, <4 x {{.*}}, i32 69, <4 x {{.*}}, i32 3, {{.*}})
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 1, i32 512, <8 x {{.*}}, <4 x {{.*}}, i32 325, <2 x {{.*}}, i32 0, {{.*}}, <2 x i32> zeroinitializer)
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 5, i32 512, <8 x {{.*}}, <4 x {{.*}}, i32 325, <3 x {{.*}}, i32 1, {{.*}}, <2 x i32> <i32 0, i32 1>)
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 1, i32 512, <8 x {{.*}}, <4 x {{.*}}, i32 325, <2 x {{.*}}, i32 0, {{.*}}, [4 x <2 x i32>] [<2 x i32> zeroinitializer, <2 x i32> <i32 0, i32 1>, <2 x i32> <i32 1, i32 0>, <2 x i32> <i32 1, i32 1>])
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 5, i32 512, <8 x {{.*}}, <4 x {{.*}}, i32 325, <3 x {{.*}}, i32 1, {{.*}}, [4 x <2 x i32>] [<2 x i32> zeroinitializer, <2 x i32> <i32 0, i32 1>, <2 x i32> <i32 1, i32 0>, <2 x i32> <i32 1, i32 1>])
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 1, i32 512, <8 x {{.*}}, <4 x {{.*}}, i32 37, <2 x {{.*}}, i32 0, {{.*}})
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 5, i32 512, <8 x {{.*}}, <4 x {{.*}}, i32 37, <3 x {{.*}}, i32 1, {{.*}})
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 3, i32 512, <8 x {{.*}}, <4 x {{.*}}, i32 37, <3 x {{.*}}, i32 2, {{.*}})
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 8, i32 512, <8 x {{.*}}, <4 x {{.*}}, i32 37, <4 x {{.*}}, i32 3, {{.*}})
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 1, i32 512, <8 x {{.*}}, <4 x {{.*}}, i32 293, <2 x {{.*}}, i32 0, {{.*}}, <2 x i32> zeroinitializer)
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 5, i32 512, <8 x {{.*}}, <4 x {{.*}}, i32 293, <3 x {{.*}}, i32 1, {{.*}}, <2 x i32> <i32 0, i32 1>)
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 1, i32 512, <8 x {{.*}}, <4 x {{.*}}, i32 293, <2 x {{.*}}, i32 0, {{.*}}, [4 x <2 x i32>] [<2 x i32> zeroinitializer, <2 x i32> <i32 0, i32 1>, <2 x i32> <i32 1, i32 0>, <2 x i32> <i32 1, i32 1>])
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 5, i32 512, <8 x {{.*}}, <4 x {{.*}}, i32 293, <3 x {{.*}}, i32 1, {{.*}}, [4 x <2 x i32>] [<2 x i32> zeroinitializer, <2 x i32> <i32 0, i32 1>, <2 x i32> <i32 1, i32 0>, <2 x i32> <i32 1, i32 1>])
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 1, i32 512, ptr addrspace(4) {{.*}}, ptr addrspace(4) {{.*}}, i32 69, <2 x {{.*}}, i32 0, {{.*}})
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 5, i32 512, ptr addrspace(4) {{.*}}, ptr addrspace(4) {{.*}}, i32 69, <3 x {{.*}}, i32 1, {{.*}})
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 3, i32 512, ptr addrspace(4) {{.*}}, ptr addrspace(4) {{.*}}, i32 69, <3 x {{.*}}, i32 2, {{.*}})
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 8, i32 512, ptr addrspace(4) {{.*}}, ptr addrspace(4) {{.*}}, i32 69, <4 x {{.*}}, i32 3, {{.*}})
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 1, i32 512, ptr addrspace(4) {{.*}}, ptr addrspace(4) {{.*}}, i32 325, <2 x {{.*}}, i32 0, {{.*}}, <2 x i32> zeroinitializer)
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 5, i32 512, ptr addrspace(4) {{.*}}, ptr addrspace(4) {{.*}}, i32 325, <3 x {{.*}}, i32 1, {{.*}}, <2 x i32> <i32 0, i32 1>)
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 1, i32 512, ptr addrspace(4) {{.*}}, ptr addrspace(4) {{.*}}, i32 325, <2 x {{.*}}, i32 0, {{.*}}, [4 x <2 x i32>] [<2 x i32> zeroinitializer, <2 x i32> <i32 0, i32 1>, <2 x i32> <i32 1, i32 0>, <2 x i32> <i32 1, i32 1>])
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 5, i32 512, ptr addrspace(4) {{.*}}, ptr addrspace(4) {{.*}}, i32 325, <3 x {{.*}}, i32 1, {{.*}}, [4 x <2 x i32>] [<2 x i32> zeroinitializer, <2 x i32> <i32 0, i32 1>, <2 x i32> <i32 1, i32 0>, <2 x i32> <i32 1, i32 1>])
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 1, i32 512, ptr addrspace(4) {{.*}}, ptr addrspace(4) {{.*}}, i32 37, <2 x {{.*}}, i32 0, {{.*}})
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 5, i32 512, ptr addrspace(4) {{.*}}, ptr addrspace(4) {{.*}}, i32 37, <3 x {{.*}}, i32 1, {{.*}})
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 3, i32 512, ptr addrspace(4) {{.*}}, ptr addrspace(4) {{.*}}, i32 37, <3 x {{.*}}, i32 2, {{.*}})
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 8, i32 512, ptr addrspace(4) {{.*}}, ptr addrspace(4) {{.*}}, i32 37, <4 x {{.*}}, i32 3, {{.*}})
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 1, i32 512, ptr addrspace(4) {{.*}}, ptr addrspace(4) {{.*}}, i32 293, <2 x {{.*}}, i32 0, {{.*}}, <2 x i32> zeroinitializer)
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 5, i32 512, ptr addrspace(4) {{.*}}, ptr addrspace(4) {{.*}}, i32 293, <3 x {{.*}}, i32 1, {{.*}}, <2 x i32> <i32 0, i32 1>)
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 1, i32 512, ptr addrspace(4) {{.*}}, ptr addrspace(4) {{.*}}, i32 293, <2 x {{.*}}, i32 0, {{.*}}, [4 x <2 x i32>] [<2 x i32> zeroinitializer, <2 x i32> <i32 0, i32 1>, <2 x i32> <i32 1, i32 0>, <2 x i32> <i32 1, i32 1>])
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 5, i32 512, ptr addrspace(4) {{.*}}, ptr addrspace(4) {{.*}}, i32 293, <3 x {{.*}}, i32 1, {{.*}}, [4 x <2 x i32>] [<2 x i32> zeroinitializer, <2 x i32> <i32 0, i32 1>, <2 x i32> <i32 1, i32 0>, <2 x i32> <i32 1, i32 1>])
 
 ; SHADERTEST-LABEL: {{^// LLPC}}  pipeline patching results
 ; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.b.2d.v4f32.f32.f32(i32 1,{{.*}},{{.*}},{{.*}},{{.*}},{{.*}}, i1 false, i32 0, i32 0)
diff --git a/llpc/test/shaderdb/core/OpImageGather_TestTextureGatherOffset_lit.frag b/llpc/test/shaderdb/core/OpImageGather_TestTextureGatherOffset_lit.frag
index 584be47256..d05255542d 100644
--- a/llpc/test/shaderdb/core/OpImageGather_TestTextureGatherOffset_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageGather_TestTextureGatherOffset_lit.frag
@@ -31,9 +31,9 @@ void main()
 ; SHADERTEST: ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4{{.*}}(i32 2, i32 2, i64 1, i32 0)
 ; SHADERTEST: ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4{{.*}}(i32 1, i32 1, i64 0, i32 0)
 ; SHADERTEST: ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4{{.*}}(i32 2, i32 2, i64 0, i32 0)
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 1, i32 512, <8 x {{.*}}, <4 x {{.*}}, i32 293, <2 x float> <float 0x3FB99999A0000000, float 0x3FB99999A0000000>, i32 2, float 0.000000e+00, <2 x {{.*}})
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 5, i32 384, <8 x {{.*}}, <4 x {{.*}}, i32 293, <3 x float> <float 0x3FC99999A0000000, float 0x3FC99999A0000000, float 0x3FC99999A0000000>, i32 3, float 0.000000e+00, <2 x {{.*}})
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 9, i32 512, <8 x {{.*}}, <4 x {{.*}}, i32 293, <2 x float> <float 1.000000e+00, float 1.000000e+00>, i32 0, float 0.000000e+00, <2 x i32> <i32 1, i32 1>)
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 1, i32 512, ptr addrspace(4) {{.*}}, ptr addrspace(4) {{.*}}, i32 293, <2 x float> <float 0x3FB99999A0000000, float 0x3FB99999A0000000>, i32 2, float 0.000000e+00, <2 x {{.*}})
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 5, i32 384, ptr addrspace(4) {{.*}}, ptr addrspace(4) {{.*}}, i32 293, <3 x float> <float 0x3FC99999A0000000, float 0x3FC99999A0000000, float 0x3FC99999A0000000>, i32 3, float 0.000000e+00, <2 x {{.*}})
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 9, i32 512, ptr addrspace(4) {{.*}}, ptr addrspace(4) {{.*}}, i32 293, <2 x float> <float 1.000000e+00, float 1.000000e+00>, i32 0, float 0.000000e+00, <2 x i32> <i32 1, i32 1>)
 
 ; SHADERTEST-LABEL: {{^// LLPC}}  pipeline patching results
 ; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.lz.o.2d.v4f32.f32(i32 4,{{.*}}, float 0x3FB99999A0000000, float 0x3FB99999A0000000,{{.*}}, i1 false, i32 0, i32 0)
diff --git a/llpc/test/shaderdb/core/OpImageGather_TestTextureGather_lit.frag b/llpc/test/shaderdb/core/OpImageGather_TestTextureGather_lit.frag
index 5134dadc01..2bdc97e82b 100644
--- a/llpc/test/shaderdb/core/OpImageGather_TestTextureGather_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageGather_TestTextureGather_lit.frag
@@ -30,9 +30,9 @@ void main()
 ; SHADERTEST: ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4{{.*}}(i32 2, i32 2, i64 1, i32 0)
 ; SHADERTEST: ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4{{.*}}(i32 1, i32 1, i64 0, i32 0)
 ; SHADERTEST: ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4{{.*}}(i32 2, i32 2, i64 0, i32 0)
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 1, i32 512, <8 x {{.*}}, <4 x {{.*}}, i32 37, <2 x float> <float 0x3FB99999A0000000, float 0x3FB99999A0000000>, i32 2, float 0.000000e+00)
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 5, i32 384, <8 x {{.*}}, <4 x {{.*}}, i32 37, <3 x float> <float 0x3FC99999A0000000, float 0x3FC99999A0000000, float 0x3FC99999A0000000>, i32 3, float 0.000000e+00)
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 9, i32 512, <8 x {{.*}}, <4 x {{.*}}, i32 37, <2 x float> <float 1.000000e+00, float 1.000000e+00>, i32 0, float 0.000000e+00)
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 1, i32 512, ptr addrspace(4) {{.*}}, ptr addrspace(4) {{.*}}, i32 37, <2 x float> <float 0x3FB99999A0000000, float 0x3FB99999A0000000>, i32 2, float 0.000000e+00)
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 5, i32 384, ptr addrspace(4) {{.*}}, ptr addrspace(4) {{.*}}, i32 37, <3 x float> <float 0x3FC99999A0000000, float 0x3FC99999A0000000, float 0x3FC99999A0000000>, i32 3, float 0.000000e+00)
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.gather.v4f32(i32 9, i32 512, ptr addrspace(4) {{.*}}, ptr addrspace(4) {{.*}}, i32 37, <2 x float> <float 1.000000e+00, float 1.000000e+00>, i32 0, float 0.000000e+00)
 
 ; SHADERTEST-LABEL: {{^// LLPC}}  pipeline patching results
 ; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32 4, float 0x3FB99999A0000000, float 0x3FB99999A0000000, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i1 false, i32 0, i32 0)
diff --git a/llpc/test/shaderdb/core/OpImageQueryLevels_TestTextureQueryLevels_lit.frag b/llpc/test/shaderdb/core/OpImageQueryLevels_TestTextureQueryLevels_lit.frag
index 403ee311ff..9ca1f9f322 100644
--- a/llpc/test/shaderdb/core/OpImageQueryLevels_TestTextureQueryLevels_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageQueryLevels_TestTextureQueryLevels_lit.frag
@@ -30,10 +30,10 @@ void main()
 ; SHADERTEST: ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4{{.*}}(i32 1, i32 1, i64 0, i32 1)
 ; SHADERTEST: ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4{{.*}}(i32 1, i32 1, i64 1, i32 0)
 ; SHADERTEST: ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4{{.*}}(i32 1, i32 1, i64 0, i32 0)
-; SHADERTEST: call i32 (...) @lgc.create.image.query.levels.i32(i32 0, i32 512, <8 x {{.*}})
-; SHADERTEST: call i32 (...) @lgc.create.image.query.levels.i32(i32 1, i32 128, <8 x {{.*}})
-; SHADERTEST: call i32 (...) @lgc.create.image.query.levels.i32(i32 1, i32 512, <8 x {{.*}})
-; SHADERTEST: call i32 (...) @lgc.create.image.query.levels.i32(i32 8, i32 128, <8 x {{.*}})
+; SHADERTEST: call i32 (...) @lgc.create.image.query.levels.i32(i32 0, i32 512, ptr addrspace(4) {{.*}})
+; SHADERTEST: call i32 (...) @lgc.create.image.query.levels.i32(i32 1, i32 128, ptr addrspace(4) {{.*}})
+; SHADERTEST: call i32 (...) @lgc.create.image.query.levels.i32(i32 1, i32 512, ptr addrspace(4) {{.*}})
+; SHADERTEST: call i32 (...) @lgc.create.image.query.levels.i32(i32 8, i32 128, ptr addrspace(4) {{.*}})
 
 ; SHADERTEST-LABEL: {{^// LLPC}}  pipeline patching results
 ; SHADERTEST: AMDLLPC SUCCESS
diff --git a/llpc/test/shaderdb/core/OpImageQuerySize_TestBasic_lit.frag b/llpc/test/shaderdb/core/OpImageQuerySize_TestBasic_lit.frag
index 483ef81f1d..45ed00b62b 100644
--- a/llpc/test/shaderdb/core/OpImageQuerySize_TestBasic_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageQuerySize_TestBasic_lit.frag
@@ -106,7 +106,7 @@ void main()
 ; SHADERTEST: call {{.*}} @lgc.create.image.query.size.v2i32(i32 9, i32 512, {{.*}}, i32 0)
 ; SHADERTEST: call {{.*}} @lgc.create.image.query.size.v2i32(i32 4, i32 512, {{.*}}, i32 0)
 ; SHADERTEST: call {{.*}} @lgc.create.image.query.size.v3i32(i32 5, i32 512, {{.*}}, i32 0)
-; SHADERTEST: call {{.*}} @lgc.create.image.query.size.i32(i32 0, i32 512, {{.*}}, i32 0)
+; SHADERTEST: call {{.*}} @lgc.create.image.query.size.i32(i32 10, i32 512, {{.*}}, i32 0)
 ; SHADERTEST: call {{.*}} @lgc.create.image.query.size.v2i32(i32 6, i32 512, {{.*}}, i32 0)
 ; SHADERTEST: call {{.*}} @lgc.create.image.query.size.v3i32(i32 7, i32 512, {{.*}}, i32 0)
 
diff --git a/llpc/test/shaderdb/core/OpImageQuerySize_TestImageSize_lit.frag b/llpc/test/shaderdb/core/OpImageQuerySize_TestImageSize_lit.frag
index 9a00841e7e..f9b18ff291 100644
--- a/llpc/test/shaderdb/core/OpImageQuerySize_TestImageSize_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageQuerySize_TestImageSize_lit.frag
@@ -36,7 +36,7 @@ void main()
 ; SHADERTEST: call {{.*}} @lgc.create.image.query.size.i32(i32 0, i32 512, {{.*}}, i32 0)
 ; SHADERTEST: call {{.*}} @lgc.create.image.query.size.v2i32(i32 9, i32 512, {{.*}}, i32 0)
 ; SHADERTEST: call {{.*}} @lgc.create.image.query.size.v2i32(i32 6, i32 512, {{.*}}, i32 0)
-; SHADERTEST: call {{.*}} @lgc.create.image.query.size.i32(i32 0, i32 128, {{.*}}, i32 0)
+; SHADERTEST: call {{.*}} @lgc.create.image.query.size.i32(i32 10, i32 128, {{.*}}, i32 0)
 ; SHADERTEST: call {{.*}} @lgc.create.image.query.size.v3i32(i32 8, i32 128, {{.*}}, i32 0)
 
 ; SHADERTEST-LABEL: {{^// LLPC}}  pipeline patching results
diff --git a/llpc/test/shaderdb/core/OpImageQuerySize_TestImage_lit.comp b/llpc/test/shaderdb/core/OpImageQuerySize_TestImage_lit.comp
index fd99503fae..aaa04cdc47 100644
--- a/llpc/test/shaderdb/core/OpImageQuerySize_TestImage_lit.comp
+++ b/llpc/test/shaderdb/core/OpImageQuerySize_TestImage_lit.comp
@@ -60,7 +60,7 @@ void main()
 ; SHADERTEST: call {{.*}} @lgc.create.image.query.size.v3i32(i32 8, i32 512, {{.*}}, i32 0)
 ; SHADERTEST: call {{.*}} @lgc.create.image.query.size.v2i32(i32 6, i32 512, {{.*}}, i32 0)
 ; SHADERTEST: call {{.*}} @lgc.create.image.query.size.v3i32(i32 7, i32 512, {{.*}}, i32 0)
-; SHADERTEST: call {{.*}} @lgc.create.image.query.size.i32(i32 0, i32 512, {{.*}}, i32 0)
+; SHADERTEST: call {{.*}} @lgc.create.image.query.size.i32(i32 10, i32 512, {{.*}}, i32 0)
 
 ; SHADERTEST-LABEL: {{^// LLPC}}  pipeline patching results
 ; SHADERTEST: AMDLLPC SUCCESS
diff --git a/llpc/test/shaderdb/core/OpImageQuerySize_TestTextureSize_lit.frag b/llpc/test/shaderdb/core/OpImageQuerySize_TestTextureSize_lit.frag
index 6c0b3a9485..dbc8973fd0 100644
--- a/llpc/test/shaderdb/core/OpImageQuerySize_TestTextureSize_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageQuerySize_TestTextureSize_lit.frag
@@ -29,7 +29,7 @@ void main()
 ; SHADERTEST: call {{.*}} @lgc.create.get.desc.ptr.p4(i32 1, i32 1, i64 0, i32 1)
 ; SHADERTEST: call {{.*}} @lgc.create.get.desc.ptr.p4(i32 1, i32 1, i64 0, i32 0)
 ; SHADERTEST: call {{.*}} @lgc.create.image.query.size.v2i32(i32 9, i32 512, {{.*}}, i32 0)
-; SHADERTEST: call {{.*}} @lgc.create.image.query.size.i32(i32 0, i32 128, {{.*}}, i32 0)
+; SHADERTEST: call {{.*}} @lgc.create.image.query.size.i32(i32 10, i32 128, {{.*}}, i32 0)
 ; SHADERTEST: call {{.*}} @lgc.create.image.query.size.v2i32(i32 6, i32 512, {{.*}}, i32 0)
 ; SHADERTEST: call {{.*}} @lgc.create.image.query.size.v3i32(i32 7, i32 128, {{.*}}, i32 0)
 
diff --git a/llpc/test/shaderdb/core/OpImageRead_TestBuffer_lit.comp b/llpc/test/shaderdb/core/OpImageRead_TestBuffer_lit.comp
index 8c2bdc89ad..048f86a7ab 100644
--- a/llpc/test/shaderdb/core/OpImageRead_TestBuffer_lit.comp
+++ b/llpc/test/shaderdb/core/OpImageRead_TestBuffer_lit.comp
@@ -19,7 +19,7 @@ void main()
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results
 ; SHADERTEST-LABEL: {{^// LLPC}}  SPIR-V lowering results
 ; SHADERTEST: call {{.*}} @lgc.create.get.desc.ptr.p4(i32 4, i32 4, i64 0, i32 0
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.load.v4f32(i32 0, i32 512, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i32 3)
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.load.v4f32(i32 10, i32 512, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, i32 3)
 
 ; SHADERTEST-LABEL: {{^// LLPC}}  pipeline patching results
 ; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.struct.buffer.load.format.v4f32({{.*}}, i32 3, i32 0, i32 0, i32 0)
diff --git a/llpc/test/shaderdb/core/OpImageRead_TestImageLoad_lit.frag b/llpc/test/shaderdb/core/OpImageRead_TestImageLoad_lit.frag
index 3f38b36347..0674a80145 100644
--- a/llpc/test/shaderdb/core/OpImageRead_TestImageLoad_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageRead_TestImageLoad_lit.frag
@@ -35,7 +35,7 @@ void main()
 ; SHADERTEST: call {{.*}} @lgc.create.get.desc.ptr.p4(i32 1, i32 1, i64 0, i32 0)
 ; SHADERTEST: call {{.*}} @lgc.create.image.load.v4f32(i32 0, i32 512, {{.*}}, i32 1)
 ; SHADERTEST: call {{.*}} @lgc.create.image.load.v4f32(i32 9, i32 512, {{.*}}, <2 x i32> <i32 2, i32 3>)
-; SHADERTEST: call {{.*}} @lgc.create.image.load.v4f32(i32 0, i32 128, {{.*}}, i32 4)
+; SHADERTEST: call {{.*}} @lgc.create.image.load.v4f32(i32 10, i32 128, {{.*}}, i32 4)
 ; SHADERTEST: call {{.*}} @lgc.create.image.load.v4f32(i32 8, i32 128, {{.*}}, <4 x i32> <i32 5, i32 6, i32 1, i32 1>)
 ; SHADERTEST: call {{.*}} @lgc.create.image.load.v4f32(i32 6, i32 512, {{.*}}, <3 x i32> <i32 8, i32 9, i32 2>)
 
diff --git a/llpc/test/shaderdb/core/OpImageRead_TestMemoryQualifier_lit.comp b/llpc/test/shaderdb/core/OpImageRead_TestMemoryQualifier_lit.comp
index 226e84d136..edba3359b8 100644
--- a/llpc/test/shaderdb/core/OpImageRead_TestMemoryQualifier_lit.comp
+++ b/llpc/test/shaderdb/core/OpImageRead_TestMemoryQualifier_lit.comp
@@ -27,10 +27,10 @@ void main()
 ; SHADERTEST: call {{.*}} @lgc.create.get.desc.ptr.p4(i32 1, i32 1, i64 0, i32 2
 ; SHADERTEST: call {{.*}} @lgc.create.get.desc.ptr.p4(i32 1, i32 1, i64 0, i32 1
 ; SHADERTEST: call {{.*}} @lgc.create.get.desc.ptr.p4(i32 1, i32 1, i64 0, i32 0
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.load.v4f32(i32 1, i32 512, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 1, i32 1>)
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.load.v4f32(i32 1, i32 512, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 2, i32 2>)
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.load.v4f32(i32 1, i32 513, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 3, i32 3>)
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.load.v4f32(i32 1, i32 515, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 4, i32 4>)
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.load.v4f32(i32 1, i32 512, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 1, i32 1>)
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.load.v4f32(i32 1, i32 512, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 2, i32 2>)
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.load.v4f32(i32 1, i32 513, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 3, i32 3>)
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.load.v4f32(i32 1, i32 515, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, <2 x i32> <i32 4, i32 4>)
 
 ; SHADERTEST-LABEL: {{^// LLPC}}  pipeline patching results
 ; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i16(i32 15, i16 1, i16 1, <8 x i32> %{{.*}}, i32 0, i32 0)
diff --git a/llpc/test/shaderdb/core/OpImageSampleExplicitLod_TestTextureGradClamp_lit.frag b/llpc/test/shaderdb/core/OpImageSampleExplicitLod_TestTextureGradClamp_lit.frag
index 9257edf333..b7666b14b1 100644
--- a/llpc/test/shaderdb/core/OpImageSampleExplicitLod_TestTextureGradClamp_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageSampleExplicitLod_TestTextureGradClamp_lit.frag
@@ -53,12 +53,12 @@ void main()
 ; SHADERTEST: call {{.*}} @lgc.create.image.sample.v4f32(i32 2, i32 512, {{.*}}, {{.*}}, i32 409, <3 x float> <float 0x3FC99999A0000000, float 0x3FC99999A0000000, float 0x3FC99999A0000000>, <3 x float> <float 0x3FF3333340000000, float 0x3FF3333340000000, float 0x3FF3333340000000>, <3 x float> <float 0x3FF4CCCCC0000000, float 0x3FF4CCCCC0000000, float 0x3FF4CCCCC0000000>, {{.*}}, <3 x i32> <i32 3, i32 3, i32 3>)
 
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
-; SHADERTEST: call i32 @llvm.amdgcn.readfirstlane
-; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f32.f32({{.*}}, float 1.000000e+00, float 1.000000e+00, float 0x3FF19999A0000000, float 0x3FF19999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, {{.*}})
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
+; SHADERTEST: call i32 @llvm.amdgcn.readfirstlane
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f32.f32({{.*}}, float 1.000000e+00, float 1.000000e+00, float 0x3FF19999A0000000, float 0x3FF19999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, {{.*}})
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
+; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.cl.3d.v4f32.f32.f32({{.*}}, float 0x3FF3333340000000, float 0x3FF3333340000000, float 0x3FF3333340000000, float 0x3FF4CCCCC0000000, float 0x3FF4CCCCC0000000, float 0x3FF4CCCCC0000000, float 0x3FC99999A0000000, float 0x3FC99999A0000000, float 0x3FC99999A0000000, {{.*}})
 ; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.cl.o.2d.v4f32.f32.f32({{.*}}, i32 514, float 1.000000e+00, float 1.000000e+00, float 0x3FF19999A0000000, float 0x3FF19999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, {{.*}})
 ; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.cl.o.3d.v4f32.f32.f32({{.*}}, i32 197379, float 0x3FF3333340000000, float 0x3FF3333340000000, float 0x3FF3333340000000, float 0x3FF4CCCCC0000000, float 0x3FF4CCCCC0000000, float 0x3FF4CCCCC0000000, float 0x3FC99999A0000000, float 0x3FC99999A0000000, float 0x3FC99999A0000000, {{.*}})
diff --git a/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestArrayDirectAccess_lit.frag b/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestArrayDirectAccess_lit.frag
index 64da83e7c7..e42cb57358 100644
--- a/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestArrayDirectAccess_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestArrayDirectAccess_lit.frag
@@ -14,7 +14,7 @@ void main()
 ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
 
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i32 1, <2 x float> <float 1.000000e+00, float 1.000000e+00>)
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, i32 1, <2 x float> <float 1.000000e+00, float 1.000000e+00>)
 
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results
 ; SHADERTEST: call {{.*}} @lgc.create.get.desc.ptr.p4(i32 1, i32 1, i64 0, i32 0)
diff --git a/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestBasic_lit.frag b/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestBasic_lit.frag
index 69a0938216..79b46e8c01 100644
--- a/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestBasic_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestBasic_lit.frag
@@ -13,7 +13,7 @@ void main()
 ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
 
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 512, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i32 1, <2 x float> zeroinitializer)
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 512, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, i32 1, <2 x float> zeroinitializer)
 
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results
 ; SHADERTEST: call {{.*}} @lgc.create.get.desc.ptr.p4(i32 1, i32 1, i64 0, i32 0)
diff --git a/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestIntegerSampler_lit.frag b/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestIntegerSampler_lit.frag
index 5bddde6b7a..780f8fc1eb 100644
--- a/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestIntegerSampler_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestIntegerSampler_lit.frag
@@ -16,8 +16,8 @@ void main()
 ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
 
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results
-; SHADERTEST: call <4 x i32> (...) @lgc.create.image.sample.v4i32(i32 1, i32 516, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i32 1, <2 x float> <float 0.000000e+00, float 1.000000e+00>)
-; SHADERTEST: call <4 x i32> (...) @lgc.create.image.sample.v4i32(i32 1, i32 512, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i32 1, <2 x float> <float 0.000000e+00, float 1.000000e+00>)
+; SHADERTEST: call <4 x i32> (...) @lgc.create.image.sample.v4i32(i32 1, i32 516, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, i32 1, <2 x float> <float 0.000000e+00, float 1.000000e+00>)
+; SHADERTEST: call <4 x i32> (...) @lgc.create.image.sample.v4i32(i32 1, i32 512, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, i32 1, <2 x float> <float 0.000000e+00, float 1.000000e+00>)
 
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results
 ; SHADERTEST: call {{.*}} @lgc.create.get.desc.ptr.p4(i32 1, i32 1, i64 0, i32 1)
diff --git a/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestMultiDimArrayDirectAccess_lit.frag b/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestMultiDimArrayDirectAccess_lit.frag
index d772429a42..af4dec00a8 100644
--- a/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestMultiDimArrayDirectAccess_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestMultiDimArrayDirectAccess_lit.frag
@@ -14,7 +14,7 @@ void main()
 ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
 
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 0, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i32 1, <2 x float> <float 1.000000e+00, float 1.000000e+00>)
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 0, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, i32 1, <2 x float> <float 1.000000e+00, float 1.000000e+00>)
 
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results
 ; SHADERTEST: call {{.*}} @lgc.create.get.desc.ptr.p4(i32 1, i32 1, i64 0, i32 0)
diff --git a/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestSeparate_lit.frag b/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestSeparate_lit.frag
index eb6fe23a80..60923e7b20 100644
--- a/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestSeparate_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestSeparate_lit.frag
@@ -14,7 +14,7 @@ void main()
 ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
 
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 512, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i32 1, <2 x float> zeroinitializer)
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 512, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, i32 1, <2 x float> zeroinitializer)
 
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results
 ; SHADERTEST: call {{.*}} @lgc.create.get.desc.ptr.p4(i32 1, i32 1, i64 0, i32 1)
diff --git a/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestTextureBiasClamp_lit.frag b/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestTextureBiasClamp_lit.frag
index e141adff10..a132ad6d38 100644
--- a/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestTextureBiasClamp_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestTextureBiasClamp_lit.frag
@@ -74,31 +74,31 @@ void main()
 ; SHADERTEST: call {{.*}} @lgc.create.image.sample.v4f32(i32 8, i32 512, {{.*}}, {{.*}}, i32 193, <4 x float> %{{[0-9]*}}, float 2.000000e+00, {{.*}})
 
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
-; SHADERTEST: call i32 @llvm.amdgcn.readfirstlane
-; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f32({{.*}}, float 2.000000e+00, float %{{[0-9]*}}, {{.*}})
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
+; SHADERTEST: call i32 @llvm.amdgcn.readfirstlane
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.b.cl.1d.v4f32.f32.f32({{.*}}, float 2.000000e+00, float %{{[0-9]*}}, {{.*}})
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f32.f32({{.*}}, float 2.000000e+00, float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, {{.*}})
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.b.cl.2d.v4f32.f32.f32({{.*}}, float 2.000000e+00, float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, {{.*}})
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.b.cl.3d.v4f32.f32.f32({{.*}}, float 2.000000e+00, float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, {{.*}})
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.b.cl.3d.v4f32.f32.f32({{.*}}, float 2.000000e+00, float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, {{.*}})
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
+; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: call {{.*}} float @llvm.amdgcn.cubesc(float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, float %{{[.i0-9]*}})
 ; SHADERTEST: call {{.*}} float @llvm.amdgcn.cubetc(float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, float %{{[.i0-9]*}})
 ; SHADERTEST: call {{.*}} float @llvm.amdgcn.cubema(float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, float %{{[.i0-9]*}})
 ; SHADERTEST: call {{.*}} float @llvm.amdgcn.cubeid(float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, float %{{[.i0-9]*}})
 ; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.b.cl.cube.v4f32.f32.f32({{.*}}, float 2.000000e+00, {{.*}})
-; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.b.cl.1darray.v4f32.f32.f32({{.*}}, float 2.000000e+00, float %{{[.i0-9]*}}, float %{{[0-9]*}}, {{.*}})
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.b.cl.1darray.v4f32.f32.f32({{.*}}, float 2.000000e+00, float %{{[.i0-9]*}}, float %{{[0-9]*}}, {{.*}})
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.b.cl.2darray.v4f32.f32.f32({{.*}}, float 2.000000e+00, float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, float %{{[0-9]*}}, {{.*}})
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.b.cl.2darray.v4f32.f32.f32({{.*}}, float 2.000000e+00, float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, float %{{[0-9]*}}, {{.*}})
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
+; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.b.cl.cube.v4f32.f32.f32({{.*}}, float 2.000000e+00, {{.*}})
 
 ; SHADERTEST: AMDLLPC SUCCESS
diff --git a/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestTextureClamp_lit.frag b/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestTextureClamp_lit.frag
index 002bcc9bf2..e48d5ec2f5 100644
--- a/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestTextureClamp_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestTextureClamp_lit.frag
@@ -74,30 +74,30 @@ void main()
 ; SHADERTEST: call {{.*}} @lgc.create.image.sample.v4f32(i32 8, i32 512, {{.*}}, {{.*}}, i32 129, <4 x float> %{{[0-9]*}}, {{.*}})
 
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
-; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.cl.1d.v4f32.f32({{.*}}, float %{{[0-9]*}}, {{.*}})
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.cl.1d.v4f32.f32({{.*}}, float %{{[0-9]*}}, {{.*}})
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.cl.2d.v4f32.f32({{.*}}, float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, {{.*}})
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.cl.2d.v4f32.f32({{.*}}, float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, {{.*}})
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.cl.3d.v4f32.f32({{.*}}, float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, {{.*}})
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.cl.3d.v4f32.f32({{.*}}, float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, {{.*}})
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
+; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: call {{.*}} float @llvm.amdgcn.cubesc(float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, float %{{[.i0-9]*}})
 ; SHADERTEST: call {{.*}} float @llvm.amdgcn.cubetc(float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, float %{{[.i0-9]*}})
 ; SHADERTEST: call {{.*}} float @llvm.amdgcn.cubema(float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, float %{{[.i0-9]*}})
 ; SHADERTEST: call {{.*}} float @llvm.amdgcn.cubeid(float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, float %{{[.i0-9]*}})
 ; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.cl.cube.v4f32.f32
-; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.cl.1darray.v4f32.f32({{.*}}, float %{{[.i0-9]*}}, float %{{[0-9]*}}, {{.*}})
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.cl.1darray.v4f32.f32({{.*}}, float %{{[.i0-9]*}}, float %{{[0-9]*}}, {{.*}})
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.cl.2darray.v4f32.f32({{.*}}, float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, float %{{[0-9]*}}, {{.*}})
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.cl.2darray.v4f32.f32({{.*}}, float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, float %{{[0-9]*}}, {{.*}})
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
+; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.cl.cube.v4f32.f32
 
 ; SHADERTEST: AMDLLPC SUCCESS
diff --git a/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestTextureGradClamp_lit.frag b/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestTextureGradClamp_lit.frag
index add49732f6..6a83db23d5 100644
--- a/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestTextureGradClamp_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestTextureGradClamp_lit.frag
@@ -76,31 +76,31 @@ void main()
 ; SHADERTEST: call {{.*}} @lgc.create.image.sample.v4f32(i32 8, i32 512, {{.*}}, {{.*}}, i32 153, <4 x float> %{{[0-9]*}}, <3 x float> %{{[0-9]*}}, <3 x float> %{{[0-9]*}}, {{.*}})
 
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
-; SHADERTEST: call i32 @llvm.amdgcn.readfirstlane
-; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f32.f32({{.*}}, float %{{[0-9]*}}, float %{{[0-9]*}}, float %{{[0-9]*}}, {{.*}})
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
+; SHADERTEST: call i32 @llvm.amdgcn.readfirstlane
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.cl.1d.v4f32.f32.f32({{.*}}, float %{{[0-9]*}}, float %{{[0-9]*}}, float %{{[0-9]*}}, {{.*}})
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f32.f32({{.*}}, float %{{[0-9]*}}, float %{{[0-9]*}}, float %{{[0-9]*}}, float %{{[0-9]*}}, float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, {{.*}})
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.cl.2d.v4f32.f32.f32({{.*}}, float %{{[0-9]*}}, float %{{[0-9]*}}, float %{{[0-9]*}}, float %{{[0-9]*}}, float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, {{.*}})
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.cl.3d.v4f32.f32.f32({{.*}}, float %{{[0-9]*}}, float %{{[0-9]*}}, float %{{[0-9]*}}, float %{{[0-9]*}}, float %{{[0-9]*}}, float %{{[0-9]*}}, float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, {{.*}})
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.cl.3d.v4f32.f32.f32({{.*}}, float %{{[0-9]*}}, float %{{[0-9]*}}, float %{{[0-9]*}}, float %{{[0-9]*}}, float %{{[0-9]*}}, float %{{[0-9]*}}, float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, {{.*}})
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
+; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: call {{.*}} float @llvm.amdgcn.cubesc(float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, float %{{[.i0-9]*}})
 ; SHADERTEST: call {{.*}} float @llvm.amdgcn.cubetc(float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, float %{{[.i0-9]*}})
 ; SHADERTEST: call {{.*}} float @llvm.amdgcn.cubema(float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, float %{{[.i0-9]*}})
 ; SHADERTEST: call {{.*}} float @llvm.amdgcn.cubeid(float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, float %{{[.i0-9]*}})
 ; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.cl.cube.v4f32.f32.f32
-; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.cl.1darray.v4f32.f32.f32({{.*}}, float %{{[0-9]*}}, float %{{[0-9]*}}, float %{{[.i0-9]*}}, float %{{[0-9]*}}, {{.*}})
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.cl.1darray.v4f32.f32.f32({{.*}}, float %{{[0-9]*}}, float %{{[0-9]*}}, float %{{[.i0-9]*}}, float %{{[0-9]*}}, {{.*}})
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.cl.2darray.v4f32.f32.f32({{.*}}, float %{{[0-9]*}}, float %{{[0-9]*}}, float %{{[0-9]*}}, float %{{[0-9]*}}, float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, float %{{[0-9]*}}, {{.*}})
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.cl.2darray.v4f32.f32.f32({{.*}}, float %{{[0-9]*}}, float %{{[0-9]*}}, float %{{[0-9]*}}, float %{{[0-9]*}}, float %{{[.i0-9]*}}, float %{{[.i0-9]*}}, float %{{[0-9]*}}, {{.*}})
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
+; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.cl.cube.v4f32.f32.f32
 
 ; SHADERTEST: AMDLLPC SUCCESS
diff --git a/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestTextureGradOffsetClamp_lit.frag b/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestTextureGradOffsetClamp_lit.frag
index 149360a1f3..b13c902967 100644
--- a/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestTextureGradOffsetClamp_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestTextureGradOffsetClamp_lit.frag
@@ -61,21 +61,21 @@ void main()
 ; SHADERTEST: call {{.*}} @lgc.create.image.sample.v4f32(i32 5, i32 512, {{.*}}, {{.*}}, i32 409, <3 x float> <float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000>, <2 x float> <float 0x3FC99999A0000000, float 0x3FC99999A0000000>, <2 x float> <float 0x3FD3333340000000, float 0x3FD3333340000000>, {{.*}}, <2 x i32> <i32 2, i32 2>)
 
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
-; SHADERTEST: call i32 @llvm.amdgcn.readfirstlane
-; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{.*}}
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{.*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.cl.o.1d.v4f32.f32.f32({{.*}}, i32 2, float 0x3FC99999A0000000, float 0x3FD3333340000000, float 0x3FB99999A0000000, {{.*}})
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{.*}}
+; SHADERTEST: call i32 @llvm.amdgcn.readfirstlane
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.cl.o.1d.v4f32.f32.f32({{.*}}, i32 2, float 0x3FC99999A0000000, float 0x3FD3333340000000, float 0x3FB99999A0000000, {{.*}})
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{.*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.cl.o.2d.v4f32.f32.f32({{.*}}, i32 514, float 0x3FC99999A0000000, float 0x3FC99999A0000000, float 0x3FD3333340000000, float 0x3FD3333340000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, {{.*}})
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{.*}}
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.cl.o.2d.v4f32.f32.f32({{.*}}, i32 514, float 0x3FC99999A0000000, float 0x3FC99999A0000000, float 0x3FD3333340000000, float 0x3FD3333340000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, {{.*}})
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{.*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.cl.o.3d.v4f32.f32.f32({{.*}}, i32 131586, float 0x3FC99999A0000000, float 0x3FC99999A0000000, float 0x3FC99999A0000000, float 0x3FD3333340000000, float 0x3FD3333340000000, float 0x3FD3333340000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, {{.*}})
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{.*}}
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.cl.o.3d.v4f32.f32.f32({{.*}}, i32 131586, float 0x3FC99999A0000000, float 0x3FC99999A0000000, float 0x3FC99999A0000000, float 0x3FD3333340000000, float 0x3FD3333340000000, float 0x3FD3333340000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, {{.*}})
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{.*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.cl.o.1darray.v4f32.f32.f32({{.*}}, i32 2, float 0x3FC99999A0000000, float 0x3FD3333340000000, float 0x3FB99999A0000000, float 0.000000e+00, {{.*}})
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{.*}}
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.cl.o.1darray.v4f32.f32.f32({{.*}}, i32 2, float 0x3FC99999A0000000, float 0x3FD3333340000000, float 0x3FB99999A0000000, float 0.000000e+00, {{.*}})
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{.*}}
+; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{.*}}
 ; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.d.cl.o.2darray.v4f32.f32.f32({{.*}}, i32 514, float 0x3FC99999A0000000, float 0x3FC99999A0000000, float 0x3FD3333340000000, float 0x3FD3333340000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0.000000e+00, {{.*}})
 
 ; SHADERTEST: AMDLLPC SUCCESS
diff --git a/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestTextureOffsetClamp_lit.frag b/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestTextureOffsetClamp_lit.frag
index 761e8f8e83..5e7dd16029 100644
--- a/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestTextureOffsetClamp_lit.frag
+++ b/llpc/test/shaderdb/core/OpImageSampleImplicitLod_TestTextureOffsetClamp_lit.frag
@@ -61,21 +61,21 @@ void main()
 ; SHADERTEST: call {{.*}} @lgc.create.image.sample.v4f32(i32 5, i32 512, {{.*}}, {{.*}}, i32 385, <3 x float> <float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000>, {{.*}}, <2 x i32> <i32 2, i32 2>)
 
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
-; SHADERTEST: call i32 @llvm.amdgcn.readfirstlane
-; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.cl.o.1d.v4f32.f32({{.*}}, i32 2, float 0x3FB99999A0000000, {{.*}})
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
+; SHADERTEST: call i32 @llvm.amdgcn.readfirstlane
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.cl.o.1d.v4f32.f32({{.*}}, i32 2, float 0x3FB99999A0000000, {{.*}})
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.cl.o.2d.v4f32.f32({{.*}}, i32 514, float 0x3FB99999A0000000, float 0x3FB99999A0000000, {{.*}})
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.cl.o.2d.v4f32.f32({{.*}}, i32 514, float 0x3FB99999A0000000, float 0x3FB99999A0000000, {{.*}})
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.cl.o.3d.v4f32.f32({{.*}}, i32 131586, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, {{.*}})
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.cl.o.3d.v4f32.f32({{.*}}, i32 131586, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0x3FB99999A0000000, {{.*}})
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
-; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.cl.o.1darray.v4f32.f32({{.*}}, i32 2, float 0x3FB99999A0000000, float 0.000000e+00, {{.*}})
 ; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
+; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.cl.o.1darray.v4f32.f32({{.*}}, i32 2, float 0x3FB99999A0000000, float 0.000000e+00, {{.*}})
 ; SHADERTEST: load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}
+; SHADERTEST: load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}
 ; SHADERTEST: call {{.*}} <4 x float> @llvm.amdgcn.image.sample.cl.o.2darray.v4f32.f32({{.*}}, i32 514, float 0x3FB99999A0000000, float 0x3FB99999A0000000, float 0.000000e+00, {{.*}})
 
 ; SHADERTEST: AMDLLPC SUCCESS
diff --git a/llpc/test/shaderdb/core/OpLogicalNotEqual_TestGeneral_lit.frag b/llpc/test/shaderdb/core/OpLogicalNotEqual_TestGeneral_lit.frag
index 8868c4c22e..8f5321288a 100644
--- a/llpc/test/shaderdb/core/OpLogicalNotEqual_TestGeneral_lit.frag
+++ b/llpc/test/shaderdb/core/OpLogicalNotEqual_TestGeneral_lit.frag
@@ -30,10 +30,10 @@ void main()
 // SHADERTEST-NEXT:    [[TMP0:%.*]] = call ptr addrspace(7) @lgc.load.buffer.desc(i64 0, i32 0, i32 0, i32 2)
 // SHADERTEST-NEXT:    [[TMP1:%.*]] = call ptr @llvm.invariant.start.p7(i64 -1, ptr addrspace(7) [[TMP0]])
 // SHADERTEST-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(7) [[TMP0]], align 4
-// SHADERTEST-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(7) [[TMP0]], i32 8
+// SHADERTEST-NEXT:    [[TMP3:%.*]] = getelementptr {{(inbounds )?}}i8, ptr addrspace(7) [[TMP0]], i32 8
 // SHADERTEST-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(7) [[TMP3]], align 4
 // SHADERTEST-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[TMP2]], [[TMP4]]
-// SHADERTEST-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(7) [[TMP0]], i32 12
+// SHADERTEST-NEXT:    [[TMP6:%.*]] = getelementptr {{(inbounds )?}}i8, ptr addrspace(7) [[TMP0]], i32 12
 // SHADERTEST-NEXT:    [[TMP7:%.*]] = load i32, ptr addrspace(7) [[TMP6]], align 4
 // SHADERTEST-NEXT:    [[TMP8:%.*]] = icmp ne i32 [[TMP2]], [[TMP7]]
 // SHADERTEST-NEXT:    [[TMP9:%.*]] = and i1 [[TMP8]], [[TMP5]]
diff --git a/llpc/test/shaderdb/core/OpSLessThanEqual_TestSignedAndUnsigned_lit.frag b/llpc/test/shaderdb/core/OpSLessThanEqual_TestSignedAndUnsigned_lit.frag
index 7956d7e89c..21dfa067ef 100644
--- a/llpc/test/shaderdb/core/OpSLessThanEqual_TestSignedAndUnsigned_lit.frag
+++ b/llpc/test/shaderdb/core/OpSLessThanEqual_TestSignedAndUnsigned_lit.frag
@@ -25,7 +25,7 @@ void main()
 // SHADERTEST-NEXT:    [[TMP0:%.*]] = call ptr addrspace(7) @lgc.load.buffer.desc(i64 0, i32 0, i32 0, i32 0)
 // SHADERTEST-NEXT:    [[TMP1:%.*]] = call ptr @llvm.invariant.start.p7(i64 -1, ptr addrspace(7) [[TMP0]])
 // SHADERTEST-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(7) [[TMP0]], align 8
-// SHADERTEST-NEXT:    [[TMP3:%.*]] = getelementptr inbounds {{i8|<{ [[]2 x i32], [[]2 x i32], [[]2 x i32], [[]2 x i32] }>}}, ptr addrspace(7) [[TMP0]], i32 {{8|0, i32 1}}
+// SHADERTEST-NEXT:    [[TMP3:%.*]] = getelementptr {{inbounds i8|<{ [[]2 x i32], [[]2 x i32], [[]2 x i32], [[]2 x i32] }>|i8}}, ptr addrspace(7) [[TMP0]], i32 {{8|0, i32 1}}
 // SHADERTEST-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr addrspace(7) [[TMP3]], align 8
 // SHADERTEST-NEXT:    [[TMP5:%.*]] = extractelement <2 x i32> [[TMP2]], i64 0
 // SHADERTEST-NEXT:    [[TMP6:%.*]] = extractelement <2 x i32> [[TMP4]], i64 0
diff --git a/llpc/test/shaderdb/core/OpSLessThan_TestSignedAndUnsigned_lit.frag b/llpc/test/shaderdb/core/OpSLessThan_TestSignedAndUnsigned_lit.frag
index 2bb60ce1df..357603eba2 100644
--- a/llpc/test/shaderdb/core/OpSLessThan_TestSignedAndUnsigned_lit.frag
+++ b/llpc/test/shaderdb/core/OpSLessThan_TestSignedAndUnsigned_lit.frag
@@ -25,7 +25,7 @@ void main()
 // SHADERTEST-NEXT:    [[TMP0:%.*]] = call ptr addrspace(7) @lgc.load.buffer.desc(i64 0, i32 0, i32 0, i32 0)
 // SHADERTEST-NEXT:    [[TMP1:%.*]] = call ptr @llvm.invariant.start.p7(i64 -1, ptr addrspace(7) [[TMP0]])
 // SHADERTEST-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(7) [[TMP0]], align 8
-// SHADERTEST-NEXT:    [[TMP3:%.*]] = getelementptr inbounds {{i8|<{ [[]2 x i32], [[]2 x i32], [[]2 x i32], [[]2 x i32] }>}}, ptr addrspace(7) [[TMP0]], i32 {{8|0, i32 1}}
+// SHADERTEST-NEXT:    [[TMP3:%.*]] = getelementptr {{inbounds i8|<{ [[]2 x i32], [[]2 x i32], [[]2 x i32], [[]2 x i32] }>|i8}}, ptr addrspace(7) [[TMP0]], i32 {{8|0, i32 1}}
 // SHADERTEST-NEXT:    [[TMP4:%.*]] = load <2 x i32>, ptr addrspace(7) [[TMP3]], align 8
 // SHADERTEST-NEXT:    [[TMP5:%.*]] = extractelement <2 x i32> [[TMP2]], i64 0
 // SHADERTEST-NEXT:    [[TMP6:%.*]] = extractelement <2 x i32> [[TMP4]], i64 0
diff --git a/llpc/test/shaderdb/core/TestXfbStateMetadata.vert b/llpc/test/shaderdb/core/TestXfbStateMetadata.vert
index 25b03a1630..c0a28c5490 100644
--- a/llpc/test/shaderdb/core/TestXfbStateMetadata.vert
+++ b/llpc/test/shaderdb/core/TestXfbStateMetadata.vert
@@ -29,8 +29,8 @@ void main()
 //
 //.
 // CHECK: attributes #[[ATTR0]] = { nounwind "denormal-fp-math-f32"="preserve-sign" }
-// CHECK: attributes #[[ATTR1:[0-9]+]] = { nounwind willreturn memory(read) }
-// CHECK: attributes #[[ATTR2:[0-9]+]] = { nounwind }
+// CHECK: attributes #[[ATTR1:[0-9]+]] = { nounwind }
+// CHECK: attributes #[[ATTR2:[0-9]+]] = { nounwind willreturn memory(read) }
 //.
 // CHECK: [[META1]] = !{i32 1}
 // CHECK: [[META6]] = !{i32 0}
diff --git a/llpc/test/shaderdb/extensions/ExtMultiView_TestSubpassLoad_lit.pipe b/llpc/test/shaderdb/extensions/ExtMultiView_TestSubpassLoad_lit.pipe
index 660dfcb827..54bf3513dc 100644
--- a/llpc/test/shaderdb/extensions/ExtMultiView_TestSubpassLoad_lit.pipe
+++ b/llpc/test/shaderdb/extensions/ExtMultiView_TestSubpassLoad_lit.pipe
@@ -3,7 +3,7 @@
 ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.load.v4f32(i32 1, i32 608, <8 x i32>
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.load.v4f32(i32 1, i32 608, ptr addrspace(4)
 ; SHADERTEST: AMDLLPC SUCCESS
 ; END_SHADERTEST
 
diff --git a/llpc/test/shaderdb/extensions/OpExtInst_TestFmaDouble_lit.frag b/llpc/test/shaderdb/extensions/OpExtInst_TestFmaDouble_lit.frag
index 07064eab48..ad76b1641c 100644
--- a/llpc/test/shaderdb/extensions/OpExtInst_TestFmaDouble_lit.frag
+++ b/llpc/test/shaderdb/extensions/OpExtInst_TestFmaDouble_lit.frag
@@ -23,16 +23,16 @@ void main()
 // CHECK-NEXT:    [[TMP0:%.*]] = call ptr addrspace(7) @lgc.load.buffer.desc(i64 0, i32 0, i32 0, i32 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = call ptr @llvm.invariant.start.p7(i64 -1, ptr addrspace(7) [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = load double, ptr addrspace(7) [[TMP0]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds {{i8|<{ double, double, double, [[]8 x i8], [[]3 x double], [[]8 x i8], [[]3 x double], [[]8 x i8], [[]3 x double] }>}}, ptr addrspace(7) [[TMP0]], i32 {{8|0, i32 1}}
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr {{inbounds i8|<{ double, double, double, [[]8 x i8], [[]3 x double], [[]8 x i8], [[]3 x double], [[]8 x i8], [[]3 x double] }>|i8}}, ptr addrspace(7) [[TMP0]], i32 {{8|0, i32 1}}
 // CHECK-NEXT:    [[TMP4:%.*]] = load double, ptr addrspace(7) [[TMP3]], align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds {{i8|<{ double, double, double, [[]8 x i8], [[]3 x double], [[]8 x i8], [[]3 x double], [[]8 x i8], [[]3 x double] }>}}, ptr addrspace(7) [[TMP0]], i32 {{16|0, i32 2}}
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr {{inbounds i8|<{ double, double, double, [[]8 x i8], [[]3 x double], [[]8 x i8], [[]3 x double], [[]8 x i8], [[]3 x double] }>|i8}}, ptr addrspace(7) [[TMP0]], i32 {{16|0, i32 2}}
 // CHECK-NEXT:    [[TMP6:%.*]] = load double, ptr addrspace(7) [[TMP5]], align 8
 // CHECK-NEXT:    [[TMP7:%.*]] = call reassoc nnan nsz arcp contract double (...) @lgc.create.fma.f64(double [[TMP2]], double [[TMP4]], double [[TMP6]])
-// CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds {{i8|<{ double, double, double, [[]8 x i8], [[]3 x double], [[]8 x i8], [[]3 x double], [[]8 x i8], [[]3 x double] }>}}, ptr addrspace(7) [[TMP0]], i32 {{32|0, i32 4}}
+// CHECK-NEXT:    [[TMP8:%.*]] = getelementptr {{inbounds i8|<{ double, double, double, [[]8 x i8], [[]3 x double], [[]8 x i8], [[]3 x double], [[]8 x i8], [[]3 x double] }>|i8}}, ptr addrspace(7) [[TMP0]], i32 {{32|0, i32 4}}
 // CHECK-NEXT:    [[TMP9:%.*]] = load <3 x double>, ptr addrspace(7) [[TMP8]], align 32
-// CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds {{i8|<{ double, double, double, [[]8 x i8], [[]3 x double], [[]8 x i8], [[]3 x double], [[]8 x i8], [[]3 x double] }>}}, ptr addrspace(7) [[TMP0]], i32 {{64|0, i32 6}}
+// CHECK-NEXT:    [[TMP10:%.*]] = getelementptr {{inbounds i8|<{ double, double, double, [[]8 x i8], [[]3 x double], [[]8 x i8], [[]3 x double], [[]8 x i8], [[]3 x double] }>|i8}}, ptr addrspace(7) [[TMP0]], i32 {{64|0, i32 6}}
 // CHECK-NEXT:    [[TMP11:%.*]] = load <3 x double>, ptr addrspace(7) [[TMP10]], align 32
-// CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds {{i8|<{ double, double, double, [[]8 x i8], [[]3 x double], [[]8 x i8], [[]3 x double], [[]8 x i8], [[]3 x double] }>}}, ptr addrspace(7) [[TMP0]], i32 {{96|0, i32 8}}
+// CHECK-NEXT:    [[TMP12:%.*]] = getelementptr {{inbounds i8|<{ double, double, double, [[]8 x i8], [[]3 x double], [[]8 x i8], [[]3 x double], [[]8 x i8], [[]3 x double] }>|i8}}, ptr addrspace(7) [[TMP0]], i32 {{96|0, i32 8}}
 // CHECK-NEXT:    [[TMP13:%.*]] = load <3 x double>, ptr addrspace(7) [[TMP12]], align 32
 // CHECK-NEXT:    [[TMP14:%.*]] = call reassoc nnan nsz arcp contract <3 x double> (...) @lgc.create.fma.v3f64(<3 x double> [[TMP9]], <3 x double> [[TMP11]], <3 x double> [[TMP13]])
 // CHECK-NEXT:    [[D3_0_0_VEC_EXTRACT:%.*]] = extractelement <3 x double> [[TMP14]], i64 0
diff --git a/llpc/test/shaderdb/extensions/OpExtInst_TestFmaFloat_lit.frag b/llpc/test/shaderdb/extensions/OpExtInst_TestFmaFloat_lit.frag
index 015a965534..653bdff986 100644
--- a/llpc/test/shaderdb/extensions/OpExtInst_TestFmaFloat_lit.frag
+++ b/llpc/test/shaderdb/extensions/OpExtInst_TestFmaFloat_lit.frag
@@ -23,16 +23,16 @@ void main()
 // CHECK-NEXT:    [[TMP0:%.*]] = call ptr addrspace(7) @lgc.load.buffer.desc(i64 0, i32 0, i32 0, i32 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = call ptr @llvm.invariant.start.p7(i64 -1, ptr addrspace(7) [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr addrspace(7) [[TMP0]], align 4
-// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds {{i8|<{ float, float, float, [[]4 x i8], [[]3 x float], [[]4 x i8], [[]3 x float], [[]4 x i8], [[]3 x float] }>}}, ptr addrspace(7) [[TMP0]], i32 {{4|0, i32 1}}
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr {{inbounds i8|<{ float, float, float, [[]4 x i8], [[]3 x float], [[]4 x i8], [[]3 x float], [[]4 x i8], [[]3 x float] }>|i8}}, ptr addrspace(7) [[TMP0]], i32 {{4|0, i32 1}}
 // CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr addrspace(7) [[TMP3]], align 4
-// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds {{i8|<{ float, float, float, [[]4 x i8], [[]3 x float], [[]4 x i8], [[]3 x float], [[]4 x i8], [[]3 x float] }>}}, ptr addrspace(7) [[TMP0]], i32 {{8|0, i32 2}}
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr {{inbounds i8|<{ float, float, float, [[]4 x i8], [[]3 x float], [[]4 x i8], [[]3 x float], [[]4 x i8], [[]3 x float] }>|i8}}, ptr addrspace(7) [[TMP0]], i32 {{8|0, i32 2}}
 // CHECK-NEXT:    [[TMP6:%.*]] = load float, ptr addrspace(7) [[TMP5]], align 4
 // CHECK-NEXT:    [[TMP7:%.*]] = call reassoc nnan nsz arcp contract afn float (...) @lgc.create.fma.f32(float [[TMP2]], float [[TMP4]], float [[TMP6]])
-// CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds {{i8|<{ float, float, float, [[]4 x i8], [[]3 x float], [[]4 x i8], [[]3 x float], [[]4 x i8], [[]3 x float] }>}}, ptr addrspace(7) [[TMP0]], i32 {{16|0, i32 4}}
+// CHECK-NEXT:    [[TMP8:%.*]] = getelementptr {{inbounds i8|<{ float, float, float, [[]4 x i8], [[]3 x float], [[]4 x i8], [[]3 x float], [[]4 x i8], [[]3 x float] }>|i8}}, ptr addrspace(7) [[TMP0]], i32 {{16|0, i32 4}}
 // CHECK-NEXT:    [[TMP9:%.*]] = load <3 x float>, ptr addrspace(7) [[TMP8]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds {{i8|<{ float, float, float, [[]4 x i8], [[]3 x float], [[]4 x i8], [[]3 x float], [[]4 x i8], [[]3 x float] }>}}, ptr addrspace(7) [[TMP0]], i32 {{32|0, i32 6}}
+// CHECK-NEXT:    [[TMP10:%.*]] = getelementptr {{inbounds i8|<{ float, float, float, [[]4 x i8], [[]3 x float], [[]4 x i8], [[]3 x float], [[]4 x i8], [[]3 x float] }>|i8}}, ptr addrspace(7) [[TMP0]], i32 {{32|0, i32 6}}
 // CHECK-NEXT:    [[TMP11:%.*]] = load <3 x float>, ptr addrspace(7) [[TMP10]], align 16
-// CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds {{i8|<{ float, float, float, [[]4 x i8], [[]3 x float], [[]4 x i8], [[]3 x float], [[]4 x i8], [[]3 x float] }>}}, ptr addrspace(7) [[TMP0]], i32 {{48|0, i32 8}}
+// CHECK-NEXT:    [[TMP12:%.*]] = getelementptr {{inbounds i8|<{ float, float, float, [[]4 x i8], [[]3 x float], [[]4 x i8], [[]3 x float], [[]4 x i8], [[]3 x float] }>|i8}}, ptr addrspace(7) [[TMP0]], i32 {{48|0, i32 8}}
 // CHECK-NEXT:    [[TMP13:%.*]] = load <3 x float>, ptr addrspace(7) [[TMP12]], align 16
 // CHECK-NEXT:    [[TMP14:%.*]] = call reassoc nnan nsz arcp contract afn <3 x float> (...) @lgc.create.fma.v3f32(<3 x float> [[TMP9]], <3 x float> [[TMP11]], <3 x float> [[TMP13]])
 // CHECK-NEXT:    [[F3_0_0_VEC_EXTRACT:%.*]] = extractelement <3 x float> [[TMP14]], i64 0
diff --git a/llpc/test/shaderdb/extensions/OpExtInst_TestMixSelectDouble_lit.frag b/llpc/test/shaderdb/extensions/OpExtInst_TestMixSelectDouble_lit.frag
index 4cc9302d98..3674cbd309 100644
--- a/llpc/test/shaderdb/extensions/OpExtInst_TestMixSelectDouble_lit.frag
+++ b/llpc/test/shaderdb/extensions/OpExtInst_TestMixSelectDouble_lit.frag
@@ -28,17 +28,17 @@ void main()
 // CHECK-NEXT:    [[TMP0:%.*]] = call ptr addrspace(7) @lgc.load.buffer.desc(i64 0, i32 0, i32 0, i32 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = call ptr @llvm.invariant.start.p7(i64 -1, ptr addrspace(7) [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = load double, ptr addrspace(7) [[TMP0]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds {{i8|<{ double, double, i32, [[]12 x i8], [[]3 x double], [[]8 x i8], [[]3 x double], [[]8 x i8], [[]3 x i32] }>}}, ptr addrspace(7) [[TMP0]], i32 {{8|0, i32 1}}
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr {{inbounds i8|<{ double, double, i32, [[]12 x i8], [[]3 x double], [[]8 x i8], [[]3 x double], [[]8 x i8], [[]3 x i32] }>|i8}}, ptr addrspace(7) [[TMP0]], i32 {{8|0, i32 1}}
 // CHECK-NEXT:    [[TMP4:%.*]] = load double, ptr addrspace(7) [[TMP3]], align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds {{i8|<{ double, double, i32, [[]12 x i8], [[]3 x double], [[]8 x i8], [[]3 x double], [[]8 x i8], [[]3 x i32] }>}}, ptr addrspace(7) [[TMP0]], i32 {{16|0, i32 2}}
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr {{inbounds i8|<{ double, double, i32, [[]12 x i8], [[]3 x double], [[]8 x i8], [[]3 x double], [[]8 x i8], [[]3 x i32] }>|i8}}, ptr addrspace(7) [[TMP0]], i32 {{16|0, i32 2}}
 // CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(7) [[TMP5]], align 4
 // CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i32 [[TMP6]], 0
 // CHECK-NEXT:    [[TMP7:%.*]] = select reassoc nnan nsz arcp contract i1 [[DOTNOT]], double [[TMP2]], double [[TMP4]]
-// CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds {{i8|<{ double, double, i32, [[]12 x i8], [[]3 x double], [[]8 x i8], [[]3 x double], [[]8 x i8], [[]3 x i32] }>}}, ptr addrspace(7) [[TMP0]], i32 {{32|0, i32 4}}
+// CHECK-NEXT:    [[TMP8:%.*]] = getelementptr {{inbounds i8|<{ double, double, i32, [[]12 x i8], [[]3 x double], [[]8 x i8], [[]3 x double], [[]8 x i8], [[]3 x i32] }>|i8}}, ptr addrspace(7) [[TMP0]], i32 {{32|0, i32 4}}
 // CHECK-NEXT:    [[TMP9:%.*]] = load <3 x double>, ptr addrspace(7) [[TMP8]], align 32
-// CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds {{i8|<{ double, double, i32, [[]12 x i8], [[]3 x double], [[]8 x i8], [[]3 x double], [[]8 x i8], [[]3 x i32] }>}}, ptr addrspace(7) [[TMP0]], i32 {{64|0, i32 6}}
+// CHECK-NEXT:    [[TMP10:%.*]] = getelementptr {{inbounds i8|<{ double, double, i32, [[]12 x i8], [[]3 x double], [[]8 x i8], [[]3 x double], [[]8 x i8], [[]3 x i32] }>|i8}}, ptr addrspace(7) [[TMP0]], i32 {{64|0, i32 6}}
 // CHECK-NEXT:    [[TMP11:%.*]] = load <3 x double>, ptr addrspace(7) [[TMP10]], align 32
-// CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds {{i8|<{ double, double, i32, [[]12 x i8], [[]3 x double], [[]8 x i8], [[]3 x double], [[]8 x i8], [[]3 x i32] }>}}, ptr addrspace(7) [[TMP0]], i32 {{96|0, i32 8}}
+// CHECK-NEXT:    [[TMP12:%.*]] = getelementptr {{inbounds i8|<{ double, double, i32, [[]12 x i8], [[]3 x double], [[]8 x i8], [[]3 x double], [[]8 x i8], [[]3 x i32] }>|i8}}, ptr addrspace(7) [[TMP0]], i32 {{96|0, i32 8}}
 // CHECK-NEXT:    [[TMP13:%.*]] = load <3 x i32>, ptr addrspace(7) [[TMP12]], align 16
 // CHECK-NEXT:    [[TMP14:%.*]] = extractelement <3 x i32> [[TMP13]], i64 1
 // CHECK-NEXT:    [[DOTNOT2:%.*]] = icmp eq i32 [[TMP14]], 0
diff --git a/llpc/test/shaderdb/extensions/OpExtInst_TestMixSelectFloat_lit.frag b/llpc/test/shaderdb/extensions/OpExtInst_TestMixSelectFloat_lit.frag
index 6889ff741e..f003075a6d 100644
--- a/llpc/test/shaderdb/extensions/OpExtInst_TestMixSelectFloat_lit.frag
+++ b/llpc/test/shaderdb/extensions/OpExtInst_TestMixSelectFloat_lit.frag
@@ -28,17 +28,17 @@ void main()
 // CHECK-NEXT:    [[TMP0:%.*]] = call ptr addrspace(7) @lgc.load.buffer.desc(i64 0, i32 0, i32 0, i32 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = call ptr @llvm.invariant.start.p7(i64 -1, ptr addrspace(7) [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr addrspace(7) [[TMP0]], align 4
-// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds {{i8|<{ float, float, i32, [[]4 x i8], [[]3 x float], [[]4 x i8], [[]3 x float], [[]4 x i8], [[]3 x i32] }>}}, ptr addrspace(7) [[TMP0]], i32 {{4|0, i32 1}}
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr {{inbounds i8|<{ float, float, i32, [[]4 x i8], [[]3 x float], [[]4 x i8], [[]3 x float], [[]4 x i8], [[]3 x i32] }>|i8}}, ptr addrspace(7) [[TMP0]], i32 {{4|0, i32 1}}
 // CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr addrspace(7) [[TMP3]], align 4
-// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds {{i8|<{ float, float, i32, [[]4 x i8], [[]3 x float], [[]4 x i8], [[]3 x float], [[]4 x i8], [[]3 x i32] }>}}, ptr addrspace(7) [[TMP0]], i32 {{8|0, i32 2}}
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr {{inbounds i8|<{ float, float, i32, [[]4 x i8], [[]3 x float], [[]4 x i8], [[]3 x float], [[]4 x i8], [[]3 x i32] }>|i8}}, ptr addrspace(7) [[TMP0]], i32 {{8|0, i32 2}}
 // CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(7) [[TMP5]], align 4
 // CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i32 [[TMP6]], 0
 // CHECK-NEXT:    [[TMP7:%.*]] = select reassoc nnan nsz arcp contract afn i1 [[DOTNOT]], float [[TMP2]], float [[TMP4]]
-// CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds {{i8|<{ float, float, i32, [[]4 x i8], [[]3 x float], [[]4 x i8], [[]3 x float], [[]4 x i8], [[]3 x i32] }>}}, ptr addrspace(7) [[TMP0]], i32 {{16|0, i32 4}}
+// CHECK-NEXT:    [[TMP8:%.*]] = getelementptr {{inbounds i8|<{ float, float, i32, [[]4 x i8], [[]3 x float], [[]4 x i8], [[]3 x float], [[]4 x i8], [[]3 x i32] }>|i8}}, ptr addrspace(7) [[TMP0]], i32 {{16|0, i32 4}}
 // CHECK-NEXT:    [[TMP9:%.*]] = load <3 x float>, ptr addrspace(7) [[TMP8]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds {{i8|<{ float, float, i32, [[]4 x i8], [[]3 x float], [[]4 x i8], [[]3 x float], [[]4 x i8], [[]3 x i32] }>}}, ptr addrspace(7) [[TMP0]], i32 {{32|0, i32 6}}
+// CHECK-NEXT:    [[TMP10:%.*]] = getelementptr {{inbounds i8|<{ float, float, i32, [[]4 x i8], [[]3 x float], [[]4 x i8], [[]3 x float], [[]4 x i8], [[]3 x i32] }>|i8}}, ptr addrspace(7) [[TMP0]], i32 {{32|0, i32 6}}
 // CHECK-NEXT:    [[TMP11:%.*]] = load <3 x float>, ptr addrspace(7) [[TMP10]], align 16
-// CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds {{i8|<{ float, float, i32, [[]4 x i8], [[]3 x float], [[]4 x i8], [[]3 x float], [[]4 x i8], [[]3 x i32] }>}}, ptr addrspace(7) [[TMP0]], i32 {{48|0, i32 8}}
+// CHECK-NEXT:    [[TMP12:%.*]] = getelementptr {{inbounds i8|<{ float, float, i32, [[]4 x i8], [[]3 x float], [[]4 x i8], [[]3 x float], [[]4 x i8], [[]3 x i32] }>|i8}}, ptr addrspace(7) [[TMP0]], i32 {{48|0, i32 8}}
 // CHECK-NEXT:    [[TMP13:%.*]] = load <3 x i32>, ptr addrspace(7) [[TMP12]], align 16
 // CHECK-NEXT:    [[TMP14:%.*]] = extractelement <3 x i32> [[TMP13]], i64 1
 // CHECK-NEXT:    [[DOTNOT2:%.*]] = icmp eq i32 [[TMP14]], 0
diff --git a/llpc/test/shaderdb/extensions/OpExtInst_TestMixSelectInt_lit.frag b/llpc/test/shaderdb/extensions/OpExtInst_TestMixSelectInt_lit.frag
index 8108e0fbf5..03a4d7b21e 100644
--- a/llpc/test/shaderdb/extensions/OpExtInst_TestMixSelectInt_lit.frag
+++ b/llpc/test/shaderdb/extensions/OpExtInst_TestMixSelectInt_lit.frag
@@ -28,17 +28,17 @@ void main()
 // CHECK-NEXT:    [[TMP0:%.*]] = call ptr addrspace(7) @lgc.load.buffer.desc(i64 0, i32 0, i32 0, i32 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = call ptr @llvm.invariant.start.p7(i64 -1, ptr addrspace(7) [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(7) [[TMP0]], align 4
-// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds {{i8|<{ i32, i32, i32, [[]4 x i8], [[]3 x i32], [[]4 x i8], [[]3 x i32], [[]4 x i8], [[]3 x i32] }>}}, ptr addrspace(7) [[TMP0]], i32 {{4|0, i32 1}}
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr {{inbounds i8|<{ i32, i32, i32, [[]4 x i8], [[]3 x i32], [[]4 x i8], [[]3 x i32], [[]4 x i8], [[]3 x i32] }>|i8}}, ptr addrspace(7) [[TMP0]], i32 {{4|0, i32 1}}
 // CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(7) [[TMP3]], align 4
-// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds {{i8|<{ i32, i32, i32, [[]4 x i8], [[]3 x i32], [[]4 x i8], [[]3 x i32], [[]4 x i8], [[]3 x i32] }>}}, ptr addrspace(7) [[TMP0]], i32 {{8|0, i32 2}}
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr {{inbounds i8|<{ i32, i32, i32, [[]4 x i8], [[]3 x i32], [[]4 x i8], [[]3 x i32], [[]4 x i8], [[]3 x i32] }>|i8}}, ptr addrspace(7) [[TMP0]], i32 {{8|0, i32 2}}
 // CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(7) [[TMP5]], align 4
 // CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i32 [[TMP6]], 0
 // CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[DOTNOT]], i32 [[TMP2]], i32 [[TMP4]]
-// CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds {{i8|<{ i32, i32, i32, [[]4 x i8], [[]3 x i32], [[]4 x i8], [[]3 x i32], [[]4 x i8], [[]3 x i32] }>}}, ptr addrspace(7) [[TMP0]], i32 {{16|0, i32 4}}
+// CHECK-NEXT:    [[TMP8:%.*]] = getelementptr {{inbounds i8|<{ i32, i32, i32, [[]4 x i8], [[]3 x i32], [[]4 x i8], [[]3 x i32], [[]4 x i8], [[]3 x i32] }>|i8}}, ptr addrspace(7) [[TMP0]], i32 {{16|0, i32 4}}
 // CHECK-NEXT:    [[TMP9:%.*]] = load <3 x i32>, ptr addrspace(7) [[TMP8]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds {{i8|<{ i32, i32, i32, [[]4 x i8], [[]3 x i32], [[]4 x i8], [[]3 x i32], [[]4 x i8], [[]3 x i32] }>}}, ptr addrspace(7) [[TMP0]], i32 {{32|0, i32 6}}
+// CHECK-NEXT:    [[TMP10:%.*]] = getelementptr {{inbounds i8|<{ i32, i32, i32, [[]4 x i8], [[]3 x i32], [[]4 x i8], [[]3 x i32], [[]4 x i8], [[]3 x i32] }>|i8}}, ptr addrspace(7) [[TMP0]], i32 {{32|0, i32 6}}
 // CHECK-NEXT:    [[TMP11:%.*]] = load <3 x i32>, ptr addrspace(7) [[TMP10]], align 16
-// CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds {{i8|<{ i32, i32, i32, [[]4 x i8], [[]3 x i32], [[]4 x i8], [[]3 x i32], [[]4 x i8], [[]3 x i32] }>}}, ptr addrspace(7) [[TMP0]], i32 {{48|0, i32 8}}
+// CHECK-NEXT:    [[TMP12:%.*]] = getelementptr {{inbounds i8|<{ i32, i32, i32, [[]4 x i8], [[]3 x i32], [[]4 x i8], [[]3 x i32], [[]4 x i8], [[]3 x i32] }>|i8}}, ptr addrspace(7) [[TMP0]], i32 {{48|0, i32 8}}
 // CHECK-NEXT:    [[TMP13:%.*]] = load <3 x i32>, ptr addrspace(7) [[TMP12]], align 16
 // CHECK-NEXT:    [[TMP14:%.*]] = extractelement <3 x i32> [[TMP13]], i64 1
 // CHECK-NEXT:    [[DOTNOT2:%.*]] = icmp eq i32 [[TMP14]], 0
diff --git a/llpc/test/shaderdb/extensions/OpExtInst_TestMixSelectUint_lit.frag b/llpc/test/shaderdb/extensions/OpExtInst_TestMixSelectUint_lit.frag
index 461ebc24c6..fb95138864 100644
--- a/llpc/test/shaderdb/extensions/OpExtInst_TestMixSelectUint_lit.frag
+++ b/llpc/test/shaderdb/extensions/OpExtInst_TestMixSelectUint_lit.frag
@@ -28,17 +28,17 @@ void main()
 // CHECK-NEXT:    [[TMP0:%.*]] = call ptr addrspace(7) @lgc.load.buffer.desc(i64 0, i32 0, i32 0, i32 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = call ptr @llvm.invariant.start.p7(i64 -1, ptr addrspace(7) [[TMP0]])
 // CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(7) [[TMP0]], align 4
-// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds {{i8|<{ i32, i32, i32, [[]4 x i8], [[]3 x i32], [[]4 x i8], [[]3 x i32], [[]4 x i8], [[]3 x i32] }>}}, ptr addrspace(7) [[TMP0]], i32 {{4|0, i32 1}}
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr {{inbounds i8|<{ i32, i32, i32, [[]4 x i8], [[]3 x i32], [[]4 x i8], [[]3 x i32], [[]4 x i8], [[]3 x i32] }>|i8}}, ptr addrspace(7) [[TMP0]], i32 {{4|0, i32 1}}
 // CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(7) [[TMP3]], align 4
-// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds {{i8|<{ i32, i32, i32, [[]4 x i8], [[]3 x i32], [[]4 x i8], [[]3 x i32], [[]4 x i8], [[]3 x i32] }>}}, ptr addrspace(7) [[TMP0]], i32 {{8|0, i32 2}}
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr {{inbounds i8|<{ i32, i32, i32, [[]4 x i8], [[]3 x i32], [[]4 x i8], [[]3 x i32], [[]4 x i8], [[]3 x i32] }>|i8}}, ptr addrspace(7) [[TMP0]], i32 {{8|0, i32 2}}
 // CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr addrspace(7) [[TMP5]], align 4
 // CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i32 [[TMP6]], 0
 // CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[DOTNOT]], i32 [[TMP2]], i32 [[TMP4]]
-// CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds {{i8|<{ i32, i32, i32, [[]4 x i8], [[]3 x i32], [[]4 x i8], [[]3 x i32], [[]4 x i8], [[]3 x i32] }>}}, ptr addrspace(7) [[TMP0]], i32 {{16|0, i32 4}}
+// CHECK-NEXT:    [[TMP8:%.*]] = getelementptr {{inbounds i8|<{ i32, i32, i32, [[]4 x i8], [[]3 x i32], [[]4 x i8], [[]3 x i32], [[]4 x i8], [[]3 x i32] }>|i8}}, ptr addrspace(7) [[TMP0]], i32 {{16|0, i32 4}}
 // CHECK-NEXT:    [[TMP9:%.*]] = load <3 x i32>, ptr addrspace(7) [[TMP8]], align 16
-// CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds {{i8|<{ i32, i32, i32, [[]4 x i8], [[]3 x i32], [[]4 x i8], [[]3 x i32], [[]4 x i8], [[]3 x i32] }>}}, ptr addrspace(7) [[TMP0]], i32 {{32|0, i32 6}}
+// CHECK-NEXT:    [[TMP10:%.*]] = getelementptr {{inbounds i8|<{ i32, i32, i32, [[]4 x i8], [[]3 x i32], [[]4 x i8], [[]3 x i32], [[]4 x i8], [[]3 x i32] }>|i8}}, ptr addrspace(7) [[TMP0]], i32 {{32|0, i32 6}}
 // CHECK-NEXT:    [[TMP11:%.*]] = load <3 x i32>, ptr addrspace(7) [[TMP10]], align 16
-// CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds {{i8|<{ i32, i32, i32, [[]4 x i8], [[]3 x i32], [[]4 x i8], [[]3 x i32], [[]4 x i8], [[]3 x i32] }>}}, ptr addrspace(7) [[TMP0]], i32 {{48|0, i32 8}}
+// CHECK-NEXT:    [[TMP12:%.*]] = getelementptr {{inbounds i8|<{ i32, i32, i32, [[]4 x i8], [[]3 x i32], [[]4 x i8], [[]3 x i32], [[]4 x i8], [[]3 x i32] }>|i8}}, ptr addrspace(7) [[TMP0]], i32 {{48|0, i32 8}}
 // CHECK-NEXT:    [[TMP13:%.*]] = load <3 x i32>, ptr addrspace(7) [[TMP12]], align 16
 // CHECK-NEXT:    [[TMP14:%.*]] = extractelement <3 x i32> [[TMP13]], i64 1
 // CHECK-NEXT:    [[DOTNOT2:%.*]] = icmp eq i32 [[TMP14]], 0
diff --git a/llpc/test/shaderdb/extensions/OpExtInst_TestTanhFloat_lit.frag b/llpc/test/shaderdb/extensions/OpExtInst_TestTanhFloat_lit.frag
index 5968ac8f47..f81e8478f0 100644
--- a/llpc/test/shaderdb/extensions/OpExtInst_TestTanhFloat_lit.frag
+++ b/llpc/test/shaderdb/extensions/OpExtInst_TestTanhFloat_lit.frag
@@ -14,13 +14,13 @@ void main()
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results
 ; SHADERTEST: = call reassoc nnan nsz arcp contract afn float (...) @lgc.create.tanh.f32(float
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
-; SHADERTEST: %{{[0-9]*}} = fmul reassoc nnan nsz arcp contract afn float %{{.*}}, 0x3FF7154760000000
-; SHADERTEST: %{{[0-9]*}} = {{fsub|fneg}} reassoc nnan nsz arcp contract afn float {{(-0.000000e+00, )?}}%{{.*}}
+; SHADERTEST: %{{[0-9]*}} = call reassoc nnan nsz arcp contract afn float @llvm.fabs.f32(float %{{[0-9]*}})
+; SHADERTEST: %{{[0-9]*}} = fmul reassoc nnan nsz arcp contract afn float %{{.*}}, 0xC007154760000000
 ; SHADERTEST: %{{[0-9]*}} = call reassoc nnan nsz arcp contract afn float @llvm.exp2.f32(float %{{.*}})
-; SHADERTEST: %{{[0-9]*}} = call reassoc nnan nsz arcp contract afn float @llvm.exp2.f32(float %{{.*}})
-; SHADERTEST: %{{[0-9]*}} = fsub reassoc nnan nsz arcp contract afn float %{{.*}}, %{{.*}}
-; SHADERTEST: %{{[0-9]*}} = fadd reassoc nnan nsz arcp contract afn float %{{.*}}, %{{.*}}
-; SHADERTEST: %{{[0-9]*}} = call reassoc nnan nsz arcp contract afn float @llvm.amdgcn.fdiv.fast(float %{{.*}}, float %{{.*}})
+; SHADERTEST: %{{[0-9]*}} = fadd reassoc nnan nsz arcp contract afn float %{{.*}}, 1.000000e+00
+; SHADERTEST: %{{[0-9]*}} = call reassoc nnan nsz arcp contract afn float @llvm.amdgcn.fdiv.fast(float 2.000000e+00, float %{{.*}})
+; SHADERTEST: %{{[0-9]*}} = fsub reassoc nnan nsz arcp contract afn float 1.000000e+00, %{{.*}}
+; SHADERTEST: %{{[0-9]*}} = call reassoc nnan nsz arcp contract afn float @llvm.copysign.f32(float %{{.*}}, float %{{.*}})
 ; SHADERTEST: AMDLLPC SUCCESS
 */
 // END_SHADERTEST
diff --git a/llpc/test/shaderdb/extensions/OpExtInst_TestTanh_lit.frag b/llpc/test/shaderdb/extensions/OpExtInst_TestTanh_lit.frag
index 79ee280562..4ab214d410 100644
--- a/llpc/test/shaderdb/extensions/OpExtInst_TestTanh_lit.frag
+++ b/llpc/test/shaderdb/extensions/OpExtInst_TestTanh_lit.frag
@@ -23,20 +23,20 @@ void main()
 ; SHADERTEST: = call reassoc nnan nsz arcp contract afn float (...) @lgc.create.tanh.f32(float
 ; SHADERTEST: = call reassoc nnan nsz arcp contract afn <3 x float> (...) @lgc.create.tanh.v3f32(<3 x float>
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
-; SHADERTEST: %{{.*}} = fmul reassoc nnan nsz arcp contract afn float %{{.*}}, 0x3FF7154760000000
-; SHADERTEST: %{{.*}} = {{fsub|fneg}} reassoc nnan nsz arcp contract afn float {{(-0.000000e+00, )?}}%{{.*}}
+; SHADERTEST: %{{.*}} = call reassoc nnan nsz arcp contract afn float @llvm.fabs.f32(float %{{[0-9]*}})
+; SHADERTEST: %{{.*}} = fmul reassoc nnan nsz arcp contract afn float %{{.*}}, 0xC007154760000000
 ; SHADERTEST: %{{.*}} = call reassoc nnan nsz arcp contract afn float @llvm.exp2.f32(float %{{.*}})
+; SHADERTEST: %{{.*}} = fadd reassoc nnan nsz arcp contract afn float %{{.*}}, 1.000000e+00
+; SHADERTEST: %{{.*}} = call reassoc nnan nsz arcp contract afn float @llvm.amdgcn.fdiv.fast(float 2.000000e+00, float %{{.*}})
+; SHADERTEST: %{{.*}} = fsub reassoc nnan nsz arcp contract afn float 1.000000e+00, %{{.*}}
+; SHADERTEST: %{{.*}} = call reassoc nnan nsz arcp contract afn float @llvm.copysign.f32(float %{{.*}}, float %{{.*}})
+; SHADERTEST: %{{.*}} = call reassoc nnan nsz arcp contract afn float @llvm.fabs.f32(float %{{[0-9]*}})
+; SHADERTEST: %{{.*}} = fmul reassoc nnan nsz arcp contract afn float %{{.*}}, 0xC007154760000000
 ; SHADERTEST: %{{.*}} = call reassoc nnan nsz arcp contract afn float @llvm.exp2.f32(float %{{.*}})
-; SHADERTEST: %{{.*}} = fsub reassoc nnan nsz arcp contract afn float %{{.*}}, %{{.*}}
-; SHADERTEST: %{{.*}} = fadd reassoc nnan nsz arcp contract afn float %{{.*}}, %{{.*}}
-; SHADERTEST: %{{.*}} = call reassoc nnan nsz arcp contract afn float @llvm.amdgcn.fdiv.fast(float %{{.*}}, float %{{.*}})
-; SHADERTEST: %{{.*}} = fmul reassoc nnan nsz arcp contract afn float %{{.*}}, 0x3FF7154760000000
-; SHADERTEST: %{{.*}} = {{fsub|fneg}} reassoc nnan nsz arcp contract afn float {{(-0.000000e+00, )?}}%{{.*}}
-; SHADERTEST: %{{.*}} = call reassoc nnan nsz arcp contract afn float @llvm.exp2.f32(float %{{.*}})
-; SHADERTEST: %{{.*}} = call reassoc nnan nsz arcp contract afn float @llvm.exp2.f32(float %{{.*}})
-; SHADERTEST: %{{.*}} = fsub reassoc nnan nsz arcp contract afn float %{{.*}}, %{{.*}}
-; SHADERTEST: %{{.*}} = fadd reassoc nnan nsz arcp contract afn float %{{.*}}, %{{.*}}
-; SHADERTEST: %{{.*}} = call reassoc nnan nsz arcp contract afn float @llvm.amdgcn.fdiv.fast(float %{{.*}}, float %{{.*}})
+; SHADERTEST: %{{.*}} = fadd reassoc nnan nsz arcp contract afn float %{{.*}}, 1.000000e+00
+; SHADERTEST: %{{.*}} = call reassoc nnan nsz arcp contract afn float @llvm.amdgcn.fdiv.fast(float 2.000000e+00, float %{{.*}})
+; SHADERTEST: %{{.*}} = fsub reassoc nnan nsz arcp contract afn float 1.000000e+00, %{{.*}}
+; SHADERTEST: %{{.*}} = call reassoc nnan nsz arcp contract afn float @llvm.copysign.f32(float %{{.*}}, float %{{.*}})
 ; SHADERTEST: AMDLLPC SUCCESS
 */
 // END_SHADERTEST
diff --git a/llpc/test/shaderdb/general/ImgDescLoad.comp b/llpc/test/shaderdb/general/ImgDescLoad.comp
index ccc4431815..3abbe69a02 100644
--- a/llpc/test/shaderdb/general/ImgDescLoad.comp
+++ b/llpc/test/shaderdb/general/ImgDescLoad.comp
@@ -4,11 +4,22 @@
 /*
 ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
 
-; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results
-; SHADERTEST: [[SMP_DESC:%[0-9]*]] = load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}, align 16, !invariant.load
-; SHADERTEST: [[IMG_DESC:%[0-9]*]] = load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}, align 32, !invariant.load
-; SHADERTEST: lgc.create.image.sample.v4f32(i32 1, i32 512, <8 x i32> [[IMG_DESC]], <4 x i32> [[SMP_DESC]], i32 33, <2 x float> zeroinitializer, float 0.000000e+00)
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline before-patching results
+; SHADERTEST: [[IMG_DESC:%[0-9]*]] = load <8 x i32>, ptr addrspace(4) %{{[0-9]*}}, align 32, !invariant.load
+; SHADERTEST: [[SMP_DESC:%[0-9]*]] = load <4 x i32>, ptr addrspace(4) %{{[0-9]*}}, align 16, !invariant.load
+; SHADERTEST: %{{[0-9]*}} = extractelement <4 x i32> [[SMP_DESC]], i64 0
+; SHADERTEST: %{{[0-9]*}} = call i32 @llvm.amdgcn.readfirstlane(i32 %{{[0-9]*}})
+; SHADERTEST: %{{[0-9]*}} = insertelement <4 x i32> poison, i32 %{{[0-9]*}}, i64 0
+; SHADERTEST: %{{[0-9]*}} = extractelement <4 x i32> [[SMP_DESC]], i64 1
+; SHADERTEST: %{{[0-9]*}} = call i32 @llvm.amdgcn.readfirstlane(i32 %{{[0-9]*}})
+; SHADERTEST: %{{[0-9]*}} = insertelement <4 x i32> %{{[0-9]*}}, i32 %{{[0-9]*}}, i64 1
+; SHADERTEST: %{{[0-9]*}} = extractelement <4 x i32> [[SMP_DESC]], i64 2
+; SHADERTEST: %{{[0-9]*}} = call i32 @llvm.amdgcn.readfirstlane(i32 %{{[0-9]*}})
+; SHADERTEST: %{{[0-9]*}} = insertelement <4 x i32> %{{[0-9]*}}, i32 %{{[0-9]*}}, i64 2
+; SHADERTEST: %{{[0-9]*}} = extractelement <4 x i32> [[SMP_DESC]], i64 3
+; SHADERTEST: %{{[0-9]*}} = call i32 @llvm.amdgcn.readfirstlane(i32 %{{[0-9]*}})
+; SHADERTEST: [[NEW_SMP_DESC:%[0-9]*]] = insertelement <4 x i32> %{{[0-9]*}}, i32 %{{[0-9]*}}, i64 3
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.l.2d.v4f32.f32(i32 15, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, <8 x i32> [[IMG_DESC]], <4 x i32> [[NEW_SMP_DESC]], i1 false, i32 0, i32 0)
 */
 // END_SHADERTEST
 
diff --git a/llpc/test/shaderdb/general/PipelineCs_DebugPrintf.pipe b/llpc/test/shaderdb/general/PipelineCs_DebugPrintf.pipe
index ee39f82720..d4a15e421b 100644
--- a/llpc/test/shaderdb/general/PipelineCs_DebugPrintf.pipe
+++ b/llpc/test/shaderdb/general/PipelineCs_DebugPrintf.pipe
@@ -24,10 +24,9 @@ userDataNode[0].next[0].set = 0xFFFFFFFF
 userDataNode[0].next[0].binding = 6
 ; CHECK-LABEL: @lgc.shader.CS.main(
 ; CHECK-NEXT:  .entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call ptr addrspace(7) @lgc.load.buffer.desc(i64 4294967295, i32 6, i32 0, i32 2)
-; CHECK-NEXT:    [[TMP1:%.*]] = call <3 x i32> (...) @lgc.create.read.builtin.input.v3i32(i32 28, i32 0, i32 poison, i32 poison)
-; CHECK-NEXT:    [[__LLPC_INPUT_PROXY_GL_GLOBALINVOCATIONID_0_VEC_EXTRACT:%.*]] = extractelement <3 x i32> [[TMP1]], i64 0
-; CHECK-NEXT:    call void (...) @lgc.debug.printf(ptr addrspace(7) [[TMP0]], ptr addrspace(4) @str, i32 [[__LLPC_INPUT_PROXY_GL_GLOBALINVOCATIONID_0_VEC_EXTRACT]])
-; CHECK-NEXT:    call void (...) @lgc.debug.printf(ptr addrspace(7) [[TMP0]], ptr addrspace(4) @str.1, double 1.000000e+00, double 1.000000e+00)
+; CHECK-NEXT:    [[TMP0:%.*]] = call <3 x i32> (...) @lgc.create.read.builtin.input.v3i32(i32 28, i32 0, i32 poison, i32 poison)
+; CHECK-NEXT:    [[__LLPC_INPUT_PROXY_GL_GLOBALINVOCATIONID_0_VEC_EXTRACT:%.*]] = extractelement <3 x i32> [[TMP0]], i64 0
+; CHECK-NEXT:    call void (...) @lgc.debug.printf(ptr nonnull @[[GLOB0:[0-9]+]], i32 [[__LLPC_INPUT_PROXY_GL_GLOBALINVOCATIONID_0_VEC_EXTRACT]])
+; CHECK-NEXT:    call void (...) @lgc.debug.printf(ptr nonnull @[[GLOB1:[0-9]+]], double 1.000000e+00, double 1.000000e+00)
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llpc/test/shaderdb/general/PipelineRays_TestLgcRtTraceRayOp.pipe b/llpc/test/shaderdb/general/PipelineRays_TestLgcRtTraceRayOp.pipe
index 7ed6a30f8d..d432d16530 100644
--- a/llpc/test/shaderdb/general/PipelineRays_TestLgcRtTraceRayOp.pipe
+++ b/llpc/test/shaderdb/general/PipelineRays_TestLgcRtTraceRayOp.pipe
@@ -1,6 +1,7 @@
 ; Check lgc.rt.trace.ray dialect is being generated.
 
 ; BEGIN_SHADERTEST
+; REQUIRES: gpurt
 ; RUN: amdllpc --print-after=llpc-spirv-lower-translator -gfxip 10.3 -o /dev/null 2>&1 %s | FileCheck -check-prefixes=SHADERTEST %s
 ; SHADERTEST-LABEL: @main(
 ; SHADERTEST: call void (...) @lgc.rt.trace.ray(i64 %{{[0-9]+}}, i32 0, i32 %{{[0-9]+}}, i32 0, i32 0, i32 0, <3 x float> %{{[0-9]+}}, float %{{[0-9]+}}, <3 x float> %{{[0-9]+}}, float %{{[0-9]+}}, ptr addrspace(5) @RayPayloadKHR0, [1 x i32] [i32 16])
diff --git a/llpc/test/shaderdb/general/WorkaroundStorageImageFormats.pipe b/llpc/test/shaderdb/general/WorkaroundStorageImageFormats.pipe
index 0ee38a19af..2854fdab17 100644
--- a/llpc/test/shaderdb/general/WorkaroundStorageImageFormats.pipe
+++ b/llpc/test/shaderdb/general/WorkaroundStorageImageFormats.pipe
@@ -28,7 +28,6 @@ userDataNode[0].next[0].binding = 0
 ; CHECK-NEXT:  .entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call ptr addrspace(4) (...) @lgc.create.get.desc.ptr.p4(i32 1, i32 1, i64 0, i32 0)
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 (...) @lgc.create.get.desc.stride.i32(i32 1, i32 1, i64 0, i32 0)
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP0]], align 32
-; CHECK-NEXT:    call void (...) @lgc.create.image.store(<4 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00>, i32 1, i32 512, <8 x i32> [[TMP2]], <2 x i32> <i32 9, i32 9>)
+; CHECK-NEXT:    call void (...) @lgc.create.image.store(<4 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00>, i32 1, i32 512, ptr addrspace(4) [[TMP0]], <2 x i32> <i32 9, i32 9>)
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llpc/test/shaderdb/gfx10/PipelineVsFs_TestVsOutMiscSideBusEna.pipe b/llpc/test/shaderdb/gfx10/PipelineVsFs_TestVsOutMiscSideBusEna.pipe
index 074a293e19..4e89bd1ffd 100644
--- a/llpc/test/shaderdb/gfx10/PipelineVsFs_TestVsOutMiscSideBusEna.pipe
+++ b/llpc/test/shaderdb/gfx10/PipelineVsFs_TestVsOutMiscSideBusEna.pipe
@@ -64,7 +64,6 @@ entryPoint = main
 ; SHADERTEST-NEXT:       .pa_cl_clip_cntl:
 ; SHADERTEST-NEXT:         .dx_linear_attr_clip_ena: true
 ; SHADERTEST-NEXT:         .rasterization_kill: false
-; SHADERTEST-NEXT:         .vs_out_misc_side_bus_ena: true
 ; SHADERTEST-NEXT:         .vte_vport_provoke_disable: false
 ; SHADERTEST-NEXT:       .pa_cl_vs_out_cntl:
 ; SHADERTEST-NEXT:         .clip_dist_ena_0: true
@@ -84,6 +83,7 @@ entryPoint = main
 ; SHADERTEST-NEXT:         .cull_dist_ena_6: false
 ; SHADERTEST-NEXT:         .cull_dist_ena_7: false
 ; SHADERTEST-NEXT:         .vs_out_cc_dist0_vec_ena: true
+; SHADERTEST-NEXT:         .vs_out_misc_side_bus_ena: true
 ; SHADERTEST-NEXT:       .pa_cl_vte_cntl:
 ; SHADERTEST-NEXT:         .vtx_w0_fmt:     true
 ; SHADERTEST-NEXT:         .x_offset_ena:   true
@@ -193,7 +193,8 @@ entryPoint = main
 ; SHADERTEST-NEXT:         .entry_point:    _amdgpu_ps_main
 ; SHADERTEST-NEXT:         .float_mode:     0xc0
 ; SHADERTEST-NEXT:         .ieee_mode:      false
-; SHADERTEST:         .mem_ordered:    true
+; SHADERTEST-NEXT:         .lds_size:       0
+; SHADERTEST-NEXT:         .mem_ordered:    true
 ; SHADERTEST-NEXT:         .scratch_en:     false
 ; SHADERTEST-NEXT:         .scratch_memory_size: 0
 ; SHADERTEST-NEXT:         .sgpr_count:     0x2
@@ -246,7 +247,8 @@ entryPoint = main
 ; SHADERTEST-NEXT:         .entry_point:    _amdgpu_vs_main
 ; SHADERTEST-NEXT:         .float_mode:     0xc0
 ; SHADERTEST-NEXT:         .ieee_mode:      false
-; SHADERTEST:         .mem_ordered:    true
+; SHADERTEST-NEXT:         .lds_size:       0
+; SHADERTEST-NEXT:         .mem_ordered:    true
 ; SHADERTEST-NEXT:         .scratch_en:     false
 ; SHADERTEST-NEXT:         .scratch_memory_size: 0
 ; SHADERTEST-NEXT:         .sgpr_count:     0x3
diff --git a/llpc/test/shaderdb/gfx11/AttributePrecedesPos.pipe b/llpc/test/shaderdb/gfx11/AttributePrecedesPos.pipe
index 7aa05e7a74..243b999907 100644
--- a/llpc/test/shaderdb/gfx11/AttributePrecedesPos.pipe
+++ b/llpc/test/shaderdb/gfx11/AttributePrecedesPos.pipe
@@ -3,7 +3,7 @@
 ; RUN: amdllpc %gfxip %s -v | FileCheck -check-prefix=SHADERTEST %s
 ; SHADERTEST-LABEL: @_amdgpu_gs_main(
 ; SHADERTEST: call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %{{.*}}, <4 x i32> %{{.*}}, i32 %{{.*}}, i32 0, i32 %{{.*}}, i32 3)
-; SHADERTEST: fence release
+; SHADERTEST: fence syncscope("agent") release
 ; SHADERTEST: call void @llvm.amdgcn.exp.f32(i32 12, i32 15, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}}, i1 false, i1 false)
 ; SHADERTEST: call void @llvm.amdgcn.exp.f32(i32 13, i32 1, float 1.000000e+00, float poison, float poison, float poison, i1 true, i1 false)
 ; SHADERTEST-LABEL: _amdgpu_gs_main:
diff --git a/llpc/test/shaderdb/gfx11/cooperativeMatrix/array-of-matrices.comp b/llpc/test/shaderdb/gfx11/cooperativeMatrix/array-of-matrices.comp
index 4ca900bf0d..6aad9398cd 100644
--- a/llpc/test/shaderdb/gfx11/cooperativeMatrix/array-of-matrices.comp
+++ b/llpc/test/shaderdb/gfx11/cooperativeMatrix/array-of-matrices.comp
@@ -35,7 +35,7 @@ void main() {
 // CHECK-NEXT:    [[TMP0:%.*]] = call ptr addrspace(4) (...) @lgc.create.load.push.constants.ptr.p4()
 // CHECK-NEXT:    [[TMP1:%.*]] = call ptr addrspace(7) @lgc.load.buffer.desc(i64 0, i32 0, i32 0, i32 2)
 // CHECK-NEXT:    [[LOAD:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr addrspace(7) [[TMP1]], i32 32, i1 true, i32 1, i32 0, i32 0, i32 16)
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr addrspace(7) [[TMP1]], i32 512
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr {{(inbounds )?}}i8, ptr addrspace(7) [[TMP1]], i32 512
 // CHECK-NEXT:    [[LOAD2:%.*]] = call <8 x float> (...) @lgc.cooperative.matrix.load__v8f32(ptr addrspace(7) [[TMP2]], i32 32, i1 true, i32 1, i32 0, i32 0, i32 16)
 // CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TMP0]], align 4
 // CHECK-NEXT:    [[TMP4:%.*]] = icmp ult i32 [[TMP3]], 2
@@ -49,13 +49,13 @@ void main() {
 // CHECK:       10:
 // CHECK-NEXT:    [[TMP11:%.*]] = phi i32 [ 0, [[DOTENTRY:%.*]] ], [ [[TMP9]], [[TMP5]] ]
 // CHECK-NEXT:    store i32 [[TMP11]], ptr addrspace(7) [[TMP1]], align 4
-// CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[TMP0]], i64 4
+// CHECK-NEXT:    [[TMP12:%.*]] = getelementptr {{(inbounds )?}}i8, ptr addrspace(4) [[TMP0]], i64 4
 // CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr addrspace(4) [[TMP12]], align 4
 // CHECK-NEXT:    [[TMP14:%.*]] = icmp ult i32 [[TMP13]], 2
 // CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[TMP13]], 1
 // CHECK-NEXT:    [[TMP16:%.*]] = select i1 [[TMP15]], <8 x float> [[LOAD2]], <8 x float> [[LOAD]]
 // CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP14]], <8 x float> [[TMP16]], <8 x float> zeroinitializer
-// CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr addrspace(7) [[TMP1]], i32 1024
+// CHECK-NEXT:    [[TMP18:%.*]] = getelementptr {{(inbounds )?}}i8, ptr addrspace(7) [[TMP1]], i32 1024
 // CHECK-NEXT:    call void (...) @lgc.cooperative.matrix.store(ptr addrspace(7) [[TMP18]], i32 64, i1 true, i32 1, i32 0, i32 0, i32 16, <8 x float> [[TMP17]])
 // CHECK-NEXT:    ret void
 //
diff --git a/llpc/test/shaderdb/graphics_library/PipelineVsFs_TestGraphicsLibrary.pipe b/llpc/test/shaderdb/graphics_library/PipelineVsFs_TestGraphicsLibrary.pipe
index 685133b765..584652edee 100644
--- a/llpc/test/shaderdb/graphics_library/PipelineVsFs_TestGraphicsLibrary.pipe
+++ b/llpc/test/shaderdb/graphics_library/PipelineVsFs_TestGraphicsLibrary.pipe
@@ -20,7 +20,7 @@ colorExport=PipelineLibCes_TestColorExport.pipe
 ; SHADERTEST-NEXT:    [[VERTEXINDEX:%.*]] = add i32 [[TMP3]], [[TMP4]]
 ; SHADERTEST-NEXT:    [[DOTFR:%.*]] = freeze i32 [[VERTEXINDEX]]
 ; SHADERTEST-NEXT:    [[TMP5:%.*]] = icmp slt i32 [[DOTFR]], 3
-; SHADERTEST-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(7) [[TMP0]], i32 4
+; SHADERTEST-NEXT:    [[TMP6:%.*]] = getelementptr {{(inbounds )?}}i8, ptr addrspace(7) [[TMP0]], i32 4
 ; SHADERTEST-NEXT:    [[DOT0_IN:%.*]] = select i1 [[TMP5]], ptr addrspace(7) [[TMP0]], ptr addrspace(7) [[TMP6]]
 ; SHADERTEST-NEXT:    [[DOT0:%.*]] = load float, ptr addrspace(7) [[DOT0_IN]], align 4
 ; SHADERTEST-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[TMP2]], float 1.000000e+00, i64 3
@@ -34,16 +34,16 @@ colorExport=PipelineLibCes_TestColorExport.pipe
 ; SHADERTEST-NEXT:  .entry:
 ; SHADERTEST-NEXT:    [[TMP0:%.*]] = call ptr addrspace(7) @lgc.load.buffer.desc(i64 1, i32 1, i32 0, i32 0)
 ; SHADERTEST-NEXT:    [[TMP1:%.*]] = call ptr @llvm.invariant.start.p7(i64 -1, ptr addrspace(7) [[TMP0]])
-; SHADERTEST-NEXT:    [[FRAGCOORD:%.*]] = call <4 x float> @lgc.input.import.builtin.FragCoord.v4f32.i32(i32 15) #[[ATTR3]]
+; SHADERTEST-NEXT:    [[FRAGCOORD:%.*]] = call <4 x float> @lgc.input.import.builtin.FragCoord.v4f32.i32(i32 15) #[[ATTR4]]
 ; SHADERTEST-NEXT:    [[__LLPC_INPUT_PROXY_GL_FRAGCOORD_4_VEC_EXTRACT:%.*]] = extractelement <4 x float> [[FRAGCOORD]], i64 1
 ; SHADERTEST-NEXT:    [[TMP2:%.*]] = fadd reassoc nnan nsz arcp contract afn float [[__LLPC_INPUT_PROXY_GL_FRAGCOORD_4_VEC_EXTRACT]], -5.000000e-01
 ; SHADERTEST-NEXT:    [[TMP3:%.*]] = fptosi float [[TMP2]] to i32
 ; SHADERTEST-NEXT:    [[DOTFR:%.*]] = freeze i32 [[TMP3]]
 ; SHADERTEST-NEXT:    [[TMP4:%.*]] = icmp slt i32 [[DOTFR]], 8
-; SHADERTEST-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(7) [[TMP0]], i32 16
+; SHADERTEST-NEXT:    [[TMP5:%.*]] = getelementptr {{(inbounds )?}}i8, ptr addrspace(7) [[TMP0]], i32 16
 ; SHADERTEST-NEXT:    [[DOT0_IN:%.*]] = select i1 [[TMP4]], ptr addrspace(7) [[TMP0]], ptr addrspace(7) [[TMP5]]
 ; SHADERTEST-NEXT:    [[DOT0:%.*]] = load <4 x float>, ptr addrspace(7) [[DOT0_IN]], align 16
-; SHADERTEST-NEXT:    call void @lgc.output.export.generic.i32.i32.v4f32(i32 0, i32 0, <4 x float> [[DOT0]]) #[[ATTR4]]
+; SHADERTEST-NEXT:    call void @lgc.output.export.generic.i32.i32.v4f32(i32 0, i32 0, <4 x float> [[DOT0]]) #[[ATTR3]]
 ; SHADERTEST-NEXT:    ret void
 ;
 ;
diff --git a/llpc/test/shaderdb/object/ObjFragMask_TestFragFetch_lit.frag b/llpc/test/shaderdb/object/ObjFragMask_TestFragFetch_lit.frag
index 0eeac24367..dda5e3a2bf 100644
--- a/llpc/test/shaderdb/object/ObjFragMask_TestFragFetch_lit.frag
+++ b/llpc/test/shaderdb/object/ObjFragMask_TestFragFetch_lit.frag
@@ -32,11 +32,11 @@ void main()
 ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.load.v4f32(i32 6, i32 512, <8 x i32>
-; SHADERTEST: call <4 x i32> (...) @lgc.create.image.load.v4i32(i32 2, i32 512, <8 x i32>
-; SHADERTEST: call <4 x i32> (...) @lgc.create.image.load.v4i32(i32 7, i32 512, <8 x i32>
-; SHADERTEST: call <4 x i32> (...) @lgc.create.image.load.v4i32(i32 1, i32 544, <8 x i32>
-; SHADERTEST: call <4 x i32> (...) @lgc.create.image.load.v4i32(i32 6, i32 544, <8 x i32>
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.load.v4f32(i32 6, i32 512, ptr addrspace(4)
+; SHADERTEST: call <4 x i32> (...) @lgc.create.image.load.v4i32(i32 2, i32 512, ptr addrspace(4)
+; SHADERTEST: call <4 x i32> (...) @lgc.create.image.load.v4i32(i32 7, i32 512, ptr addrspace(4)
+; SHADERTEST: call <4 x i32> (...) @lgc.create.image.load.v4i32(i32 1, i32 544, ptr addrspace(4)
+; SHADERTEST: call <4 x i32> (...) @lgc.create.image.load.v4i32(i32 6, i32 544, ptr addrspace(4)
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
 ; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i16
 ; SHADERTEST: call i32 @llvm.amdgcn.image.load.3d.i32.i16(i32 1, i16 2, i16 3, i16 1, <8 x i32> %{{.*}}, i32 0, i32 0)
diff --git a/llpc/test/shaderdb/object/ObjImage_TestCubeAtomicAdd_lit.comp b/llpc/test/shaderdb/object/ObjImage_TestCubeAtomicAdd_lit.comp
index 17520b8f99..365fc5f536 100644
--- a/llpc/test/shaderdb/object/ObjImage_TestCubeAtomicAdd_lit.comp
+++ b/llpc/test/shaderdb/object/ObjImage_TestCubeAtomicAdd_lit.comp
@@ -16,10 +16,10 @@ void main (void)
 /*
 ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 2, i32 8, i32 0, i32 0, <8 x i32>
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 2, i32 8, i32 0, i32 0, ptr addrspace(4)
 
 ; SHADERTEST-LABEL: {{^// LLPC.*}} SPIR-V lowering results
-; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 2, i32 8, i32 0, i32 0, <8 x i32>
+; SHADERTEST: call i32 (...) @lgc.create.image.atomic.i32(i32 2, i32 8, i32 0, i32 0, ptr addrspace(4)
 
 ; SHADERTEST-LABEL: {{^// LLPC}} final pipeline module info
 ; SHADERTEST: call i32 @llvm.amdgcn.image.atomic.add.cube.i32.i32
diff --git a/llpc/test/shaderdb/object/ObjImage_TestMemoryQualifier_lit.frag b/llpc/test/shaderdb/object/ObjImage_TestMemoryQualifier_lit.frag
index d8be3affa1..921561a6de 100644
--- a/llpc/test/shaderdb/object/ObjImage_TestMemoryQualifier_lit.frag
+++ b/llpc/test/shaderdb/object/ObjImage_TestMemoryQualifier_lit.frag
@@ -16,13 +16,13 @@ void main()
 /*
 ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.load.v4f32(i32 0, i32 513, <8 x i32>
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.load.v4f32(i32 1, i32 512, <8 x i32>
-; SHADERTEST: call void (...) @lgc.create.image.store(<4 x float> %{{[^,]*}}, i32 9, i32 515, <8 x i32>
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.load.v4f32(i32 0, i32 513, ptr addrspace(4)
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.load.v4f32(i32 1, i32 512, ptr addrspace(4)
+; SHADERTEST: call void (...) @lgc.create.image.store(<4 x float> %{{[^,]*}}, i32 9, i32 515, ptr addrspace(4)
 ; SHADERTEST-LABEL: {{^// LLPC.*}} SPIR-V lowering results
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.load.v4f32(i32 0, i32 513, <8 x i32>
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.load.v4f32(i32 1, i32 512, <8 x i32>
-; SHADERTEST: call void (...) @lgc.create.image.store(<4 x float> %{{[^,]*}}, i32 9, i32 515, <8 x i32>
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.load.v4f32(i32 0, i32 513, ptr addrspace(4)
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.load.v4f32(i32 1, i32 512, ptr addrspace(4)
+; SHADERTEST: call void (...) @lgc.create.image.store(<4 x float> %{{[^,]*}}, i32 9, i32 515, ptr addrspace(4)
 ; SHADERTEST: AMDLLPC SUCCESS
 */
 // END_SHADERTEST
diff --git a/llpc/test/shaderdb/object/ObjResource_TestAlias_lit.frag b/llpc/test/shaderdb/object/ObjResource_TestAlias_lit.frag
index c2d76316b4..37f246212a 100644
--- a/llpc/test/shaderdb/object/ObjResource_TestAlias_lit.frag
+++ b/llpc/test/shaderdb/object/ObjResource_TestAlias_lit.frag
@@ -39,16 +39,15 @@ void main()
 /*
 ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 512, <8 x i32>
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 512, <8 x i32>
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.load.v4f32(i32 1, i32 512, <8 x i32>
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.load.v4f32(i32 1, i32 512, <8 x i32>
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 512, ptr addrspace(4)
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 512, ptr addrspace(4)
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.load.v4f32(i32 1, i32 512, ptr addrspace(4)
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.load.v4f32(i32 1, i32 512, ptr addrspace(4)
 
 ; SHADERTEST-LABEL: {{^// LLPC.*}} SPIR-V lowering results
 ; SHADERTEST: call {{.*}} {{.*}}@lgc.load.buffer.desc{{.*}}(i64 0, i32 1,{{.*}}
-; SHADERTEST: load <4 x float>
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 512, <8 x i32>
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.load.v4f32(i32 1, i32 512, <8 x i32>
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 512, ptr addrspace(4)
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.load.v4f32(i32 1, i32 512, ptr addrspace(4)
 
 
 ; SHADERTEST: AMDLLPC SUCCESS
diff --git a/llpc/test/shaderdb/object/ObjSampler_TestLargeId_lit.frag b/llpc/test/shaderdb/object/ObjSampler_TestLargeId_lit.frag
index 39a9031568..d04cd7195d 100644
--- a/llpc/test/shaderdb/object/ObjSampler_TestLargeId_lit.frag
+++ b/llpc/test/shaderdb/object/ObjSampler_TestLargeId_lit.frag
@@ -46,23 +46,23 @@ void main()
 ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
 
 ; SHADERTEST-LABEL: {{^// LLPC.*}} SPIR-V lowering results
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 512, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i32 1, <2 x float> zeroinitializer)
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 512, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i32 1, <2 x float> zeroinitializer)
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 512, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i32 1, <2 x float> zeroinitializer)
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 512, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i32 1, <2 x float> zeroinitializer)
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 512, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i32 1, <2 x float> zeroinitializer)
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 512, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i32 1, <2 x float> zeroinitializer)
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 512, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i32 1, <2 x float> zeroinitializer)
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 512, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i32 1, <2 x float> zeroinitializer)
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 512, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i32 1, <2 x float> zeroinitializer)
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 512, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i32 1, <2 x float> zeroinitializer)
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 512, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i32 1, <2 x float> zeroinitializer)
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 512, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i32 1, <2 x float> zeroinitializer)
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 512, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i32 1, <2 x float> zeroinitializer)
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 512, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i32 1, <2 x float> zeroinitializer)
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 512, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i32 1, <2 x float> zeroinitializer)
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 512, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i32 1, <2 x float> zeroinitializer)
-; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 512, <8 x i32> %{{[-0-9A-Za0z_.]+}}, <4 x i32> %{{[-0-9A-Za0z_.]+}}, i32 1, <2 x float> zeroinitializer)
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 512, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, i32 1, <2 x float> zeroinitializer)
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 512, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, i32 1, <2 x float> zeroinitializer)
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 512, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, i32 1, <2 x float> zeroinitializer)
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 512, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, i32 1, <2 x float> zeroinitializer)
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 512, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, i32 1, <2 x float> zeroinitializer)
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 512, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, i32 1, <2 x float> zeroinitializer)
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 512, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, i32 1, <2 x float> zeroinitializer)
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 512, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, i32 1, <2 x float> zeroinitializer)
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 512, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, i32 1, <2 x float> zeroinitializer)
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 512, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, i32 1, <2 x float> zeroinitializer)
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 512, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, i32 1, <2 x float> zeroinitializer)
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 512, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, i32 1, <2 x float> zeroinitializer)
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 512, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, i32 1, <2 x float> zeroinitializer)
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 512, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, i32 1, <2 x float> zeroinitializer)
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 512, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, i32 1, <2 x float> zeroinitializer)
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 512, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, i32 1, <2 x float> zeroinitializer)
+; SHADERTEST: call reassoc nnan nsz arcp contract afn <4 x float> (...) @lgc.create.image.sample.v4f32(i32 1, i32 512, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, ptr addrspace(4) %{{[-0-9A-Za0z_.]+}}, i32 1, <2 x float> zeroinitializer)
 
 ; SHADERTEST: AMDLLPC SUCCESS
 */
diff --git a/llpc/test/shaderdb/object/ObjSampler_TestSeparateSamplerShadow_lit.frag b/llpc/test/shaderdb/object/ObjSampler_TestSeparateSamplerShadow_lit.frag
index 4636798b25..92b381dded 100644
--- a/llpc/test/shaderdb/object/ObjSampler_TestSeparateSamplerShadow_lit.frag
+++ b/llpc/test/shaderdb/object/ObjSampler_TestSeparateSamplerShadow_lit.frag
@@ -14,7 +14,7 @@ void main()
 /*
 ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results
-; SHADERTEST: call reassoc nnan nsz arcp contract afn float (...) @lgc.create.image.sample.f32(i32 1, i32 512, <8 x i32>
+; SHADERTEST: call reassoc nnan nsz arcp contract afn float (...) @lgc.create.image.sample.f32(i32 1, i32 512, ptr addrspace(4)
 
 ; SHADERTEST-LABEL: {{^// LLPC}} pipeline patching results
 ; SHADERTEST: call {{.*}} float @llvm.amdgcn.image.sample.c.lz.2d.f32.f16(i32 1, float 0.000000e+00, half 0xH0000, half 0xH0000, <8 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 false, i32 0, i32 0)
diff --git a/llpc/test/shaderdb/object/ObjSharedVariable_TestArrayCopy_lit.comp b/llpc/test/shaderdb/object/ObjSharedVariable_TestArrayCopy_lit.comp
index 2fac95aef8..f311930853 100644
--- a/llpc/test/shaderdb/object/ObjSharedVariable_TestArrayCopy_lit.comp
+++ b/llpc/test/shaderdb/object/ObjSharedVariable_TestArrayCopy_lit.comp
@@ -30,8 +30,8 @@ void main()
 
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results
 ; SHADERTEST: @[[LDS:[^ ]*]] = addrspace(3) global { i32, [16 x i32] }
-; SHADERTEST: store i32 %{{[0-9]*}}, ptr addrspace(3) getelementptr inbounds ({ i32, [16 x i32] }, ptr addrspace(3) @[[LDS]], i32 0, i32 1, i32 {{[0-9]*}})
-; SHADERTEST: %{{[0-9]*}} = load i32, ptr addrspace(3) getelementptr inbounds ({ i32, [16 x i32] }, ptr addrspace(3) @[[LDS]], i32 0, i32 1, i32 {{[0-9]*}})
+; SHADERTEST: store i32 %{{[0-9]*}}, ptr addrspace(3) getelementptr {{(inbounds )?}}({ i32, [16 x i32] }, ptr addrspace(3) @[[LDS]], i32 0, i32 1, i32 {{[0-9]*}})
+; SHADERTEST: %{{[0-9]*}} = load i32, ptr addrspace(3) getelementptr {{(inbounds )?}}({ i32, [16 x i32] }, ptr addrspace(3) @[[LDS]], i32 0, i32 1, i32 {{[0-9]*}})
 ; SHADERTEST: %{{[0-9]*}} = getelementptr { i32, [16 x i32] }, ptr addrspace(3) @[[LDS]], i32 0, i32 1, i32 %{{[0-9]*}}
 ; SHADERTEST: %{{[0-9]*}} = load i32, ptr addrspace(3) %{{[0-9]*}}
 
diff --git a/llpc/test/shaderdb/object/ObjSharedVariable_TestArray_lit.comp b/llpc/test/shaderdb/object/ObjSharedVariable_TestArray_lit.comp
index bddcfcbd43..2623ea3276 100644
--- a/llpc/test/shaderdb/object/ObjSharedVariable_TestArray_lit.comp
+++ b/llpc/test/shaderdb/object/ObjSharedVariable_TestArray_lit.comp
@@ -22,8 +22,8 @@ void main()
 ; SHADERTEST: @[[LDS:[^ ]*]] = addrspace(3) global [16 x i32] poison
 ; SHADERTEST: %{{[0-9]*}} = getelementptr [16 x i32], ptr addrspace(3) @[[LDS]], i32 0, i32 %{{[0-9]*}}
 ; SHADERTEST: store i32 %{{[0-9]*}}, ptr addrspace(3) %{{[0-9]*}}
-; SHADERTEST: store i32 %{{[0-9]*}}, ptr addrspace(3) getelementptr inbounds ([16 x i32], ptr addrspace(3) @[[LDS]], i32 0, i32 3)
-; SHADERTEST: %{{[0-9]*}} = load i32, ptr addrspace(3) getelementptr inbounds ([16 x i32], ptr addrspace(3) @[[LDS]], i32 0, i32 4)
+; SHADERTEST: store i32 %{{[0-9]*}}, ptr addrspace(3) getelementptr {{(inbounds )?}}([16 x i32], ptr addrspace(3) @[[LDS]], i32 0, i32 3)
+; SHADERTEST: %{{[0-9]*}} = load i32, ptr addrspace(3) getelementptr {{(inbounds )?}}([16 x i32], ptr addrspace(3) @[[LDS]], i32 0, i32 4)
 
 ; SHADERTEST: AMDLLPC SUCCESS
 */
diff --git a/llpc/test/shaderdb/object/ObjStorageBlock_TestMultiLevelAccessChain_lit.vert b/llpc/test/shaderdb/object/ObjStorageBlock_TestMultiLevelAccessChain_lit.vert
index 35540f4a9a..788af1911b 100644
--- a/llpc/test/shaderdb/object/ObjStorageBlock_TestMultiLevelAccessChain_lit.vert
+++ b/llpc/test/shaderdb/object/ObjStorageBlock_TestMultiLevelAccessChain_lit.vert
@@ -26,7 +26,7 @@ void main()
 ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results
 ; SHADERTEST: getelementptr { <4 x float> }, ptr addrspace({{.*}}) %{{[a-z0-9]*}}, i32 0, i32 0
-; SHADERTEST: getelementptr inbounds (<{ [3 x float], [4 x i8], <{ [4 x float] }> }>, ptr addrspace({{.*}}) @{{.*}}, i32 0, i32 2
+; SHADERTEST: getelementptr {{(inbounds )?}}(<{ [3 x float], [4 x i8], <{ [4 x float] }> }>, ptr addrspace({{.*}}) @{{.*}}, i32 0, i32 2
 
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results
 ; SHADERTEST: store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>,
diff --git a/llpc/test/shaderdb/object/ObjUniformBlock_TestDirectIndex_lit.frag b/llpc/test/shaderdb/object/ObjUniformBlock_TestDirectIndex_lit.frag
index ba1b71735e..3b21275d95 100644
--- a/llpc/test/shaderdb/object/ObjUniformBlock_TestDirectIndex_lit.frag
+++ b/llpc/test/shaderdb/object/ObjUniformBlock_TestDirectIndex_lit.frag
@@ -17,7 +17,7 @@ void main()
 /*
 ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results
-; SHADERTEST: getelementptr inbounds ([4 x <{ [4 x float], [10 x [4 x float]] }>], ptr addrspace({{.*}}) @{{.*}}, i32 0, i32 3, i32 1, i32 5
+; SHADERTEST: getelementptr {{(inbounds )?}}([4 x <{ [4 x float], [10 x [4 x float]] }>], ptr addrspace({{.*}}) @{{.*}}, i32 0, i32 3, i32 1, i32 5
 
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIR-V lowering results
 ; SHADERTEST: call {{.*}} {{.*}}@lgc.load.buffer.desc(i64 1, i32 0, i32 0
diff --git a/llpc/test/shaderdb/object/ObjUniformBlock_TestLoadMatrixArray_lit.vert b/llpc/test/shaderdb/object/ObjUniformBlock_TestLoadMatrixArray_lit.vert
index 6aeaafdcdb..cda46c22b6 100644
--- a/llpc/test/shaderdb/object/ObjUniformBlock_TestLoadMatrixArray_lit.vert
+++ b/llpc/test/shaderdb/object/ObjUniformBlock_TestLoadMatrixArray_lit.vert
@@ -20,14 +20,14 @@ void main()
 ; RUN: amdllpc -v %gfxip %s | FileCheck -check-prefix=SHADERTEST %s
 
 ; SHADERTEST-LABEL: {{^// LLPC}} SPIRV-to-LLVM translation results
-; SHADERTEST: %{{[0-9]*}} = load <4 x float>, ptr addrspace(7) getelementptr inbounds (<{ i32, [12 x i8], [2 x [4 x %{{[a-z.]*}}]] }>, ptr addrspace(7) @{{[a-z0-9]+}}, i32 0, i32 2), align 16
-; SHADERTEST: %{{[0-9]*}} = load <4 x float>, ptr addrspace(7) getelementptr ([4 x %llpc.matrix.column], ptr addrspace(7) getelementptr inbounds (<{ i32, [12 x i8], [2 x [4 x %{{[a-z.]*}}]] }>, ptr addrspace(7) @{{[a-z0-9]+}}, i32 0, i32 2), i32 0, i32 1, i32 0), align 16
-; SHADERTEST: %{{[0-9]*}} = load <4 x float>, ptr addrspace(7) getelementptr ([4 x %llpc.matrix.column], ptr addrspace(7) getelementptr inbounds (<{ i32, [12 x i8], [2 x [4 x %{{[a-z.]*}}]] }>, ptr addrspace(7) @{{[a-z0-9]+}}, i32 0, i32 2), i32 0, i32 2, i32 0), align 16
-; SHADERTEST: %{{[0-9]*}} = load <4 x float>, ptr addrspace(7) getelementptr ([4 x %llpc.matrix.column], ptr addrspace(7) getelementptr inbounds (<{ i32, [12 x i8], [2 x [4 x %{{[a-z.]*}}]] }>, ptr addrspace(7) @{{[a-z0-9]+}}, i32 0, i32 2), i32 0, i32 3, i32 0), align 16
-; SHADERTEST: %{{[0-9]*}} = load <4 x float>, ptr addrspace(7) getelementptr inbounds (<{ i32, [12 x i8], [2 x [4 x %{{[a-z.]*}}]] }>, ptr addrspace(7) @{{[a-z0-9]+}}, i32 0, i32 2, i32 1), align 16
-; SHADERTEST: %{{[0-9]*}} = load <4 x float>, ptr addrspace(7) getelementptr inbounds (<{ i32, [12 x i8], [2 x [4 x %{{[a-z.]*}}]] }>, ptr addrspace(7) @{{[a-z0-9]+}}, i32 0, i32 2, i32 1, i32 1, i32 0), align 16
-; SHADERTEST: %{{[0-9]*}} = load <4 x float>, ptr addrspace(7) getelementptr inbounds (<{ i32, [12 x i8], [2 x [4 x %{{[a-z.]*}}]] }>, ptr addrspace(7) @{{[a-z0-9]+}}, i32 0, i32 2, i32 1, i32 2, i32 0), align 16
-; SHADERTEST: %{{[0-9]*}} = load <4 x float>, ptr addrspace(7) getelementptr inbounds (<{ i32, [12 x i8], [2 x [4 x %{{[a-z.]*}}]] }>, ptr addrspace(7) @{{[a-z0-9]+}}, i32 0, i32 2, i32 1, i32 3, i32 0), align 16
+; SHADERTEST: %{{[0-9]*}} = load <4 x float>, ptr addrspace(7) getelementptr {{(inbounds )?}}(<{ i32, [12 x i8], [2 x [4 x %{{[a-z.]*}}]] }>, ptr addrspace(7) @{{[a-z0-9]+}}, i32 0, i32 2), align 16
+; SHADERTEST: %{{[0-9]*}} = load <4 x float>, ptr addrspace(7) getelementptr ([4 x %llpc.matrix.column], ptr addrspace(7) getelementptr {{(inbounds )?}}(<{ i32, [12 x i8], [2 x [4 x %{{[a-z.]*}}]] }>, ptr addrspace(7) @{{[a-z0-9]+}}, i32 0, i32 2), i32 0, i32 1, i32 0), align 16
+; SHADERTEST: %{{[0-9]*}} = load <4 x float>, ptr addrspace(7) getelementptr ([4 x %llpc.matrix.column], ptr addrspace(7) getelementptr {{(inbounds )?}}(<{ i32, [12 x i8], [2 x [4 x %{{[a-z.]*}}]] }>, ptr addrspace(7) @{{[a-z0-9]+}}, i32 0, i32 2), i32 0, i32 2, i32 0), align 16
+; SHADERTEST: %{{[0-9]*}} = load <4 x float>, ptr addrspace(7) getelementptr ([4 x %llpc.matrix.column], ptr addrspace(7) getelementptr {{(inbounds )?}}(<{ i32, [12 x i8], [2 x [4 x %{{[a-z.]*}}]] }>, ptr addrspace(7) @{{[a-z0-9]+}}, i32 0, i32 2), i32 0, i32 3, i32 0), align 16
+; SHADERTEST: %{{[0-9]*}} = load <4 x float>, ptr addrspace(7) getelementptr {{(inbounds )?}}(<{ i32, [12 x i8], [2 x [4 x %{{[a-z.]*}}]] }>, ptr addrspace(7) @{{[a-z0-9]+}}, i32 0, i32 2, i32 1), align 16
+; SHADERTEST: %{{[0-9]*}} = load <4 x float>, ptr addrspace(7) getelementptr {{(inbounds )?}}(<{ i32, [12 x i8], [2 x [4 x %{{[a-z.]*}}]] }>, ptr addrspace(7) @{{[a-z0-9]+}}, i32 0, i32 2, i32 1, i32 1, i32 0), align 16
+; SHADERTEST: %{{[0-9]*}} = load <4 x float>, ptr addrspace(7) getelementptr {{(inbounds )?}}(<{ i32, [12 x i8], [2 x [4 x %{{[a-z.]*}}]] }>, ptr addrspace(7) @{{[a-z0-9]+}}, i32 0, i32 2, i32 1, i32 2, i32 0), align 16
+; SHADERTEST: %{{[0-9]*}} = load <4 x float>, ptr addrspace(7) getelementptr {{(inbounds )?}}(<{ i32, [12 x i8], [2 x [4 x %{{[a-z.]*}}]] }>, ptr addrspace(7) @{{[a-z0-9]+}}, i32 0, i32 2, i32 1, i32 3, i32 0), align 16
 
 ; SHADERTEST: AMDLLPC SUCCESS
 */
diff --git a/llpc/test/shaderdb/ray_tracing/lit.local.cfg b/llpc/test/shaderdb/ray_tracing/lit.local.cfg
index c839f74489..995b0dc96b 100644
--- a/llpc/test/shaderdb/ray_tracing/lit.local.cfg
+++ b/llpc/test/shaderdb/ray_tracing/lit.local.cfg
@@ -1,3 +1,5 @@
+if "gpurt" not in config.available_features:
+    config.unsupported = True
 
 # overwrite %gfxip in config.substitutions
 config.gfxip = '-gfxip=10.3'
diff --git a/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_EnableColorExport.pipe b/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_EnableColorExport.pipe
index 76845b01dc..b08b250318 100644
--- a/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_EnableColorExport.pipe
+++ b/llpc/test/shaderdb/relocatable_shaders/PipelineVsFs_EnableColorExport.pipe
@@ -71,79 +71,62 @@ attribute[0].binding = 0
 attribute[0].format = VK_FORMAT_R32G32B32A32_SFLOAT
 attribute[0].offset = 0
 
-;
-; SHADERTEST-LABEL: @_amdgpu_ps_main(
+; SHADERTEST-LABEL: @lgc.shader.VS.main(
 ; SHADERTEST-NEXT:  .entry:
-; SHADERTEST-NEXT:    [[PERSPINTERPCENTER_I1:%.*]] = extractelement <2 x float> [[PERSPINTERPCENTER:%.*]], i64 1
-; SHADERTEST-NEXT:    [[PERSPINTERPCENTER_I0:%.*]] = extractelement <2 x float> [[PERSPINTERPCENTER]], i64 0
-; SHADERTEST-NEXT:    [[TMP11:%.*]] = call i64 @llvm.amdgcn.s.getpc()
-; SHADERTEST-NEXT:    [[TMP16:%.*]] = call float @llvm.amdgcn.interp.p1(float [[PERSPINTERPCENTER_I0]], i32 immarg 0, i32 immarg 0, i32 [[PRIMMASK:%.*]])
-; SHADERTEST-NEXT:    [[TMP17:%.*]] = call float @llvm.amdgcn.interp.p2(float [[TMP16]], float [[PERSPINTERPCENTER_I1]], i32 immarg 0, i32 immarg 0, i32 [[PRIMMASK]])
-; SHADERTEST-NEXT:    [[TMP18:%.*]] = call float @llvm.amdgcn.interp.p1(float [[PERSPINTERPCENTER_I0]], i32 immarg 1, i32 immarg 0, i32 [[PRIMMASK]])
-; SHADERTEST-NEXT:    [[TMP19:%.*]] = call float @llvm.amdgcn.interp.p2(float [[TMP18]], float [[PERSPINTERPCENTER_I1]], i32 immarg 1, i32 immarg 0, i32 [[PRIMMASK]])
-; SHADERTEST-NEXT:    [[TMP12:%.*]] = and i64 [[TMP11]], -4294967296
-; SHADERTEST-NEXT:    [[TMP13:%.*]] = zext i32 [[DESCTABLE0:%.*]] to i64
-; SHADERTEST-NEXT:    [[TMP14:%.*]] = or {{(disjoint )?}}i64 [[TMP12]], [[TMP13]]
-; SHADERTEST-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP14]] to ptr addrspace(4)
-; SHADERTEST-NEXT:    [[DOTI0:%.*]] = fptosi float [[TMP17]] to i32
-; SHADERTEST-NEXT:    [[DOTI1:%.*]] = fptosi float [[TMP19]] to i32
-; SHADERTEST-NEXT:    [[TMP20:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP15]], align 32, !invariant.load !{{.*}}
-; SHADERTEST-NEXT:    [[TMP21:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP15]], align 16, !invariant.load !{{.*}}
-; SHADERTEST-NEXT:    [[DOTI01:%.*]] = sitofp i32 [[DOTI0]] to float
-; SHADERTEST-NEXT:    [[DOTI12:%.*]] = sitofp i32 [[DOTI1]] to float
-; SHADERTEST-NEXT:    [[TMP23:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float [[DOTI01]], float [[DOTI12]], <8 x i32> [[TMP20]], <4 x i32> [[TMP21]], i1 false, i32 0, i32 0)
-; SHADERTEST-NEXT:    [[TMP25:%.*]] = zext i32 [[COLOREXPADDR:%.*]] to i64
-; SHADERTEST-NEXT:    [[TMP26:%.*]] = or {{(disjoint )?}}i64 [[TMP12]], [[TMP25]]
-; SHADERTEST-NEXT:    [[TMP27:%.*]] = inttoptr i64 [[TMP26]] to ptr addrspace(4)
-; SHADERTEST-NEXT:    call amdgpu_gfx addrspace(4) void [[TMP27]](<4 x float> [[TMP23]], i32 inreg 0) #[[ATTR1:[0-9]+]]
-; SHADERTEST-NEXT:    unreachable
+; SHADERTEST-NEXT:    [[TMP0:%.*]] = call <2 x float> @lgc.input.import.generic__v2f32(i1 false, i32 0, i32 0, i32 0, i32 poison)
+; SHADERTEST-NEXT:    call void @lgc.output.export.generic.i32.i32.v2f32(i32 0, i32 0, <2 x float> [[TMP0]]) #[[ATTR1:[0-9]+]]
+; SHADERTEST-NEXT:    ret void
+;
 ;
 ;
-; SHADERTEST-LABEL: amdgpu_ps_main:
-; SHADERTEST:         s_getpc_b64 s[6:7]
-; SHADERTEST-NEXT:    s_mov_b32 s6, s0
-; SHADERTEST-NEXT:    s_mov_b32 s32, 0
-; SHADERTEST-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
-; SHADERTEST-NEXT:    s_waitcnt lgkmcnt(0)
-; SHADERTEST-NEXT:    s_and_b32 s7, s7, 0xffff
-; SHADERTEST-NEXT:    s_add_u32 s6, s6, s4
-; SHADERTEST-NEXT:    s_addc_u32 s7, s7, 0
-; SHADERTEST-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
-; SHADERTEST-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; SHADERTEST-NEXT:    s_wqm_b64 exec, exec
-; SHADERTEST-NEXT:    s_getpc_b64 s[16:17]
-; SHADERTEST-NEXT:    s_mov_b32 s0, s1
-; SHADERTEST-NEXT:    s_mov_b32 m0, s3
-; SHADERTEST-NEXT:    s_mov_b32 s1, s17
-; SHADERTEST-NEXT:    v_interp_p1_f32_e32 v2, v0, attr0.x
-; SHADERTEST-NEXT:    v_interp_p1_f32_e32 v0, v0, attr0.y
-; SHADERTEST-NEXT:    s_clause 0x1
-; SHADERTEST-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x0
-; SHADERTEST-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x0
-; SHADERTEST-NEXT:    s_mov_b32 s3, s17
-; SHADERTEST-NEXT:    v_interp_p2_f32_e32 v2, v1, attr0.x
-; SHADERTEST-NEXT:    v_interp_p2_f32_e32 v0, v1, attr0.y
-; SHADERTEST-NEXT:    v_cvt_i32_f32_e32 v1, v2
-; SHADERTEST-NEXT:    v_cvt_i32_f32_e32 v2, v0
-; SHADERTEST-NEXT:    v_cvt_f32_i32_e32 v0, v1
-; SHADERTEST-NEXT:    v_cvt_f32_i32_e32 v1, v2
-; SHADERTEST-NEXT:    s_waitcnt lgkmcnt(0)
-; SHADERTEST-NEXT:    image_sample v[0:3], v[0:1], s[4:11], s[12:15] dmask:0xf dim:SQ_RSRC_IMG_2D
-; SHADERTEST-NEXT:    s_mov_b32 s4, 0
-; SHADERTEST-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; SHADERTEST-LABEL: @lgc.shader.FS.main(
+; SHADERTEST-NEXT:  .entry:
+; SHADERTEST-NEXT:    [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.getpc()
+; SHADERTEST-NEXT:    [[TMP1:%.*]] = bitcast i64 [[TMP0]] to <2 x i32>
+; SHADERTEST-NEXT:    [[TMP2:%.*]] = call i64 @llvm.amdgcn.s.getpc()
+; SHADERTEST-NEXT:    [[TMP3:%.*]] = bitcast i64 [[TMP2]] to <2 x i32>
+; SHADERTEST-NEXT:    [[INTERPPERSPCENTER:%.*]] = call <2 x float> @lgc.input.import.builtin.InterpPerspCenter.v2f32.i32(i32 268435457) #[[ATTR5:[0-9]+]]
+; SHADERTEST-NEXT:    [[TMP4:%.*]] = call <2 x float> (...) @lgc.input.import.interpolated__v2f32(i1 false, i32 0, i32 0, i32 0, i32 poison, i32 0, <2 x float> [[INTERPPERSPCENTER]])
+; SHADERTEST-NEXT:    [[TMP5:%.*]] = call i32 @lgc.load.user.data__i32(i32 44)
+; SHADERTEST-NEXT:    [[TMP6:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[TMP5]], i64 0
+; SHADERTEST-NEXT:    [[TMP7:%.*]] = bitcast <2 x i32> [[TMP6]] to i64
+; SHADERTEST-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr addrspace(4)
+; SHADERTEST-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP8]], i32 0
+; SHADERTEST-NEXT:    [[TMP10:%.*]] = call i32 @lgc.load.user.data__i32(i32 44)
+; SHADERTEST-NEXT:    [[TMP11:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[TMP10]], i64 0
+; SHADERTEST-NEXT:    [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to i64
+; SHADERTEST-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr addrspace(4)
+; SHADERTEST-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP13]], i32 0
+; SHADERTEST-NEXT:    [[TMP15:%.*]] = fptosi <2 x float> [[TMP4]] to <2 x i32>
+; SHADERTEST-NEXT:    [[TMP16:%.*]] = sitofp <2 x i32> [[TMP15]] to <2 x float>
+; SHADERTEST-NEXT:    [[TMP17:%.*]] = load <8 x i32>, ptr addrspace(4) [[TMP14]], align 32, !invariant.load !11
+; SHADERTEST-NEXT:    [[TMP18:%.*]] = load <4 x i32>, ptr addrspace(4) [[TMP9]], align 16, !invariant.load !11
+; SHADERTEST-NEXT:    [[TMP19:%.*]] = extractelement <2 x float> [[TMP16]], i64 0
+; SHADERTEST-NEXT:    [[TMP20:%.*]] = extractelement <2 x float> [[TMP16]], i64 1
+; SHADERTEST-NEXT:    [[TMP21:%.*]] = extractelement <4 x i32> [[TMP18]], i64 0
+; SHADERTEST-NEXT:    [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP21]])
+; SHADERTEST-NEXT:    [[TMP23:%.*]] = insertelement <4 x i32> poison, i32 [[TMP22]], i64 0
+; SHADERTEST-NEXT:    [[TMP24:%.*]] = extractelement <4 x i32> [[TMP18]], i64 1
+; SHADERTEST-NEXT:    [[TMP25:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP24]])
+; SHADERTEST-NEXT:    [[TMP26:%.*]] = insertelement <4 x i32> [[TMP23]], i32 [[TMP25]], i64 1
+; SHADERTEST-NEXT:    [[TMP27:%.*]] = extractelement <4 x i32> [[TMP18]], i64 2
+; SHADERTEST-NEXT:    [[TMP28:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP27]])
+; SHADERTEST-NEXT:    [[TMP29:%.*]] = insertelement <4 x i32> [[TMP26]], i32 [[TMP28]], i64 2
+; SHADERTEST-NEXT:    [[TMP30:%.*]] = extractelement <4 x i32> [[TMP18]], i64 3
+; SHADERTEST-NEXT:    [[TMP31:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP30]])
+; SHADERTEST-NEXT:    [[TMP32:%.*]] = insertelement <4 x i32> [[TMP29]], i32 [[TMP31]], i64 3
+; SHADERTEST-NEXT:    [[TMP33:%.*]] = call reassoc nnan nsz arcp contract afn <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float [[TMP19]], float [[TMP20]], <8 x i32> [[TMP17]], <4 x i32> [[TMP32]], i1 false, i32 0, i32 0)
+; SHADERTEST-NEXT:    call void @lgc.output.export.generic.i32.i32.v4f32(i32 0, i32 0, <4 x float> [[TMP33]]) #[[ATTR6:[0-9]+]]
+; SHADERTEST-NEXT:    ret void
+;
 ;
 ;
 ; SHADERTEST-LABEL: @color_export_shader(
-; SHADERTEST-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[TMP0:%.*]], i64 0
-; SHADERTEST-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP0]], i64 1
-; SHADERTEST-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP0]], i64 2
-; SHADERTEST-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP0]], i64 3
-; SHADERTEST-NEXT:    call void @llvm.amdgcn.exp.f32(i32 immarg 0, i32 immarg 15, float [[TMP2]], float [[TMP3]], float [[TMP4]], float [[TMP5]], i1 immarg true, i1 immarg true)
+; SHADERTEST-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[TMP0:%.*]], i64 0
+; SHADERTEST-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[TMP0]], i64 1
+; SHADERTEST-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[TMP0]], i64 2
+; SHADERTEST-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[TMP0]], i64 3
+; SHADERTEST-NEXT:    call void @llvm.amdgcn.exp.f32(i32 immarg 0, i32 immarg 15, float [[TMP3]], float [[TMP4]], float [[TMP5]], float [[TMP6]], i1 immarg true, i1 immarg true) #[[ATTR1]]
 ; SHADERTEST-NEXT:    call void @llvm.amdgcn.endpgm()
 ; SHADERTEST-NEXT:    unreachable
 ;
-; SHADERTEST-LABEL: color_export_shader:
-; SHADERTEST:         s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SHADERTEST-NEXT:    exp mrt0 v0, v1, v2, v3 done vm
-; SHADERTEST-NEXT:    s_endpgm
-;
diff --git a/llpc/tool/amdllpc.cpp b/llpc/tool/amdllpc.cpp
index 61211f861b..9515f034b0 100644
--- a/llpc/tool/amdllpc.cpp
+++ b/llpc/tool/amdllpc.cpp
@@ -680,8 +680,15 @@ static Error processInputs(ICompiler *compiler, InputSpecGroup &inputSpecs, bool
 
   const InputSpec &firstInput = inputSpecs.front();
   if (isPipelineInfoFile(firstInput.filename)) {
+    bool unlinked = Unlinked;
+
+    if (firstInput.filename.starts_with("PipelineLib") && !unlinked && !Unlinked.getNumOccurrences()) {
+      LLPC_WARN("Input filename starts with \"PipelineLib\". Assuming you meant -unlinked.\n");
+      unlinked = true;
+    }
+
     compileInfo.autoLayoutDesc = false;
-    if (Error err = processInputPipeline(compiler, compileInfo, firstInput, Unlinked, IgnoreColorAttachmentFormats))
+    if (Error err = processInputPipeline(compiler, compileInfo, firstInput, unlinked, IgnoreColorAttachmentFormats))
       return err;
 
     if (compileInfo.pipelineType == VfxPipelineTypeGraphicsLibrary) {
@@ -792,6 +799,9 @@ static Error processInputs(ICompiler *compiler, InputSpecGroup &inputSpecs, bool
     dumpOptions->filterPipelineDumpByType = FilterPipelineDumpByType;
     dumpOptions->filterPipelineDumpByHash = FilterPipelineDumpByHash;
     dumpOptions->dumpDuplicatePipelines = DumpDuplicatePipelines;
+
+    if (codegen::getFileType() != CodeGenFileType::ObjectFile)
+      return createResultError(Result::ErrorInvalidValue, "Pipeline dumps require the default (ELF) -filetype");
   }
 
   std::unique_ptr<PipelineBuilder> builder =
diff --git a/llpc/translator/lib/SPIRV/SPIRVReader.cpp b/llpc/translator/lib/SPIRV/SPIRVReader.cpp
index 9d55f8e7c5..1aeb804992 100644
--- a/llpc/translator/lib/SPIRV/SPIRVReader.cpp
+++ b/llpc/translator/lib/SPIRV/SPIRVReader.cpp
@@ -1,4 +1,4 @@
-//===- SPIRVReader.cpp - Converts SPIR-V to LLVM ----------------*- C++ -*-===//
+﻿//===- SPIRVReader.cpp - Converts SPIR-V to LLVM ----------------*- C++ -*-===//
 //
 //                     The LLVM/SPIR-V Translator
 //
@@ -277,7 +277,7 @@ SPIRVToLLVM::SPIRVToLLVM(Module *llvmModule, SPIRVModule *theSpirvModule, const
                          const Vkgc::ShaderModuleUsage *moduleUsage, const Vkgc::PipelineShaderOptions *shaderOptions)
     : m_m(llvmModule), m_builder(builder), m_bm(theSpirvModule), m_entryTarget(nullptr),
       m_specConstMap(theSpecConstMap), m_convertingSamplers(convertingSamplers), m_dbgTran(m_bm, m_m, this),
-      m_moduleUsage(reinterpret_cast<const Vkgc::ShaderModuleUsage *>(moduleUsage)), m_debugOutputBuffer(nullptr),
+      m_moduleUsage(reinterpret_cast<const Vkgc::ShaderModuleUsage *>(moduleUsage)),
       m_shaderOptions(reinterpret_cast<const Vkgc::PipelineShaderOptions *>(shaderOptions)) {
   assert(m_m);
   m_context = &m_m->getContext();
@@ -1085,9 +1085,9 @@ Type *SPIRVToLLVM::transTypeImpl(SPIRVType *t, unsigned matrixStride, bool colum
     // image is not an array of three.)
     Type *imageTy = nullptr;
     if (st->getDescriptor().Dim == DimBuffer) {
-      imageTy = getBuilder()->getDescTy(ResourceNodeType::DescriptorTexelBuffer);
+      imageTy = PointerType::get(*m_context, SPIRAS_Constant);
     } else {
-      Type *singleImageTy = getBuilder()->getDescTy(ResourceNodeType::DescriptorResource);
+      Type *singleImageTy = PointerType::get(*m_context, SPIRAS_Constant);
       imageTy = ArrayType::get(singleImageTy, 3);
       if (st->getDescriptor().MS) {
         // A multisampled image is represented by a struct containing both the
@@ -1105,7 +1105,7 @@ Type *SPIRVToLLVM::transTypeImpl(SPIRVType *t, unsigned matrixStride, bool colum
     // Get sampler type.
     // A sampler is represented by a struct containing the sampler itself, and the convertingSamplerIdx, an i32
     // that is either 0 or the 1-based index into the converting samplers.
-    Type *ty = getBuilder()->getDescTy(ResourceNodeType::DescriptorSampler);
+    Type *ty = PointerType::get(*m_context, SPIRAS_Constant);
     ty = StructType::get(*m_context, {ty, getBuilder()->getInt32Ty()});
     if (t->getOpCode() == OpTypeSampledImage) {
       // A sampledimage is represented by a struct containing the image descriptor
@@ -1295,26 +1295,39 @@ Value *SPIRVToLLVM::transValue(SPIRVValue *bv, Function *f, BasicBlock *bb, bool
 
 Value *SPIRVToLLVM::transConvertInst(SPIRVValue *bv, Function *f, BasicBlock *bb) {
   SPIRVUnary *bc = static_cast<SPIRVUnary *>(bv);
+  auto srcSpvType = bc->getOperand(0)->getType();
+  auto dstSpvType = bc->getType();
   auto src = transValue(bc->getOperand(0), f, bb, bb != nullptr);
   auto srcType = src->getType();
-  auto dstType = transType(bc->getType());
+  auto dstType = transType(dstSpvType);
   CastInst::CastOps co = Instruction::BitCast;
 
+  // Extension for OGLP: Only valid for bindless texture/image to convert uvec2 to gsampler/gimage
+  // uniform uvec2 textureHandle;
+  // vec4 result = texture(sampler2D(textureHandle), texCoord);
+  bool srcTypeUvec2 = srcSpvType->isTypeVectorInt(32) && (srcSpvType->getVectorComponentCount() == 2);
+  bool bindlessTexture = dstSpvType->isTypeSampledImage() && srcTypeUvec2;
+  bool bindlessImage = dstSpvType->isTypeImage() && srcTypeUvec2;
+
+  if (bindlessTexture || bindlessImage) {
+    // 64 bit handle is stored in uvec2, we need to convert texHandle to uint64 at first
+    Value *imgDescGpuAddress = getBuilder()->CreateBitCast(src, getBuilder()->getInt64Ty());
+    return transLoadBindlessImage(dstSpvType, imgDescGpuAddress, bindlessTexture);
+  }
+
   lgc::CooperativeMatrixElementType srcElemTy = lgc::CooperativeMatrixElementType::Unknown;
   lgc::CooperativeMatrixElementType dstElemTy = lgc::CooperativeMatrixElementType::Unknown;
   lgc::CooperativeMatrixLayout srcLayout = lgc::CooperativeMatrixLayout::InvalidLayout;
   lgc::CooperativeMatrixLayout dstLayout = lgc::CooperativeMatrixLayout::InvalidLayout;
 
   if (bv->getType()->isTypeCooperativeMatrixKHR()) {
-    auto srcCompType = static_cast<SPIRVTypeCooperativeMatrixKHR *>(bc->getOperand(0)->getType())
-                           ->getCooperativeMatrixKHRComponentType();
+    auto srcCompType = static_cast<SPIRVTypeCooperativeMatrixKHR *>(srcSpvType)->getCooperativeMatrixKHRComponentType();
     srcElemTy = mapToBasicType(srcCompType);
-    auto dstCompType =
-        static_cast<SPIRVTypeCooperativeMatrixKHR *>(bc->getType())->getCooperativeMatrixKHRComponentType();
+    auto dstCompType = static_cast<SPIRVTypeCooperativeMatrixKHR *>(dstSpvType)->getCooperativeMatrixKHRComponentType();
     dstElemTy = mapToBasicType(dstCompType);
-    auto dstUse = static_cast<SPIRVTypeCooperativeMatrixKHR *>(bc->getType())->getCooperativeMatrixKHRUse();
-    unsigned rows = static_cast<SPIRVTypeCooperativeMatrixKHR *>(bc->getType())->getCooperativeMatrixKHRRows();
-    unsigned columns = static_cast<SPIRVTypeCooperativeMatrixKHR *>(bc->getType())->getCooperativeMatrixKHRColumns();
+    auto dstUse = static_cast<SPIRVTypeCooperativeMatrixKHR *>(dstSpvType)->getCooperativeMatrixKHRUse();
+    unsigned rows = static_cast<SPIRVTypeCooperativeMatrixKHR *>(dstSpvType)->getCooperativeMatrixKHRRows();
+    unsigned columns = static_cast<SPIRVTypeCooperativeMatrixKHR *>(dstSpvType)->getCooperativeMatrixKHRColumns();
     dstLayout = getCooperativeMatrixKHRLayout(static_cast<CooperativeMatrixUse>(dstUse), dstElemTy, rows, columns);
     srcLayout = getCooperativeMatrixKHRLayout(static_cast<CooperativeMatrixUse>(dstUse), srcElemTy, rows, columns);
   }
@@ -2373,12 +2386,16 @@ static SyncScope::ID transScope(LLVMContext &context, const SPIRVConstant *const
 // Translate memory semantics from SPIR-V to LLVM.
 //
 // @param spvMemorySemantics : The semantics to translate.
-// @param isAtomicRMW : Is the memory semantic from an atomic rmw operation.
-static AtomicOrdering transMemorySemantics(const SPIRVConstant *const spvMemorySemantics, const bool isAtomicRMW) {
+// @param readOnly : If the corresponding memory access only read.
+// @param writeNone : If the corresponding memory access only write.
+static AtomicOrdering transMemorySemantics(const SPIRVConstant *const spvMemorySemantics, const bool readOnly = false,
+                                           const bool writeOnly = false) {
   const unsigned semantics = static_cast<unsigned>(spvMemorySemantics->getZExtIntValue());
 
+  // We are safe to downgrade the SequentiallyConsistent to Acquire/Release/AcquireRelease based on Vulkan validation
+  // rules within a module.
   if (semantics & MemorySemanticsSequentiallyConsistentMask)
-    return AtomicOrdering::SequentiallyConsistent;
+    return readOnly ? AtomicOrdering::Acquire : writeOnly ? AtomicOrdering::Release : AtomicOrdering::AcquireRelease;
   if (semantics & MemorySemanticsAcquireReleaseMask)
     return AtomicOrdering::AcquireRelease;
   if (semantics & MemorySemanticsAcquireMask)
@@ -2400,8 +2417,7 @@ Value *SPIRVToLLVM::transAtomicRMW(SPIRVValue *const spvValue, const AtomicRMWIn
   SPIRVAtomicInstBase *const spvAtomicInst = static_cast<SPIRVAtomicInstBase *>(spvValue);
 
   const SyncScope::ID scope = transScope(*m_context, static_cast<SPIRVConstant *>(spvAtomicInst->getOpValue(1)));
-  const AtomicOrdering ordering =
-      transMemorySemantics(static_cast<SPIRVConstant *>(spvAtomicInst->getOpValue(2)), true);
+  const AtomicOrdering ordering = transMemorySemantics(static_cast<SPIRVConstant *>(spvAtomicInst->getOpValue(2)));
 
   Value *const atomicPointer = transValue(spvAtomicInst->getOpValue(0), getBuilder()->GetInsertBlock()->getParent(),
                                           getBuilder()->GetInsertBlock());
@@ -2439,8 +2455,8 @@ template <> Value *SPIRVToLLVM::transValueWithOpcode<OpAtomicLoad>(SPIRVValue *c
   SPIRVAtomicLoad *const spvAtomicLoad = static_cast<SPIRVAtomicLoad *>(spvValue);
 
   const SyncScope::ID scope = transScope(*m_context, static_cast<SPIRVConstant *>(spvAtomicLoad->getOpValue(1)));
-  const AtomicOrdering ordering =
-      transMemorySemantics(static_cast<SPIRVConstant *>(spvAtomicLoad->getOpValue(2)), false);
+  const AtomicOrdering ordering = transMemorySemantics(static_cast<SPIRVConstant *>(spvAtomicLoad->getOpValue(2)),
+                                                       /*readOnly=*/true, /*writeOnly=*/false);
 
   Value *const loadPointer = transValue(spvAtomicLoad->getOpValue(0), getBuilder()->GetInsertBlock()->getParent(),
                                         getBuilder()->GetInsertBlock());
@@ -2467,8 +2483,8 @@ template <> Value *SPIRVToLLVM::transValueWithOpcode<OpAtomicStore>(SPIRVValue *
   SPIRVAtomicStore *const spvAtomicStore = static_cast<SPIRVAtomicStore *>(spvValue);
 
   const SyncScope::ID scope = transScope(*m_context, static_cast<SPIRVConstant *>(spvAtomicStore->getOpValue(1)));
-  const AtomicOrdering ordering =
-      transMemorySemantics(static_cast<SPIRVConstant *>(spvAtomicStore->getOpValue(2)), false);
+  const AtomicOrdering ordering = transMemorySemantics(static_cast<SPIRVConstant *>(spvAtomicStore->getOpValue(2)),
+                                                       /*readOnly=*/false, /*writeOnly=*/true);
 
   Value *const storePointer = transValue(spvAtomicStore->getOpValue(0), getBuilder()->GetInsertBlock()->getParent(),
                                          getBuilder()->GetInsertBlock());
@@ -2666,8 +2682,7 @@ template <> Value *SPIRVToLLVM::transValueWithOpcode<OpAtomicIIncrement>(SPIRVVa
   SPIRVAtomicInstBase *const spvAtomicInst = static_cast<SPIRVAtomicInstBase *>(spvValue);
 
   const SyncScope::ID scope = transScope(*m_context, static_cast<SPIRVConstant *>(spvAtomicInst->getOpValue(1)));
-  const AtomicOrdering ordering =
-      transMemorySemantics(static_cast<SPIRVConstant *>(spvAtomicInst->getOpValue(2)), true);
+  const AtomicOrdering ordering = transMemorySemantics(static_cast<SPIRVConstant *>(spvAtomicInst->getOpValue(2)));
 
   Value *const atomicPointer = transValue(spvAtomicInst->getOpValue(0), getBuilder()->GetInsertBlock()->getParent(),
                                           getBuilder()->GetInsertBlock());
@@ -2694,8 +2709,7 @@ template <> Value *SPIRVToLLVM::transValueWithOpcode<OpAtomicIDecrement>(SPIRVVa
   SPIRVAtomicInstBase *const spvAtomicInst = static_cast<SPIRVAtomicInstBase *>(spvValue);
 
   const SyncScope::ID scope = transScope(*m_context, static_cast<SPIRVConstant *>(spvAtomicInst->getOpValue(1)));
-  const AtomicOrdering ordering =
-      transMemorySemantics(static_cast<SPIRVConstant *>(spvAtomicInst->getOpValue(2)), true);
+  const AtomicOrdering ordering = transMemorySemantics(static_cast<SPIRVConstant *>(spvAtomicInst->getOpValue(2)));
 
   Value *const atomicPointer = transValue(spvAtomicInst->getOpValue(0), getBuilder()->GetInsertBlock()->getParent(),
                                           getBuilder()->GetInsertBlock());
@@ -2724,9 +2738,9 @@ template <> Value *SPIRVToLLVM::transValueWithOpcode<OpAtomicCompareExchange>(SP
 
   const SyncScope::ID scope = transScope(*m_context, static_cast<SPIRVConstant *>(spvAtomicInst->getOpValue(1)));
   const AtomicOrdering successOrdering =
-      transMemorySemantics(static_cast<SPIRVConstant *>(spvAtomicInst->getOpValue(2)), true);
-  const AtomicOrdering failureOrdering =
-      transMemorySemantics(static_cast<SPIRVConstant *>(spvAtomicInst->getOpValue(3)), true);
+      transMemorySemantics(static_cast<SPIRVConstant *>(spvAtomicInst->getOpValue(2)));
+  AtomicOrdering failureOrdering = transMemorySemantics(static_cast<SPIRVConstant *>(spvAtomicInst->getOpValue(3)),
+                                                        /*readOnly=*/true, /*writeOnly=*/false);
 
   Value *const atomicPointer = transValue(spvAtomicInst->getOpValue(0), getBuilder()->GetInsertBlock()->getParent(),
                                           getBuilder()->GetInsertBlock());
@@ -2977,6 +2991,94 @@ Value *SPIRVToLLVM::transLoadImage(SPIRVValue *spvImageLoadPtr) {
   return loadImageSampler(elementTy, base);
 }
 
+// =====================================================================================================================
+// Translate a load for UniformConstant that is image/sampledimage
+//
+// @param spvElementTy : The image/sampledimage pointer
+// @param imgDescGpuAddress : image descriptor's gpu memory address
+// @param bindlessTexture : true is bindless texture, false is bindless image
+Value *SPIRVToLLVM::transLoadBindlessImage(SPIRVType *spvElementTy, Value *imgDescGpuAddress, bool bindlessTexture) {
+
+  Type *elementTy = transType(spvElementTy, 0, false, false, LayoutMode::Native);
+  Type *gpuAddrAsPtrTy = getBuilder()->getPtrTy(SPIRAS_Constant);
+  auto imageDescAddr = getBuilder()->CreateIntToPtr(imgDescGpuAddress, gpuAddrAsPtrTy);
+
+  SPIRVTypeImage *spvImageTy = nullptr;
+  if (spvElementTy->getOpCode() == OpTypeSampledImage) {
+    spvImageTy = static_cast<SPIRVTypeSampledImage *>(spvElementTy)->getImageType();
+  } else {
+    spvImageTy = static_cast<SPIRVTypeImage *>(spvElementTy);
+  }
+
+  auto desc = spvImageTy->getDescriptor();
+  Value *imageDescPtr = nullptr;
+
+  // Handle samplerBuffer or imageBuffer
+  if (desc.Dim == DimBuffer) {
+    auto bufferDescStride = getBuilder()->getInt32(DescriptorSizeBuffer);
+    imageDescPtr = getBuilder()->CreateInsertValue(
+        PoisonValue::get(StructType::get(*m_context, {imageDescAddr->getType(), bufferDescStride->getType(),
+                                                      bufferDescStride->getType(), getBuilder()->getInt32Ty()})),
+        imageDescAddr, 0);
+    imageDescPtr = getBuilder()->CreateInsertValue(imageDescPtr, bufferDescStride, 1);
+  } else {
+    // The descriptor stride is unimportant for bindless texture/image, just use it as a placeholder
+    auto imageDescStride = getBuilder()->getInt32(DescriptorSizeResource);
+    imageDescPtr = getBuilder()->CreateInsertValue(
+        PoisonValue::get(StructType::get(*m_context, {imageDescAddr->getType(), imageDescStride->getType(),
+                                                      imageDescStride->getType(), getBuilder()->getInt32Ty()})),
+        imageDescAddr, 0);
+
+    imageDescPtr = getBuilder()->CreateInsertValue(imageDescPtr, imageDescStride, 1);
+    imageDescPtr = getBuilder()->CreateInsertValue(imageDescPtr, getBuilder()->getInt32(DescriptorSizeResource), 2);
+    imageDescPtr = getBuilder()->CreateInsertValue(imageDescPtr, getBuilder()->getInt32(1), 3);
+  }
+
+  // Insert fmask descriptor address into structure
+  if (desc.MS) {
+    auto fMaskOffset = getBuilder()->getInt64(DescriptorSizeResource + DescriptorSizeSampler);
+    constexpr unsigned descriptorSizeFmask = 8 * sizeof(uint32_t);
+    auto fmaskDescStride = getBuilder()->getInt32(descriptorSizeFmask);
+    Value *fMaskDescAddr =
+        getBuilder()->CreateIntToPtr(getBuilder()->CreateAdd(imgDescGpuAddress, fMaskOffset), gpuAddrAsPtrTy);
+
+    auto fmaskDescPtr = getBuilder()->CreateInsertValue(
+        PoisonValue::get(StructType::get(*m_context, {fMaskDescAddr->getType(), fmaskDescStride->getType(),
+                                                      fmaskDescStride->getType(), getBuilder()->getInt32Ty()})),
+        fMaskDescAddr, 0);
+    fmaskDescPtr = getBuilder()->CreateInsertValue(fmaskDescPtr, fmaskDescStride, 1);
+    imageDescPtr = getBuilder()->CreateInsertValue(
+        PoisonValue::get(StructType::get(*m_context, {imageDescPtr->getType(), fmaskDescPtr->getType()})), imageDescPtr,
+        0);
+    imageDescPtr = getBuilder()->CreateInsertValue(imageDescPtr, fmaskDescPtr, 1);
+  }
+
+  // True for bindless texture, otherwise is bindless image
+  if (bindlessTexture) {
+    auto samplerOffset = getBuilder()->getInt64(DescriptorSizeResource);
+    auto samplerDescStride = getBuilder()->getInt32(DescriptorSizeSampler);
+
+    Value *samplerDescAddr =
+        getBuilder()->CreateIntToPtr(getBuilder()->CreateAdd(imgDescGpuAddress, samplerOffset), gpuAddrAsPtrTy);
+
+    Type *samplerPtrTy = StructType::get(
+        *m_context, {samplerDescAddr->getType(), getBuilder()->getInt32Ty(), getBuilder()->getInt32Ty()});
+    Value *samplerDescPtr = Constant::getNullValue(samplerPtrTy);
+
+    samplerDescPtr = getBuilder()->CreateInsertValue(samplerDescPtr, samplerDescAddr, 0);
+    samplerDescPtr = getBuilder()->CreateInsertValue(samplerDescPtr, samplerDescStride, 1);
+
+    Value *descPtr =
+        PoisonValue::get(StructType::get(*m_context, {imageDescPtr->getType(), samplerDescPtr->getType()}));
+    descPtr = getBuilder()->CreateInsertValue(descPtr, imageDescPtr, 0);
+    descPtr = getBuilder()->CreateInsertValue(descPtr, samplerDescPtr, 1);
+
+    return loadImageSampler(elementTy, descPtr);
+  }
+
+  return loadImageSampler(elementTy, imageDescPtr);
+}
+
 // =====================================================================================================================
 // Generate a load of an image, sampler or sampledimage
 //
@@ -3017,33 +3119,21 @@ Value *SPIRVToLLVM::loadImageSampler(Type *elementTy, Value *base) {
     // an array of three image descriptors, to allow for multiple planes in YCbCr conversion. Normally we only
     // load one descriptor; if there are any converting samplers, we load all three, and rely on later optimizations
     // to remove the unused ones (and thus stop us reading off the end of the descriptor table).
-    elementTy = arrayTy->getElementType();
-    auto *oneVal = getBuilder()->CreateLoad(elementTy, ptr);
-    oneVal->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(*m_context, {}));
-
-    Value *result = getBuilder()->CreateInsertValue(PoisonValue::get(arrayTy), oneVal, 0);
+    Value *result = getBuilder()->CreateInsertValue(PoisonValue::get(arrayTy), ptr, 0);
     // Pointer to image is represented as a struct containing {pointer, stride, planeStride, isResource}.
     if (!m_convertingSamplers.empty() && base->getType()->getStructNumElements() >= 4) {
       Value *planeStride = getBuilder()->CreateExtractValue(base, 2);
       Type *ptrTy = ptr->getType();
 
       for (unsigned planeIdx = 1; planeIdx != arrayTy->getNumElements(); ++planeIdx) {
-        ptr = getBuilder()->CreateBitCast(
-            ptr, getBuilder()->getInt8Ty()->getPointerTo(ptr->getType()->getPointerAddressSpace()));
         ptr = getBuilder()->CreateGEP(getBuilder()->getInt8Ty(), ptr, planeStride);
         ptr = getBuilder()->CreateBitCast(ptr, ptrTy);
-        oneVal = getBuilder()->CreateLoad(elementTy, ptr);
-        oneVal->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(*m_context, {}));
-        result = getBuilder()->CreateInsertValue(result, oneVal, planeIdx);
+        result = getBuilder()->CreateInsertValue(result, ptr, planeIdx);
       }
     }
     return result;
   }
-
-  // Other cases: Just load the element from the pointer.
-  auto load = getBuilder()->CreateLoad(elementTy, ptr);
-  load->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(*m_context, {}));
-  return load;
+  return ptr;
 }
 
 // =====================================================================================================================
@@ -3070,7 +3160,8 @@ Value *SPIRVToLLVM::transImagePointer(SPIRVValue *spvImagePtr, SPIRVType *baseTy
 
   spvImagePtr->hasDecorate(DecorationBinding, 0, &binding);
   bool hasDescriptorSet = spvImagePtr->hasDecorate(DecorationDescriptorSet, 0, &descriptorSet);
-  assert(!getPipelineOptions()->replaceSetWithResourceType || !hasDescriptorSet ||
+
+  assert(!getPipelineOptions()->getGlState().replaceSetWithResourceType || !hasDescriptorSet ||
          static_cast<SPIRVTypePointer *>(spvImagePtr->getType())->getStorageClass() == StorageClassUniformConstant);
   (void)hasDescriptorSet;
 
@@ -3092,7 +3183,7 @@ Value *SPIRVToLLVM::transImagePointer(SPIRVValue *spvImagePtr, SPIRVType *baseTy
   Value *imageDescPtr = nullptr;
   Value *samplerDescPtr = nullptr;
 
-  if (getPipelineOptions()->replaceSetWithResourceType)
+  if (getPipelineOptions()->getGlState().replaceSetWithResourceType)
     assert(spvTy->getOpCode() != OpTypeSampler);
 
   if (spvTy->getOpCode() != OpTypeSampler) {
@@ -3106,11 +3197,11 @@ Value *SPIRVToLLVM::transImagePointer(SPIRVValue *spvImagePtr, SPIRVType *baseTy
     auto resType =
         desc->Dim == DimBuffer ? ResourceNodeType::DescriptorTexelBuffer : ResourceNodeType::DescriptorResource;
 
-    if (getPipelineOptions()->replaceSetWithResourceType) {
+    if (getPipelineOptions()->getGlState().replaceSetWithResourceType) {
       if (spvTy->getOpCode() == OpTypeImage) {
         descriptorSet = PipelineContext::getGlResourceNodeSetFromType(Vkgc::ResourceMappingNodeType::DescriptorImage);
       } else if (spvTy->getOpCode() == OpTypeSampledImage) {
-        if (getPipelineOptions()->enableCombinedTexture) {
+        if (getPipelineOptions()->getGlState().enableCombinedTexture) {
           descriptorSet =
               PipelineContext::getGlResourceNodeSetFromType(Vkgc::ResourceMappingNodeType::DescriptorCombinedTexture);
         } else {
@@ -3123,7 +3214,7 @@ Value *SPIRVToLLVM::transImagePointer(SPIRVValue *spvImagePtr, SPIRVType *baseTy
     imageDescPtr = getDescPointerAndStride(resType, descriptorSet, binding, resType);
 
     if (desc->MS) {
-      if (getPipelineOptions()->replaceSetWithResourceType && spvTy->getOpCode() != OpTypeImage)
+      if (getPipelineOptions()->getGlState().replaceSetWithResourceType && spvTy->getOpCode() != OpTypeImage)
         descriptorSet = PipelineContext::getGlResourceNodeSetFromType(Vkgc::ResourceMappingNodeType::DescriptorFmask);
       // A multisampled image pointer is a struct containing an image desc pointer and an fmask desc pointer.
       Value *fmaskDescPtr = getDescPointerAndStride(ResourceNodeType::DescriptorFmask, descriptorSet, binding,
@@ -3136,7 +3227,8 @@ Value *SPIRVToLLVM::transImagePointer(SPIRVValue *spvImagePtr, SPIRVType *baseTy
   }
 
   if (spvTy->getOpCode() != OpTypeImage) {
-    if (getPipelineOptions()->replaceSetWithResourceType && !getPipelineOptions()->enableCombinedTexture)
+    if (getPipelineOptions()->getGlState().replaceSetWithResourceType &&
+        !getPipelineOptions()->getGlState().enableCombinedTexture)
       descriptorSet = PipelineContext::getGlResourceNodeSetFromType(Vkgc::ResourceMappingNodeType::DescriptorSampler);
     // Sampler or sampledimage -- need to get the sampler {pointer,stride,convertingSamplerIdx}
     samplerDescPtr = getDescPointerAndStride(ResourceNodeType::DescriptorSampler, descriptorSet, binding,
@@ -3211,7 +3303,7 @@ Value *SPIRVToLLVM::getDescPointerAndStride(ResourceNodeType resType, unsigned d
   unsigned convertingSamplerIdx = 0;
   unsigned nextIdx = 1;
   unsigned convertingSamplerDescriptorSet = descriptorSet;
-  if (getPipelineOptions()->replaceSetWithResourceType &&
+  if (getPipelineOptions()->getGlState().replaceSetWithResourceType &&
       descriptorSet ==
           PipelineContext::getGlResourceNodeSetFromType(Vkgc::ResourceMappingNodeType::DescriptorSampler)) {
     // When using 'replaceSetWithResourceType' option (OGL default) it's not possible to match converting samplers
@@ -3743,8 +3835,6 @@ Value *SPIRVToLLVM::indexDescPtr(Type *elementTy, Value *base, Value *index) {
 
   // Do the indexing operation by GEPping as a byte pointer.
   Type *ptrTy = ptr->getType();
-  ptr = getBuilder()->CreateBitCast(ptr,
-                                    getBuilder()->getInt8Ty()->getPointerTo(ptr->getType()->getPointerAddressSpace()));
   ptr = getBuilder()->CreateGEP(getBuilder()->getInt8Ty(), ptr, index);
   ptr = getBuilder()->CreateBitCast(ptr, ptrTy);
   base = getBuilder()->CreateInsertValue(base, ptr, 0);
@@ -4859,39 +4949,14 @@ template <> Value *SPIRVToLLVM::transValueWithOpcode<OpExtInst>(SPIRVValue *cons
 // @param bb : Which basicblock to generate code
 Value *SPIRVToLLVM::transDebugPrintf(SPIRVInstruction *bi, const ArrayRef<SPIRVValue *> spvValues, Function *func,
                                      BasicBlock *bb) {
-  auto resMapping = getPipelineContext()->getResourceMapping();
-  unsigned nodeIndex = 0;
-  if (findResourceNode(resMapping->pUserDataNodes, resMapping->userDataNodeCount, Vkgc::InternalDescriptorSetId,
-                       Vkgc::PrintfBufferBindingId, &nodeIndex) == nullptr)
-    return getBuilder()->getInt64(0);
-
-  if (!m_debugOutputBuffer) {
-    auto spvArrType = m_bm->addRuntimeArray(m_bm->addIntegerType(32));
-    auto spvStructType = m_bm->addStructType({spvArrType});
-    Type *bufType = transType(spvStructType);
-
-    m_debugOutputBuffer =
-        new GlobalVariable(*m_m, bufType, false, GlobalValue::ExternalLinkage, nullptr, "debugOutputBuffer", nullptr,
-                           GlobalVariable::NotThreadLocal, SPIRAS_Uniform);
-
-    // Setup (desc,binding) resource metadata
-    auto intType = getBuilder()->getInt32Ty();
-    SmallVector<Metadata *, 4> resourceMetas = {
-        ConstantAsMetadata::get(ConstantInt::get(intType, Vkgc::InternalDescriptorSetId)),
-        ConstantAsMetadata::get(ConstantInt::get(intType, Vkgc::PrintfBufferBindingId)),
-        ConstantAsMetadata::get(ConstantInt::get(intType, 0))};
-
-    auto resMdNode = MDNode::get(*m_context, resourceMetas);
-    m_debugOutputBuffer->addMetadata(gSPIRVMD::Resource, *resMdNode);
-  }
-
   auto spvValueItr = spvValues.begin();
-  Value *formatStr = mapEntry(*spvValueItr++, nullptr);
+  const SPIRVEntry *spvStrEntry = *spvValueItr++;
+  auto spvStr = static_cast<const SPIRVString *>(spvStrEntry);
   SmallVector<Value *> args;
   for (; spvValueItr != spvValues.end(); ++spvValueItr) {
     args.push_back(transValue(*spvValueItr, func, bb));
   }
-  return getBuilder()->create<lgc::DebugPrintfOp>(m_debugOutputBuffer, formatStr, args);
+  return getBuilder()->create<lgc::DebugPrintfOp>(spvStr->getStr(), args);
 }
 
 // Translate an initializer. This has special handling for the case where the type to initialize to does not match the
@@ -7153,7 +7218,7 @@ static unsigned convertDimension(const SPIRVTypeImageDescriptor *desc) {
     case Dim1D:
       return lgc::Builder::Dim1D;
     case DimBuffer:
-      return lgc::Builder::Dim1D;
+      return lgc::Builder::Dim1DBuffer;
     case Dim2D:
       return lgc::Builder::Dim2D;
     case DimRect:
@@ -7172,7 +7237,7 @@ static unsigned convertDimension(const SPIRVTypeImageDescriptor *desc) {
     case Dim1D:
       return lgc::Builder::Dim1DArray;
     case DimBuffer:
-      return lgc::Builder::Dim1DArray;
+      return lgc::Builder::Dim1DArrayBuffer;
     case Dim2D:
       return lgc::Builder::Dim2DArray;
     case DimCube:
@@ -7720,8 +7785,11 @@ Value *SPIRVToLLVM::transSPIRVImageAtomicOpFromInst(SPIRVInstruction *bi, BasicB
   // Determine the atomic ordering.
   AtomicOrdering ordering = AtomicOrdering::NotAtomic;
   if (scope != ScopeInvocation) {
+    // We are safe to downgrade the SequentiallyConsistent to Acquire/AcquireRelease based on Vulkan validation rules
+    // within a module.
+    bool readOnly = bi->getOpCode() == OpAtomicLoad;
     if (semantics & MemorySemanticsSequentiallyConsistentMask)
-      ordering = AtomicOrdering::SequentiallyConsistent;
+      ordering = readOnly ? AtomicOrdering::Acquire : AtomicOrdering::AcquireRelease;
     else if (semantics & MemorySemanticsAcquireReleaseMask)
       ordering = AtomicOrdering::AcquireRelease;
     else if (semantics & MemorySemanticsAcquireMask)
@@ -8994,7 +9062,7 @@ bool SPIRVToLLVM::transDecoration(SPIRVValue *bv, ArrayRef<Value *> values) {
       Type *mdTy = nullptr;
       SPIRVType *bt = bv->getType()->getPointerElementType();
       bool vs64BitsAttribInputSingleLoc = (as == SPIRAS_Input && m_execModule == ExecutionModelVertex &&
-                                           getPipelineOptions()->vertex64BitsAttribSingleLoc);
+                                           getPipelineOptions()->getGlState().vertex64BitsAttribSingleLoc);
       auto md = buildShaderInOutMetadata(bt, inOutDec, mdTy, vs64BitsAttribInputSingleLoc);
 
       // Setup input/output metadata
@@ -9036,7 +9104,7 @@ bool SPIRVToLLVM::transDecoration(SPIRVValue *bv, ArrayRef<Value *> values) {
         assert(blockTy->isTypeStruct() || blockTy->isTypeAccelerationStructureKHR() ||
                bv->getType()->getPointerStorageClass() == StorageClassAtomicCounter);
 
-        if (getPipelineOptions()->replaceSetWithResourceType) {
+        if (getPipelineOptions()->getGlState().replaceSetWithResourceType) {
           bool hasBlock = blockTy->hasDecorate(DecorationBlock);
           bool hasBufferBlock = blockTy->hasDecorate(DecorationBufferBlock);
 
@@ -9340,6 +9408,14 @@ Constant *SPIRVToLLVM::buildShaderInOutMetadata(SPIRVType *bt, ShaderInOutDecora
 
     inOutMd.Component = inOutDec.Component;
     inOutMd.InterpMode = inOutDec.Interp.Mode;
+    auto llpcContext = static_cast<Llpc::Context *>(m_context);
+    auto info = static_cast<const Vkgc::GraphicsPipelineBuildInfo *>(llpcContext->getPipelineBuildInfo());
+    if ((llpcContext->getPipelineType() == PipelineType::Graphics) && info->glState.enableFlatShade &&
+        (inOutMd.Value == Vkgc::GlCompatibilityInOutLocation::FrontColor ||
+         inOutMd.Value == Vkgc::GlCompatibilityInOutLocation::BackColor ||
+         inOutMd.Value == Vkgc::GlCompatibilityInOutLocation::FrontSecondaryColor ||
+         inOutMd.Value == Vkgc::GlCompatibilityInOutLocation::BackSecondaryColor))
+      inOutMd.InterpMode = InterpModeFlat;
     inOutMd.InterpLoc = inOutDec.Interp.Loc;
     inOutMd.PerPatch = inOutDec.PerPatch;
     inOutMd.PerPrimitive = inOutDec.PerPrimitive;
@@ -10418,8 +10494,10 @@ Instruction *SPIRVToLLVM::transBarrier(BasicBlock *bb, SPIRVWord execScope, SPIR
 Instruction *SPIRVToLLVM::transMemFence(BasicBlock *bb, SPIRVWord memSema, SPIRVWord memScope) {
   AtomicOrdering ordering = AtomicOrdering::NotAtomic;
 
+  // We are safe to downgrade the SequentiallyConsistent to AcquireRelease based on Vulkan validation rules within a
+  // module.
   if (memSema & MemorySemanticsSequentiallyConsistentMask)
-    ordering = AtomicOrdering::SequentiallyConsistent;
+    ordering = AtomicOrdering::AcquireRelease;
   else if (memSema & MemorySemanticsAcquireReleaseMask)
     ordering = AtomicOrdering::AcquireRelease;
   else if (memSema & MemorySemanticsAcquireMask)
@@ -10437,10 +10515,6 @@ Instruction *SPIRVToLLVM::transMemFence(BasicBlock *bb, SPIRVWord memSema, SPIRV
   if (ordering == AtomicOrdering::NotAtomic)
     return nullptr;
 
-  // Upgrade the ordering if we need to make it available or visible
-  if (memSema & (MemorySemanticsMakeAvailableKHRMask | MemorySemanticsMakeVisibleKHRMask))
-    ordering = AtomicOrdering::SequentiallyConsistent;
-
   SyncScope::ID scope = SyncScope::System;
 
   switch (memScope) {
diff --git a/llpc/translator/lib/SPIRV/SPIRVReader.h b/llpc/translator/lib/SPIRV/SPIRVReader.h
index 04e5929cd6..aadf063459 100644
--- a/llpc/translator/lib/SPIRV/SPIRVReader.h
+++ b/llpc/translator/lib/SPIRV/SPIRVReader.h
@@ -110,6 +110,7 @@ class SPIRVToLLVM {
   template <spv::Op> SmallVector<Value *> transValueMultiWithOpcode(SPIRVValue *);
   template <spv::Op> SmallVector<Value *> transValueMultiWithOpcode(SPIRVValue *, Function *f, BasicBlock *bb);
   Value *transLoadImage(SPIRVValue *spvImageLoadPtr);
+  Value *transLoadBindlessImage(SPIRVType *spvElementTy, Value *imgDescGpuAddress, bool bindlessTexture);
   Value *loadImageSampler(Type *elementTy, Value *base);
   Value *transImagePointer(SPIRVValue *spvImagePtr, SPIRVType *elementTy = nullptr);
   Value *getDescPointerAndStride(lgc::ResourceNodeType resType, unsigned descriptorSet, unsigned binding,
@@ -287,7 +288,6 @@ class SPIRVToLLVM {
   DenseMap<std::pair<SPIRVType *, unsigned>, Type *> m_overlappingStructTypeWorkaroundMap;
   DenseMap<Function *, BlockPredecessorToCountInFunction> m_blockPredecessorToCount;
   const Vkgc::ShaderModuleUsage *m_moduleUsage;
-  GlobalVariable *m_debugOutputBuffer;
 
   const Vkgc::PipelineShaderOptions *m_shaderOptions;
   bool m_workaroundStorageImageFormats;
diff --git a/llpc/util/llpcDebug.h b/llpc/util/llpcDebug.h
index f406c1794d..804574e25a 100644
--- a/llpc/util/llpcDebug.h
+++ b/llpc/util/llpcDebug.h
@@ -41,6 +41,15 @@
     }                                                                                                                  \
   while (false)
 
+// Output error message
+#define LLPC_WARN(_msg)                                                                                                \
+  do                                                                                                                   \
+    if (Llpc::EnableErrs()) {                                                                                          \
+      llvm::outs() << "WARNING: " << _msg;                                                                             \
+      llvm::outs().flush();                                                                                            \
+    }                                                                                                                  \
+  while (false)
+
 // Output general message
 #define LLPC_OUTS(_msg)                                                                                                \
   do                                                                                                                   \
diff --git a/llpc/util/llpcShaderModuleHelper.cpp b/llpc/util/llpcShaderModuleHelper.cpp
index 9f9959f940..4ac381ba2a 100644
--- a/llpc/util/llpcShaderModuleHelper.cpp
+++ b/llpc/util/llpcShaderModuleHelper.cpp
@@ -151,20 +151,94 @@ ShaderModuleUsage ShaderModuleHelper::getShaderModuleUsageInfo(const BinaryData
           break;
         }
         case BuiltInPointCoord:
-        case BuiltInPrimitiveId:
         case BuiltInLayer:
         case BuiltInClipDistance:
         case BuiltInCullDistance: {
           shaderModuleUsage.useGenericBuiltIn = true;
           break;
         }
+        case BuiltInBaryCoordKHR:
+        case BuiltInBaryCoordNoPerspKHR: {
+          shaderModuleUsage.useBarycentric = true;
+          break;
+        }
+        case BuiltInPrimitiveId: {
+          shaderModuleUsage.useGenericBuiltIn = true;
+          shaderModuleUsage.rtSystemValueUsage.primitive.primitiveIndex = 1;
+          break;
+        }
+        case BuiltInInstanceId: {
+          shaderModuleUsage.rtSystemValueUsage.primitive.instanceID = 1;
+          break;
+        }
+        case BuiltInLaunchIdKHR: {
+          shaderModuleUsage.rtSystemValueUsage.ray.launchId = 1;
+          break;
+        }
+        case BuiltInLaunchSizeKHR: {
+          shaderModuleUsage.rtSystemValueUsage.ray.launchSize = 1;
+          break;
+        }
+        case BuiltInWorldRayOriginKHR: {
+          shaderModuleUsage.rtSystemValueUsage.ray.worldRayOrigin = 1;
+          break;
+        }
+        case BuiltInWorldRayDirectionKHR: {
+          shaderModuleUsage.rtSystemValueUsage.ray.worldRayDirection = 1;
+          break;
+        }
+        case BuiltInObjectRayOriginKHR: {
+          shaderModuleUsage.rtSystemValueUsage.primitive.objectRayOrigin = 1;
+          break;
+        }
+        case BuiltInObjectRayDirectionKHR: {
+          shaderModuleUsage.rtSystemValueUsage.primitive.objectRayDirection = 1;
+          break;
+        }
+        case BuiltInRayTminKHR: {
+          shaderModuleUsage.rtSystemValueUsage.ray.tMin = 1;
+          break;
+        }
+        case BuiltInInstanceCustomIndexKHR: {
+          shaderModuleUsage.rtSystemValueUsage.primitive.instanceIndex = 1;
+          break;
+        }
+        case BuiltInObjectToWorldKHR: {
+          shaderModuleUsage.rtSystemValueUsage.primitive.objectToWorld = 1;
+          break;
+        }
+        case BuiltInWorldToObjectKHR: {
+          shaderModuleUsage.rtSystemValueUsage.primitive.worldToObject = 1;
+          break;
+        }
+        case BuiltInHitTNV: {
+          shaderModuleUsage.rtSystemValueUsage.ray.tCurrent = 1;
+          break;
+        }
+        case BuiltInHitKindKHR: {
+          shaderModuleUsage.rtSystemValueUsage.primitive.hitKind = 1;
+          break;
+        }
+        case BuiltInHitTriangleVertexPositionsKHR: {
+          shaderModuleUsage.rtSystemValueUsage.primitive.hitTrianglePosition = 1;
+          break;
+        }
+        case BuiltInIncomingRayFlagsKHR: {
+          shaderModuleUsage.rtSystemValueUsage.ray.flags = 1;
+          break;
+        }
+        case BuiltInRayGeometryIndexKHR: {
+          shaderModuleUsage.rtSystemValueUsage.primitive.geometryIndex = 1;
+          break;
+        }
         default: {
           break;
         }
         }
       } else if (decoration == DecorationIndex) {
         hasIndexDecoration = true;
-      }
+      } else if (decoration == DecorationPerVertexKHR)
+        shaderModuleUsage.useBarycentric = true;
       break;
     }
     case OpSpecConstantTrue:
diff --git a/llvmraytracing/include/llvmraytracing/Continuations.h b/llvmraytracing/include/llvmraytracing/Continuations.h
index a53a01df61..624ec64ced 100644
--- a/llvmraytracing/include/llvmraytracing/Continuations.h
+++ b/llvmraytracing/include/llvmraytracing/Continuations.h
@@ -72,6 +72,7 @@
 
 #pragma once
 
+#include "TypesMetadata.h"
 #include "compilerutils/CompilerUtils.h"
 #include "llvm-dialects/Dialect/Builder.h"
 #include "llvmraytracing/ContinuationsUtil.h"
@@ -473,8 +474,4 @@ Function *lowerStructRetArgument(Function *Fn);
 /// Add necessary continuation transform passes for LGC.
 void addLgcContinuationTransform(ModulePassManager &MPM);
 
-/// LLVM parser callback which adds !types metadata during DXIL parsing
-void DXILValueTypeMetadataCallback(Value *V, unsigned TypeID,
-                                   GetTypeByIDTy GetTypeByID,
-                                   GetContainedTypeIDTy GetContainedTypeID);
 } // namespace llvm
diff --git a/llvmraytracing/include/llvmraytracing/ContinuationsUtil.h b/llvmraytracing/include/llvmraytracing/ContinuationsUtil.h
index 8fe18562f6..ac1fb42de5 100644
--- a/llvmraytracing/include/llvmraytracing/ContinuationsUtil.h
+++ b/llvmraytracing/include/llvmraytracing/ContinuationsUtil.h
@@ -173,16 +173,6 @@ class ContFuncTy {
   void writeMetadata(Function *F);
 };
 
-/// Return element type of a function argument resolving opaque pointers
-/// via !types metadata where appropriate.
-/// Returns nullptr for non-pointers.
-Type *getFuncArgPtrElementType(const Argument *Arg);
-
-/// Return element type of a function argument resolving opaque pointers
-/// via !types metadata where appropriate.
-/// Returns nullptr for non-pointers.
-Type *getFuncArgPtrElementType(const Function *F, int ArgNo);
-
 struct ContSetting {
   /// A hash value that is used as name.
   uint64_t NameHash;
@@ -272,10 +262,6 @@ class ContHelper {
   // Marks an await as a waiting one with a wait mask.
   static constexpr const char *MDIsWaitAwaitName = "continuation.wait.await";
 
-  // Whether this is a load instruction that should translate to a last_use
-  // load.
-  static constexpr const char *MDIsLastUseName = "amdgpu.last.use";
-
   static std::optional<uint32_t> extractZExtI32Constant(MDNode *Node) {
     if (Node) {
       uint64_t Result =
@@ -595,13 +581,6 @@ class ContHelper {
     CI.setMetadata(ContHelper::MDIsWaitAwaitName, nullptr);
   }
 
-  // Specifies that this is a load that marks a last use of the pointer it loads
-  // from.
-  static void setIsLastUseLoad(LoadInst &Load) {
-    Load.setMetadata(ContHelper::MDIsLastUseName,
-                     MDTuple::get(Load.getContext(), {}));
-  }
-
   /// Returns true if a call to the given function should be rematerialized
   /// in a shader of the specified kind.
   ///
diff --git a/llvmraytracing/include/llvmraytracing/TypesMetadata.h b/llvmraytracing/include/llvmraytracing/TypesMetadata.h
new file mode 100644
index 0000000000..e1db8e80d7
--- /dev/null
+++ b/llvmraytracing/include/llvmraytracing/TypesMetadata.h
@@ -0,0 +1,49 @@
+/*
+ ***********************************************************************************************************************
+ *
+ *  Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved.
+ *
+ *  Permission is hereby granted, free of charge, to any person obtaining a copy
+ *  of this software and associated documentation files (the "Software"), to
+ *  deal in the Software without restriction, including without limitation the
+ *  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ *  sell copies of the Software, and to permit persons to whom the Software is
+ *  furnished to do so, subject to the following conditions:
+ *
+ *  The above copyright notice and this permission notice shall be included in
+ *all copies or substantial portions of the Software.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ *  IN THE SOFTWARE.
+ *
+ **********************************************************************************************************************/
+
+//===- TypesMetadata.h - Pointee type metadata for processing DXIL ---------==//
+
+#pragma once
+
+#include "llvm/Bitcode/BitcodeReader.h"
+
+namespace llvm {
+
+/// Return element type of a function argument resolving opaque pointers
+/// via !types metadata where appropriate.
+/// Returns nullptr for non-pointers.
+Type *getFuncArgPtrElementType(const Argument *Arg);
+
+/// Return element type of a function argument resolving opaque pointers
+/// via !types metadata where appropriate.
+/// Returns nullptr for non-pointers.
+Type *getFuncArgPtrElementType(const Function *F, int ArgNo);
+
+/// LLVM parser callback which adds !types metadata during DXIL parsing
+void DXILValueTypeMetadataCallback(Value *V, unsigned TypeID,
+                                   GetTypeByIDTy GetTypeByID,
+                                   GetContainedTypeIDTy GetContainedTypeID);
+
+} // namespace llvm
diff --git a/llvmraytracing/lib/Continuations.cpp b/llvmraytracing/lib/Continuations.cpp
index 5ecf797ef2..d314c9eb91 100644
--- a/llvmraytracing/lib/Continuations.cpp
+++ b/llvmraytracing/lib/Continuations.cpp
@@ -1265,6 +1265,8 @@ bool llvm::LgcMaterializable(Instruction &OrigI) {
       // FIXME: switch to dialectOp check.
       if (CalledName.starts_with("lgc.user.data") ||
           CalledName.starts_with("lgc.shader.input") ||
+          CalledName.starts_with("lgc.create.get.desc.ptr") ||
+          CalledName.starts_with("lgc.load.buffer.desc") ||
           CalledName.starts_with("lgc.load.user.data"))
         return true;
     }
diff --git a/llvmraytracing/lib/DXILContPostProcess.cpp b/llvmraytracing/lib/DXILContPostProcess.cpp
index 56c3c4f05b..ae434696fc 100644
--- a/llvmraytracing/lib/DXILContPostProcess.cpp
+++ b/llvmraytracing/lib/DXILContPostProcess.cpp
@@ -569,7 +569,7 @@ void DXILContPostProcessPassImpl::handleContStackIntrinsic(
           Align(CpsStackLowering::getContinuationStackAlignment()));
 
       if (FuncName.starts_with("LoadLastUse"))
-        ContHelper::setIsLastUseLoad(*cast<LoadInst>(Replacement));
+        CompilerUtils::setIsLastUseLoad(*cast<LoadInst>(Replacement));
 
       IsMemoryAccess = true;
     } else if (FuncName.starts_with("Store")) {
diff --git a/llvmraytracing/test/dx/intrinsics/cont-payload-registers-get-i32.ll b/llvmraytracing/test/dx/intrinsics/cont-payload-registers-get-i32.ll
index 5f4629bae2..c8abbcea25 100644
--- a/llvmraytracing/test/dx/intrinsics/cont-payload-registers-get-i32.ll
+++ b/llvmraytracing/test/dx/intrinsics/cont-payload-registers-get-i32.ll
@@ -31,7 +31,7 @@ define void @main() {
 ; ALL-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
 ; ALL-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0
 ; ALL-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
-; ALL-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds ([30 x i32], ptr addrspace(20) @REGISTERS, i32 0, i32 5), align 4
+; ALL-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(20) getelementptr {{(inbounds )?}}([30 x i32], ptr addrspace(20) @REGISTERS, i32 0, i32 5), align 4
 ; ALL-NEXT:    store i32 [[TMP2]], ptr @debug_global, align 4
 ; ALL-NEXT:    ret void
 ; ALL:       entry.split:
@@ -43,7 +43,7 @@ define void @main() {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
-; LOWERRAYTRACINGPIPELINE-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(20) getelementptr inbounds ([30 x i32], ptr addrspace(20) @PAYLOAD, i32 0, i32 5), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    [[VAL:%.*]] = load i32, ptr addrspace(20) getelementptr {{(inbounds )?}}([30 x i32], ptr addrspace(20) @PAYLOAD, i32 0, i32 5), align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 [[VAL]], ptr @debug_global, align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    ret void
 ;
diff --git a/llvmraytracing/test/dx/intrinsics/cont-payload-registers-set-i32.ll b/llvmraytracing/test/dx/intrinsics/cont-payload-registers-set-i32.ll
index 14a1f07454..8eb99d7bc7 100644
--- a/llvmraytracing/test/dx/intrinsics/cont-payload-registers-set-i32.ll
+++ b/llvmraytracing/test/dx/intrinsics/cont-payload-registers-set-i32.ll
@@ -29,7 +29,7 @@ define void @main() {
 ; ALL-NEXT:    store i32 [[CSPINIT]], ptr [[CSP]], align 4
 ; ALL-NEXT:    [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], 0
 ; ALL-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
-; ALL-NEXT:    store i32 42, ptr addrspace(20) getelementptr inbounds ([30 x i32], ptr addrspace(20) @REGISTERS, i32 0, i32 5), align 4
+; ALL-NEXT:    store i32 42, ptr addrspace(20) getelementptr {{(inbounds )?}}([30 x i32], ptr addrspace(20) @REGISTERS, i32 0, i32 5), align 4
 ; ALL-NEXT:    ret void
 ; ALL:       entry.split:
 ; ALL-NEXT:    unreachable
@@ -40,7 +40,7 @@ define void @main() {
 ; LOWERRAYTRACINGPIPELINE-NEXT:    [[SYSTEM_DATA_ALLOCA:%.*]] = alloca [[STRUCT_DISPATCHSYSTEMDATA]], align 8
 ; LOWERRAYTRACINGPIPELINE-NEXT:    store [[STRUCT_DISPATCHSYSTEMDATA]] [[TMP0]], ptr [[SYSTEM_DATA_ALLOCA]], align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    call void @amd.dx.setLocalRootIndex(i32 0)
-; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 42, ptr addrspace(20) getelementptr inbounds ([30 x i32], ptr addrspace(20) @PAYLOAD, i32 0, i32 5), align 4
+; LOWERRAYTRACINGPIPELINE-NEXT:    store i32 42, ptr addrspace(20) getelementptr {{(inbounds )?}}([30 x i32], ptr addrspace(20) @PAYLOAD, i32 0, i32 5), align 4
 ; LOWERRAYTRACINGPIPELINE-NEXT:    ret void
 ;
 entry:
diff --git a/tool/dumper/vkgcPipelineDumper.cpp b/tool/dumper/vkgcPipelineDumper.cpp
index 7ca0eb0c2f..534fce5bc2 100644
--- a/tool/dumper/vkgcPipelineDumper.cpp
+++ b/tool/dumper/vkgcPipelineDumper.cpp
@@ -936,14 +936,15 @@ void PipelineDumper::dumpPipelineOptions(const PipelineOptions *options, std::os
   dumpFile << "options.internalRtShaders = " << options->internalRtShaders << "\n";
   dumpFile << "options.forceNonUniformResourceIndexStageMask = " << options->forceNonUniformResourceIndexStageMask
            << "\n";
-  dumpFile << "options.replaceSetWithResourceType = " << options->replaceSetWithResourceType << "\n";
-  dumpFile << "options.disableSampleMask = " << options->disableSampleMask << "\n";
-  dumpFile << "options.buildResourcesDataForShaderModule = " << options->buildResourcesDataForShaderModule << "\n";
-  dumpFile << "options.disableTruncCoordForGather = " << options->disableTruncCoordForGather << "\n";
-  dumpFile << "options.enableCombinedTexture = " << options->enableCombinedTexture << "\n";
-  dumpFile << "options.vertex64BitsAttribSingleLoc = " << options->vertex64BitsAttribSingleLoc << "\n";
-  dumpFile << "options.enableFragColor = " << options->enableFragColor << "\n";
-  dumpFile << "options.disableBaseVertex = " << options->disableBaseVertex << "\n";
+  dumpFile << "options.replaceSetWithResourceType = " << options->getGlState().replaceSetWithResourceType << "\n";
+  dumpFile << "options.disableSampleMask = " << options->getGlState().disableSampleMask << "\n";
+  dumpFile << "options.buildResourcesDataForShaderModule = " << options->getGlState().buildResourcesDataForShaderModule
+           << "\n";
+  dumpFile << "options.disableTruncCoordForGather = " << options->getGlState().disableTruncCoordForGather << "\n";
+  dumpFile << "options.enableCombinedTexture = " << options->getGlState().enableCombinedTexture << "\n";
+  dumpFile << "options.vertex64BitsAttribSingleLoc = " << options->getGlState().vertex64BitsAttribSingleLoc << "\n";
+  dumpFile << "options.enableFragColor = " << options->getGlState().enableFragColor << "\n";
+  dumpFile << "options.disableBaseVertex = " << options->getGlState().disableBaseVertex << "\n";
   dumpFile << "options.enablePrimGeneratedQuery = " << options->enablePrimGeneratedQuery << "\n";
   dumpFile << "options.disablePerCompFetch = " << options->disablePerCompFetch << "\n";
 }
@@ -1043,6 +1044,7 @@ void PipelineDumper::dumpGraphicsStateInfo(const GraphicsPipelineBuildInfo *pipe
   dumpFile << "dynamicTopology = " << pipelineInfo->dynamicTopology << "\n";
   dumpFile << "enableColorClampVs = " << pipelineInfo->glState.enableColorClampVs << "\n";
   dumpFile << "enableColorClampFs = " << pipelineInfo->glState.enableColorClampFs << "\n";
+  dumpFile << "enableFlatShade = " << pipelineInfo->glState.enableFlatShade << "\n";
 
   dumpFile << "originUpperLeft = " << pipelineInfo->getGlState().originUpperLeft << "\n";
   if (pipelineInfo->clientMetadataSize > 0) {
@@ -1570,6 +1572,7 @@ MetroHash::Hash PipelineDumper::generateHashForGraphicsPipeline(const GraphicsPi
 
   hasher.Update(pipeline->glState.enableColorClampVs);
   hasher.Update(pipeline->glState.enableColorClampFs);
+  hasher.Update(pipeline->glState.enableFlatShade);
 
   MetroHash::Hash hash = {};
   hasher.Finalize(hash.bytes);
@@ -1873,7 +1876,7 @@ void PipelineDumper::updateHashForPipelineOptions(const PipelineOptions *options
   }
   if (stage == UnlinkedStageFragment || stage == UnlinkedStageCount) {
     hasher->Update(options->enableInterpModePatch);
-    hasher->Update(options->disableSampleMask);
+    hasher->Update(options->getGlState().disableSampleMask);
   }
   hasher->Update(options->pageMigrationEnabled);
   hasher->Update(options->optimizationLevel);
@@ -1886,13 +1889,13 @@ void PipelineDumper::updateHashForPipelineOptions(const PipelineOptions *options
   hasher->Update(options->reverseThreadGroup);
   hasher->Update(options->internalRtShaders);
   hasher->Update(options->forceNonUniformResourceIndexStageMask);
-  hasher->Update(options->replaceSetWithResourceType);
-  hasher->Update(options->buildResourcesDataForShaderModule);
-  hasher->Update(options->disableTruncCoordForGather);
-  hasher->Update(options->enableCombinedTexture);
-  hasher->Update(options->vertex64BitsAttribSingleLoc);
-  hasher->Update(options->enableFragColor);
-  hasher->Update(options->disableBaseVertex);
+  hasher->Update(options->getGlState().replaceSetWithResourceType);
+  hasher->Update(options->getGlState().buildResourcesDataForShaderModule);
+  hasher->Update(options->getGlState().disableTruncCoordForGather);
+  hasher->Update(options->getGlState().enableCombinedTexture);
+  hasher->Update(options->getGlState().vertex64BitsAttribSingleLoc);
+  hasher->Update(options->getGlState().enableFragColor);
+  hasher->Update(options->getGlState().disableBaseVertex);
   hasher->Update(options->enablePrimGeneratedQuery);
   // disablePerCompFetch has been handled in updateHashForNonFragmentState
 }
@@ -2191,12 +2194,18 @@ template <class OStream, class Elf>
 // @param reader : ELF object
 OStream &operator<<(OStream &out, ElfReader<Elf> &reader) {
   unsigned sectionCount = reader.getSectionCount();
+  bool sortSection = reader.getMap().size() == sectionCount;
   char formatBuf[256];
 
-  for (unsigned sortIdx = 0; sortIdx < sectionCount; ++sortIdx) {
+  for (unsigned idx = 0; idx < sectionCount; ++idx) {
     typename ElfReader<Elf>::SectionBuffer *section = nullptr;
-    unsigned secIdx = 0;
-    Result result = reader.getSectionDataBySortingIndex(sortIdx, &secIdx, &section);
+    Result result = Result::Success;
+    unsigned secIdx = idx;
+    if (sortSection) {
+      result = reader.getSectionDataBySortingIndex(idx, &secIdx, &section);
+    } else {
+      result = reader.getSectionDataBySectionIndex(idx, &section);
+    }
     assert(result == Result::Success);
     (void(result)); // unused
     if (strcmp(section->name, ShStrTabName) == 0 || strcmp(section->name, StrTabName) == 0 ||
@@ -2418,7 +2427,7 @@ OStream &operator<<(OStream &out, ElfReader<Elf> &reader) {
 
       while (startPos < section->secHead.sh_size) {
         if (symIdx < symbols.size())
-          endPos = static_cast<unsigned>(symbols[symIdx].value);
+          endPos = static_cast<unsigned>(std::min(symbols[symIdx].value, section->secHead.sh_size));
         else
           endPos = static_cast<unsigned>(section->secHead.sh_size);
 
@@ -2428,12 +2437,16 @@ OStream &operator<<(OStream &out, ElfReader<Elf> &reader) {
           out << "    " << symbols[symIdx].pSymName << " (offset = " << symbols[symIdx].value
               << "  size = " << symbols[symIdx].size;
 
-          MetroHash::Hash hash = {};
-          MetroHash64::Hash(
-              reinterpret_cast<const uint8_t *>(voidPtrInc(section->data, static_cast<size_t>(symbols[symIdx].value))),
-              symbols[symIdx].size, hash.bytes);
-          uint64_t hashCode64 = MetroHash::compact64(&hash);
-          snprintf(formatBuf, sizeof(formatBuf), " hash = 0x%016" PRIX64 ")\n", hashCode64);
+          if ((symbols[symIdx].value + symbols[symIdx].size) <= section->secHead.sh_size) {
+            MetroHash::Hash hash = {};
+            MetroHash64::Hash(reinterpret_cast<const uint8_t *>(
+                                  voidPtrInc(section->data, static_cast<size_t>(symbols[symIdx].value))),
+                              symbols[symIdx].size, hash.bytes);
+            uint64_t hashCode64 = MetroHash::compact64(&hash);
+            snprintf(formatBuf, sizeof(formatBuf), " hash = 0x%016" PRIX64 ")\n", hashCode64);
+          } else {
+            snprintf(formatBuf, sizeof(formatBuf), " hash = Unknown )\n");
+          }
           out << formatBuf;
         }
         ++symIdx;
diff --git a/tool/vfx/vfxVkSection.h b/tool/vfx/vfxVkSection.h
index 313c79789b..ce4cc75742 100644
--- a/tool/vfx/vfxVkSection.h
+++ b/tool/vfx/vfxVkSection.h
@@ -882,6 +882,7 @@ class SectionGraphicsState : public Section {
       INIT_STATE_SUB_MEMBER_NAME_TO_ADDR(SectionGraphicsState, cbState, dualSourceBlendDynamic, MemberTypeBool, false);
       INIT_STATE_SUB_MEMBER_NAME_TO_ADDR(SectionGraphicsState, glState, enableColorClampVs, MemberTypeBool, false);
       INIT_STATE_SUB_MEMBER_NAME_TO_ADDR(SectionGraphicsState, glState, enableColorClampFs, MemberTypeBool, false);
+      INIT_STATE_SUB_MEMBER_NAME_TO_ADDR(SectionGraphicsState, glState, enableFlatShade, MemberTypeBool, false);
       INIT_MEMBER_ARRAY_NAME_TO_ADDR(SectionGraphicsState, m_colorBuffer, MemberTypeColorBufferItem,
                                      Vkgc::MaxColorTargets, true);
 
@@ -896,6 +897,7 @@ class SectionGraphicsState : public Section {
       INIT_STATE_MEMBER_NAME_TO_ADDR(SectionGraphicsState, useSoftwareVertexBufferDescriptors, MemberTypeBool, false);
       INIT_MEMBER_NAME_TO_ADDR(SectionGraphicsState, m_shaderLibrary, MemberTypeString, false);
       INIT_MEMBER_NAME_TO_ADDR(SectionGraphicsState, m_rtState, MemberTypeRtState, true);
+
       INIT_MEMBER_NAME_TO_ADDR(SectionGraphicsState, m_clientMetadata, MemberTypeU8Array, false);
       INIT_MEMBER_ARRAY_NAME_TO_ADDR(SectionGraphicsState, m_uniformConstantMaps, MemberTypeUniformConstantMap,
                                      Vkgc::ShaderStageGfxCount, true);
diff --git a/util/vkgcUtil.cpp b/util/vkgcUtil.cpp
index da52ff8df4..8f948b8b13 100644
--- a/util/vkgcUtil.cpp
+++ b/util/vkgcUtil.cpp
@@ -52,6 +52,14 @@ const char *VKAPI_CALL IUtil::GetEntryPointNameFromSpirvBinary(const BinaryData
   return getEntryPointNameFromSpirvBinary(spvBin);
 }
 
+// =====================================================================================================================
+// Translate enum "ResourceMappingNodeType" to string
+//
+// @param type : Resource map node type
+const char *VKAPI_CALL IUtil::GetResourceMappingNodeTypeName(ResourceMappingNodeType type) {
+  return getResourceMappingNodeTypeName(type);
+}
+
 // =====================================================================================================================
 // Gets name string of the abbreviation for the specified shader stage
 //
diff --git a/version/include/llpcVersion.h.in b/version/include/llpcVersion.h.in
index d947653255..e8d33c7a96 100644
--- a/version/include/llpcVersion.h.in
+++ b/version/include/llpcVersion.h.in
@@ -37,6 +37,8 @@
 //  %Version History
 //  | %Version | Change Description                                                                                    |
 //  | -------- | ----------------------------------------------------------------------------------------------------- |
+//  |     73.0 | Add all the ogl specific pipeline options in a new structure GLState                                  |
+//  |     72.4 | Add enableFlatShade to GraphicsPipelineBuildInfo.                                                     |
 //  |     72.3 | Add enableColorClampVs and enableColorClampFs to GraphicsPipelineBuildInfo.                           |
 //  |     72.2 | Add pGpurtOptions and gpurtOptionCount to RayTracingPipelineBuildInfo                                 |
 //  |     72.1 | Add dynamicTopology to GraphicsPipelineBuildInfo                                                      |