dotnet · fanyang-mono · Feb 12, 2024 · Feb 7, 2024 · Feb 7, 2024 · Feb 7, 2024
diff --git a/.../System.Private.CoreLib/src/System/Runtime/Intrinsics/Arm/AdvSimd.PlatformNotSupported.cs b/.../System.Private.CoreLib/src/System/Runtime/Intrinsics/Arm/AdvSimd.PlatformNotSupported.cs
@@ -2258,6 +2258,7 @@ internal Arm64() { }
             ///   A64: LD4 { Vn.2D, Vn+1.2D, Vn+2.2D, Vn+3.2D }, [Xn]
             /// </summary>
             public static unsafe (Vector128<double> Value1, Vector128<double> Value2, Vector128<double> Value3, Vector128<double> Value4) LoadVector128x4AndUnzip(double* address) { throw new PlatformNotSupportedException(); }
+#endif
 
             /// <summary>
             ///   A64: LD1 { Vn.16B, Vn+1.16B }, [Xn]
@@ -2408,7 +2409,6 @@ internal Arm64() { }
             ///   A64: LD1 { Vn.2D, Vn+1.2D, Vn+2.2D, Vn+3.2D }, [Xn]
             /// </summary>
             public static unsafe (Vector128<double> Value1, Vector128<double> Value2, Vector128<double> Value3, Vector128<double> Value4) LoadVector128x4(double* address) { throw new PlatformNotSupportedException(); }
-#endif
 
             /// <summary>
             /// float64x2_t vmaxq_f64 (float64x2_t a, float64x2_t b)
@@ -9632,9 +9632,6 @@ internal Arm64() { }
         /// </summary>
         public static unsafe Vector128<ulong> LoadVector128(ulong* address) { throw new PlatformNotSupportedException(); }
 
-#if false
-        // Should be disabled until Mono implements these APIs. See https://github.com/dotnet/runtime/issues/93081
-
         /// <summary>
         ///   A64: LD2 { Vn.8B, Vn+1.8B }, [Xn]
         /// </summary>
@@ -9844,7 +9841,6 @@ internal Arm64() { }
         ///   A64: LD1 { Vn.2S, Vn+1.2S, Vn+2.2S, Vn+3.2S }, [Xn]
         /// </summary>
         public static unsafe (Vector64<float> Value1, Vector64<float> Value2, Vector64<float> Value3, Vector64<float> Value4) LoadVector64x4(float* address) { throw new PlatformNotSupportedException(); }
-#endif
 
         /// <summary>
         /// uint8x8_t vmax_u8 (uint8x8_t a, uint8x8_t b)

diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Arm/AdvSimd.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Arm/AdvSimd.cs
@@ -2103,9 +2103,6 @@ internal Arm64() { }
             /// </summary>
             public static unsafe (Vector128<ulong> Value1, Vector128<ulong> Value2) LoadPairVector128NonTemporal(ulong* address) => LoadPairVector128NonTemporal(address);
 
-#if false
-            // Should be disabled until Mono implements these APIs. See https://github.com/dotnet/runtime/issues/93081
-
             /// <summary>
             ///   A64: LD2 { Vn.16B, Vn+1.16B }, [Xn]
             /// </summary>
@@ -2405,7 +2402,6 @@ internal Arm64() { }
             ///   A64: LD1 { Vn.2D, Vn+1.2D, Vn+2.2D, Vn+3.2D }, [Xn]
             /// </summary>
             public static unsafe (Vector128<double> Value1, Vector128<double> Value2, Vector128<double> Value3, Vector128<double> Value4) LoadVector128x4(double* address) => LoadVector128x4(address);
-#endif
 
             /// <summary>
             /// float64x2_t vmaxq_f64 (float64x2_t a, float64x2_t b)
@@ -9628,9 +9624,6 @@ internal Arm64() { }
         /// </summary>
         public static unsafe Vector128<ulong> LoadVector128(ulong* address) => LoadVector128(address);
 
-#if false
-        // Should be disabled until Mono implements these APIs. See https://github.com/dotnet/runtime/issues/93081
-
         /// <summary>
         ///   A64: LD2 { Vn.8B, Vn+1.8B }, [Xn]
         /// </summary>
@@ -9840,7 +9833,6 @@ internal Arm64() { }
         ///   A64: LD1 { Vn.2S, Vn+1.2S, Vn+2.2S, Vn+3.2S }, [Xn]
         /// </summary>
         public static unsafe (Vector64<float> Value1, Vector64<float> Value2, Vector64<float> Value3, Vector64<float> Value4) LoadVector64x4(float* address) => LoadVector64x4(address);
-#endif
 
         /// <summary>
         /// uint8x8_t vmax_u8 (uint8x8_t a, uint8x8_t b)

diff --git a/src/libraries/System.Runtime.Intrinsics/ref/System.Runtime.Intrinsics.cs b/src/libraries/System.Runtime.Intrinsics/ref/System.Runtime.Intrinsics.cs
@@ -1990,8 +1990,6 @@ internal AdvSimd() { }
         public static unsafe System.Runtime.Intrinsics.Vector64<ushort> LoadVector64(ushort* address) { throw null; }
         public static unsafe System.Runtime.Intrinsics.Vector64<uint> LoadVector64(uint* address) { throw null; }
         public static unsafe System.Runtime.Intrinsics.Vector64<ulong> LoadVector64(ulong* address) { throw null; }
-#if false
-        // Should be disabled until Mono implements these APIs. See https://github.com/dotnet/runtime/issues/93081
         public static unsafe (System.Runtime.Intrinsics.Vector64<byte> Value1, System.Runtime.Intrinsics.Vector64<byte> Value2) LoadVector64x2AndUnzip(byte* address) { throw null; }
         public static unsafe (System.Runtime.Intrinsics.Vector64<sbyte> Value1, System.Runtime.Intrinsics.Vector64<sbyte> Value2) LoadVector64x2AndUnzip(sbyte* address) { throw null; }
         public static unsafe (System.Runtime.Intrinsics.Vector64<short> Value1, System.Runtime.Intrinsics.Vector64<short> Value2) LoadVector64x2AndUnzip(short* address) { throw null; }
@@ -2034,7 +2032,6 @@ internal AdvSimd() { }
         public static unsafe (System.Runtime.Intrinsics.Vector64<int> Value1, System.Runtime.Intrinsics.Vector64<int> Value2, System.Runtime.Intrinsics.Vector64<int> Value3, System.Runtime.Intrinsics.Vector64<int> Value4) LoadVector64x4(int* address) { throw null; }
         public static unsafe (System.Runtime.Intrinsics.Vector64<uint> Value1, System.Runtime.Intrinsics.Vector64<uint> Value2, System.Runtime.Intrinsics.Vector64<uint> Value3, System.Runtime.Intrinsics.Vector64<uint> Value4) LoadVector64x4(uint* address) { throw null; }
         public static unsafe (System.Runtime.Intrinsics.Vector64<float> Value1, System.Runtime.Intrinsics.Vector64<float> Value2, System.Runtime.Intrinsics.Vector64<float> Value3, System.Runtime.Intrinsics.Vector64<float> Value4) LoadVector64x4(float* address) { throw null; }
-#endif
         public static System.Runtime.Intrinsics.Vector128<byte> Max(System.Runtime.Intrinsics.Vector128<byte> left, System.Runtime.Intrinsics.Vector128<byte> right) { throw null; }
         public static System.Runtime.Intrinsics.Vector128<short> Max(System.Runtime.Intrinsics.Vector128<short> left, System.Runtime.Intrinsics.Vector128<short> right) { throw null; }
         public static System.Runtime.Intrinsics.Vector128<int> Max(System.Runtime.Intrinsics.Vector128<int> left, System.Runtime.Intrinsics.Vector128<int> right) { throw null; }
@@ -3486,6 +3483,7 @@ internal Arm64() { }
             public static unsafe (System.Runtime.Intrinsics.Vector128<ulong> Value1, System.Runtime.Intrinsics.Vector128<ulong> Value2, System.Runtime.Intrinsics.Vector128<ulong> Value3, System.Runtime.Intrinsics.Vector128<ulong> Value4) LoadVector128x4AndUnzip(ulong* address) { throw null; }
             public static unsafe (System.Runtime.Intrinsics.Vector128<float> Value1, System.Runtime.Intrinsics.Vector128<float> Value2, System.Runtime.Intrinsics.Vector128<float> Value3, System.Runtime.Intrinsics.Vector128<float> Value4) LoadVector128x4AndUnzip(float* address) { throw null; }
             public static unsafe (System.Runtime.Intrinsics.Vector128<double> Value1, System.Runtime.Intrinsics.Vector128<double> Value2, System.Runtime.Intrinsics.Vector128<double> Value3, System.Runtime.Intrinsics.Vector128<double> Value4) LoadVector128x4AndUnzip(double* address) { throw null; }
+#endif
             public static unsafe (System.Runtime.Intrinsics.Vector128<byte> Value1, System.Runtime.Intrinsics.Vector128<byte> Value2) LoadVector128x2(byte* address) { throw null; }
             public static unsafe (System.Runtime.Intrinsics.Vector128<sbyte> Value1, System.Runtime.Intrinsics.Vector128<sbyte> Value2) LoadVector128x2(sbyte* address) { throw null; }
             public static unsafe (System.Runtime.Intrinsics.Vector128<short> Value1, System.Runtime.Intrinsics.Vector128<short> Value2) LoadVector128x2(short* address) { throw null; }
@@ -3516,7 +3514,6 @@ internal Arm64() { }
             public static unsafe (System.Runtime.Intrinsics.Vector128<ulong> Value1, System.Runtime.Intrinsics.Vector128<ulong> Value2, System.Runtime.Intrinsics.Vector128<ulong> Value3, System.Runtime.Intrinsics.Vector128<ulong> Value4) LoadVector128x4(ulong* address) { throw null; }        
             public static unsafe (System.Runtime.Intrinsics.Vector128<float> Value1, System.Runtime.Intrinsics.Vector128<float> Value2, System.Runtime.Intrinsics.Vector128<float> Value3, System.Runtime.Intrinsics.Vector128<float> Value4) LoadVector128x4(float* address) { throw null; }
             public static unsafe (System.Runtime.Intrinsics.Vector128<double> Value1, System.Runtime.Intrinsics.Vector128<double> Value2, System.Runtime.Intrinsics.Vector128<double> Value3, System.Runtime.Intrinsics.Vector128<double> Value4) LoadVector128x4(double* address) { throw null; }
-#endif
             public static System.Runtime.Intrinsics.Vector128<double> Max(System.Runtime.Intrinsics.Vector128<double> left, System.Runtime.Intrinsics.Vector128<double> right) { throw null; }
             public static System.Runtime.Intrinsics.Vector64<byte> MaxAcross(System.Runtime.Intrinsics.Vector128<byte> value) { throw null; }
             public static System.Runtime.Intrinsics.Vector64<short> MaxAcross(System.Runtime.Intrinsics.Vector128<short> value) { throw null; }

@@ -26,6 +26,7 @@
 #define WidenAcross INTRIN_kind_widen_across
 #define Across INTRIN_kind_across
 #define Arm64DotProd INTRIN_kind_arm64_dot_prod
+#define AddPointer INTRIN_kind_add_pointer
 #if !defined(Generic)
 #define Generic
 #endif
@@ -361,6 +362,18 @@ INTRINS_OVR_TAG_KIND(AARCH64_ADV_SIMD_SADDV, aarch64_neon_saddv, Arm64, Across,
 INTRINS_OVR_TAG_KIND(AARCH64_ADV_SIMD_UADDV, aarch64_neon_uaddv, Arm64, Across, V64 | V128 | I1 | I2 | I4 | I8)
 INTRINS_OVR_TAG_KIND(AARCH64_ADV_SIMD_FADDV, aarch64_neon_faddv, Arm64, Across, V64 | V128 | R4 | R8)
 
+INTRINS_OVR_TAG_KIND(AARCH64_ADV_SIMD_LD1X2_V64, aarch64_neon_ld1x2, Arm64, AddPointer, V64 | I1 | I2 | I4 | R4)
+INTRINS_OVR_TAG_KIND(AARCH64_ADV_SIMD_LD1X3_V64, aarch64_neon_ld1x3, Arm64, AddPointer, V64 | I1 | I2 | I4 | R4)
+INTRINS_OVR_TAG_KIND(AARCH64_ADV_SIMD_LD1X4_V64, aarch64_neon_ld1x4, Arm64, AddPointer, V64 | I1 | I2 | I4 | R4)
+INTRINS_OVR_TAG_KIND(AARCH64_ADV_SIMD_LD1X2_V128, aarch64_neon_ld1x2, Arm64, AddPointer, V128 | I1 | I2 | I4 | I8 | R4 | R8)
+INTRINS_OVR_TAG_KIND(AARCH64_ADV_SIMD_LD1X3_V128, aarch64_neon_ld1x3, Arm64, AddPointer, V128 | I1 | I2 | I4 | I8 | R4 | R8)
+INTRINS_OVR_TAG_KIND(AARCH64_ADV_SIMD_LD1X4_V128, aarch64_neon_ld1x4, Arm64, AddPointer, V128 | I1 | I2 | I4 | I8 | R4 | R8)
+INTRINS_OVR_TAG_KIND(AARCH64_ADV_SIMD_LD2_V64, aarch64_neon_ld2, Arm64, AddPointer, V64 | I1 | I2 | I4 | R4)
+INTRINS_OVR_TAG_KIND(AARCH64_ADV_SIMD_LD3_V64, aarch64_neon_ld3, Arm64, AddPointer, V64 | I1 | I2 | I4 | R4)
+INTRINS_OVR_TAG_KIND(AARCH64_ADV_SIMD_LD4_V64, aarch64_neon_ld4, Arm64, AddPointer, V64 | I1 | I2 | I4 | R4)
+INTRINS_OVR_TAG_KIND(AARCH64_ADV_SIMD_LD2_V128, aarch64_neon_ld2, Arm64, AddPointer, V128 | I1 | I2 | I4 | I8 | R4 | R8)
+INTRINS_OVR_TAG_KIND(AARCH64_ADV_SIMD_LD3_V128, aarch64_neon_ld3, Arm64, AddPointer, V128 | I1 | I2 | I4 | I8 | R4 | R8)
+INTRINS_OVR_TAG_KIND(AARCH64_ADV_SIMD_LD4_V128, aarch64_neon_ld4, Arm64, AddPointer, V128 | I1 | I2 | I4 | I8 | R4 | R8)
 INTRINS_OVR_TAG_KIND(AARCH64_ADV_SIMD_SMAXV, aarch64_neon_smaxv, Arm64, Across, V64 | V128 | I1 | I2 | I4)
 INTRINS_OVR_TAG_KIND(AARCH64_ADV_SIMD_UMAXV, aarch64_neon_umaxv, Arm64, Across, V64 | V128 | I1 | I2 | I4)
 INTRINS_OVR_TAG_KIND(AARCH64_ADV_SIMD_SMINV, aarch64_neon_sminv, Arm64, Across, V64 | V128 | I1 | I2 | I4)

@@ -726,7 +726,7 @@ mono_llvm_register_intrinsic (LLVMModuleRef module, IntrinsicId id, LLVMTypeRef
 }
 
 /*
- * mono_llvm_register_intrinsic:
+ * mono_llvm_register_overloaded_intrinsic:
  *
  *   Register an overloaded LLVM intrinsic identified by ID using the supplied types.
  */

@@ -394,7 +394,7 @@ static const llvm_ovr_tag_t intrin_arm64_ovr [] = {
 	#define INTRINS_OVR_2_ARG(sym, ...) 0,
 	#define INTRINS_OVR_3_ARG(sym, ...) 0,
 	#define INTRINS_OVR_TAG(sym, _, arch, spec) spec,
-	#define INTRINS_OVR_TAG_KIND(sym, _, kind, arch, spec) spec,
+	#define INTRINS_OVR_TAG_KIND(sym, _, arch, kind, spec) spec,
 	#include "llvm-intrinsics.h"
 };
 
@@ -404,6 +404,7 @@ enum {
 	INTRIN_kind_widen_across,
 	INTRIN_kind_across,
 	INTRIN_kind_arm64_dot_prod,
+	INTRIN_kind_add_pointer,
 };
 
 static const uint8_t intrin_kind [] = {
@@ -661,13 +662,14 @@ static LLVMTypeRef
 simd_valuetuple_to_llvm_type (EmitContext *ctx, MonoClass *klass)
 {
 	const char *klass_name = m_class_get_name (klass);
-	if (!strcmp (klass_name, "ValueTuple`2")) {
-		MonoType *etype = mono_class_get_generic_class (klass)->context.class_inst->type_argv [0];
+	if (strstr (klass_name, "ValueTuple") != NULL) {
+		MonoGenericInst *classInst = mono_class_get_generic_class (klass)->context.class_inst;
+		MonoType *etype = classInst->type_argv [0];
 		if (etype->type != MONO_TYPE_GENERICINST)
 			g_assert_not_reached ();
 		MonoClass *eklass = etype->data.generic_class->cached_class;
 		LLVMTypeRef ltype = simd_class_to_llvm_type (ctx, eklass);
-		return LLVMArrayType (ltype, 2);
+		return LLVMArrayType (ltype, classInst->type_argc);
 	}
 	g_assert_not_reached ();
 }
@@ -11616,6 +11618,21 @@ MONO_RESTORE_WARNING
 			values [ins->dreg] = vec_sz == 64 ? val : NULL;
 			break;
 		}
+		case OP_ARM64_LDM: {
+			const char *oname = "arm64_ldm";
+			LLVMTypeRef ret_t = simd_valuetuple_to_llvm_type (ctx, ins->klass);
+			if (!addresses [ins->dreg])
+				addresses [ins->dreg] = create_address (ctx->module, build_named_alloca (ctx, m_class_get_byval_arg (ins->klass), oname), ret_t);
+			LLVMTypeRef vec_t = LLVMGetElementType (ret_t);
+			IntrinsicId iid = (IntrinsicId) ins->inst_c0;
+			llvm_ovr_tag_t ovr_tag = ovr_tag_from_llvm_type (vec_t);
+			LLVMValueRef result = call_overloaded_intrins (ctx, iid, ovr_tag, &lhs, oname);
+			LLVMTypeRef retptr_t = pointer_type (ret_t);
+			LLVMValueRef dst = convert (ctx, addresses [ins->dreg]->value, retptr_t);
+			LLVMBuildStore (builder, result, dst);
+			values [ins->dreg] = result;
+			break;
+		}
 		case OP_ARM64_ST1: {
 			LLVMTypeRef t = LLVMTypeOf (rhs);
 			LLVMValueRef address = convert (ctx, lhs, pointer_type (t));
@@ -13546,6 +13563,10 @@ add_intrinsic (EmitContext *ctx, int id)
 						 */
 						 LLVMTypeRef associated_type = intrin_types [vw][0];
 						 intrins = add_intrins2 (module, id, distinguishing_type, associated_type, &intrins_type);
+					} else if (kind == INTRIN_kind_add_pointer) {
+						LLVMTypeRef elem_type = LLVMGetElementType (distinguishing_type);
+						LLVMTypeRef src_t = pointer_type (elem_type);
+						intrins = add_intrins2 (module, id, distinguishing_type, src_t, &intrins_type);
 					} else
 						intrins = add_intrins1 (module, id, distinguishing_type, &intrins_type);
 					int key = key_from_id_and_tag (id, test);

@@ -1642,6 +1642,8 @@ MINI_OP(OP_ARM64_LDNP_SCALAR, "arm64_ldnp_scalar", VREG, IREG, NONE)
 MINI_OP(OP_ARM64_LDP, "arm64_ldp", VREG, IREG, NONE)
 MINI_OP(OP_ARM64_LDP_SCALAR, "arm64_ldp_scalar", VREG, IREG, NONE)
 
+MINI_OP(OP_ARM64_LDM, "arm64_ldm", VREG, IREG, NONE)
+
 MINI_OP(OP_ARM64_ST1, "arm64_st1", NONE, IREG, XREG)
 MINI_OP(OP_ARM64_SXTL, "arm64_sxtl", XREG, XREG, NONE)
 MINI_OP(OP_ARM64_SXTL2, "arm64_sxtl2", XREG, XREG, NONE)

@@ -3658,7 +3658,19 @@ static SimdIntrinsic advsimd_methods [] = {
 	{SN_LoadPairVector64, OP_ARM64_LDP},
 	{SN_LoadPairVector64NonTemporal, OP_ARM64_LDNP},
 	{SN_LoadVector128, OP_ARM64_LD1},
+	{SN_LoadVector128x2, OP_ARM64_LDM, INTRINS_AARCH64_ADV_SIMD_LD1X2_V128},
+	{SN_LoadVector128x2AndUnzip, OP_ARM64_LDM, INTRINS_AARCH64_ADV_SIMD_LD2_V128},
+	{SN_LoadVector128x3, OP_ARM64_LDM, INTRINS_AARCH64_ADV_SIMD_LD1X3_V128},
+	{SN_LoadVector128x3AndUnzip, OP_ARM64_LDM, INTRINS_AARCH64_ADV_SIMD_LD3_V128},
+	{SN_LoadVector128x4, OP_ARM64_LDM, INTRINS_AARCH64_ADV_SIMD_LD1X4_V128},
+	{SN_LoadVector128x4AndUnzip, OP_ARM64_LDM, INTRINS_AARCH64_ADV_SIMD_LD4_V128},
 	{SN_LoadVector64, OP_ARM64_LD1},
+	{SN_LoadVector64x2, OP_ARM64_LDM, INTRINS_AARCH64_ADV_SIMD_LD1X2_V64},
+	{SN_LoadVector64x2AndUnzip, OP_ARM64_LDM, INTRINS_AARCH64_ADV_SIMD_LD2_V64},
+	{SN_LoadVector64x3, OP_ARM64_LDM, INTRINS_AARCH64_ADV_SIMD_LD1X3_V64},
+	{SN_LoadVector64x3AndUnzip, OP_ARM64_LDM, INTRINS_AARCH64_ADV_SIMD_LD3_V64},
+	{SN_LoadVector64x4, OP_ARM64_LDM, INTRINS_AARCH64_ADV_SIMD_LD1X4_V64},
+	{SN_LoadVector64x4AndUnzip, OP_ARM64_LDM, INTRINS_AARCH64_ADV_SIMD_LD4_V64},
 	{SN_Max, OP_XOP_OVR_X_X_X, INTRINS_AARCH64_ADV_SIMD_SMAX, OP_XOP_OVR_X_X_X, INTRINS_AARCH64_ADV_SIMD_UMAX, OP_XOP_OVR_X_X_X, INTRINS_AARCH64_ADV_SIMD_FMAX},
 	{SN_MaxAcross, OP_ARM64_XHORIZ, INTRINS_AARCH64_ADV_SIMD_SMAXV, OP_ARM64_XHORIZ, INTRINS_AARCH64_ADV_SIMD_UMAXV, OP_ARM64_XHORIZ, INTRINS_AARCH64_ADV_SIMD_FMAXV},
 	{SN_MaxNumber, OP_XOP_OVR_X_X_X, INTRINS_AARCH64_ADV_SIMD_FMAXNM},

@@ -165,6 +165,12 @@ METHOD(Extract)
 METHOD(LoadHigh)
 METHOD(LoadLow)
 METHOD(LoadVector128)
+METHOD(LoadVector128x2)
+METHOD(LoadVector128x3)
+METHOD(LoadVector128x4)
+METHOD(LoadVector128x2AndUnzip)
+METHOD(LoadVector128x3AndUnzip)
+METHOD(LoadVector128x4AndUnzip)
 METHOD(LoadScalarVector128)
 METHOD(MoveHighToLow)
 METHOD(MoveLowToHigh)
@@ -436,6 +442,12 @@ METHOD(LoadPairVector128NonTemporal)
 METHOD(LoadPairVector64)
 METHOD(LoadPairVector64NonTemporal)
 METHOD(LoadVector64)
+METHOD(LoadVector64x2)
+METHOD(LoadVector64x3)
+METHOD(LoadVector64x4)
+METHOD(LoadVector64x2AndUnzip)
+METHOD(LoadVector64x3AndUnzip)
+METHOD(LoadVector64x4AndUnzip)
 METHOD(MaxAcross)
 METHOD(MaxNumber)
 METHOD(MaxNumberAcross)