build for armv7

Tencent · Sep 20, 2024 · 86b03df · 86b03df
1 parent dd6bf5c
commit 86b03df
Show file tree

Hide file tree

Showing 2 changed files with 228 additions and 0 deletions.
diff --git a/src/layer/arm/gemm_int8.h b/src/layer/arm/gemm_int8.h
@@ -2111,6 +2111,7 @@ static void pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int max_ii, i
                 float32x4_t _pe = vld1q_f32(p0 + A_hstep * 7);
                 float32x4_t _pf = vld1q_f32(p0 + A_hstep * 7 + 4);
 
+#if __aarch64__
                 _p0 = vmulq_laneq_f32(_p0, _scale0, 0);
                 _p1 = vmulq_laneq_f32(_p1, _scale0, 0);
                 _p2 = vmulq_laneq_f32(_p2, _scale0, 1);
@@ -2127,6 +2128,24 @@ static void pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int max_ii, i
                 _pd = vmulq_laneq_f32(_pd, _scale1, 2);
                 _pe = vmulq_laneq_f32(_pe, _scale1, 3);
                 _pf = vmulq_laneq_f32(_pf, _scale1, 3);
+#else
+                _p0 = vmulq_lane_f32(_p0, vget_low_f32(_scale0), 0);
+                _p1 = vmulq_lane_f32(_p1, vget_low_f32(_scale0), 0);
+                _p2 = vmulq_lane_f32(_p2, vget_low_f32(_scale0), 1);
+                _p3 = vmulq_lane_f32(_p3, vget_low_f32(_scale0), 1);
+                _p4 = vmulq_lane_f32(_p4, vget_high_f32(_scale0), 0);
+                _p5 = vmulq_lane_f32(_p5, vget_high_f32(_scale0), 0);
+                _p6 = vmulq_lane_f32(_p6, vget_high_f32(_scale0), 1);
+                _p7 = vmulq_lane_f32(_p7, vget_high_f32(_scale0), 1);
+                _p8 = vmulq_lane_f32(_p8, vget_low_f32(_scale1), 0);
+                _p9 = vmulq_lane_f32(_p9, vget_low_f32(_scale1), 0);
+                _pa = vmulq_lane_f32(_pa, vget_low_f32(_scale1), 1);
+                _pb = vmulq_lane_f32(_pb, vget_low_f32(_scale1), 1);
+                _pc = vmulq_lane_f32(_pc, vget_high_f32(_scale1), 0);
+                _pd = vmulq_lane_f32(_pd, vget_high_f32(_scale1), 0);
+                _pe = vmulq_lane_f32(_pe, vget_high_f32(_scale1), 1);
+                _pf = vmulq_lane_f32(_pf, vget_high_f32(_scale1), 1);
+#endif
 
 #if __ARM_FEATURE_DOTPROD
 #if __ARM_FEATURE_MATMUL_INT8
@@ -2190,6 +2209,7 @@ static void pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int max_ii, i
                 float32x4_t _p6 = vld1q_f32(p0 + A_hstep * 6);
                 float32x4_t _p7 = vld1q_f32(p0 + A_hstep * 7);
 
+#if __aarch64__
                 _p0 = vmulq_laneq_f32(_p0, _scale0, 0);
                 _p1 = vmulq_laneq_f32(_p1, _scale0, 1);
                 _p2 = vmulq_laneq_f32(_p2, _scale0, 2);
@@ -2198,6 +2218,16 @@ static void pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int max_ii, i
                 _p5 = vmulq_laneq_f32(_p5, _scale1, 1);
                 _p6 = vmulq_laneq_f32(_p6, _scale1, 2);
                 _p7 = vmulq_laneq_f32(_p7, _scale1, 3);
+#else
+                _p0 = vmulq_lane_f32(_p0, vget_low_f32(_scale0), 0);
+                _p1 = vmulq_lane_f32(_p1, vget_low_f32(_scale0), 1);
+                _p2 = vmulq_lane_f32(_p2, vget_high_f32(_scale0), 0);
+                _p3 = vmulq_lane_f32(_p3, vget_high_f32(_scale0), 1);
+                _p4 = vmulq_lane_f32(_p4, vget_low_f32(_scale1), 0);
+                _p5 = vmulq_lane_f32(_p5, vget_low_f32(_scale1), 1);
+                _p6 = vmulq_lane_f32(_p6, vget_high_f32(_scale1), 0);
+                _p7 = vmulq_lane_f32(_p7, vget_high_f32(_scale1), 1);
+#endif
 
 #if __ARM_FEATURE_DOTPROD
                 int8x8_t _r0 = float2int8(_p0, _p1);
@@ -2428,6 +2458,7 @@ static void pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int max_ii, i
                 float32x4_t _p6 = vld1q_f32(p0 + A_hstep * 3);
                 float32x4_t _p7 = vld1q_f32(p0 + A_hstep * 3 + 4);
 
+#if __aarch64__
                 _p0 = vmulq_laneq_f32(_p0, _scale, 0);
                 _p1 = vmulq_laneq_f32(_p1, _scale, 0);
                 _p2 = vmulq_laneq_f32(_p2, _scale, 1);
@@ -2436,6 +2467,16 @@ static void pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int max_ii, i
                 _p5 = vmulq_laneq_f32(_p5, _scale, 2);
                 _p6 = vmulq_laneq_f32(_p6, _scale, 3);
                 _p7 = vmulq_laneq_f32(_p7, _scale, 3);
+#else
+                _p0 = vmulq_lane_f32(_p0, vget_low_f32(_scale), 0);
+                _p1 = vmulq_lane_f32(_p1, vget_low_f32(_scale), 0);
+                _p2 = vmulq_lane_f32(_p2, vget_low_f32(_scale), 1);
+                _p3 = vmulq_lane_f32(_p3, vget_low_f32(_scale), 1);
+                _p4 = vmulq_lane_f32(_p4, vget_high_f32(_scale), 0);
+                _p5 = vmulq_lane_f32(_p5, vget_high_f32(_scale), 0);
+                _p6 = vmulq_lane_f32(_p6, vget_high_f32(_scale), 1);
+                _p7 = vmulq_lane_f32(_p7, vget_high_f32(_scale), 1);
+#endif
 
 #if __ARM_FEATURE_DOTPROD
 #if __ARM_FEATURE_MATMUL_INT8
@@ -2475,10 +2516,17 @@ static void pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int max_ii, i
                 float32x4_t _p2 = vld1q_f32(p0 + A_hstep * 2);
                 float32x4_t _p3 = vld1q_f32(p0 + A_hstep * 3);
 
+#if __aarch64__
                 _p0 = vmulq_laneq_f32(_p0, _scale, 0);
                 _p1 = vmulq_laneq_f32(_p1, _scale, 1);
                 _p2 = vmulq_laneq_f32(_p2, _scale, 2);
                 _p3 = vmulq_laneq_f32(_p3, _scale, 3);
+#else
+                _p0 = vmulq_lane_f32(_p0, vget_low_f32(_scale), 0);
+                _p1 = vmulq_lane_f32(_p1, vget_low_f32(_scale), 1);
+                _p2 = vmulq_lane_f32(_p2, vget_high_f32(_scale), 0);
+                _p3 = vmulq_lane_f32(_p3, vget_high_f32(_scale), 1);
+#endif
 
 #if __ARM_FEATURE_DOTPROD
                 int8x8_t _r0 = float2int8(_p0, _p1);
@@ -3032,6 +3080,7 @@ static void transpose_pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int
                 float32x4_t _pe = vld1q_f32(p0 + A_hstep * 4 + 24);
                 float32x4_t _pf = vld1q_f32(p0 + A_hstep * 4 + 28);
 
+#if __aarch64__
                 _p0 = vmulq_laneq_f32(_p0, _scale0, 0);
                 _p1 = vmulq_laneq_f32(_p1, _scale0, 1);
                 _p2 = vmulq_laneq_f32(_p2, _scale0, 2);
@@ -3048,6 +3097,24 @@ static void transpose_pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int
                 _pd = vmulq_laneq_f32(_pd, _scale1, 1);
                 _pe = vmulq_laneq_f32(_pe, _scale1, 2);
                 _pf = vmulq_laneq_f32(_pf, _scale1, 3);
+#else
+                _p0 = vmulq_lane_f32(_p0, vget_low_f32(_scale0), 0);
+                _p1 = vmulq_lane_f32(_p1, vget_low_f32(_scale0), 1);
+                _p2 = vmulq_lane_f32(_p2, vget_high_f32(_scale0), 0);
+                _p3 = vmulq_lane_f32(_p3, vget_high_f32(_scale0), 1);
+                _p4 = vmulq_lane_f32(_p4, vget_low_f32(_scale1), 0);
+                _p5 = vmulq_lane_f32(_p5, vget_low_f32(_scale1), 1);
+                _p6 = vmulq_lane_f32(_p6, vget_high_f32(_scale1), 0);
+                _p7 = vmulq_lane_f32(_p7, vget_high_f32(_scale1), 1);
+                _p8 = vmulq_lane_f32(_p8, vget_low_f32(_scale0), 0);
+                _p9 = vmulq_lane_f32(_p9, vget_low_f32(_scale0), 1);
+                _pa = vmulq_lane_f32(_pa, vget_high_f32(_scale0), 0);
+                _pb = vmulq_lane_f32(_pb, vget_high_f32(_scale0), 1);
+                _pc = vmulq_lane_f32(_pc, vget_low_f32(_scale1), 0);
+                _pd = vmulq_lane_f32(_pd, vget_low_f32(_scale1), 1);
+                _pe = vmulq_lane_f32(_pe, vget_high_f32(_scale1), 0);
+                _pf = vmulq_lane_f32(_pf, vget_high_f32(_scale1), 1);
+#endif
 
 #if __ARM_FEATURE_DOTPROD
 #if __ARM_FEATURE_MATMUL_INT8
@@ -3116,6 +3183,7 @@ static void transpose_pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int
                 float32x4_t _p6 = vld1q_f32(p0 + 24);
                 float32x4_t _p7 = vld1q_f32(p0 + 28);
 
+#if __aarch64__
                 _p0 = vmulq_laneq_f32(_p0, _scale0, 0);
                 _p1 = vmulq_laneq_f32(_p1, _scale0, 1);
                 _p2 = vmulq_laneq_f32(_p2, _scale0, 2);
@@ -3124,6 +3192,16 @@ static void transpose_pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int
                 _p5 = vmulq_laneq_f32(_p5, _scale1, 1);
                 _p6 = vmulq_laneq_f32(_p6, _scale1, 2);
                 _p7 = vmulq_laneq_f32(_p7, _scale1, 3);
+#else
+                _p0 = vmulq_lane_f32(_p0, vget_low_f32(_scale0), 0);
+                _p1 = vmulq_lane_f32(_p1, vget_low_f32(_scale0), 1);
+                _p2 = vmulq_lane_f32(_p2, vget_high_f32(_scale0), 0);
+                _p3 = vmulq_lane_f32(_p3, vget_high_f32(_scale0), 1);
+                _p4 = vmulq_lane_f32(_p4, vget_low_f32(_scale1), 0);
+                _p5 = vmulq_lane_f32(_p5, vget_low_f32(_scale1), 1);
+                _p6 = vmulq_lane_f32(_p6, vget_high_f32(_scale1), 0);
+                _p7 = vmulq_lane_f32(_p7, vget_high_f32(_scale1), 1);
+#endif
 
                 int8x8_t _r0 = float2int8(_p0, _p1);
                 int8x8_t _r1 = float2int8(_p2, _p3);
@@ -3334,6 +3412,7 @@ static void transpose_pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int
                 float32x4_t _p6 = vld1q_f32(p0 + A_hstep * 4 + 8);
                 float32x4_t _p7 = vld1q_f32(p0 + A_hstep * 4 + 12);
 
+#if __aarch64__
                 _p0 = vmulq_laneq_f32(_p0, _scale, 0);
                 _p1 = vmulq_laneq_f32(_p1, _scale, 1);
                 _p2 = vmulq_laneq_f32(_p2, _scale, 2);
@@ -3342,6 +3421,16 @@ static void transpose_pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int
                 _p5 = vmulq_laneq_f32(_p5, _scale, 1);
                 _p6 = vmulq_laneq_f32(_p6, _scale, 2);
                 _p7 = vmulq_laneq_f32(_p7, _scale, 3);
+#else
+                _p0 = vmulq_lane_f32(_p0, vget_low_f32(_scale), 0);
+                _p1 = vmulq_lane_f32(_p1, vget_low_f32(_scale), 1);
+                _p2 = vmulq_lane_f32(_p2, vget_high_f32(_scale), 0);
+                _p3 = vmulq_lane_f32(_p3, vget_high_f32(_scale), 1);
+                _p4 = vmulq_lane_f32(_p4, vget_low_f32(_scale), 0);
+                _p5 = vmulq_lane_f32(_p5, vget_low_f32(_scale), 1);
+                _p6 = vmulq_lane_f32(_p6, vget_high_f32(_scale), 0);
+                _p7 = vmulq_lane_f32(_p7, vget_high_f32(_scale), 1);
+#endif
 
 #if __ARM_FEATURE_DOTPROD
 #if __ARM_FEATURE_MATMUL_INT8
@@ -3381,10 +3470,17 @@ static void transpose_pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int
                 float32x4_t _p2 = vld1q_f32(p0 + 8);
                 float32x4_t _p3 = vld1q_f32(p0 + 12);
 
+#if __aarch64__
                 _p0 = vmulq_laneq_f32(_p0, _scale, 0);
                 _p1 = vmulq_laneq_f32(_p1, _scale, 1);
                 _p2 = vmulq_laneq_f32(_p2, _scale, 2);
                 _p3 = vmulq_laneq_f32(_p3, _scale, 3);
+#else
+                _p0 = vmulq_lane_f32(_p0, vget_low_f32(_scale), 0);
+                _p1 = vmulq_lane_f32(_p1, vget_low_f32(_scale), 1);
+                _p2 = vmulq_lane_f32(_p2, vget_high_f32(_scale), 0);
+                _p3 = vmulq_lane_f32(_p3, vget_high_f32(_scale), 1);
+#endif
 
 #if __ARM_FEATURE_DOTPROD
                 int8x8_t _r0 = float2int8(_p0, _p1);
@@ -8910,6 +9006,7 @@ static void transpose_unpack_output_tile_int32_to_fp32(const Mat& topT, const Ma
                     }
                     if (broadcast_type_C == 1 || broadcast_type_C == 2)
                     {
+#if __aarch64__
                         float32x4_t _cc0 = vdupq_laneq_f32(_c0, 0);
                         float32x4_t _cc1 = vdupq_laneq_f32(_c0, 1);
                         float32x4_t _cc2 = vdupq_laneq_f32(_c0, 2);
@@ -8918,6 +9015,16 @@ static void transpose_unpack_output_tile_int32_to_fp32(const Mat& topT, const Ma
                         float32x4_t _cc5 = vdupq_laneq_f32(_c1, 1);
                         float32x4_t _cc6 = vdupq_laneq_f32(_c1, 2);
                         float32x4_t _cc7 = vdupq_laneq_f32(_c1, 3);
+#else
+                        float32x4_t _cc0 = vdupq_lane_f32(vget_low_f32(_c0), 0);
+                        float32x4_t _cc1 = vdupq_lane_f32(vget_low_f32(_c0), 1);
+                        float32x4_t _cc2 = vdupq_lane_f32(vget_high_f32(_c0), 0);
+                        float32x4_t _cc3 = vdupq_lane_f32(vget_high_f32(_c0), 1);
+                        float32x4_t _cc4 = vdupq_lane_f32(vget_low_f32(_c1), 0);
+                        float32x4_t _cc5 = vdupq_lane_f32(vget_low_f32(_c1), 1);
+                        float32x4_t _cc6 = vdupq_lane_f32(vget_high_f32(_c1), 0);
+                        float32x4_t _cc7 = vdupq_lane_f32(vget_high_f32(_c1), 1);
+#endif
                         _f0 = vaddq_f32(_f0, _cc0);
                         _f1 = vaddq_f32(_f1, _cc1);
                         _f2 = vaddq_f32(_f2, _cc2);
@@ -9993,10 +10100,17 @@ static void transpose_unpack_output_tile_int32_to_fp32(const Mat& topT, const Ma
                     }
                     if (broadcast_type_C == 1 || broadcast_type_C == 2)
                     {
+#if __aarch64__
                         float32x4_t _cc0 = vdupq_laneq_f32(_c0, 0);
                         float32x4_t _cc1 = vdupq_laneq_f32(_c0, 1);
                         float32x4_t _cc2 = vdupq_laneq_f32(_c0, 2);
                         float32x4_t _cc3 = vdupq_laneq_f32(_c0, 3);
+#else
+                        float32x4_t _cc0 = vdupq_lane_f32(vget_low_f32(_c0), 0);
+                        float32x4_t _cc1 = vdupq_lane_f32(vget_low_f32(_c0), 1);
+                        float32x4_t _cc2 = vdupq_lane_f32(vget_high_f32(_c0), 0);
+                        float32x4_t _cc3 = vdupq_lane_f32(vget_high_f32(_c0), 1);
+#endif
                         _f0 = vaddq_f32(_f0, _cc0);
                         _f1 = vaddq_f32(_f1, _cc1);
                         _f2 = vaddq_f32(_f2, _cc2);