Skip to content

Commit

Permalink
build for armv7
Browse files Browse the repository at this point in the history
  • Loading branch information
nihui committed Sep 20, 2024
1 parent dd6bf5c commit 86b03df
Show file tree
Hide file tree
Showing 2 changed files with 228 additions and 0 deletions.
114 changes: 114 additions & 0 deletions src/layer/arm/gemm_int8.h
Original file line number Diff line number Diff line change
Expand Up @@ -2111,6 +2111,7 @@ static void pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int max_ii, i
float32x4_t _pe = vld1q_f32(p0 + A_hstep * 7);
float32x4_t _pf = vld1q_f32(p0 + A_hstep * 7 + 4);

#if __aarch64__
_p0 = vmulq_laneq_f32(_p0, _scale0, 0);
_p1 = vmulq_laneq_f32(_p1, _scale0, 0);
_p2 = vmulq_laneq_f32(_p2, _scale0, 1);
Expand All @@ -2127,6 +2128,24 @@ static void pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int max_ii, i
_pd = vmulq_laneq_f32(_pd, _scale1, 2);
_pe = vmulq_laneq_f32(_pe, _scale1, 3);
_pf = vmulq_laneq_f32(_pf, _scale1, 3);
#else
_p0 = vmulq_lane_f32(_p0, vget_low_f32(_scale0), 0);
_p1 = vmulq_lane_f32(_p1, vget_low_f32(_scale0), 0);
_p2 = vmulq_lane_f32(_p2, vget_low_f32(_scale0), 1);
_p3 = vmulq_lane_f32(_p3, vget_low_f32(_scale0), 1);
_p4 = vmulq_lane_f32(_p4, vget_high_f32(_scale0), 0);
_p5 = vmulq_lane_f32(_p5, vget_high_f32(_scale0), 0);
_p6 = vmulq_lane_f32(_p6, vget_high_f32(_scale0), 1);
_p7 = vmulq_lane_f32(_p7, vget_high_f32(_scale0), 1);
_p8 = vmulq_lane_f32(_p8, vget_low_f32(_scale1), 0);
_p9 = vmulq_lane_f32(_p9, vget_low_f32(_scale1), 0);
_pa = vmulq_lane_f32(_pa, vget_low_f32(_scale1), 1);
_pb = vmulq_lane_f32(_pb, vget_low_f32(_scale1), 1);
_pc = vmulq_lane_f32(_pc, vget_high_f32(_scale1), 0);
_pd = vmulq_lane_f32(_pd, vget_high_f32(_scale1), 0);
_pe = vmulq_lane_f32(_pe, vget_high_f32(_scale1), 1);
_pf = vmulq_lane_f32(_pf, vget_high_f32(_scale1), 1);
#endif

#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
Expand Down Expand Up @@ -2190,6 +2209,7 @@ static void pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int max_ii, i
float32x4_t _p6 = vld1q_f32(p0 + A_hstep * 6);
float32x4_t _p7 = vld1q_f32(p0 + A_hstep * 7);

#if __aarch64__
_p0 = vmulq_laneq_f32(_p0, _scale0, 0);
_p1 = vmulq_laneq_f32(_p1, _scale0, 1);
_p2 = vmulq_laneq_f32(_p2, _scale0, 2);
Expand All @@ -2198,6 +2218,16 @@ static void pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int max_ii, i
_p5 = vmulq_laneq_f32(_p5, _scale1, 1);
_p6 = vmulq_laneq_f32(_p6, _scale1, 2);
_p7 = vmulq_laneq_f32(_p7, _scale1, 3);
#else
_p0 = vmulq_lane_f32(_p0, vget_low_f32(_scale0), 0);
_p1 = vmulq_lane_f32(_p1, vget_low_f32(_scale0), 1);
_p2 = vmulq_lane_f32(_p2, vget_high_f32(_scale0), 0);
_p3 = vmulq_lane_f32(_p3, vget_high_f32(_scale0), 1);
_p4 = vmulq_lane_f32(_p4, vget_low_f32(_scale1), 0);
_p5 = vmulq_lane_f32(_p5, vget_low_f32(_scale1), 1);
_p6 = vmulq_lane_f32(_p6, vget_high_f32(_scale1), 0);
_p7 = vmulq_lane_f32(_p7, vget_high_f32(_scale1), 1);
#endif

#if __ARM_FEATURE_DOTPROD
int8x8_t _r0 = float2int8(_p0, _p1);
Expand Down Expand Up @@ -2428,6 +2458,7 @@ static void pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int max_ii, i
float32x4_t _p6 = vld1q_f32(p0 + A_hstep * 3);
float32x4_t _p7 = vld1q_f32(p0 + A_hstep * 3 + 4);

#if __aarch64__
_p0 = vmulq_laneq_f32(_p0, _scale, 0);
_p1 = vmulq_laneq_f32(_p1, _scale, 0);
_p2 = vmulq_laneq_f32(_p2, _scale, 1);
Expand All @@ -2436,6 +2467,16 @@ static void pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int max_ii, i
_p5 = vmulq_laneq_f32(_p5, _scale, 2);
_p6 = vmulq_laneq_f32(_p6, _scale, 3);
_p7 = vmulq_laneq_f32(_p7, _scale, 3);
#else
_p0 = vmulq_lane_f32(_p0, vget_low_f32(_scale), 0);
_p1 = vmulq_lane_f32(_p1, vget_low_f32(_scale), 0);
_p2 = vmulq_lane_f32(_p2, vget_low_f32(_scale), 1);
_p3 = vmulq_lane_f32(_p3, vget_low_f32(_scale), 1);
_p4 = vmulq_lane_f32(_p4, vget_high_f32(_scale), 0);
_p5 = vmulq_lane_f32(_p5, vget_high_f32(_scale), 0);
_p6 = vmulq_lane_f32(_p6, vget_high_f32(_scale), 1);
_p7 = vmulq_lane_f32(_p7, vget_high_f32(_scale), 1);
#endif

#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
Expand Down Expand Up @@ -2475,10 +2516,17 @@ static void pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int max_ii, i
float32x4_t _p2 = vld1q_f32(p0 + A_hstep * 2);
float32x4_t _p3 = vld1q_f32(p0 + A_hstep * 3);

#if __aarch64__
_p0 = vmulq_laneq_f32(_p0, _scale, 0);
_p1 = vmulq_laneq_f32(_p1, _scale, 1);
_p2 = vmulq_laneq_f32(_p2, _scale, 2);
_p3 = vmulq_laneq_f32(_p3, _scale, 3);
#else
_p0 = vmulq_lane_f32(_p0, vget_low_f32(_scale), 0);
_p1 = vmulq_lane_f32(_p1, vget_low_f32(_scale), 1);
_p2 = vmulq_lane_f32(_p2, vget_high_f32(_scale), 0);
_p3 = vmulq_lane_f32(_p3, vget_high_f32(_scale), 1);
#endif

#if __ARM_FEATURE_DOTPROD
int8x8_t _r0 = float2int8(_p0, _p1);
Expand Down Expand Up @@ -3032,6 +3080,7 @@ static void transpose_pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int
float32x4_t _pe = vld1q_f32(p0 + A_hstep * 4 + 24);
float32x4_t _pf = vld1q_f32(p0 + A_hstep * 4 + 28);

#if __aarch64__
_p0 = vmulq_laneq_f32(_p0, _scale0, 0);
_p1 = vmulq_laneq_f32(_p1, _scale0, 1);
_p2 = vmulq_laneq_f32(_p2, _scale0, 2);
Expand All @@ -3048,6 +3097,24 @@ static void transpose_pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int
_pd = vmulq_laneq_f32(_pd, _scale1, 1);
_pe = vmulq_laneq_f32(_pe, _scale1, 2);
_pf = vmulq_laneq_f32(_pf, _scale1, 3);
#else
_p0 = vmulq_lane_f32(_p0, vget_low_f32(_scale0), 0);
_p1 = vmulq_lane_f32(_p1, vget_low_f32(_scale0), 1);
_p2 = vmulq_lane_f32(_p2, vget_high_f32(_scale0), 0);
_p3 = vmulq_lane_f32(_p3, vget_high_f32(_scale0), 1);
_p4 = vmulq_lane_f32(_p4, vget_low_f32(_scale1), 0);
_p5 = vmulq_lane_f32(_p5, vget_low_f32(_scale1), 1);
_p6 = vmulq_lane_f32(_p6, vget_high_f32(_scale1), 0);
_p7 = vmulq_lane_f32(_p7, vget_high_f32(_scale1), 1);
_p8 = vmulq_lane_f32(_p8, vget_low_f32(_scale0), 0);
_p9 = vmulq_lane_f32(_p9, vget_low_f32(_scale0), 1);
_pa = vmulq_lane_f32(_pa, vget_high_f32(_scale0), 0);
_pb = vmulq_lane_f32(_pb, vget_high_f32(_scale0), 1);
_pc = vmulq_lane_f32(_pc, vget_low_f32(_scale1), 0);
_pd = vmulq_lane_f32(_pd, vget_low_f32(_scale1), 1);
_pe = vmulq_lane_f32(_pe, vget_high_f32(_scale1), 0);
_pf = vmulq_lane_f32(_pf, vget_high_f32(_scale1), 1);
#endif

#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
Expand Down Expand Up @@ -3116,6 +3183,7 @@ static void transpose_pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int
float32x4_t _p6 = vld1q_f32(p0 + 24);
float32x4_t _p7 = vld1q_f32(p0 + 28);

#if __aarch64__
_p0 = vmulq_laneq_f32(_p0, _scale0, 0);
_p1 = vmulq_laneq_f32(_p1, _scale0, 1);
_p2 = vmulq_laneq_f32(_p2, _scale0, 2);
Expand All @@ -3124,6 +3192,16 @@ static void transpose_pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int
_p5 = vmulq_laneq_f32(_p5, _scale1, 1);
_p6 = vmulq_laneq_f32(_p6, _scale1, 2);
_p7 = vmulq_laneq_f32(_p7, _scale1, 3);
#else
_p0 = vmulq_lane_f32(_p0, vget_low_f32(_scale0), 0);
_p1 = vmulq_lane_f32(_p1, vget_low_f32(_scale0), 1);
_p2 = vmulq_lane_f32(_p2, vget_high_f32(_scale0), 0);
_p3 = vmulq_lane_f32(_p3, vget_high_f32(_scale0), 1);
_p4 = vmulq_lane_f32(_p4, vget_low_f32(_scale1), 0);
_p5 = vmulq_lane_f32(_p5, vget_low_f32(_scale1), 1);
_p6 = vmulq_lane_f32(_p6, vget_high_f32(_scale1), 0);
_p7 = vmulq_lane_f32(_p7, vget_high_f32(_scale1), 1);
#endif

int8x8_t _r0 = float2int8(_p0, _p1);
int8x8_t _r1 = float2int8(_p2, _p3);
Expand Down Expand Up @@ -3334,6 +3412,7 @@ static void transpose_pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int
float32x4_t _p6 = vld1q_f32(p0 + A_hstep * 4 + 8);
float32x4_t _p7 = vld1q_f32(p0 + A_hstep * 4 + 12);

#if __aarch64__
_p0 = vmulq_laneq_f32(_p0, _scale, 0);
_p1 = vmulq_laneq_f32(_p1, _scale, 1);
_p2 = vmulq_laneq_f32(_p2, _scale, 2);
Expand All @@ -3342,6 +3421,16 @@ static void transpose_pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int
_p5 = vmulq_laneq_f32(_p5, _scale, 1);
_p6 = vmulq_laneq_f32(_p6, _scale, 2);
_p7 = vmulq_laneq_f32(_p7, _scale, 3);
#else
_p0 = vmulq_lane_f32(_p0, vget_low_f32(_scale), 0);
_p1 = vmulq_lane_f32(_p1, vget_low_f32(_scale), 1);
_p2 = vmulq_lane_f32(_p2, vget_high_f32(_scale), 0);
_p3 = vmulq_lane_f32(_p3, vget_high_f32(_scale), 1);
_p4 = vmulq_lane_f32(_p4, vget_low_f32(_scale), 0);
_p5 = vmulq_lane_f32(_p5, vget_low_f32(_scale), 1);
_p6 = vmulq_lane_f32(_p6, vget_high_f32(_scale), 0);
_p7 = vmulq_lane_f32(_p7, vget_high_f32(_scale), 1);
#endif

#if __ARM_FEATURE_DOTPROD
#if __ARM_FEATURE_MATMUL_INT8
Expand Down Expand Up @@ -3381,10 +3470,17 @@ static void transpose_pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int
float32x4_t _p2 = vld1q_f32(p0 + 8);
float32x4_t _p3 = vld1q_f32(p0 + 12);

#if __aarch64__
_p0 = vmulq_laneq_f32(_p0, _scale, 0);
_p1 = vmulq_laneq_f32(_p1, _scale, 1);
_p2 = vmulq_laneq_f32(_p2, _scale, 2);
_p3 = vmulq_laneq_f32(_p3, _scale, 3);
#else
_p0 = vmulq_lane_f32(_p0, vget_low_f32(_scale), 0);
_p1 = vmulq_lane_f32(_p1, vget_low_f32(_scale), 1);
_p2 = vmulq_lane_f32(_p2, vget_high_f32(_scale), 0);
_p3 = vmulq_lane_f32(_p3, vget_high_f32(_scale), 1);
#endif

#if __ARM_FEATURE_DOTPROD
int8x8_t _r0 = float2int8(_p0, _p1);
Expand Down Expand Up @@ -8910,6 +9006,7 @@ static void transpose_unpack_output_tile_int32_to_fp32(const Mat& topT, const Ma
}
if (broadcast_type_C == 1 || broadcast_type_C == 2)
{
#if __aarch64__
float32x4_t _cc0 = vdupq_laneq_f32(_c0, 0);
float32x4_t _cc1 = vdupq_laneq_f32(_c0, 1);
float32x4_t _cc2 = vdupq_laneq_f32(_c0, 2);
Expand All @@ -8918,6 +9015,16 @@ static void transpose_unpack_output_tile_int32_to_fp32(const Mat& topT, const Ma
float32x4_t _cc5 = vdupq_laneq_f32(_c1, 1);
float32x4_t _cc6 = vdupq_laneq_f32(_c1, 2);
float32x4_t _cc7 = vdupq_laneq_f32(_c1, 3);
#else
float32x4_t _cc0 = vdupq_lane_f32(vget_low_f32(_c0), 0);
float32x4_t _cc1 = vdupq_lane_f32(vget_low_f32(_c0), 1);
float32x4_t _cc2 = vdupq_lane_f32(vget_high_f32(_c0), 0);
float32x4_t _cc3 = vdupq_lane_f32(vget_high_f32(_c0), 1);
float32x4_t _cc4 = vdupq_lane_f32(vget_low_f32(_c1), 0);
float32x4_t _cc5 = vdupq_lane_f32(vget_low_f32(_c1), 1);
float32x4_t _cc6 = vdupq_lane_f32(vget_high_f32(_c1), 0);
float32x4_t _cc7 = vdupq_lane_f32(vget_high_f32(_c1), 1);
#endif
_f0 = vaddq_f32(_f0, _cc0);
_f1 = vaddq_f32(_f1, _cc1);
_f2 = vaddq_f32(_f2, _cc2);
Expand Down Expand Up @@ -9993,10 +10100,17 @@ static void transpose_unpack_output_tile_int32_to_fp32(const Mat& topT, const Ma
}
if (broadcast_type_C == 1 || broadcast_type_C == 2)
{
#if __aarch64__
float32x4_t _cc0 = vdupq_laneq_f32(_c0, 0);
float32x4_t _cc1 = vdupq_laneq_f32(_c0, 1);
float32x4_t _cc2 = vdupq_laneq_f32(_c0, 2);
float32x4_t _cc3 = vdupq_laneq_f32(_c0, 3);
#else
float32x4_t _cc0 = vdupq_lane_f32(vget_low_f32(_c0), 0);
float32x4_t _cc1 = vdupq_lane_f32(vget_low_f32(_c0), 1);
float32x4_t _cc2 = vdupq_lane_f32(vget_high_f32(_c0), 0);
float32x4_t _cc3 = vdupq_lane_f32(vget_high_f32(_c0), 1);
#endif
_f0 = vaddq_f32(_f0, _cc0);
_f1 = vaddq_f32(_f1, _cc1);
_f2 = vaddq_f32(_f2, _cc2);
Expand Down
Loading

0 comments on commit 86b03df

Please sign in to comment.