From 86b03dfe4ae21673ff4fd1996e50a902edb2986d Mon Sep 17 00:00:00 2001
From: nihui <shuizhuyuanluo@126.com>
Date: Fri, 20 Sep 2024 10:28:23 +0800
Subject: [PATCH] build for armv7

---
 src/layer/arm/gemm_int8.h       | 114 ++++++++++++++++++++++++++++++++
 src/layer/arm/gemm_int8_bf16s.h | 114 ++++++++++++++++++++++++++++++++
 2 files changed, 228 insertions(+)

diff --git a/src/layer/arm/gemm_int8.h b/src/layer/arm/gemm_int8.h
index a532f7cf598..12d6d51b7ef 100644
--- a/src/layer/arm/gemm_int8.h
+++ b/src/layer/arm/gemm_int8.h
@@ -2111,6 +2111,7 @@ static void pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int max_ii, i
                 float32x4_t _pe = vld1q_f32(p0 + A_hstep * 7);
                 float32x4_t _pf = vld1q_f32(p0 + A_hstep * 7 + 4);
 
+#if __aarch64__
                 _p0 = vmulq_laneq_f32(_p0, _scale0, 0);
                 _p1 = vmulq_laneq_f32(_p1, _scale0, 0);
                 _p2 = vmulq_laneq_f32(_p2, _scale0, 1);
@@ -2127,6 +2128,24 @@ static void pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int max_ii, i
                 _pd = vmulq_laneq_f32(_pd, _scale1, 2);
                 _pe = vmulq_laneq_f32(_pe, _scale1, 3);
                 _pf = vmulq_laneq_f32(_pf, _scale1, 3);
+#else
+                _p0 = vmulq_lane_f32(_p0, vget_low_f32(_scale0), 0);
+                _p1 = vmulq_lane_f32(_p1, vget_low_f32(_scale0), 0);
+                _p2 = vmulq_lane_f32(_p2, vget_low_f32(_scale0), 1);
+                _p3 = vmulq_lane_f32(_p3, vget_low_f32(_scale0), 1);
+                _p4 = vmulq_lane_f32(_p4, vget_high_f32(_scale0), 0);
+                _p5 = vmulq_lane_f32(_p5, vget_high_f32(_scale0), 0);
+                _p6 = vmulq_lane_f32(_p6, vget_high_f32(_scale0), 1);
+                _p7 = vmulq_lane_f32(_p7, vget_high_f32(_scale0), 1);
+                _p8 = vmulq_lane_f32(_p8, vget_low_f32(_scale1), 0);
+                _p9 = vmulq_lane_f32(_p9, vget_low_f32(_scale1), 0);
+                _pa = vmulq_lane_f32(_pa, vget_low_f32(_scale1), 1);
+                _pb = vmulq_lane_f32(_pb, vget_low_f32(_scale1), 1);
+                _pc = vmulq_lane_f32(_pc, vget_high_f32(_scale1), 0);
+                _pd = vmulq_lane_f32(_pd, vget_high_f32(_scale1), 0);
+                _pe = vmulq_lane_f32(_pe, vget_high_f32(_scale1), 1);
+                _pf = vmulq_lane_f32(_pf, vget_high_f32(_scale1), 1);
+#endif
 
 #if __ARM_FEATURE_DOTPROD
 #if __ARM_FEATURE_MATMUL_INT8
@@ -2190,6 +2209,7 @@ static void pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int max_ii, i
                 float32x4_t _p6 = vld1q_f32(p0 + A_hstep * 6);
                 float32x4_t _p7 = vld1q_f32(p0 + A_hstep * 7);
 
+#if __aarch64__
                 _p0 = vmulq_laneq_f32(_p0, _scale0, 0);
                 _p1 = vmulq_laneq_f32(_p1, _scale0, 1);
                 _p2 = vmulq_laneq_f32(_p2, _scale0, 2);
@@ -2198,6 +2218,16 @@ static void pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int max_ii, i
                 _p5 = vmulq_laneq_f32(_p5, _scale1, 1);
                 _p6 = vmulq_laneq_f32(_p6, _scale1, 2);
                 _p7 = vmulq_laneq_f32(_p7, _scale1, 3);
+#else
+                _p0 = vmulq_lane_f32(_p0, vget_low_f32(_scale0), 0);
+                _p1 = vmulq_lane_f32(_p1, vget_low_f32(_scale0), 1);
+                _p2 = vmulq_lane_f32(_p2, vget_high_f32(_scale0), 0);
+                _p3 = vmulq_lane_f32(_p3, vget_high_f32(_scale0), 1);
+                _p4 = vmulq_lane_f32(_p4, vget_low_f32(_scale1), 0);
+                _p5 = vmulq_lane_f32(_p5, vget_low_f32(_scale1), 1);
+                _p6 = vmulq_lane_f32(_p6, vget_high_f32(_scale1), 0);
+                _p7 = vmulq_lane_f32(_p7, vget_high_f32(_scale1), 1);
+#endif
 
 #if __ARM_FEATURE_DOTPROD
                 int8x8_t _r0 = float2int8(_p0, _p1);
@@ -2428,6 +2458,7 @@ static void pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int max_ii, i
                 float32x4_t _p6 = vld1q_f32(p0 + A_hstep * 3);
                 float32x4_t _p7 = vld1q_f32(p0 + A_hstep * 3 + 4);
 
+#if __aarch64__
                 _p0 = vmulq_laneq_f32(_p0, _scale, 0);
                 _p1 = vmulq_laneq_f32(_p1, _scale, 0);
                 _p2 = vmulq_laneq_f32(_p2, _scale, 1);
@@ -2436,6 +2467,16 @@ static void pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int max_ii, i
                 _p5 = vmulq_laneq_f32(_p5, _scale, 2);
                 _p6 = vmulq_laneq_f32(_p6, _scale, 3);
                 _p7 = vmulq_laneq_f32(_p7, _scale, 3);
+#else
+                _p0 = vmulq_lane_f32(_p0, vget_low_f32(_scale), 0);
+                _p1 = vmulq_lane_f32(_p1, vget_low_f32(_scale), 0);
+                _p2 = vmulq_lane_f32(_p2, vget_low_f32(_scale), 1);
+                _p3 = vmulq_lane_f32(_p3, vget_low_f32(_scale), 1);
+                _p4 = vmulq_lane_f32(_p4, vget_high_f32(_scale), 0);
+                _p5 = vmulq_lane_f32(_p5, vget_high_f32(_scale), 0);
+                _p6 = vmulq_lane_f32(_p6, vget_high_f32(_scale), 1);
+                _p7 = vmulq_lane_f32(_p7, vget_high_f32(_scale), 1);
+#endif
 
 #if __ARM_FEATURE_DOTPROD
 #if __ARM_FEATURE_MATMUL_INT8
@@ -2475,10 +2516,17 @@ static void pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int max_ii, i
                 float32x4_t _p2 = vld1q_f32(p0 + A_hstep * 2);
                 float32x4_t _p3 = vld1q_f32(p0 + A_hstep * 3);
 
+#if __aarch64__
                 _p0 = vmulq_laneq_f32(_p0, _scale, 0);
                 _p1 = vmulq_laneq_f32(_p1, _scale, 1);
                 _p2 = vmulq_laneq_f32(_p2, _scale, 2);
                 _p3 = vmulq_laneq_f32(_p3, _scale, 3);
+#else
+                _p0 = vmulq_lane_f32(_p0, vget_low_f32(_scale), 0);
+                _p1 = vmulq_lane_f32(_p1, vget_low_f32(_scale), 1);
+                _p2 = vmulq_lane_f32(_p2, vget_high_f32(_scale), 0);
+                _p3 = vmulq_lane_f32(_p3, vget_high_f32(_scale), 1);
+#endif
 
 #if __ARM_FEATURE_DOTPROD
                 int8x8_t _r0 = float2int8(_p0, _p1);
@@ -3032,6 +3080,7 @@ static void transpose_pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int
                 float32x4_t _pe = vld1q_f32(p0 + A_hstep * 4 + 24);
                 float32x4_t _pf = vld1q_f32(p0 + A_hstep * 4 + 28);
 
+#if __aarch64__
                 _p0 = vmulq_laneq_f32(_p0, _scale0, 0);
                 _p1 = vmulq_laneq_f32(_p1, _scale0, 1);
                 _p2 = vmulq_laneq_f32(_p2, _scale0, 2);
@@ -3048,6 +3097,24 @@ static void transpose_pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int
                 _pd = vmulq_laneq_f32(_pd, _scale1, 1);
                 _pe = vmulq_laneq_f32(_pe, _scale1, 2);
                 _pf = vmulq_laneq_f32(_pf, _scale1, 3);
+#else
+                _p0 = vmulq_lane_f32(_p0, vget_low_f32(_scale0), 0);
+                _p1 = vmulq_lane_f32(_p1, vget_low_f32(_scale0), 1);
+                _p2 = vmulq_lane_f32(_p2, vget_high_f32(_scale0), 0);
+                _p3 = vmulq_lane_f32(_p3, vget_high_f32(_scale0), 1);
+                _p4 = vmulq_lane_f32(_p4, vget_low_f32(_scale1), 0);
+                _p5 = vmulq_lane_f32(_p5, vget_low_f32(_scale1), 1);
+                _p6 = vmulq_lane_f32(_p6, vget_high_f32(_scale1), 0);
+                _p7 = vmulq_lane_f32(_p7, vget_high_f32(_scale1), 1);
+                _p8 = vmulq_lane_f32(_p8, vget_low_f32(_scale0), 0);
+                _p9 = vmulq_lane_f32(_p9, vget_low_f32(_scale0), 1);
+                _pa = vmulq_lane_f32(_pa, vget_high_f32(_scale0), 0);
+                _pb = vmulq_lane_f32(_pb, vget_high_f32(_scale0), 1);
+                _pc = vmulq_lane_f32(_pc, vget_low_f32(_scale1), 0);
+                _pd = vmulq_lane_f32(_pd, vget_low_f32(_scale1), 1);
+                _pe = vmulq_lane_f32(_pe, vget_high_f32(_scale1), 0);
+                _pf = vmulq_lane_f32(_pf, vget_high_f32(_scale1), 1);
+#endif
 
 #if __ARM_FEATURE_DOTPROD
 #if __ARM_FEATURE_MATMUL_INT8
@@ -3116,6 +3183,7 @@ static void transpose_pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int
                 float32x4_t _p6 = vld1q_f32(p0 + 24);
                 float32x4_t _p7 = vld1q_f32(p0 + 28);
 
+#if __aarch64__
                 _p0 = vmulq_laneq_f32(_p0, _scale0, 0);
                 _p1 = vmulq_laneq_f32(_p1, _scale0, 1);
                 _p2 = vmulq_laneq_f32(_p2, _scale0, 2);
@@ -3124,6 +3192,16 @@ static void transpose_pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int
                 _p5 = vmulq_laneq_f32(_p5, _scale1, 1);
                 _p6 = vmulq_laneq_f32(_p6, _scale1, 2);
                 _p7 = vmulq_laneq_f32(_p7, _scale1, 3);
+#else
+                _p0 = vmulq_lane_f32(_p0, vget_low_f32(_scale0), 0);
+                _p1 = vmulq_lane_f32(_p1, vget_low_f32(_scale0), 1);
+                _p2 = vmulq_lane_f32(_p2, vget_high_f32(_scale0), 0);
+                _p3 = vmulq_lane_f32(_p3, vget_high_f32(_scale0), 1);
+                _p4 = vmulq_lane_f32(_p4, vget_low_f32(_scale1), 0);
+                _p5 = vmulq_lane_f32(_p5, vget_low_f32(_scale1), 1);
+                _p6 = vmulq_lane_f32(_p6, vget_high_f32(_scale1), 0);
+                _p7 = vmulq_lane_f32(_p7, vget_high_f32(_scale1), 1);
+#endif
 
                 int8x8_t _r0 = float2int8(_p0, _p1);
                 int8x8_t _r1 = float2int8(_p2, _p3);
@@ -3334,6 +3412,7 @@ static void transpose_pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int
                 float32x4_t _p6 = vld1q_f32(p0 + A_hstep * 4 + 8);
                 float32x4_t _p7 = vld1q_f32(p0 + A_hstep * 4 + 12);
 
+#if __aarch64__
                 _p0 = vmulq_laneq_f32(_p0, _scale, 0);
                 _p1 = vmulq_laneq_f32(_p1, _scale, 1);
                 _p2 = vmulq_laneq_f32(_p2, _scale, 2);
@@ -3342,6 +3421,16 @@ static void transpose_pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int
                 _p5 = vmulq_laneq_f32(_p5, _scale, 1);
                 _p6 = vmulq_laneq_f32(_p6, _scale, 2);
                 _p7 = vmulq_laneq_f32(_p7, _scale, 3);
+#else
+                _p0 = vmulq_lane_f32(_p0, vget_low_f32(_scale), 0);
+                _p1 = vmulq_lane_f32(_p1, vget_low_f32(_scale), 1);
+                _p2 = vmulq_lane_f32(_p2, vget_high_f32(_scale), 0);
+                _p3 = vmulq_lane_f32(_p3, vget_high_f32(_scale), 1);
+                _p4 = vmulq_lane_f32(_p4, vget_low_f32(_scale), 0);
+                _p5 = vmulq_lane_f32(_p5, vget_low_f32(_scale), 1);
+                _p6 = vmulq_lane_f32(_p6, vget_high_f32(_scale), 0);
+                _p7 = vmulq_lane_f32(_p7, vget_high_f32(_scale), 1);
+#endif
 
 #if __ARM_FEATURE_DOTPROD
 #if __ARM_FEATURE_MATMUL_INT8
@@ -3381,10 +3470,17 @@ static void transpose_pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int
                 float32x4_t _p2 = vld1q_f32(p0 + 8);
                 float32x4_t _p3 = vld1q_f32(p0 + 12);
 
+#if __aarch64__
                 _p0 = vmulq_laneq_f32(_p0, _scale, 0);
                 _p1 = vmulq_laneq_f32(_p1, _scale, 1);
                 _p2 = vmulq_laneq_f32(_p2, _scale, 2);
                 _p3 = vmulq_laneq_f32(_p3, _scale, 3);
+#else
+                _p0 = vmulq_lane_f32(_p0, vget_low_f32(_scale), 0);
+                _p1 = vmulq_lane_f32(_p1, vget_low_f32(_scale), 1);
+                _p2 = vmulq_lane_f32(_p2, vget_high_f32(_scale), 0);
+                _p3 = vmulq_lane_f32(_p3, vget_high_f32(_scale), 1);
+#endif
 
 #if __ARM_FEATURE_DOTPROD
                 int8x8_t _r0 = float2int8(_p0, _p1);
@@ -8910,6 +9006,7 @@ static void transpose_unpack_output_tile_int32_to_fp32(const Mat& topT, const Ma
                     }
                     if (broadcast_type_C == 1 || broadcast_type_C == 2)
                     {
+#if __aarch64__
                         float32x4_t _cc0 = vdupq_laneq_f32(_c0, 0);
                         float32x4_t _cc1 = vdupq_laneq_f32(_c0, 1);
                         float32x4_t _cc2 = vdupq_laneq_f32(_c0, 2);
@@ -8918,6 +9015,16 @@ static void transpose_unpack_output_tile_int32_to_fp32(const Mat& topT, const Ma
                         float32x4_t _cc5 = vdupq_laneq_f32(_c1, 1);
                         float32x4_t _cc6 = vdupq_laneq_f32(_c1, 2);
                         float32x4_t _cc7 = vdupq_laneq_f32(_c1, 3);
+#else
+                        float32x4_t _cc0 = vdupq_lane_f32(vget_low_f32(_c0), 0);
+                        float32x4_t _cc1 = vdupq_lane_f32(vget_low_f32(_c0), 1);
+                        float32x4_t _cc2 = vdupq_lane_f32(vget_high_f32(_c0), 0);
+                        float32x4_t _cc3 = vdupq_lane_f32(vget_high_f32(_c0), 1);
+                        float32x4_t _cc4 = vdupq_lane_f32(vget_low_f32(_c1), 0);
+                        float32x4_t _cc5 = vdupq_lane_f32(vget_low_f32(_c1), 1);
+                        float32x4_t _cc6 = vdupq_lane_f32(vget_high_f32(_c1), 0);
+                        float32x4_t _cc7 = vdupq_lane_f32(vget_high_f32(_c1), 1);
+#endif
                         _f0 = vaddq_f32(_f0, _cc0);
                         _f1 = vaddq_f32(_f1, _cc1);
                         _f2 = vaddq_f32(_f2, _cc2);
@@ -9993,10 +10100,17 @@ static void transpose_unpack_output_tile_int32_to_fp32(const Mat& topT, const Ma
                     }
                     if (broadcast_type_C == 1 || broadcast_type_C == 2)
                     {
+#if __aarch64__
                         float32x4_t _cc0 = vdupq_laneq_f32(_c0, 0);
                         float32x4_t _cc1 = vdupq_laneq_f32(_c0, 1);
                         float32x4_t _cc2 = vdupq_laneq_f32(_c0, 2);
                         float32x4_t _cc3 = vdupq_laneq_f32(_c0, 3);
+#else
+                        float32x4_t _cc0 = vdupq_lane_f32(vget_low_f32(_c0), 0);
+                        float32x4_t _cc1 = vdupq_lane_f32(vget_low_f32(_c0), 1);
+                        float32x4_t _cc2 = vdupq_lane_f32(vget_high_f32(_c0), 0);
+                        float32x4_t _cc3 = vdupq_lane_f32(vget_high_f32(_c0), 1);
+#endif
                         _f0 = vaddq_f32(_f0, _cc0);
                         _f1 = vaddq_f32(_f1, _cc1);
                         _f2 = vaddq_f32(_f2, _cc2);
diff --git a/src/layer/arm/gemm_int8_bf16s.h b/src/layer/arm/gemm_int8_bf16s.h
index b6ce8b41764..9cecdc89298 100644
--- a/src/layer/arm/gemm_int8_bf16s.h
+++ b/src/layer/arm/gemm_int8_bf16s.h
@@ -452,6 +452,7 @@ static void pack_A_tile_bf16_to_int8(const Mat& A, Mat& AT, int i, int max_ii, i
                 float32x4_t _pe = bfloat2float(vget_low_u16(_w));
                 float32x4_t _pf = bfloat2float(vget_high_u16(_w));
 
+#if __aarch64__
                 _p0 = vmulq_laneq_f32(_p0, _scale0, 0);
                 _p1 = vmulq_laneq_f32(_p1, _scale0, 0);
                 _p2 = vmulq_laneq_f32(_p2, _scale0, 1);
@@ -468,6 +469,24 @@ static void pack_A_tile_bf16_to_int8(const Mat& A, Mat& AT, int i, int max_ii, i
                 _pd = vmulq_laneq_f32(_pd, _scale1, 2);
                 _pe = vmulq_laneq_f32(_pe, _scale1, 3);
                 _pf = vmulq_laneq_f32(_pf, _scale1, 3);
+#else
+                _p0 = vmulq_lane_f32(_p0, vget_low_f32(_scale0), 0);
+                _p1 = vmulq_lane_f32(_p1, vget_low_f32(_scale0), 0);
+                _p2 = vmulq_lane_f32(_p2, vget_low_f32(_scale0), 1);
+                _p3 = vmulq_lane_f32(_p3, vget_low_f32(_scale0), 1);
+                _p4 = vmulq_lane_f32(_p4, vget_high_f32(_scale0), 0);
+                _p5 = vmulq_lane_f32(_p5, vget_high_f32(_scale0), 0);
+                _p6 = vmulq_lane_f32(_p6, vget_high_f32(_scale0), 1);
+                _p7 = vmulq_lane_f32(_p7, vget_high_f32(_scale0), 1);
+                _p8 = vmulq_lane_f32(_p8, vget_low_f32(_scale1), 0);
+                _p9 = vmulq_lane_f32(_p9, vget_low_f32(_scale1), 0);
+                _pa = vmulq_lane_f32(_pa, vget_low_f32(_scale1), 1);
+                _pb = vmulq_lane_f32(_pb, vget_low_f32(_scale1), 1);
+                _pc = vmulq_lane_f32(_pc, vget_high_f32(_scale1), 0);
+                _pd = vmulq_lane_f32(_pd, vget_high_f32(_scale1), 0);
+                _pe = vmulq_lane_f32(_pe, vget_high_f32(_scale1), 1);
+                _pf = vmulq_lane_f32(_pf, vget_high_f32(_scale1), 1);
+#endif
 
 #if __ARM_FEATURE_DOTPROD
 #if __ARM_FEATURE_MATMUL_INT8
@@ -531,6 +550,7 @@ static void pack_A_tile_bf16_to_int8(const Mat& A, Mat& AT, int i, int max_ii, i
                 float32x4_t _p6 = bfloat2float(vld1_u16(p0 + A_hstep * 6));
                 float32x4_t _p7 = bfloat2float(vld1_u16(p0 + A_hstep * 7));
 
+#if __aarch64__
                 _p0 = vmulq_laneq_f32(_p0, _scale0, 0);
                 _p1 = vmulq_laneq_f32(_p1, _scale0, 1);
                 _p2 = vmulq_laneq_f32(_p2, _scale0, 2);
@@ -539,6 +559,16 @@ static void pack_A_tile_bf16_to_int8(const Mat& A, Mat& AT, int i, int max_ii, i
                 _p5 = vmulq_laneq_f32(_p5, _scale1, 1);
                 _p6 = vmulq_laneq_f32(_p6, _scale1, 2);
                 _p7 = vmulq_laneq_f32(_p7, _scale1, 3);
+#else
+                _p0 = vmulq_lane_f32(_p0, vget_low_f32(_scale0), 0);
+                _p1 = vmulq_lane_f32(_p1, vget_low_f32(_scale0), 1);
+                _p2 = vmulq_lane_f32(_p2, vget_high_f32(_scale0), 0);
+                _p3 = vmulq_lane_f32(_p3, vget_high_f32(_scale0), 1);
+                _p4 = vmulq_lane_f32(_p4, vget_low_f32(_scale1), 0);
+                _p5 = vmulq_lane_f32(_p5, vget_low_f32(_scale1), 1);
+                _p6 = vmulq_lane_f32(_p6, vget_high_f32(_scale1), 0);
+                _p7 = vmulq_lane_f32(_p7, vget_high_f32(_scale1), 1);
+#endif
 
 #if __ARM_FEATURE_DOTPROD
                 int8x8_t _r0 = float2int8(_p0, _p1);
@@ -789,6 +819,7 @@ static void pack_A_tile_bf16_to_int8(const Mat& A, Mat& AT, int i, int max_ii, i
                 float32x4_t _p6 = bfloat2float(vget_low_u16(_s));
                 float32x4_t _p7 = bfloat2float(vget_high_u16(_s));
 
+#if __aarch64__
                 _p0 = vmulq_laneq_f32(_p0, _scale, 0);
                 _p1 = vmulq_laneq_f32(_p1, _scale, 0);
                 _p2 = vmulq_laneq_f32(_p2, _scale, 1);
@@ -797,6 +828,16 @@ static void pack_A_tile_bf16_to_int8(const Mat& A, Mat& AT, int i, int max_ii, i
                 _p5 = vmulq_laneq_f32(_p5, _scale, 2);
                 _p6 = vmulq_laneq_f32(_p6, _scale, 3);
                 _p7 = vmulq_laneq_f32(_p7, _scale, 3);
+#else
+                _p0 = vmulq_lane_f32(_p0, vget_low_f32(_scale), 0);
+                _p1 = vmulq_lane_f32(_p1, vget_low_f32(_scale), 0);
+                _p2 = vmulq_lane_f32(_p2, vget_low_f32(_scale), 1);
+                _p3 = vmulq_lane_f32(_p3, vget_low_f32(_scale), 1);
+                _p4 = vmulq_lane_f32(_p4, vget_high_f32(_scale), 0);
+                _p5 = vmulq_lane_f32(_p5, vget_high_f32(_scale), 0);
+                _p6 = vmulq_lane_f32(_p6, vget_high_f32(_scale), 1);
+                _p7 = vmulq_lane_f32(_p7, vget_high_f32(_scale), 1);
+#endif
 
 #if __ARM_FEATURE_DOTPROD
 #if __ARM_FEATURE_MATMUL_INT8
@@ -836,10 +877,17 @@ static void pack_A_tile_bf16_to_int8(const Mat& A, Mat& AT, int i, int max_ii, i
                 float32x4_t _p2 = bfloat2float(vld1_u16(p0 + A_hstep * 2));
                 float32x4_t _p3 = bfloat2float(vld1_u16(p0 + A_hstep * 3));
 
+#if __aarch64__
                 _p0 = vmulq_laneq_f32(_p0, _scale, 0);
                 _p1 = vmulq_laneq_f32(_p1, _scale, 1);
                 _p2 = vmulq_laneq_f32(_p2, _scale, 2);
                 _p3 = vmulq_laneq_f32(_p3, _scale, 3);
+#else
+                _p0 = vmulq_lane_f32(_p0, vget_low_f32(_scale), 0);
+                _p1 = vmulq_lane_f32(_p1, vget_low_f32(_scale), 1);
+                _p2 = vmulq_lane_f32(_p2, vget_high_f32(_scale), 0);
+                _p3 = vmulq_lane_f32(_p3, vget_high_f32(_scale), 1);
+#endif
 
 #if __ARM_FEATURE_DOTPROD
                 int8x8_t _r0 = float2int8(_p0, _p1);
@@ -1347,6 +1395,7 @@ static void transpose_pack_A_tile_bf16_to_int8(const Mat& A, Mat& AT, int i, int
                 float32x4_t _pe = bfloat2float(vget_low_u16(_w));
                 float32x4_t _pf = bfloat2float(vget_high_u16(_w));
 
+#if __aarch64__
                 _p0 = vmulq_laneq_f32(_p0, _scale0, 0);
                 _p1 = vmulq_laneq_f32(_p1, _scale0, 1);
                 _p2 = vmulq_laneq_f32(_p2, _scale0, 2);
@@ -1363,6 +1412,24 @@ static void transpose_pack_A_tile_bf16_to_int8(const Mat& A, Mat& AT, int i, int
                 _pd = vmulq_laneq_f32(_pd, _scale1, 1);
                 _pe = vmulq_laneq_f32(_pe, _scale1, 2);
                 _pf = vmulq_laneq_f32(_pf, _scale1, 3);
+#else
+                _p0 = vmulq_lane_f32(_p0, vget_low_f32(_scale0), 0);
+                _p1 = vmulq_lane_f32(_p1, vget_low_f32(_scale0), 1);
+                _p2 = vmulq_lane_f32(_p2, vget_high_f32(_scale0), 0);
+                _p3 = vmulq_lane_f32(_p3, vget_high_f32(_scale0), 1);
+                _p4 = vmulq_lane_f32(_p4, vget_low_f32(_scale1), 0);
+                _p5 = vmulq_lane_f32(_p5, vget_low_f32(_scale1), 1);
+                _p6 = vmulq_lane_f32(_p6, vget_high_f32(_scale1), 0);
+                _p7 = vmulq_lane_f32(_p7, vget_high_f32(_scale1), 1);
+                _p8 = vmulq_lane_f32(_p8, vget_low_f32(_scale0), 0);
+                _p9 = vmulq_lane_f32(_p9, vget_low_f32(_scale0), 1);
+                _pa = vmulq_lane_f32(_pa, vget_high_f32(_scale0), 0);
+                _pb = vmulq_lane_f32(_pb, vget_high_f32(_scale0), 1);
+                _pc = vmulq_lane_f32(_pc, vget_low_f32(_scale1), 0);
+                _pd = vmulq_lane_f32(_pd, vget_low_f32(_scale1), 1);
+                _pe = vmulq_lane_f32(_pe, vget_high_f32(_scale1), 0);
+                _pf = vmulq_lane_f32(_pf, vget_high_f32(_scale1), 1);
+#endif
 
 #if __ARM_FEATURE_DOTPROD
 #if __ARM_FEATURE_MATMUL_INT8
@@ -1435,6 +1502,7 @@ static void transpose_pack_A_tile_bf16_to_int8(const Mat& A, Mat& AT, int i, int
                 float32x4_t _p6 = bfloat2float(vget_low_u16(_s));
                 float32x4_t _p7 = bfloat2float(vget_high_u16(_s));
 
+#if __aarch64__
                 _p0 = vmulq_laneq_f32(_p0, _scale0, 0);
                 _p1 = vmulq_laneq_f32(_p1, _scale0, 1);
                 _p2 = vmulq_laneq_f32(_p2, _scale0, 2);
@@ -1443,6 +1511,16 @@ static void transpose_pack_A_tile_bf16_to_int8(const Mat& A, Mat& AT, int i, int
                 _p5 = vmulq_laneq_f32(_p5, _scale1, 1);
                 _p6 = vmulq_laneq_f32(_p6, _scale1, 2);
                 _p7 = vmulq_laneq_f32(_p7, _scale1, 3);
+#else
+                _p0 = vmulq_lane_f32(_p0, vget_low_f32(_scale0), 0);
+                _p1 = vmulq_lane_f32(_p1, vget_low_f32(_scale0), 1);
+                _p2 = vmulq_lane_f32(_p2, vget_high_f32(_scale0), 0);
+                _p3 = vmulq_lane_f32(_p3, vget_high_f32(_scale0), 1);
+                _p4 = vmulq_lane_f32(_p4, vget_low_f32(_scale1), 0);
+                _p5 = vmulq_lane_f32(_p5, vget_low_f32(_scale1), 1);
+                _p6 = vmulq_lane_f32(_p6, vget_high_f32(_scale1), 0);
+                _p7 = vmulq_lane_f32(_p7, vget_high_f32(_scale1), 1);
+#endif
 
                 int8x8_t _r0 = float2int8(_p0, _p1);
                 int8x8_t _r1 = float2int8(_p2, _p3);
@@ -1673,6 +1751,7 @@ static void transpose_pack_A_tile_bf16_to_int8(const Mat& A, Mat& AT, int i, int
                 float32x4_t _p6 = bfloat2float(vget_low_u16(_s));
                 float32x4_t _p7 = bfloat2float(vget_high_u16(_s));
 
+#if __aarch64__
                 _p0 = vmulq_laneq_f32(_p0, _scale, 0);
                 _p1 = vmulq_laneq_f32(_p1, _scale, 1);
                 _p2 = vmulq_laneq_f32(_p2, _scale, 2);
@@ -1681,6 +1760,16 @@ static void transpose_pack_A_tile_bf16_to_int8(const Mat& A, Mat& AT, int i, int
                 _p5 = vmulq_laneq_f32(_p5, _scale, 1);
                 _p6 = vmulq_laneq_f32(_p6, _scale, 2);
                 _p7 = vmulq_laneq_f32(_p7, _scale, 3);
+#else
+                _p0 = vmulq_lane_f32(_p0, vget_low_f32(_scale), 0);
+                _p1 = vmulq_lane_f32(_p1, vget_low_f32(_scale), 1);
+                _p2 = vmulq_lane_f32(_p2, vget_high_f32(_scale), 0);
+                _p3 = vmulq_lane_f32(_p3, vget_high_f32(_scale), 1);
+                _p4 = vmulq_lane_f32(_p4, vget_low_f32(_scale), 0);
+                _p5 = vmulq_lane_f32(_p5, vget_low_f32(_scale), 1);
+                _p6 = vmulq_lane_f32(_p6, vget_high_f32(_scale), 0);
+                _p7 = vmulq_lane_f32(_p7, vget_high_f32(_scale), 1);
+#endif
 
 #if __ARM_FEATURE_DOTPROD
 #if __ARM_FEATURE_MATMUL_INT8
@@ -1722,10 +1811,17 @@ static void transpose_pack_A_tile_bf16_to_int8(const Mat& A, Mat& AT, int i, int
                 float32x4_t _p2 = bfloat2float(vget_low_u16(_q));
                 float32x4_t _p3 = bfloat2float(vget_high_u16(_q));
 
+#if __aarch64__
                 _p0 = vmulq_laneq_f32(_p0, _scale, 0);
                 _p1 = vmulq_laneq_f32(_p1, _scale, 1);
                 _p2 = vmulq_laneq_f32(_p2, _scale, 2);
                 _p3 = vmulq_laneq_f32(_p3, _scale, 3);
+#else
+                _p0 = vmulq_lane_f32(_p0, vget_low_f32(_scale), 0);
+                _p1 = vmulq_lane_f32(_p1, vget_low_f32(_scale), 1);
+                _p2 = vmulq_lane_f32(_p2, vget_high_f32(_scale), 0);
+                _p3 = vmulq_lane_f32(_p3, vget_high_f32(_scale), 1);
+#endif
 
 #if __ARM_FEATURE_DOTPROD
                 int8x8_t _r0 = float2int8(_p0, _p1);
@@ -7508,6 +7604,7 @@ static void transpose_unpack_output_tile_int32_to_bf16(const Mat& topT, const Ma
                     }
                     if (broadcast_type_C == 1 || broadcast_type_C == 2)
                     {
+#if __aarch64__
                         float32x4_t _cc0 = vdupq_laneq_f32(_c0, 0);
                         float32x4_t _cc1 = vdupq_laneq_f32(_c0, 1);
                         float32x4_t _cc2 = vdupq_laneq_f32(_c0, 2);
@@ -7516,6 +7613,16 @@ static void transpose_unpack_output_tile_int32_to_bf16(const Mat& topT, const Ma
                         float32x4_t _cc5 = vdupq_laneq_f32(_c1, 1);
                         float32x4_t _cc6 = vdupq_laneq_f32(_c1, 2);
                         float32x4_t _cc7 = vdupq_laneq_f32(_c1, 3);
+#else
+                        float32x4_t _cc0 = vdupq_lane_f32(vget_low_f32(_c0), 0);
+                        float32x4_t _cc1 = vdupq_lane_f32(vget_low_f32(_c0), 1);
+                        float32x4_t _cc2 = vdupq_lane_f32(vget_high_f32(_c0), 0);
+                        float32x4_t _cc3 = vdupq_lane_f32(vget_high_f32(_c0), 1);
+                        float32x4_t _cc4 = vdupq_lane_f32(vget_low_f32(_c1), 0);
+                        float32x4_t _cc5 = vdupq_lane_f32(vget_low_f32(_c1), 1);
+                        float32x4_t _cc6 = vdupq_lane_f32(vget_high_f32(_c1), 0);
+                        float32x4_t _cc7 = vdupq_lane_f32(vget_high_f32(_c1), 1);
+#endif
                         _f0 = vaddq_f32(_f0, _cc0);
                         _f1 = vaddq_f32(_f1, _cc1);
                         _f2 = vaddq_f32(_f2, _cc2);
@@ -8607,10 +8714,17 @@ static void transpose_unpack_output_tile_int32_to_bf16(const Mat& topT, const Ma
                     }
                     if (broadcast_type_C == 1 || broadcast_type_C == 2)
                     {
+#if __aarch64__
                         float32x4_t _cc0 = vdupq_laneq_f32(_c0, 0);
                         float32x4_t _cc1 = vdupq_laneq_f32(_c0, 1);
                         float32x4_t _cc2 = vdupq_laneq_f32(_c0, 2);
                         float32x4_t _cc3 = vdupq_laneq_f32(_c0, 3);
+#else
+                        float32x4_t _cc0 = vdupq_lane_f32(vget_low_f32(_c0), 0);
+                        float32x4_t _cc1 = vdupq_lane_f32(vget_low_f32(_c0), 1);
+                        float32x4_t _cc2 = vdupq_lane_f32(vget_high_f32(_c0), 0);
+                        float32x4_t _cc3 = vdupq_lane_f32(vget_high_f32(_c0), 1);
+#endif
                         _f0 = vaddq_f32(_f0, _cc0);
                         _f1 = vaddq_f32(_f1, _cc1);
                         _f2 = vaddq_f32(_f2, _cc2);