diff --git a/kernel/loongarch64/dsymv_L_lsx.S b/kernel/loongarch64/dsymv_L_lsx.S index 1fd0d26f58..fed4081089 100644 --- a/kernel/loongarch64/dsymv_L_lsx.S +++ b/kernel/loongarch64/dsymv_L_lsx.S @@ -28,6 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ASSEMBLER #include "common.h" +#include "loongarch64_asm.S" /* Param */ #define M $r4 @@ -57,6 +58,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define T2 $r28 #define T3 $r29 #define T4 $r30 +#define T5 $r17 +#define T6 $r16 +#define T7 $r12 /* LSX vectors */ #define U0 $vr31 @@ -87,10 +91,114 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define a8 $f8 #define a9 $f9 +.macro LOAD_Y_8 + beqz T5, .L01_Y_0 + add.d T2, IY, INCY + fldx.d $f4, Y, T2 + add.d T2, T2, INCY + fldx.d $f5, Y, T2 + add.d T2, T2, INCY + fldx.d $f6, Y, T2 + add.d T2, T2, INCY + fldx.d $f7, Y, T2 - PROLOGUE + add.d T2, T2, INCY + fldx.d $f8, Y, T2 + add.d T2, T2, INCY + fldx.d $f9, Y, T2 + add.d T2, T2, INCY + fldx.d $f10, Y, T2 + add.d T2, T2, INCY + fldx.d $f11, Y, T2 + + vextrins.d U4, U5, 0x10 + vextrins.d U6, U7, 0x10 + vextrins.d U8, U9, 0x10 + vextrins.d U10, U11, 0x10 + b .L01_Y_1 +.L01_Y_0: + add.d T7, IY, INCY + vldx U4, Y, T7 + alsl.d T2, INCY, T7, 1 + vldx U6, Y, T2 + alsl.d T3, INCY, T2, 1 + vldx U8, Y, T3 + alsl.d T4, INCY, T3, 1 + vldx U10, Y, T4 +.L01_Y_1: +.endm + +.macro LOAD_X_8 + beqz T6, .L01_X_0 + add.d T2, IX, INCX + fldx.d $f4, X, T2 + add.d T2, T2, INCX + fldx.d $f5, X, T2 + add.d T2, T2, INCX + fldx.d $f6, X, T2 + add.d T2, T2, INCX + fldx.d $f7, X, T2 + + add.d T2, T2, INCX + fldx.d $f8, X, T2 + add.d T2, T2, INCX + fldx.d $f9, X, T2 + add.d T2, T2, INCX + fldx.d $f10, X, T2 + add.d T2, T2, INCX + fldx.d $f11, X, T2 + + vextrins.d U4, U5, 0x10 + vextrins.d U6, U7, 0x10 + vextrins.d U8, U9, 0x10 + vextrins.d U10, U11, 0x10 + b .L01_X_1 +.L01_X_0: + add.d T7, IX, INCX + vldx U4, X, T7 + alsl.d T2, INCX, T7, 1 + vldx U6, X, T2 + alsl.d T3, INCX, T2, 1 + vldx U8, X, T3 + alsl.d T4, INCX, T3, 1 + vldx U10, X, T4 +.L01_X_1: +.endm + +.macro STORE_Y_8 + beqz T5, .L01_Y_2 + vextrins.d U5, U4, 0x01 + vextrins.d U7, U6, 0x01 + vextrins.d U9, U8, 0x01 + vextrins.d U11, U10, 0x01 + + add.d T2, IY, INCY + fstx.d $f4, Y, T2 + add.d T2, T2, INCY + fstx.d $f5, Y, T2 + add.d T2, T2, INCY + fstx.d $f6, Y, T2 + add.d T2, T2, INCY + fstx.d $f7, Y, T2 + + add.d T2, T2, INCY + fstx.d $f8, Y, T2 + add.d T2, T2, INCY + fstx.d $f9, Y, T2 + add.d T2, T2, INCY + fstx.d $f10, Y, T2 + add.d T2, T2, INCY + fstx.d $f11, Y, T2 + b .L01_Y_3 +.L01_Y_2: + vstx U4, Y, T7 + vstx U6, Y, T2 + vstx U8, Y, T3 + vstx U10, Y, T4 +.L01_Y_3: +.endm - LDARG BUFFER, $sp, 0 + PROLOGUE addi.d $sp, $sp, -88 @@ -107,6 +215,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vldrepl.d VALPHA, $sp, 80 + addi.d T5, INCY, -1 + addi.d T6, INCX, -1 slli.d LDA, LDA, BASE_SHIFT slli.d INCX, INCX, BASE_SHIFT slli.d INCY, INCY, BASE_SHIFT @@ -122,11 +232,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. beq J, N, .L999 .L01: - MTC a2, $r0 //temp2 + vxor.v U2, U2, U2 fldx.d a6, X, JX fmul.d a3, ALPHA, a6 //temp1 vshuf4i.d U3, U3, 0x00 - vshuf4i.d U2, U2, 0x00 mul.d T0, J, LDA slli.d T1, J, BASE_SHIFT @@ -163,105 +272,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vldx U16, AO1, T1 addi.d T1, T1, 16 - add.d T2, IY, INCY - fldx.d $f4, Y, T2 - add.d T2, T2, INCY - fldx.d $f5, Y, T2 - add.d T2, T2, INCY - fldx.d $f6, Y, T2 - add.d T2, T2, INCY - fldx.d $f7, Y, T2 - - add.d T2, T2, INCY - fldx.d $f8, Y, T2 - add.d T2, T2, INCY - fldx.d $f9, Y, T2 - add.d T2, T2, INCY - fldx.d $f10, Y, T2 - add.d T2, T2, INCY - fldx.d $f11, Y, T2 - - vextrins.d U4, U5, 0x10 - vextrins.d U6, U7, 0x10 - vextrins.d U8, U9, 0x10 - vextrins.d U10, U11, 0x10 + LOAD_Y_8 vfmadd.d U4, U3, U1, U4 vfmadd.d U6, U3, U14, U6 vfmadd.d U8, U3, U15, U8 vfmadd.d U10, U3, U16, U10 - vextrins.d U5, U4, 0x01 - vextrins.d U7, U6, 0x01 - vextrins.d U9, U8, 0x01 - vextrins.d U11, U10, 0x01 - - add.d T2, IY, INCY - fstx.d $f4, Y, T2 - add.d T2, T2, INCY - fstx.d $f5, Y, T2 - add.d T2, T2, INCY - fstx.d $f6, Y, T2 - add.d T2, T2, INCY - fstx.d $f7, Y, T2 - - add.d T2, T2, INCY - fstx.d $f8, Y, T2 - add.d T2, T2, INCY - fstx.d $f9, Y, T2 - add.d T2, T2, INCY - fstx.d $f10, Y, T2 - add.d T2, T2, INCY - fstx.d $f11, Y, T2 - - slli.d T2, INCY, 3 - add.d IY, IY, T2 - - add.d T2, IX, INCX - fldx.d $f4, X, T2 - add.d T2, T2, INCX - fldx.d $f5, X, T2 - add.d T2, T2, INCX - fldx.d $f6, X, T2 - add.d T2, T2, INCX - fldx.d $f7, X, T2 - - add.d T2, T2, INCX - fldx.d $f8, X, T2 - add.d T2, T2, INCX - fldx.d $f9, X, T2 - add.d T2, T2, INCX - fldx.d $f10, X, T2 - add.d T2, T2, INCX - fldx.d $f11, X, T2 + STORE_Y_8 - vextrins.d U4, U5, 0x10 - vextrins.d U6, U7, 0x10 - vextrins.d U8, U9, 0x10 - vextrins.d U10, U11, 0x10 + alsl.d IY, INCY, IY, 3 - vand.v $vr12, $vr2, $vr2 + LOAD_X_8 vfmadd.d U2, U1, U4, U2 - vfsub.d U2, U2, $vr12 vfmadd.d U2, U14, U6, U2 vfmadd.d U2, U15, U8, U2 vfmadd.d U2, U16, U10, U2 - vextrins.d U4, U2, 0x01 - - fadd.d $f2, $f2, $f4 - fadd.d $f2, $f2, $f12 - - vextrins.d U2, U2, 0x10 - - slli.d T2, INCX, 3 - add.d IX, IX, T2 + alsl.d IX, INCX, IX, 3 addi.d II, II, 64 addi.d I, I, 1 blt I, T0, .L02 + // Acc U2 + GACC vf, d, U4, U2 + vilvl.d U2, U4, U4 + .L03: /* &4 */ sub.d T0, M, J addi.d T0, T0, -1 @@ -429,4 +467,4 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi.d $sp, $sp, 88 jirl $r0, $r1, 0x0 - EPILOGUE \ No newline at end of file + EPILOGUE diff --git a/kernel/loongarch64/dsymv_U_lsx.S b/kernel/loongarch64/dsymv_U_lsx.S index f708196aaa..2589f31910 100644 --- a/kernel/loongarch64/dsymv_U_lsx.S +++ b/kernel/loongarch64/dsymv_U_lsx.S @@ -28,6 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ASSEMBLER #include "common.h" +#include "loongarch64_asm.S" /* Param */ #define M $r4 @@ -57,6 +58,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define T2 $r28 #define T3 $r29 #define T4 $r30 +#define T5 $r17 +#define T6 $r16 +#define T7 $r12 /* LSX vectors */ #define U0 $vr31 @@ -87,10 +91,109 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define a8 $f8 #define a9 $f9 +.macro LOAD_Y_8 + beqz T5, .L01_Y_0 + fldx.d $f4, Y, IY + add.d T2, IY, INCY + fldx.d $f5, Y, T2 + add.d T2, T2, INCY + fldx.d $f6, Y, T2 + add.d T2, T2, INCY + fldx.d $f7, Y, T2 - PROLOGUE + add.d T2, T2, INCY + fldx.d $f8, Y, T2 + add.d T2, T2, INCY + fldx.d $f9, Y, T2 + add.d T2, T2, INCY + fldx.d $f10, Y, T2 + add.d T2, T2, INCY + fldx.d $f11, Y, T2 - LDARG BUFFER, $sp, 0 + vextrins.d U4, U5, 0x10 + vextrins.d U6, U7, 0x10 + vextrins.d U8, U9, 0x10 + vextrins.d U10, U11, 0x10 + b .L01_Y_1 +.L01_Y_0: + vldx U4, Y, IY + alsl.d T2, INCY, IY, 1 + vldx U6, Y, T2 + alsl.d T3, INCY, T2, 1 + vldx U8, Y, T3 + alsl.d T4, INCY, T3, 1 + vldx U10, Y, T4 +.L01_Y_1: +.endm + +.macro STORE_Y_8 + beqz T5, .L01_Y_2 + vextrins.d U5, U4, 0x01 + vextrins.d U7, U6, 0x01 + vextrins.d U9, U8, 0x01 + vextrins.d U11, U10, 0x01 + + fstx.d $f4, Y, IY + add.d T2, IY, INCY + fstx.d $f5, Y, T2 + add.d T2, T2, INCY + fstx.d $f6, Y, T2 + add.d T2, T2, INCY + fstx.d $f7, Y, T2 + + add.d T2, T2, INCY + fstx.d $f8, Y, T2 + add.d T2, T2, INCY + fstx.d $f9, Y, T2 + add.d T2, T2, INCY + fstx.d $f10, Y, T2 + add.d T2, T2, INCY + fstx.d $f11, Y, T2 + b .L01_Y_3 +.L01_Y_2: + vstx U4, Y, IY + vstx U6, Y, T2 + vstx U8, Y, T3 + vstx U10,Y, T4 +.L01_Y_3: +.endm + +.macro LOAD_X_8 + beqz T6, .L01_X_0 + fldx.d $f4, X, IX + add.d T2, IX, INCX + fldx.d $f5, X, T2 + add.d T2, T2, INCX + fldx.d $f6, X, T2 + add.d T2, T2, INCX + fldx.d $f7, X, T2 + + add.d T2, T2, INCX + fldx.d $f8, X, T2 + add.d T2, T2, INCX + fldx.d $f9, X, T2 + add.d T2, T2, INCX + fldx.d $f10, X, T2 + add.d T2, T2, INCX + fldx.d $f11, X, T2 + + vextrins.d U4, U5, 0x10 + vextrins.d U6, U7, 0x10 + vextrins.d U8, U9, 0x10 + vextrins.d U10, U11, 0x10 + b .L01_X_1 +.L01_X_0: + vldx U4, X, IX + alsl.d T2, INCX, IX, 1 + vldx U6, X, T2 + alsl.d T3, INCX, T2, 1 + vldx U8, X, T3 + alsl.d T4, INCX, T3, 1 + vldx U10, X, T4 +.L01_X_1: +.endm + + PROLOGUE addi.d $sp, $sp, -88 @@ -107,6 +210,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vldrepl.d VALPHA, $sp, 80 + addi.d T5, INCY, -1 + addi.d T6, INCX, -1 slli.d LDA, LDA, BASE_SHIFT slli.d INCX, INCX, BASE_SHIFT slli.d INCY, INCY, BASE_SHIFT @@ -125,11 +230,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. beq J, M, .L999 .L01: - MTC $f2, $r0 //temp2 + vxor.v U2, U2, U2 fldx.d $f6, X, JX fmul.d $f3, ALPHA, $f6 //temp1 vshuf4i.d U3, U3, 0x00 - vshuf4i.d U2, U2, 0x00 move IY, $r0 move IX, $r0 @@ -152,102 +256,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vldx U16, AO1, T1 addi.d T1, T1, 16 - fldx.d $f4, Y, IY - add.d T2, IY, INCY - fldx.d $f5, Y, T2 - add.d T2, T2, INCY - fldx.d $f6, Y, T2 - add.d T2, T2, INCY - fldx.d $f7, Y, T2 - - add.d T2, T2, INCY - fldx.d $f8, Y, T2 - add.d T2, T2, INCY - fldx.d $f9, Y, T2 - add.d T2, T2, INCY - fldx.d $f10, Y, T2 - add.d T2, T2, INCY - fldx.d $f11, Y, T2 - - vextrins.d U4, U5, 0x10 - vextrins.d U6, U7, 0x10 - vextrins.d U8, U9, 0x10 - vextrins.d U10, U11, 0x10 + LOAD_Y_8 vfmadd.d U4, U3, U1, U4 vfmadd.d U6, U3, U14, U6 vfmadd.d U8, U3, U15, U8 vfmadd.d U10, U3, U16, U10 - vextrins.d U5, U4, 0x01 - vextrins.d U7, U6, 0x01 - vextrins.d U9, U8, 0x01 - vextrins.d U11, U10, 0x01 + STORE_Y_8 - fstx.d $f4, Y, IY - add.d T2, IY, INCY - fstx.d $f5, Y, T2 - add.d T2, T2, INCY - fstx.d $f6, Y, T2 - add.d T2, T2, INCY - fstx.d $f7, Y, T2 + alsl.d IY, INCY, IY, 3 - add.d T2, T2, INCY - fstx.d $f8, Y, T2 - add.d T2, T2, INCY - fstx.d $f9, Y, T2 - add.d T2, T2, INCY - fstx.d $f10, Y, T2 - add.d T2, T2, INCY - fstx.d $f11, Y, T2 - - slli.d T2, INCY, 3 - add.d IY, IY, T2 - - fldx.d $f4, X, IX - add.d T2, IX, INCX - fldx.d $f5, X, T2 - add.d T2, T2, INCX - fldx.d $f6, X, T2 - add.d T2, T2, INCX - fldx.d $f7, X, T2 - - add.d T2, T2, INCX - fldx.d $f8, X, T2 - add.d T2, T2, INCX - fldx.d $f9, X, T2 - add.d T2, T2, INCX - fldx.d $f10, X, T2 - add.d T2, T2, INCX - fldx.d $f11, X, T2 - - vextrins.d U4, U5, 0x10 - vextrins.d U6, U7, 0x10 - vextrins.d U8, U9, 0x10 - vextrins.d U10, U11, 0x10 - - vand.v $vr12, $vr2, $vr2 + LOAD_X_8 vfmadd.d U2, U1, U4, U2 - vfsub.d U2, U2, $vr12 vfmadd.d U2, U14, U6, U2 vfmadd.d U2, U15, U8, U2 vfmadd.d U2, U16, U10, U2 - vextrins.d U4, U2, 0x01 - - fadd.d $f2, $f2, $f4 - fadd.d $f2, $f2, $f12 - - vextrins.d U2, U2, 0x10 - - slli.d T2, INCX, 3 - add.d IX, IX, T2 + alsl.d IX, INCX, IX, 3 addi.d II, II, 64 addi.d I, I, 1 blt I, T0, .L02 + // Acc U2 + GACC vf, d, U4, U2 + vilvl.d U2, U4, U4 + .L03: /* &4 */ andi T0, J, 4 beq $r0, T0, .L04 @@ -417,4 +453,4 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi.d $sp, $sp, 88 jirl $r0, $r1, 0x0 - EPILOGUE \ No newline at end of file + EPILOGUE diff --git a/kernel/loongarch64/ssymv_L_lsx.S b/kernel/loongarch64/ssymv_L_lsx.S index 949e9e9025..a98cad38bf 100644 --- a/kernel/loongarch64/ssymv_L_lsx.S +++ b/kernel/loongarch64/ssymv_L_lsx.S @@ -28,6 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ASSEMBLER #include "common.h" +#include "loongarch64_asm.S" /* Param */ #define M $r4 @@ -57,6 +58,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define T2 $r28 #define T3 $r29 #define T4 $r30 +#define T5 $r17 +#define T6 $r16 /* LSX vectors */ #define U0 $vr31 @@ -88,77 +91,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define a9 $f9 - PROLOGUE - - LDARG BUFFER, $sp, 0 - - addi.d $sp, $sp, -88 - - SDARG $r23, $sp, 0 - SDARG $r24, $sp, 8 - SDARG $r25, $sp, 16 - SDARG $r26, $sp, 32 - SDARG $r27, $sp, 40 - SDARG $r28, $sp, 48 - SDARG $r29, $sp, 56 - SDARG $r30, $sp, 64 - SDARG $r31, $sp, 72 - ST ALPHA, $sp, 80 - - vldrepl.w VALPHA, $sp, 80 - - slli.d LDA, LDA, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - slli.d INCY, INCY, BASE_SHIFT - - bge $r0, M, .L999 - bge $r0, N, .L999 - - move J, $r0 - move JY, $r0 - move JX, $r0 - move AO1, A - - beq J, N, .L999 - -.L01: - MTC a2, $r0 //temp2 - fldx.s a6, X, JX - fmul.s a3, ALPHA, a6 //temp1 - vpermi.w U3, U3, 0x00 - vpermi.w U2, U2, 0x00 - - mul.w T0, J, LDA - slli.d T1, J, BASE_SHIFT - add.w T0, T0, T1 - fldx.s a6, AO1, T0 - fldx.s a4, Y, JY - fmadd.s a4, a3, a6, a4 - fstx.s a4, Y, JY - - move IY, JY - move IX, JX - addi.d II, J, 1 - move I, II - slli.d II, II, BASE_SHIFT - - sub.d T0, M, J - addi.d T0, T0, -1 - srai.d T0, T0, 3 - add.d T0, T0, J - addi.d T0, T0, 1 - beq I, T0, .L03 - bge I, T0, .L03 - - mul.w T1, J, LDA - add.d T1, T1, II - -.L02: /* /8 */ - vldx U1, AO1, T1 - addi.d T1, T1, 16 - vldx U14, AO1, T1 - addi.d T1, T1, 16 - +.macro LOAD_Y_8 + beqz T5, .L01_Y_0 add.d T2, IY, INCY fldx.s $f4, Y, T2 add.d T2, T2, INCY @@ -183,10 +117,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vextrins.w U8, U9, 0x10 vextrins.w U8, U10, 0x20 vextrins.w U8, U11, 0x30 - - vfmadd.s U4, U3, U1, U4 - vfmadd.s U8, U3, U14, U8 - + b .L01_Y_1 +.L01_Y_0: + add.d T3, IY, INCY + vldx U4, Y, T3 + alsl.d T4, INCY, T3, 2 + vldx U8, Y, T4 +.L01_Y_1: +.endm + +.macro STORE_Y_8 + beqz T5, .L01_Y_2 vextrins.w U5, U4, 0x01 vextrins.w U6, U4, 0x02 vextrins.w U7, U4, 0x03 @@ -211,10 +152,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fstx.s $f10, Y, T2 add.d T2, T2, INCY fstx.s $f11, Y, T2 - - slli.d T2, INCY, 3 - add.d IY, IY, T2 - + b .L01_Y_3 +.L01_Y_2: + vstx U4, Y, T3 + vstx U8, Y, T4 +.L01_Y_3: +.endm + +.macro LOAD_X_8 + beqz T6, .L01_X_0 add.d T2, IX, INCX fldx.s $f4, X, T2 add.d T2, T2, INCX @@ -239,31 +185,109 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vextrins.w $vr8, $vr9, 0x10 vextrins.w $vr8, $vr10, 0x20 vextrins.w $vr8, $vr11, 0x30 + b .L01_X_1 +.L01_X_0: + add.d T3, IX, INCX + vldx U4, X, T3 + alsl.d T4, INCX, T3, 2 + vldx U8, X, T4 +.L01_X_1: +.endm - vand.v $vr12, $vr2, $vr2 + PROLOGUE - vfmadd.s U2, U1, U4, U2 - vfsub.s U2, U2, $vr12 - vfmadd.s U2, U14, U8, U2 + addi.d $sp, $sp, -88 - vextrins.w U4, U2, 0x01 - vextrins.w U5, U2, 0x02 - vextrins.w U6, U2, 0x03 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 32 + SDARG $r27, $sp, 40 + SDARG $r28, $sp, 48 + SDARG $r29, $sp, 56 + SDARG $r30, $sp, 64 + SDARG $r31, $sp, 72 + ST ALPHA, $sp, 80 - fadd.s $f2, $f2, $f4 - fadd.s $f2, $f2, $f5 - fadd.s $f2, $f2, $f6 - fadd.s $f2, $f2, $f12 + vldrepl.w VALPHA, $sp, 80 - vpermi.w U2, U2, 0x00 + addi.d T5, INCY, -1 + addi.d T6, INCX, -1 + slli.d LDA, LDA, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + + bge $r0, M, .L999 + bge $r0, N, .L999 + + move J, $r0 + move JY, $r0 + move JX, $r0 + move AO1, A + + beq J, N, .L999 + +.L01: + vxor.v U2, U2, U2 + fldx.s a6, X, JX + fmul.s a3, ALPHA, a6 //temp1 + vpermi.w U3, U3, 0x00 + + mul.w T0, J, LDA + slli.d T1, J, BASE_SHIFT + add.w T0, T0, T1 + fldx.s a6, AO1, T0 + fldx.s a4, Y, JY + fmadd.s a4, a3, a6, a4 + fstx.s a4, Y, JY + + move IY, JY + move IX, JX + addi.d II, J, 1 + move I, II + slli.d II, II, BASE_SHIFT - slli.d T2, INCX, 3 - add.d IX, IX, T2 + sub.d T0, M, J + addi.d T0, T0, -1 + srai.d T0, T0, 3 + add.d T0, T0, J + addi.d T0, T0, 1 + beq I, T0, .L03 + bge I, T0, .L03 + + mul.w T1, J, LDA + add.d T1, T1, II + +.L02: /* /8 */ + vldx U1, AO1, T1 + addi.d T1, T1, 16 + vldx U14, AO1, T1 + addi.d T1, T1, 16 + + LOAD_Y_8 + + vfmadd.s U4, U3, U1, U4 + vfmadd.s U8, U3, U14, U8 + + STORE_Y_8 + + alsl.d IY, INCY, IY, 3 + + LOAD_X_8 + + vfmadd.s U2, U1, U4, U2 + vfmadd.s U2, U14, U8, U2 + + alsl.d IX, INCX, IX, 3 addi.d II, II, 32 addi.d I, I, 1 blt I, T0, .L02 + // Acc U2 + GACC vf, s, U4, U2 + vpermi.w U2, U4, 0 + .L03: /* &4 */ sub.d T0, M, J addi.d T0, T0, -1 @@ -426,4 +450,4 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi.d $sp, $sp, 88 jirl $r0, $r1, 0x0 - EPILOGUE \ No newline at end of file + EPILOGUE diff --git a/kernel/loongarch64/ssymv_U_lsx.S b/kernel/loongarch64/ssymv_U_lsx.S index f3898e1483..7ff9b9b7b3 100644 --- a/kernel/loongarch64/ssymv_U_lsx.S +++ b/kernel/loongarch64/ssymv_U_lsx.S @@ -28,6 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ASSEMBLER #include "common.h" +#include "loongarch64_asm.S" /* Param */ #define M $r4 @@ -57,6 +58,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define T2 $r28 #define T3 $r29 #define T4 $r30 +#define T5 $r17 +#define T6 $r16 /* LSX vectors */ #define U0 $vr31 @@ -87,67 +90,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define a8 $f8 #define a9 $f9 - - PROLOGUE - - LDARG BUFFER, $sp, 0 - - addi.d $sp, $sp, -88 - - SDARG $r23, $sp, 0 - SDARG $r24, $sp, 8 - SDARG $r25, $sp, 16 - SDARG $r26, $sp, 32 - SDARG $r27, $sp, 40 - SDARG $r28, $sp, 48 - SDARG $r29, $sp, 56 - SDARG $r30, $sp, 64 - SDARG $r31, $sp, 72 - ST ALPHA, $sp, 80 - - vldrepl.w VALPHA, $sp, 80 - - slli.d LDA, LDA, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - slli.d INCY, INCY, BASE_SHIFT - - bge $r0, M, .L999 - bge $r0, N, .L999 - - sub.d M1, M, N - - mul.d JY, M1, INCY - mul.d JX, M1, INCX - - move J, M1 - move AO1, A - - beq J, M, .L999 - -.L01: - MTC $f2, $r0 //temp2 - fldx.s $f6, X, JX - fmul.s $f3, ALPHA, $f6 //temp1 - vpermi.w U3, U3, 0x00 - vpermi.w U2, U2, 0x00 - - move IY, $r0 - move IX, $r0 - move II, $r0 - move I, $r0 - - srai.d T0, J, 3 - beq I, T0, .L03 - - mul.w T1, J, LDA - add.d T1, T1, II - -.L02: /* /8 */ - vldx U1, AO1, T1 - addi.d T1, T1, 16 - vldx U14, AO1, T1 - addi.d T1, T1, 16 - +.macro LOAD_Y_8 + beqz T5, .L01_Y_0 fldx.s $f4, Y, IY add.d T2, IY, INCY fldx.s $f5, Y, T2 @@ -171,10 +115,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vextrins.w U8, U9, 0x10 vextrins.w U8, U10, 0x20 vextrins.w U8, U11, 0x30 - - vfmadd.s U4, U3, U1, U4 - vfmadd.s U8, U3, U14, U8 - + b .L01_Y_1 +.L01_Y_0: + vldx U4, Y, IY + alsl.d T2, INCY, IY, 2 + vldx U8, Y, T2 +.L01_Y_1: +.endm + +.macro STORE_Y_8 + beqz T5, .L01_Y_2 vextrins.w U5, U4, 0x01 vextrins.w U6, U4, 0x02 vextrins.w U7, U4, 0x03 @@ -198,10 +148,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fstx.s $f10, Y, T2 add.d T2, T2, INCY fstx.s $f11, Y, T2 - - slli.d T2, INCY, 3 - add.d IY, IY, T2 - + b .L01_Y_3 +.L01_Y_2: + vstx U4, Y, IY + vstx U8, Y, T2 +.L01_Y_3: +.endm + +.macro LOAD_X_8 + beqz T6, .L01_X_0 fldx.s $f4, X, IX add.d T2, IX, INCX fldx.s $f5, X, T2 @@ -225,31 +180,97 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vextrins.w $vr8, $vr9, 0x10 vextrins.w $vr8, $vr10, 0x20 vextrins.w $vr8, $vr11, 0x30 + b .L01_X_1 +.L01_X_0: + vldx U4, X, IX + alsl.d T3, INCX, IX, 2 + vldx U8, X, T3 +.L01_X_1: +.endm - vand.v $vr12, $vr2, $vr2 + PROLOGUE - vfmadd.s U2, U1, U4, U2 - vfsub.s U2, U2, $vr12 - vfmadd.s U2, U14, U8, U2 + addi.d $sp, $sp, -88 - vextrins.w U4, U2, 0x01 - vextrins.w U5, U2, 0x02 - vextrins.w U6, U2, 0x03 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 32 + SDARG $r27, $sp, 40 + SDARG $r28, $sp, 48 + SDARG $r29, $sp, 56 + SDARG $r30, $sp, 64 + SDARG $r31, $sp, 72 + ST ALPHA, $sp, 80 - fadd.s $f2, $f2, $f4 - fadd.s $f2, $f2, $f5 - fadd.s $f2, $f2, $f6 - fadd.s $f2, $f2, $f12 + vldrepl.w VALPHA, $sp, 80 - vpermi.w U2, U2, 0x00 + addi.d T5, INCY, -1 + addi.d T6, INCX, -1 + slli.d LDA, LDA, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + + bge $r0, M, .L999 + bge $r0, N, .L999 + + sub.d M1, M, N + + mul.d JY, M1, INCY + mul.d JX, M1, INCX + + move J, M1 + move AO1, A + + beq J, M, .L999 + +.L01: + vxor.v U2, U2, U2 + fldx.s $f6, X, JX + fmul.s $f3, ALPHA, $f6 //temp1 + vpermi.w U3, U3, 0x00 + + move IY, $r0 + move IX, $r0 + move II, $r0 + move I, $r0 + + srai.d T0, J, 3 + beq I, T0, .L03 + + mul.w T1, J, LDA + add.d T1, T1, II - slli.d T2, INCX, 3 - add.d IX, IX, T2 +.L02: /* /8 */ + vldx U1, AO1, T1 + addi.d T1, T1, 16 + vldx U14, AO1, T1 + addi.d T1, T1, 16 + + LOAD_Y_8 + + vfmadd.s U4, U3, U1, U4 + vfmadd.s U8, U3, U14, U8 + + STORE_Y_8 + + alsl.d IY, INCY, IY, 3 + + LOAD_X_8 + + vfmadd.s U2, U1, U4, U2 + vfmadd.s U2, U14, U8, U2 + + alsl.d IX, INCX, IX, 3 addi.d II, II, 32 addi.d I, I, 1 blt I, T0, .L02 + // Acc U2 + GACC vf, s, U4, U2 + vpermi.w U2, U4, 0x00 + .L03: /* &4 */ andi T0, J, 4 beq $r0, T0, .L04 @@ -414,4 +435,4 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi.d $sp, $sp, 88 jirl $r0, $r1, 0x0 - EPILOGUE \ No newline at end of file + EPILOGUE