From b2db064285bfaa36ac44b3a61cb4930cf0076d8d Mon Sep 17 00:00:00 2001 From: pengxu Date: Tue, 27 Feb 2024 10:47:49 +0800 Subject: [PATCH] Optimized sgemv and dgemv kernel LSX for LoongArch --- kernel/loongarch64/KERNEL.LOONGSON2K1000 | 12 + kernel/loongarch64/dgemv_n_lsx.S | 229 +++++++++++++++++++ kernel/loongarch64/dgemv_t_lsx.S | 279 +++++++++++++++++++++++ kernel/loongarch64/sgemv_n_lsx.S | 227 ++++++++++++++++++ kernel/loongarch64/sgemv_t_lsx.S | 275 ++++++++++++++++++++++ 5 files changed, 1022 insertions(+) create mode 100644 kernel/loongarch64/dgemv_n_lsx.S create mode 100644 kernel/loongarch64/dgemv_t_lsx.S create mode 100644 kernel/loongarch64/sgemv_n_lsx.S create mode 100644 kernel/loongarch64/sgemv_t_lsx.S diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000 index c7ef440358..5b54a2adad 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON2K1000 +++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000 @@ -85,6 +85,12 @@ ZSWAPKERNEL = cswap_lsx.S CSUMKERNEL = csum_lsx.S ZSUMKERNEL = csum_lsx.S +SGEMVNKERNEL = sgemv_n_lsx.S +SGEMVTKERNEL = sgemv_t_lsx.S + +DGEMVNKERNEL = dgemv_n_lsx.S +DGEMVTKERNEL = dgemv_t_lsx.S + DGEMMKERNEL = dgemm_kernel_8x4.S DGEMMINCOPY = dgemm_ncopy_8_lsx.S DGEMMITCOPY = dgemm_tcopy_8_lsx.S @@ -100,6 +106,9 @@ DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +CGEMVNKERNEL = cgemv_n_4_lsx.S +CGEMVTKERNEL = cgemv_t_4_lsx.S + CGEMMKERNEL = cgemm_kernel_8x4_lsx.S CGEMMINCOPY = cgemm_ncopy_8_lsx.S CGEMMITCOPY = cgemm_tcopy_8_lsx.S @@ -115,6 +124,9 @@ CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +ZGEMVNKERNEL = zgemv_n_2_lsx.S +ZGEMVTKERNEL = zgemv_t_2_lsx.S + ZGEMMKERNEL = zgemm_kernel_4x4_lsx.S ZGEMMONCOPY = zgemm_ncopy_4_lsx.S ZGEMMOTCOPY = zgemm_tcopy_4_lsx.S diff --git a/kernel/loongarch64/dgemv_n_lsx.S b/kernel/loongarch64/dgemv_n_lsx.S new file mode 100644 index 0000000000..9a0141fb1b --- /dev/null +++ b/kernel/loongarch64/dgemv_n_lsx.S @@ -0,0 +1,229 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +/* Param */ +#define M $r4 +#define N $r5 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INCX $r10 +#define Y $r11 +#define INCY $r6 +#define BUFFER $r16 +#define ALPHA $f0 + +#define YORIG $r18 +#define T0 $r19 +#define T1 $r20 +#define XX $r12 +#define YY $r13 +#define I $r14 +#define J $r15 +#define AO1 $r23 +#define AO2 $r24 +#define IX $r25 +#define IY $r26 +#define II $r27 +#define T2 $r28 +#define T3 $r29 +#define T4 $r30 + +/* LSX vectors */ +#define U0 $vr11 +#define U1 $vr12 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define U8 $vr8 +#define U9 $vr9 +#define VALPHA $vr10 + +#define a1 $f3 +#define a2 $f4 +#define a3 $f5 +#define a4 $f6 +#define a5 $f7 +#define a6 $f8 +#define a7 $f9 +#define a8 $f10 + + + PROLOGUE + + LDARG INCY, $sp, 0 + LDARG BUFFER, $sp, 8 + + addi.d $sp, $sp, -80 + + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 32 + SDARG $r27, $sp, 40 + SDARG $r28, $sp, 48 + SDARG $r29, $sp, 56 + SDARG $r30, $sp, 64 + ST ALPHA, $sp, 72 + + vldrepl.d VALPHA, $sp, 72 + + slli.d LDA, LDA, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + + bge $r0, M, .L999 + bge $r0, N, .L999 + + move J, $r0 + move IX, $r0 + + move AO1, A //a_ptr + move XX, X + move YY, Y + + beq J, M, .L999 + +.L01: + vldx U0, XX, IX + vshuf4i.d U0, U0, 0x00 + + vfmul.d U1, VALPHA, U0 //temp1 + + move IY, $r0 + move II, $r0 + move I, $r0 + + srai.d T0, M, 2 //n/4 + beq I, T0, .L03 + +.L02: + vldx U2, AO1, II + addi.d II, II, 16 + vldx U7, AO1, II + + move T1, IY + add.d T2, T1, INCY + add.d T3, T2, INCY + add.d T4, T3, INCY + + fldx.d a1, YY, T1 + fldx.d a2, YY, T2 + fldx.d a3, YY, T3 + fldx.d a4, YY, T4 + + vextrins.d U3, U4, 0x10 + vextrins.d U5, U6, 0x10 + + vfmadd.d U3, U1, U2, U3 + vfmadd.d U5, U1, U7, U5 + + vextrins.d U4, U3, 0x01 + vextrins.d U6, U5, 0x01 + + fstx.d a1, YY, T1 + fstx.d a2, YY, T2 + fstx.d a3, YY, T3 + fstx.d a4, YY, T4 + + add.d IY, T4, INCY + addi.d II, II, 16 + addi.d I, I, 1 + blt I, T0, .L02 + +.L03: + andi T0, M, 2 + beq $r0, T0, .L04 + + addi.d T1, $r0, 4 + mod.d T1, M, T1 + sub.d II, M, T1 + slli.d II, II, BASE_SHIFT + + move T1, IY + add.d T2, T1, INCY + + vldx U2, AO1, II + + fldx.d a1, YY, T1 + fldx.d a2, YY, T2 + + vextrins.d U3, U4, 0x10 + + vfmadd.d U3, U1, U2, U3 + + vextrins.d U4, U3, 0x01 + + fstx.d a1, YY, T1 + fstx.d a2, YY, T2 + + add.d IY, T2, INCY + +.L04: + andi T0, M, 1 + beq $r0, T0, .L05 + + addi.d II, M, -1 + slli.d II, II, BASE_SHIFT + + fldx.d a1, AO1, II + fldx.d a3, YY, IY + + fmadd.d a3, $f12, a1, a3 + + fstx.d a3, YY, IY + + add.d IY, IY, INCY + +.L05: + add.d AO1, AO1, LDA + add.d IX, IX, INCX + + addi.d J, J, 1 + blt J, N, .L01 + +.L999: + LDARG $r23, $sp, 0 + LDARG $r24, $sp, 8 + LDARG $r25, $sp, 16 + LDARG $r26, $sp, 32 + LDARG $r27, $sp, 40 + LDARG $r28, $sp, 48 + LDARG $r29, $sp, 56 + LDARG $r30, $sp, 64 + LD ALPHA, $sp, 72 + addi.d $sp, $sp, 80 + jirl $r0, $r1, 0x0 + + EPILOGUE diff --git a/kernel/loongarch64/dgemv_t_lsx.S b/kernel/loongarch64/dgemv_t_lsx.S new file mode 100644 index 0000000000..76f0d9bdcc --- /dev/null +++ b/kernel/loongarch64/dgemv_t_lsx.S @@ -0,0 +1,279 @@ +/******************************************************************************* +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER + +#include "common.h" + +/* Param */ +#define M $r4 +#define N $r5 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INCX $r10 +#define Y $r11 +#define INCY $r6 +#define BUFFER $r16 +#define ALPHA $f0 + +#define YORIG $r18 +#define T0 $r19 +#define T1 $r20 +#define AO3 $r12 +#define AO4 $r13 +#define I $r14 +#define J $r15 +#define AO1 $r23 +#define AO2 $r24 +#define IX $r25 +#define IY $r26 +#define II $r27 +#define T2 $r28 +#define T3 $r29 +#define T4 $r30 + +/* LSX vectors */ +#define U0 $vr11 +#define U1 $vr12 +#define U2 $vr2 +#define U3 $vr3 +#define U4 $vr4 +#define U5 $vr5 +#define U6 $vr6 +#define U7 $vr7 +#define U8 $vr8 +#define U9 $vr9 +#define VALPHA $vr10 + +#define a1 $f3 +#define a2 $f4 +#define a3 $f5 +#define a4 $f6 +#define a5 $f7 +#define a6 $f8 +#define a7 $f9 +#define a8 $f10 + + + PROLOGUE + + LDARG INCY, $sp, 0 + LDARG BUFFER, $sp, 8 + + addi.d $sp, $sp, -80 + + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 32 + SDARG $r27, $sp, 40 + SDARG $r28, $sp, 48 + SDARG $r29, $sp, 56 + SDARG $r30, $sp, 64 + ST ALPHA, $sp, 72 + + vldrepl.d VALPHA, $sp, 72 + + slli.d LDA, LDA, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT + + bge $r0, M, .L999 + bge $r0, N, .L999 + + move J, $r0 + move IY, $r0 + + move AO1, A //a_ptr1 + + srai.d T0, N, 2 //n/4 + beq J, T0, .L04 + +.L01: /* j