diff --git a/.github/workflows/c910v.yml b/.github/workflows/c910v.yml index a47ca1dce6..1dd3a2c713 100644 --- a/.github/workflows/c910v.yml +++ b/.github/workflows/c910v.yml @@ -37,7 +37,7 @@ jobs: run: | sudo apt-get update sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \ - gcc-${{ matrix.apt_triple }} gfortran-${{ matrix.apt_triple }} libgomp1-riscv64-cross + gcc-${{ matrix.apt_triple }} gfortran-${{ matrix.apt_triple }} libgomp1-riscv64-cross libglib2.0-dev - name: checkout qemu uses: actions/checkout@v3 @@ -52,6 +52,7 @@ jobs: wget https://github.com/revyos/qemu/commit/5164bca5a4bcde4534dc1a9aa3a7f619719874cf.patch cd qemu patch -p1 < ../5164bca5a4bcde4534dc1a9aa3a7f619719874cf.patch + export CXXFLAGS="-Wno-error"; export CFLAGS="-Wno-error" ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=riscv64-linux-user --disable-system make -j$(nproc) make install diff --git a/.github/workflows/codspeed-bench.yml b/.github/workflows/codspeed-bench.yml index 25e196ef2a..94e0d708ed 100644 --- a/.github/workflows/codspeed-bench.yml +++ b/.github/workflows/codspeed-bench.yml @@ -15,7 +15,7 @@ jobs: strategy: fail-fast: false matrix: - os: [ubuntu-latest] + os: [ubuntu-22.04] fortran: [gfortran] build: [make] pyver: ["3.12"] @@ -147,7 +147,7 @@ jobs: OPENBLAS_NUM_THREADS=1 pytest benchmarks/bench_blas.py -k 'gesdd' - name: Run benchmarks - uses: CodSpeedHQ/action@v2 + uses: CodSpeedHQ/action@v3 with: token: ${{ secrets.CODSPEED_TOKEN }} run: | diff --git a/.github/workflows/dynamic_arch.yml b/.github/workflows/dynamic_arch.yml index 9e55e73467..f42d4c57fe 100644 --- a/.github/workflows/dynamic_arch.yml +++ b/.github/workflows/dynamic_arch.yml @@ -43,7 +43,9 @@ jobs: run: | if [ "$RUNNER_OS" == "Linux" ]; then sudo apt-get update - sudo apt-get install -y gfortran cmake ccache libtinfo5 + sudo apt-get install -y gfortran cmake ccache + wget http://security.ubuntu.com/ubuntu/pool/universe/n/ncurses/libtinfo5_6.3-2ubuntu0.1_amd64.deb + sudo apt install ./libtinfo5_6.3-2ubuntu0.1_amd64.deb elif [ "$RUNNER_OS" == "macOS" ]; then # It looks like "gfortran" isn't working correctly unless "gcc" is re-installed. brew reinstall gcc diff --git a/.github/workflows/loongarch64_clang.yml b/.github/workflows/loongarch64_clang.yml index f1a75ad343..fdb48309b9 100644 --- a/.github/workflows/loongarch64_clang.yml +++ b/.github/workflows/loongarch64_clang.yml @@ -41,7 +41,7 @@ jobs: - name: Install APT deps run: | sudo apt-get update - sudo apt-get install autoconf automake autotools-dev ninja-build make ccache + sudo apt-get install autoconf automake autotools-dev ninja-build make ccache libglib2.0-dev - name: Download and install loongarch64-toolchain run: | diff --git a/.github/workflows/mips64.yml b/.github/workflows/mips64.yml index 1491aff78b..56da22c6b0 100644 --- a/.github/workflows/mips64.yml +++ b/.github/workflows/mips64.yml @@ -41,14 +41,14 @@ jobs: run: | sudo apt-get update sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \ - gcc-${{ matrix.triple }} gfortran-${{ matrix.triple }} libgomp1-mips64el-cross + gcc-${{ matrix.triple }} gfortran-${{ matrix.triple }} libgomp1-mips64el-cross libglib2.0-dev - name: checkout qemu uses: actions/checkout@v3 with: repository: qemu/qemu path: qemu - ref: 79dfa177ae348bb5ab5f97c0915359b13d6186e2 + ref: ae35f033b874c627d81d51070187fbf55f0bf1a7 - name: build qemu run: | @@ -59,8 +59,7 @@ jobs: - name: Compilation cache uses: actions/cache@v3 - with: - path: ~/.ccache + with: path: ~/.ccache key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }} restore-keys: | ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }} diff --git a/kernel/riscv64/gemm_tcopy_8_rvv.c b/kernel/riscv64/gemm_tcopy_8_rvv.c index 4742ae6a75..c50b0d5b42 100644 --- a/kernel/riscv64/gemm_tcopy_8_rvv.c +++ b/kernel/riscv64/gemm_tcopy_8_rvv.c @@ -28,35 +28,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) __riscv_vsetvl_e32m1(n) -#define FLOAT_V_T vfloat32m1_t -#define FLOAT_VX2_T vfloat32m1x2_t -#define FLOAT_VX4_T vfloat32m1x4_t -#define FLOAT_VX8_T vfloat32m1x8_t -#define VLEV_FLOAT __riscv_vle32_v_f32m1 -#define VLSEV_FLOAT __riscv_vlse32_v_f32m1 -#define VSEV_FLOAT __riscv_vse32_v_f32m1 -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m1x2 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1x2 -#define VLSSEG4_FLOAT __riscv_vlsseg4e32_v_f32m1x4 -#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1x4 -#define VLSSEG8_FLOAT __riscv_vlsseg8e32_v_f32m1x8 -#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1x8 +#define FLOAT_V_T vfloat32m2_t +#define FLOAT_V_T_HALF vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VLEV_FLOAT_HALF __riscv_vle32_v_f32m1 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VSEV_FLOAT_HALF __riscv_vse32_v_f32m1 #else -#define VSETVL(n) __riscv_vsetvl_e64m1(n) -#define FLOAT_V_T vfloat64m1_t -#define FLOAT_VX2_T vfloat64m1x2_t -#define FLOAT_VX4_T vfloat64m1x4_t -#define FLOAT_VX8_T vfloat64m1x8_t -#define VLEV_FLOAT __riscv_vle64_v_f64m1 -#define VLSEV_FLOAT __riscv_vlse64_v_f64m1 -#define VSEV_FLOAT __riscv_vse64_v_f64m1 -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m1x2 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1x2 -#define VLSSEG4_FLOAT __riscv_vlsseg4e64_v_f64m1x4 -#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1x4 -#define VLSSEG8_FLOAT __riscv_vlsseg8e64_v_f64m1x8 -#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1x8 +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_HALF vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m4 +#define VLEV_FLOAT_HALF __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m4 +#define VSEV_FLOAT_HALF __riscv_vse64_v_f64m2 #endif int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) @@ -69,9 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) IFLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4; FLOAT_V_T v0; - FLOAT_VX2_T vx2; - FLOAT_VX4_T vx4; - FLOAT_VX8_T vx8; + FLOAT_V_T_HALF v1; // fprintf(stderr, "gemm_tcopy_8 m=%ld n=%ld lda=%ld\n", m, n, lda); @@ -81,156 +63,12 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) boffset3 = b + m * (n & ~3); boffset4 = b + m * (n & ~1); - for(j = (m >> 3); j > 0; j--) { - - aoffset1 = aoffset; - aoffset += 8 * lda; - - boffset1 = boffset; - boffset += 64; - - for(i = (n >> 3); i > 0; i--) { - size_t vl = 8; - - vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); - VSSEG8_FLOAT(boffset1, vx8, vl); - - aoffset1 += 8; - boffset1 += m * 8; - } - - if (n & 4) { - size_t vl = 8; - - vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); - VSSEG4_FLOAT(boffset2, vx4, vl); - - aoffset1 += 4; - boffset2 += 32; - } - - if (n & 2) { - size_t vl = 8; - - vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); - VSSEG2_FLOAT(boffset3, vx2, vl); - - aoffset1 += 2; - boffset3 += 16; - } - - if (n & 1) { - size_t vl = 8; - - v0 = VLSEV_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); - VSEV_FLOAT(boffset4, v0, vl); - - aoffset1 += 1; - boffset4 += 8; - } - - } - - if (m & 4) { - - aoffset1 = aoffset; - aoffset += 4 * lda; - - boffset1 = boffset; - boffset += 32; - - for(i = (n >> 3); i > 0; i--) { - size_t vl = 4; - - vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); - VSSEG8_FLOAT(boffset1, vx8, vl); - - aoffset1 += 8; - boffset1 += m * 8; - } - - if (n & 4) { - size_t vl = 4; - - vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); - VSSEG4_FLOAT(boffset2, vx4, vl); - - aoffset1 += 4; - boffset2 += 16; - } - - if (n & 2) { - size_t vl = 4; - - vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); - VSSEG2_FLOAT(boffset3, vx2, vl); - - aoffset1 += 2; - boffset3 += 8; - } - - if (n & 1) { - size_t vl = 4; - - v0 = VLSEV_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); - VSEV_FLOAT(boffset4, v0, vl); - - aoffset1 += 1; - boffset4 += 4; - } - } - - if (m & 2) { + for(j = m; j > 0; j--) { aoffset1 = aoffset; - aoffset += 2 * lda; - boffset1 = boffset; - boffset += 16; - - for(i = (n >> 3); i > 0; i--) { - size_t vl = 2; - vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); - VSSEG8_FLOAT(boffset1, vx8, vl); - - aoffset1 += 8; - boffset1 += m * 8; - } - - if (n & 4) { - size_t vl = 2; - - vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); - VSSEG4_FLOAT(boffset2, vx4, vl); - - aoffset1 += 4; - boffset2 += 8; - } - - if (n & 2) { - size_t vl = 2; - - vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); - VSSEG2_FLOAT(boffset3, vx2, vl); - - aoffset1 += 2; - boffset3 += 4; - } - - if (n & 1) { - size_t vl = 2; - - v0 = VLSEV_FLOAT(aoffset1, lda * sizeof(FLOAT), vl); - VSEV_FLOAT(boffset4, v0, vl); - - aoffset1 += 1; - boffset4 += 2; - } - } - - if (m & 1) { - aoffset1 = aoffset; - boffset1 = boffset; + aoffset += lda; + boffset += 8; for(i = (n >> 3); i > 0; i--) { size_t vl = 8; @@ -245,27 +83,25 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) if (n & 4) { size_t vl = 4; - v0 = VLEV_FLOAT(aoffset1, vl); - VSEV_FLOAT(boffset2, v0, vl); + v1 = VLEV_FLOAT_HALF(aoffset1, vl); + VSEV_FLOAT_HALF(boffset2, v1, vl); aoffset1 += 4; - //boffset2 += 4; + boffset2 += 4; } if (n & 2) { - size_t vl = 2; - - v0 = VLEV_FLOAT(aoffset1, vl); - VSEV_FLOAT(boffset3, v0, vl); + *(boffset3) = *(aoffset1); + *(boffset3 + 1) = *(aoffset1 + 1); aoffset1 += 2; - // boffset3 += 2; + boffset3 += 2; } if (n & 1) { - *(boffset4) = *(aoffset1); - // aoffset1 ++; - // boffset4 ++; + *(boffset4) = *(aoffset1); + aoffset1 ++; + boffset4 ++; } }