diff --git a/.github/workflows/c910v.yml b/.github/workflows/c910v.yml
index a47ca1dce6..1dd3a2c713 100644
--- a/.github/workflows/c910v.yml
+++ b/.github/workflows/c910v.yml
@@ -37,7 +37,7 @@ jobs:
         run: |
           sudo apt-get update
           sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \
-          gcc-${{ matrix.apt_triple }} gfortran-${{ matrix.apt_triple }} libgomp1-riscv64-cross
+          gcc-${{ matrix.apt_triple }} gfortran-${{ matrix.apt_triple }} libgomp1-riscv64-cross libglib2.0-dev
 
       - name: checkout qemu
         uses: actions/checkout@v3
@@ -52,6 +52,7 @@ jobs:
           wget https://github.com/revyos/qemu/commit/5164bca5a4bcde4534dc1a9aa3a7f619719874cf.patch
           cd qemu
           patch -p1 < ../5164bca5a4bcde4534dc1a9aa3a7f619719874cf.patch
+          export CXXFLAGS="-Wno-error"; export CFLAGS="-Wno-error"
           ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=riscv64-linux-user --disable-system
           make -j$(nproc)
           make install
diff --git a/.github/workflows/codspeed-bench.yml b/.github/workflows/codspeed-bench.yml
index 25e196ef2a..94e0d708ed 100644
--- a/.github/workflows/codspeed-bench.yml
+++ b/.github/workflows/codspeed-bench.yml
@@ -15,7 +15,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [ubuntu-latest]
+        os: [ubuntu-22.04]
         fortran: [gfortran]
         build: [make]
         pyver: ["3.12"]
@@ -147,7 +147,7 @@ jobs:
           OPENBLAS_NUM_THREADS=1 pytest benchmarks/bench_blas.py -k 'gesdd'
 
       - name: Run benchmarks
-        uses: CodSpeedHQ/action@v2
+        uses: CodSpeedHQ/action@v3
         with:
           token: ${{ secrets.CODSPEED_TOKEN }}
           run: |
diff --git a/.github/workflows/dynamic_arch.yml b/.github/workflows/dynamic_arch.yml
index 9e55e73467..f42d4c57fe 100644
--- a/.github/workflows/dynamic_arch.yml
+++ b/.github/workflows/dynamic_arch.yml
@@ -43,7 +43,9 @@ jobs:
         run: |
           if [ "$RUNNER_OS" == "Linux" ]; then
             sudo apt-get update
-            sudo apt-get install -y gfortran cmake ccache libtinfo5
+            sudo apt-get install -y gfortran cmake ccache
+            wget http://security.ubuntu.com/ubuntu/pool/universe/n/ncurses/libtinfo5_6.3-2ubuntu0.1_amd64.deb
+            sudo apt install ./libtinfo5_6.3-2ubuntu0.1_amd64.deb
           elif [ "$RUNNER_OS" == "macOS" ]; then
             # It looks like "gfortran" isn't working correctly unless "gcc" is re-installed.
             brew reinstall gcc
diff --git a/.github/workflows/loongarch64_clang.yml b/.github/workflows/loongarch64_clang.yml
index f1a75ad343..fdb48309b9 100644
--- a/.github/workflows/loongarch64_clang.yml
+++ b/.github/workflows/loongarch64_clang.yml
@@ -41,7 +41,7 @@ jobs:
       - name: Install APT deps
         run: |
           sudo apt-get update
-          sudo apt-get install autoconf automake autotools-dev ninja-build make ccache
+          sudo apt-get install autoconf automake autotools-dev ninja-build make ccache libglib2.0-dev
 
       - name: Download and install loongarch64-toolchain
         run: |
diff --git a/.github/workflows/mips64.yml b/.github/workflows/mips64.yml
index 1491aff78b..56da22c6b0 100644
--- a/.github/workflows/mips64.yml
+++ b/.github/workflows/mips64.yml
@@ -41,14 +41,14 @@ jobs:
         run: |
           sudo apt-get update
           sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \
-          gcc-${{ matrix.triple }} gfortran-${{ matrix.triple }} libgomp1-mips64el-cross
+          gcc-${{ matrix.triple }} gfortran-${{ matrix.triple }} libgomp1-mips64el-cross libglib2.0-dev
 
       - name: checkout qemu
         uses: actions/checkout@v3
         with:
           repository: qemu/qemu
           path: qemu
-          ref: 79dfa177ae348bb5ab5f97c0915359b13d6186e2
+          ref: ae35f033b874c627d81d51070187fbf55f0bf1a7
 
       - name: build qemu
         run: |
@@ -59,8 +59,7 @@ jobs:
 
       - name: Compilation cache
         uses: actions/cache@v3
-        with:
-          path: ~/.ccache
+        with:          path: ~/.ccache
           key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }}
           restore-keys: |
             ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}
diff --git a/kernel/riscv64/gemm_tcopy_8_rvv.c b/kernel/riscv64/gemm_tcopy_8_rvv.c
index 4742ae6a75..c50b0d5b42 100644
--- a/kernel/riscv64/gemm_tcopy_8_rvv.c
+++ b/kernel/riscv64/gemm_tcopy_8_rvv.c
@@ -28,35 +28,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n)               __riscv_vsetvl_e32m1(n)
-#define FLOAT_V_T               vfloat32m1_t
-#define FLOAT_VX2_T             vfloat32m1x2_t
-#define FLOAT_VX4_T             vfloat32m1x4_t
-#define FLOAT_VX8_T             vfloat32m1x8_t
-#define VLEV_FLOAT              __riscv_vle32_v_f32m1
-#define VLSEV_FLOAT             __riscv_vlse32_v_f32m1
-#define VSEV_FLOAT              __riscv_vse32_v_f32m1
-#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m1x2
-#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m1x2
-#define VLSSEG4_FLOAT           __riscv_vlsseg4e32_v_f32m1x4
-#define VSSEG4_FLOAT            __riscv_vsseg4e32_v_f32m1x4
-#define VLSSEG8_FLOAT           __riscv_vlsseg8e32_v_f32m1x8
-#define VSSEG8_FLOAT            __riscv_vsseg8e32_v_f32m1x8
+#define FLOAT_V_T               vfloat32m2_t
+#define FLOAT_V_T_HALF          vfloat32m1_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VLEV_FLOAT_HALF         __riscv_vle32_v_f32m1
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VSEV_FLOAT_HALF         __riscv_vse32_v_f32m1
 #else
-#define VSETVL(n)               __riscv_vsetvl_e64m1(n)
-#define FLOAT_V_T               vfloat64m1_t
-#define FLOAT_VX2_T             vfloat64m1x2_t
-#define FLOAT_VX4_T             vfloat64m1x4_t
-#define FLOAT_VX8_T             vfloat64m1x8_t
-#define VLEV_FLOAT              __riscv_vle64_v_f64m1
-#define VLSEV_FLOAT             __riscv_vlse64_v_f64m1
-#define VSEV_FLOAT              __riscv_vse64_v_f64m1
-#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m1x2
-#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m1x2
-#define VLSSEG4_FLOAT           __riscv_vlsseg4e64_v_f64m1x4
-#define VSSEG4_FLOAT            __riscv_vsseg4e64_v_f64m1x4
-#define VLSSEG8_FLOAT           __riscv_vlsseg8e64_v_f64m1x8
-#define VSSEG8_FLOAT            __riscv_vsseg8e64_v_f64m1x8
+#define FLOAT_V_T               vfloat64m4_t
+#define FLOAT_V_T_HALF          vfloat64m2_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m4
+#define VLEV_FLOAT_HALF         __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m4
+#define VSEV_FLOAT_HALF         __riscv_vse64_v_f64m2
 #endif
 
 int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
@@ -69,9 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
     IFLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4;
 
     FLOAT_V_T v0;
-    FLOAT_VX2_T vx2;
-    FLOAT_VX4_T vx4;
-    FLOAT_VX8_T vx8;
+    FLOAT_V_T_HALF v1;
 
     // fprintf(stderr, "gemm_tcopy_8 m=%ld n=%ld lda=%ld\n", m, n, lda);
 
@@ -81,156 +63,12 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
     boffset3  = b + m  * (n & ~3);
     boffset4  = b + m  * (n & ~1);
 
-    for(j = (m >> 3); j > 0; j--) {
-
-        aoffset1  = aoffset;
-        aoffset += 8 * lda;
-
-        boffset1  = boffset;
-        boffset  += 64;
-
-        for(i = (n >> 3); i > 0; i--) {
-            size_t vl = 8;
-
-            vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
-            VSSEG8_FLOAT(boffset1, vx8, vl);
-
-            aoffset1 += 8;
-            boffset1 += m * 8;
-        }
-
-        if (n & 4) {
-            size_t vl = 8;
-
-            vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
-            VSSEG4_FLOAT(boffset2, vx4, vl);
-
-            aoffset1 += 4;
-            boffset2 += 32;
-        }
-
-        if (n & 2) {
-            size_t vl = 8;
-
-            vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
-            VSSEG2_FLOAT(boffset3, vx2, vl);
-
-            aoffset1 += 2;
-            boffset3 += 16;
-        }
-
-        if (n & 1) {
-            size_t vl = 8;
-
-            v0 = VLSEV_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
-            VSEV_FLOAT(boffset4, v0, vl);
-
-            aoffset1 += 1;
-            boffset4 += 8;
-        }
-
-    }
-
-    if (m & 4) {
-
-        aoffset1  = aoffset;
-        aoffset += 4 * lda;
-
-        boffset1  = boffset;
-        boffset  += 32;
-
-        for(i = (n >> 3); i > 0; i--) {
-            size_t vl = 4;
-
-            vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
-            VSSEG8_FLOAT(boffset1, vx8, vl);
-
-            aoffset1 += 8;
-            boffset1 += m * 8;
-        }
-
-        if (n & 4) {
-            size_t vl = 4;
-
-            vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
-            VSSEG4_FLOAT(boffset2, vx4, vl);
-
-            aoffset1 += 4;
-            boffset2 += 16;
-        }
-
-        if (n & 2) {
-            size_t vl = 4;
-
-            vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
-            VSSEG2_FLOAT(boffset3, vx2, vl);
-
-            aoffset1 += 2;
-            boffset3 += 8;
-	}
-
-        if (n & 1) {
-            size_t vl = 4;
-
-            v0 = VLSEV_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
-            VSEV_FLOAT(boffset4, v0, vl);
-
-            aoffset1 += 1;
-            boffset4 += 4;
-        }
-    }
-
-    if (m & 2) {
+    for(j = m; j > 0; j--) {
         aoffset1  = aoffset;
-        aoffset += 2 * lda;
-
         boffset1  = boffset;
-        boffset  += 16;
-
-        for(i = (n >> 3); i > 0; i--) {
-            size_t vl = 2;
 
-            vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
-            VSSEG8_FLOAT(boffset1, vx8, vl);
-
-            aoffset1 += 8;
-            boffset1 += m * 8;
-        }
-
-        if (n & 4) {
-            size_t vl = 2;
-
-            vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
-            VSSEG4_FLOAT(boffset2, vx4, vl);
-
-            aoffset1 += 4;
-            boffset2 += 8;
-        }
-
-        if (n & 2) {
-            size_t vl = 2;
-
-            vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
-            VSSEG2_FLOAT(boffset3, vx2, vl);
-
-            aoffset1 += 2;
-            boffset3 += 4;
-	}
-  
-        if (n & 1) {
-           size_t vl = 2;
-
-            v0 = VLSEV_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
-            VSEV_FLOAT(boffset4, v0, vl);
-
-            aoffset1 += 1;
-            boffset4 += 2;
-        }
-    }
-
-    if (m & 1) {
-        aoffset1  = aoffset;
-        boffset1  = boffset;
+        aoffset += lda;
+        boffset  += 8;
 
         for(i = (n >> 3); i > 0; i--) {
             size_t vl = 8;
@@ -245,27 +83,25 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
         if (n & 4) {
             size_t vl = 4;
 
-            v0 = VLEV_FLOAT(aoffset1, vl);
-            VSEV_FLOAT(boffset2, v0, vl);
+            v1 = VLEV_FLOAT_HALF(aoffset1, vl);
+            VSEV_FLOAT_HALF(boffset2, v1, vl);
 
             aoffset1 += 4;
-            //boffset2 += 4;
+            boffset2 += 4;
         }
 
         if (n & 2) {
-            size_t vl = 2;
-
-            v0 = VLEV_FLOAT(aoffset1, vl);
-            VSEV_FLOAT(boffset3, v0, vl);
+            *(boffset3) = *(aoffset1);
+            *(boffset3 + 1) = *(aoffset1 + 1);
 
             aoffset1 += 2;
-           // boffset3 += 2;
+            boffset3 += 2;
         }
 
         if (n & 1) {
-           *(boffset4) = *(aoffset1);
-           // aoffset1 ++;
-           // boffset4 ++;
+            *(boffset4) = *(aoffset1);
+            aoffset1 ++;
+            boffset4 ++;
         }
     }