From c92104a6058775f64c677ff3cea3768e2e4f097a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vladimir=20Nikoli=C4=87?= Date: Fri, 5 Jul 2024 12:32:58 -0700 Subject: [PATCH 001/244] Update cross compile info --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index b52e485647..578ab209a0 100644 --- a/README.md +++ b/README.md @@ -67,17 +67,17 @@ build options you plan to set. ### Cross compile Set `CC` and `FC` to point to the cross toolchains, and set `HOSTCC` to your host C compiler. -The target must be specified explicitly when cross compiling. +The target must be specified explicitly when cross compiling. The `CROSS=1` flag should be specified. Examples: * On an x86 box, compile this library for a loongson3a CPU: ```sh - make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A + make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A CROSS=1 ``` or same with the newer mips-crosscompiler put out by Loongson that defaults to the 32bit ABI: ```sh - make HOSTCC=gcc CC='/opt/mips-loongson-gcc7.3-linux-gnu/2019.06-29/bin/mips-linux-gnu-gcc -mabi=64' FC='/opt/mips-loongson-gcc7.3-linux-gnu/2019.06-29/bin/mips-linux-gnu-gfortran -mabi=64' TARGET=LOONGSON3A + make HOSTCC=gcc CC='/opt/mips-loongson-gcc7.3-linux-gnu/2019.06-29/bin/mips-linux-gnu-gcc -mabi=64' FC='/opt/mips-loongson-gcc7.3-linux-gnu/2019.06-29/bin/mips-linux-gnu-gfortran -mabi=64' TARGET=LOONGSON3A CROSS=1 ``` * On an x86 box, compile this library for a loongson3a CPU with loongcc (based on Open64) compiler: From ba47c7f4f301aad100ed166de338b86e01da8465 Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Tue, 16 Jul 2024 15:57:24 -0500 Subject: [PATCH 002/244] Vectorize reduction stage of sgemv_t. --- kernel/power/sgemv_t.c | 54 ++++++++++++++++++++++++++++++---------- kernel/power/sgemv_t_8.c | 54 ++++++++++++++++++++++++++++++---------- 2 files changed, 82 insertions(+), 26 deletions(-) diff --git a/kernel/power/sgemv_t.c b/kernel/power/sgemv_t.c index c3fc8e77a1..e133c815c3 100644 --- a/kernel/power/sgemv_t.c +++ b/kernel/power/sgemv_t.c @@ -79,15 +79,32 @@ static void sgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA } - y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); - y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); - y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); - y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); - - y[4] += alpha * (temp4[0] + temp4[1]+temp4[2] + temp4[3]); - y[5] += alpha * (temp5[0] + temp5[1]+temp5[2] + temp5[3]); - y[6] += alpha * (temp6[0] + temp6[1]+temp6[2] + temp6[3]); - y[7] += alpha * (temp7[0] + temp7[1]+temp7[2] + temp7[3]); + register __vector float t0, t1, t2, t3; + register __vector float a = { alpha, alpha, alpha, alpha }; + __vector float *v_y = (__vector float*) y; + + t0 = vec_mergeh(temp0, temp2); + t1 = vec_mergel(temp0, temp2); + t2 = vec_mergeh(temp1, temp3); + t3 = vec_mergel(temp1, temp3); + temp0 = vec_mergeh(t0, t2); + temp1 = vec_mergel(t0, t2); + temp2 = vec_mergeh(t1, t3); + temp3 = vec_mergel(t1, t3); + temp0 += temp1 + temp2 + temp3; + + t0 = vec_mergeh(temp4, temp6); + t1 = vec_mergel(temp4, temp6); + t2 = vec_mergeh(temp5, temp7); + t3 = vec_mergel(temp5, temp7); + temp4 = vec_mergeh(t0, t2); + temp5 = vec_mergel(t0, t2); + temp6 = vec_mergeh(t1, t3); + temp7 = vec_mergel(t1, t3); + temp4 += temp5 + temp6 + temp7; + + v_y[0] += a * temp0; + v_y[1] += a * temp4; } @@ -116,10 +133,21 @@ static void sgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA temp3 += v_x[i] * va3[i]; } - y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); - y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); - y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); - y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); + register __vector float t0, t1, t2, t3; + register __vector float a = { alpha, alpha, alpha, alpha }; + __vector float *v_y = (__vector float*) y; + + t0 = vec_mergeh(temp0, temp2); + t1 = vec_mergel(temp0, temp2); + t2 = vec_mergeh(temp1, temp3); + t3 = vec_mergel(temp1, temp3); + temp0 = vec_mergeh(t0, t2); + temp1 = vec_mergel(t0, t2); + temp2 = vec_mergeh(t1, t3); + temp3 = vec_mergel(t1, t3); + temp0 += temp1 + temp2 + temp3; + + v_y[0] += a * temp0; } diff --git a/kernel/power/sgemv_t_8.c b/kernel/power/sgemv_t_8.c index 1ee7c8aebb..f21f6eb7d2 100644 --- a/kernel/power/sgemv_t_8.c +++ b/kernel/power/sgemv_t_8.c @@ -100,15 +100,32 @@ static void sgemv_kernel_8x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA } - y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); - y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); - y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); - y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); - - y[4] += alpha * (temp4[0] + temp4[1]+temp4[2] + temp4[3]); - y[5] += alpha * (temp5[0] + temp5[1]+temp5[2] + temp5[3]); - y[6] += alpha * (temp6[0] + temp6[1]+temp6[2] + temp6[3]); - y[7] += alpha * (temp7[0] + temp7[1]+temp7[2] + temp7[3]); + register __vector float t0, t1, t2, t3; + register __vector float a = { alpha, alpha, alpha, alpha }; + __vector float *v_y = (__vector float*) y; + + t0 = vec_mergeh(temp0, temp2); + t1 = vec_mergel(temp0, temp2); + t2 = vec_mergeh(temp1, temp3); + t3 = vec_mergel(temp1, temp3); + temp0 = vec_mergeh(t0, t2); + temp1 = vec_mergel(t0, t2); + temp2 = vec_mergeh(t1, t3); + temp3 = vec_mergel(t1, t3); + temp0 += temp1 + temp2 + temp3; + + t0 = vec_mergeh(temp4, temp6); + t1 = vec_mergel(temp4, temp6); + t2 = vec_mergeh(temp5, temp7); + t3 = vec_mergel(temp5, temp7); + temp4 = vec_mergeh(t0, t2); + temp5 = vec_mergel(t0, t2); + temp6 = vec_mergeh(t1, t3); + temp7 = vec_mergel(t1, t3); + temp4 += temp5 + temp6 + temp7; + + v_y[0] += a * temp0; + v_y[1] += a * temp4; } @@ -137,10 +154,21 @@ static void sgemv_kernel_8x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA temp3 += v_x[i] * va3[i] + v_x[i+1] * va3[i+1]; } - y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); - y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); - y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); - y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); + register __vector float t0, t1, t2, t3; + register __vector float a = { alpha, alpha, alpha, alpha }; + __vector float *v_y = (__vector float*) y; + + t0 = vec_mergeh(temp0, temp2); + t1 = vec_mergel(temp0, temp2); + t2 = vec_mergeh(temp1, temp3); + t3 = vec_mergel(temp1, temp3); + temp0 = vec_mergeh(t0, t2); + temp1 = vec_mergel(t0, t2); + temp2 = vec_mergeh(t1, t3); + temp3 = vec_mergel(t1, t3); + temp0 += temp1 + temp2 + temp3; + + v_y[0] += a * temp0; } From 66622de36d0b30161fcfbbf1ad22007f654efa4d Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Fri, 19 Jul 2024 07:26:08 -0500 Subject: [PATCH 003/244] Hack: Test gemv vs gemm. --- interface/gemm.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/interface/gemm.c b/interface/gemm.c index 4537b6a78f..e31e22d241 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -47,22 +47,29 @@ #define SMP_THRESHOLD_MIN 65536.0 #ifdef XDOUBLE #define ERROR_NAME "QGEMM " +#define GEMV BLASFUNC(qgemv) #elif defined(DOUBLE) #define ERROR_NAME "DGEMM " +#define GEMV BLASFUNC(dgemv) #elif defined(BFLOAT16) #define ERROR_NAME "SBGEMM " +#define GEMV BLASFUNC(sbgemv) #else #define ERROR_NAME "SGEMM " +#define GEMV BLASFUNC(sgemv) #endif #else #define SMP_THRESHOLD_MIN 8192.0 #ifndef GEMM3M #ifdef XDOUBLE #define ERROR_NAME "XGEMM " +#define GEMV BLASFUNC(xgemv) #elif defined(DOUBLE) #define ERROR_NAME "ZGEMM " +#define GEMV BLASFUNC(zgemv) #else #define ERROR_NAME "CGEMM " +#define GEMV BLASFUNC(cgemv) #endif #else #ifdef XDOUBLE @@ -190,6 +197,16 @@ void NAME(char *TRANSA, char *TRANSB, IFLOAT *buffer; IFLOAT *sa, *sb; +#if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) +#if 1 + if (*N == 1) { + GEMV(TRANSA, K, M, alpha, a, ldA, b, N, beta, c, N); +//SUBROUTINE SGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY) + return; + } +#endif +#endif + #ifdef SMP double MNK; #if defined(USE_SIMPLE_THREADED_LEVEL3) || !defined(NO_AFFINITY) From e2334d02180c5bc24592ab4fe65aa109da19179b Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Thu, 1 Aug 2024 14:44:40 -0500 Subject: [PATCH 004/244] Remove GEMV hack. --- interface/gemm.c | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/interface/gemm.c b/interface/gemm.c index e31e22d241..4537b6a78f 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -47,29 +47,22 @@ #define SMP_THRESHOLD_MIN 65536.0 #ifdef XDOUBLE #define ERROR_NAME "QGEMM " -#define GEMV BLASFUNC(qgemv) #elif defined(DOUBLE) #define ERROR_NAME "DGEMM " -#define GEMV BLASFUNC(dgemv) #elif defined(BFLOAT16) #define ERROR_NAME "SBGEMM " -#define GEMV BLASFUNC(sbgemv) #else #define ERROR_NAME "SGEMM " -#define GEMV BLASFUNC(sgemv) #endif #else #define SMP_THRESHOLD_MIN 8192.0 #ifndef GEMM3M #ifdef XDOUBLE #define ERROR_NAME "XGEMM " -#define GEMV BLASFUNC(xgemv) #elif defined(DOUBLE) #define ERROR_NAME "ZGEMM " -#define GEMV BLASFUNC(zgemv) #else #define ERROR_NAME "CGEMM " -#define GEMV BLASFUNC(cgemv) #endif #else #ifdef XDOUBLE @@ -197,16 +190,6 @@ void NAME(char *TRANSA, char *TRANSB, IFLOAT *buffer; IFLOAT *sa, *sb; -#if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) -#if 1 - if (*N == 1) { - GEMV(TRANSA, K, M, alpha, a, ldA, b, N, beta, c, N); -//SUBROUTINE SGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY) - return; - } -#endif -#endif - #ifdef SMP double MNK; #if defined(USE_SIMPLE_THREADED_LEVEL3) || !defined(NO_AFFINITY) From 5b07ec643c4ed007df47f9775eec00e2d9410128 Mon Sep 17 00:00:00 2001 From: Harmen Stoppels Date: Wed, 7 Aug 2024 09:43:47 +0200 Subject: [PATCH 005/244] require consistent minimal cmake version --- CMakeLists.txt | 6 +----- lapack-netlib/INSTALL/CMakeLists.txt | 2 +- lapack-netlib/LAPACKE/mangling/CMakeLists.txt | 2 +- lapack-netlib/lapack_build.cmake | 2 +- 4 files changed, 4 insertions(+), 8 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 314e8d9d81..0e287eb9ba 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,7 +2,7 @@ ## Author: Hank Anderson ## -cmake_minimum_required(VERSION 2.8.5) +cmake_minimum_required(VERSION 3.6) project(OpenBLAS C ASM) @@ -258,10 +258,6 @@ if (${CMAKE_SYSTEM_NAME} MATCHES "AIX|Android|Linux|FreeBSD|OpenBSD|NetBSD|Drago endif() endif() -if (APPLE AND BUILD_SHARED_LIBS) -set(CMAKE_MACOSX_RPATH ON) -endif() - # Seems that this hack doesn't required since macOS 11 Big Sur if (APPLE AND BUILD_SHARED_LIBS AND CMAKE_HOST_SYSTEM_VERSION VERSION_LESS 20) set (CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1) diff --git a/lapack-netlib/INSTALL/CMakeLists.txt b/lapack-netlib/INSTALL/CMakeLists.txt index 1e808a64c3..b6c26753d3 100644 --- a/lapack-netlib/INSTALL/CMakeLists.txt +++ b/lapack-netlib/INSTALL/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 2.8.7) +cmake_minimum_required(VERSION 3.6) project(TIMING Fortran) add_executable(secondtst_NONE second_NONE.f secondtst.f) add_executable(secondtst_EXT_ETIME second_EXT_ETIME.f secondtst.f) diff --git a/lapack-netlib/LAPACKE/mangling/CMakeLists.txt b/lapack-netlib/LAPACKE/mangling/CMakeLists.txt index 88ac0d85db..1b6b308e07 100644 --- a/lapack-netlib/LAPACKE/mangling/CMakeLists.txt +++ b/lapack-netlib/LAPACKE/mangling/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 2.8.7) +cmake_minimum_required(VERSION 3.6) project(MANGLING C Fortran) add_executable(xintface Fintface.f Cintface.c) diff --git a/lapack-netlib/lapack_build.cmake b/lapack-netlib/lapack_build.cmake index 39878cb240..b15e18e441 100644 --- a/lapack-netlib/lapack_build.cmake +++ b/lapack-netlib/lapack_build.cmake @@ -4,7 +4,7 @@ ## HINTS: ctest -Ddashboard_model=Nightly -S $(pwd)/lapack/lapack_build.cmake ## -cmake_minimum_required(VERSION 2.8.10) +cmake_minimum_required(VERSION 3.6) ################################################################### # The values in this section must always be provided ################################################################### From 1ef9f24b39ece20e2fa3b3a2aab591da2037c346 Mon Sep 17 00:00:00 2001 From: Harmen Stoppels Date: Wed, 7 Aug 2024 16:37:02 +0200 Subject: [PATCH 006/244] Revert "require consistent minimal cmake version" This reverts commit 5b07ec643c4ed007df47f9775eec00e2d9410128. --- CMakeLists.txt | 6 +++++- lapack-netlib/INSTALL/CMakeLists.txt | 2 +- lapack-netlib/LAPACKE/mangling/CMakeLists.txt | 2 +- lapack-netlib/lapack_build.cmake | 2 +- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0e287eb9ba..314e8d9d81 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,7 +2,7 @@ ## Author: Hank Anderson ## -cmake_minimum_required(VERSION 3.6) +cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) @@ -258,6 +258,10 @@ if (${CMAKE_SYSTEM_NAME} MATCHES "AIX|Android|Linux|FreeBSD|OpenBSD|NetBSD|Drago endif() endif() +if (APPLE AND BUILD_SHARED_LIBS) +set(CMAKE_MACOSX_RPATH ON) +endif() + # Seems that this hack doesn't required since macOS 11 Big Sur if (APPLE AND BUILD_SHARED_LIBS AND CMAKE_HOST_SYSTEM_VERSION VERSION_LESS 20) set (CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1) diff --git a/lapack-netlib/INSTALL/CMakeLists.txt b/lapack-netlib/INSTALL/CMakeLists.txt index b6c26753d3..1e808a64c3 100644 --- a/lapack-netlib/INSTALL/CMakeLists.txt +++ b/lapack-netlib/INSTALL/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.6) +cmake_minimum_required(VERSION 2.8.7) project(TIMING Fortran) add_executable(secondtst_NONE second_NONE.f secondtst.f) add_executable(secondtst_EXT_ETIME second_EXT_ETIME.f secondtst.f) diff --git a/lapack-netlib/LAPACKE/mangling/CMakeLists.txt b/lapack-netlib/LAPACKE/mangling/CMakeLists.txt index 1b6b308e07..88ac0d85db 100644 --- a/lapack-netlib/LAPACKE/mangling/CMakeLists.txt +++ b/lapack-netlib/LAPACKE/mangling/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.6) +cmake_minimum_required(VERSION 2.8.7) project(MANGLING C Fortran) add_executable(xintface Fintface.f Cintface.c) diff --git a/lapack-netlib/lapack_build.cmake b/lapack-netlib/lapack_build.cmake index b15e18e441..39878cb240 100644 --- a/lapack-netlib/lapack_build.cmake +++ b/lapack-netlib/lapack_build.cmake @@ -4,7 +4,7 @@ ## HINTS: ctest -Ddashboard_model=Nightly -S $(pwd)/lapack/lapack_build.cmake ## -cmake_minimum_required(VERSION 3.6) +cmake_minimum_required(VERSION 2.8.10) ################################################################### # The values in this section must always be provided ################################################################### From f49371c1ba2ce00169892b194fc4e50dd9bfb6c2 Mon Sep 17 00:00:00 2001 From: Harmen Stoppels Date: Wed, 7 Aug 2024 16:40:11 +0200 Subject: [PATCH 007/244] Set CMake 3.0 policies to NEW --- CMakeLists.txt | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 314e8d9d81..c6a80b13a8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,7 +2,7 @@ ## Author: Hank Anderson ## -cmake_minimum_required(VERSION 2.8.5) +cmake_minimum_required(VERSION 2.8.5...3.0) project(OpenBLAS C ASM) @@ -258,10 +258,6 @@ if (${CMAKE_SYSTEM_NAME} MATCHES "AIX|Android|Linux|FreeBSD|OpenBSD|NetBSD|Drago endif() endif() -if (APPLE AND BUILD_SHARED_LIBS) -set(CMAKE_MACOSX_RPATH ON) -endif() - # Seems that this hack doesn't required since macOS 11 Big Sur if (APPLE AND BUILD_SHARED_LIBS AND CMAKE_HOST_SYSTEM_VERSION VERSION_LESS 20) set (CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1) From fe0a69e3084df97849055346ca29eaee78d1c166 Mon Sep 17 00:00:00 2001 From: Harmen Stoppels Date: Wed, 7 Aug 2024 16:43:45 +0200 Subject: [PATCH 008/244] even less invasive --- CMakeLists.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c6a80b13a8..a4e025503a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,7 +2,9 @@ ## Author: Hank Anderson ## -cmake_minimum_required(VERSION 2.8.5...3.0) +cmake_minimum_required(VERSION 2.8.5) + +cmake_policy(SET CMP0042 NEW) project(OpenBLAS C ASM) From cbd321aecbb1069e5e02022cfc8f028e4487666f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 8 Aug 2024 23:08:52 +0200 Subject: [PATCH 009/244] Update versin to 0.3.28.dev --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2006604146..df8d7eb5bb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,7 +8,7 @@ project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 28) +set(OpenBLAS_PATCH_VERSION 28.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") From cd3945b99881423035cd9cdd00928e5d1671f30a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 8 Aug 2024 23:09:45 +0200 Subject: [PATCH 010/244] Update version to 0.3.28.dev --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index ac62d49cfb..e57388844a 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.28 +VERSION = 0.3.28.dev # If you set this prefix, the library name will be lib$(LIBNAMESUFFIX)openblas.a # and lib$(LIBNAMESUFFIX)openblas.so, with a matching soname in the shared library From 1265eee85c304c2b7d33d5b48d6128de28acb1ca Mon Sep 17 00:00:00 2001 From: psykose Date: Fri, 9 Aug 2024 20:38:05 +0200 Subject: [PATCH 011/244] fix cmake typo for power10 cc version check fixes 668f48f4fc80db2d886576f20b7d4ddb6defd4c1 --- cmake/system.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/system.cmake b/cmake/system.cmake index 683c3181db..a0b73ddae0 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -263,7 +263,7 @@ if (DEFINED TARGET) endif() if (${TARGET} STREQUAL POWER10) - if (CMAKE_C_COMPILER VERSION VERSION_GREATER 10.2 OR CMAKE_C_COMPILER_VERSION VERSION_EQUAL 10.2) + if (CMAKE_C_COMPILER_VERSION VERSION_GREATER 10.2 OR CMAKE_C_COMPILER_VERSION VERSION_EQUAL 10.2) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math") else () message(FATAL_ERROR "Compiler GCC ${CMAKE_C_COMPILER_VERSION} does not support Power10.") From 7ca835a82c5cb315997949804f134f32d9a14b70 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 10 Aug 2024 13:44:56 +0200 Subject: [PATCH 012/244] address clang array overflow warning --- kernel/x86_64/sbgemv_t_microk_cooperlake_template.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/sbgemv_t_microk_cooperlake_template.c b/kernel/x86_64/sbgemv_t_microk_cooperlake_template.c index 8a3a022fb3..69370e7441 100644 --- a/kernel/x86_64/sbgemv_t_microk_cooperlake_template.c +++ b/kernel/x86_64/sbgemv_t_microk_cooperlake_template.c @@ -2680,7 +2680,7 @@ static int sbgemv_kernel_1x128_lda_direct(BLASLONG m, BLASLONG n, float alpha, b BLASLONG tag_n_32x = n & (~31); BLASLONG tag_n_128x = n & (~127); - __m512 accum512_bridge[8]; + __m512 accum512_bridge[16]; __m512 accum512_t_0, accum512_t_1, accum512_t_2, accum512_t_3; __m256 accum256_0; __m128 accum128; From 824306baabbf91555c07f81ec98b594584dc5952 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 12 Aug 2024 14:44:13 +0200 Subject: [PATCH 013/244] flesh out HERK prototype --- lapack/potrf/potrf_L_parallel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack/potrf/potrf_L_parallel.c b/lapack/potrf/potrf_L_parallel.c index 7d6bcd7764..6a2e4d4303 100644 --- a/lapack/potrf/potrf_L_parallel.c +++ b/lapack/potrf/potrf_L_parallel.c @@ -121,7 +121,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, HERK_THREAD_LN(&newarg, NULL, NULL, sa, sb, 0); #else syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T | BLAS_UPLO, - &newarg, NULL, NULL, (int (*)(void))HERK_LN, sa, sb, args -> nthreads); + &newarg, NULL, NULL, (int (*)(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG))HERK_LN, sa, sb, args -> nthreads); #endif } } From 73e13b027381833a003f42790ddcd4ff087e9798 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 12 Aug 2024 14:45:40 +0200 Subject: [PATCH 014/244] flesh out HERK prototype --- lapack/potrf/potrf_U_parallel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack/potrf/potrf_U_parallel.c b/lapack/potrf/potrf_U_parallel.c index 1f1427276b..de7d333742 100644 --- a/lapack/potrf/potrf_U_parallel.c +++ b/lapack/potrf/potrf_U_parallel.c @@ -121,7 +121,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, HERK_THREAD_UC(&newarg, NULL, NULL, sa, sb, 0); #else syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T, - &newarg, NULL, NULL, (int (*)(void))HERK_UC, sa, sb, args -> nthreads); + &newarg, NULL, NULL, (int (*)(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG))HERK_UC, sa, sb, args -> nthreads); #endif } } From d8f740791a6f21e6d40c879bf2d8e127c4627d73 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 12 Aug 2024 14:50:49 +0200 Subject: [PATCH 015/244] tweak threshold a little more to cover POWER10 fma --- lapack-netlib/TESTING/stest_rfp.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/TESTING/stest_rfp.in b/lapack-netlib/TESTING/stest_rfp.in index 9b082b7df4..0e391aacc1 100644 --- a/lapack-netlib/TESTING/stest_rfp.in +++ b/lapack-netlib/TESTING/stest_rfp.in @@ -5,5 +5,5 @@ Data file for testing REAL LAPACK linear equation routines RFP format 1 2 15 Values of NRHS (number of right hand sides) 9 Number of matrix types (list types on next line if 0 < NTYPES < 9) 1 2 3 4 5 6 7 8 9 Matrix Types -42.0 Threshold value of test ratio +45.0 Threshold value of test ratio T Put T to test the error exits From b1737698db5773ffde6a3a6c8586da4bfb991099 Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Tue, 13 Aug 2024 07:01:21 -0500 Subject: [PATCH 016/244] Fix DEFAULTS in SBGEMM for POWER10. Also comparisons for SBGEMM unit test can be exactly due to epilison differences. --- param.h | 4 +-- test/compare_sgemm_sbgemm.c | 68 +++++++++++++++++++++++-------------- 2 files changed, 45 insertions(+), 27 deletions(-) diff --git a/param.h b/param.h index 2618e1f609..0e4d8965d9 100644 --- a/param.h +++ b/param.h @@ -2637,8 +2637,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #undef SBGEMM_DEFAULT_Q #define SBGEMM_DEFAULT_UNROLL_M 16 #define SBGEMM_DEFAULT_UNROLL_N 8 -#define SBGEMM_DEFAULT_P 832 -#define SBGEMM_DEFAULT_Q 1026 +#define SBGEMM_DEFAULT_P 512 +#define SBGEMM_DEFAULT_Q 1024 #define SBGEMM_DEFAULT_R 4096 #endif diff --git a/test/compare_sgemm_sbgemm.c b/test/compare_sgemm_sbgemm.c index de589458bc..4b546fb1fd 100644 --- a/test/compare_sgemm_sbgemm.c +++ b/test/compare_sgemm_sbgemm.c @@ -81,6 +81,8 @@ float16to32 (bfloat16_bits f16) return f32.v; } +#define SBGEMM_LARGEST 256 + int main (int argc, char *argv[]) { @@ -88,12 +90,39 @@ main (int argc, char *argv[]) int i, j, l; blasint x, y; int ret = 0; - int loop = 100; + int loop = SBGEMM_LARGEST; char transA = 'N', transB = 'N'; float alpha = 1.0, beta = 0.0; for (x = 0; x <= loop; x++) { + if ((x > 100) && (x != SBGEMM_LARGEST)) continue; + m = k = n = x; + float *A = (float *)malloc(m * k * sizeof(FLOAT)); + float *B = (float *)malloc(k * n * sizeof(FLOAT)); + float *C = (float *)malloc(m * n * sizeof(FLOAT)); + bfloat16_bits *AA = (bfloat16_bits *)malloc(m * k * sizeof(bfloat16_bits)); + bfloat16_bits *BB = (bfloat16_bits *)malloc(k * n * sizeof(bfloat16_bits)); + float *DD = (float *)malloc(m * n * sizeof(FLOAT)); + float *CC = (float *)malloc(m * n * sizeof(FLOAT)); + if ((A == NULL) || (B == NULL) || (C == NULL) || (AA == NULL) || (BB == NULL) || + (DD == NULL) || (CC == NULL)) + return 1; + bfloat16 atmp,btmp; + blasint one=1; + + for (j = 0; j < m; j++) + { + for (i = 0; i < n; i++) + { + A[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; + B[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; + sbstobf16_(&one, &A[j*k+i], &one, &atmp, &one); + sbstobf16_(&one, &B[j*k+i], &one, &btmp, &one); + AA[j * k + i].v = atmp; + BB[j * k + i].v = btmp; + } + } for (y = 0; y < 4; y++) { if ((y == 0) || (y == 2)) { @@ -106,34 +135,16 @@ main (int argc, char *argv[]) } else { transB = 'T'; } - m = k = n = x; - float A[m * k]; - float B[k * n]; - float C[m * n]; - bfloat16_bits AA[m * k], BB[k * n]; - float DD[m * n], CC[m * n]; - bfloat16 atmp,btmp; - blasint one=1; - for (j = 0; j < m; j++) - { - for (i = 0; i < m; i++) - { - A[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; - B[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; - C[j * k + i] = 0; - sbstobf16_(&one, &A[j*k+i], &one, &atmp, &one); - sbstobf16_(&one, &B[j*k+i], &one, &btmp, &one); - AA[j * k + i].v = atmp; - BB[j * k + i].v = btmp; - CC[j * k + i] = 0; - DD[j * k + i] = 0; - } - } + memset(CC, 0, m * n * sizeof(FLOAT)); + memset(DD, 0, m * n * sizeof(FLOAT)); + memset(C, 0, m * n * sizeof(FLOAT)); + SGEMM (&transA, &transB, &m, &n, &k, &alpha, A, &m, B, &k, &beta, C, &m); SBGEMM (&transA, &transB, &m, &n, &k, &alpha, (bfloat16*) AA, &m, (bfloat16*)BB, &k, &beta, CC, &m); + for (i = 0; i < n; i++) for (j = 0; j < m; j++) if (fabs (CC[i * m + j] - C[i * m + j]) > 1.0) @@ -160,9 +171,16 @@ main (int argc, char *argv[]) } for (i = 0; i < n; i++) for (j = 0; j < m; j++) - if (CC[i * m + j] != DD[i * m + j]) + if (fabs (CC[i * m + j] - DD[i * m + j]) > 1.0) ret++; } + free(A); + free(B); + free(C); + free(AA); + free(BB); + free(DD); + free(CC); } if (ret != 0) From 20bdb658828e62a01dcc0b97edf14cb56f3ea6a8 Mon Sep 17 00:00:00 2001 From: Henry Chen Date: Mon, 12 Aug 2024 16:22:31 +0800 Subject: [PATCH 017/244] Fix recursive variable expansion in Makefiles for LOONGSON3A --- ctest/Makefile | 2 +- test/Makefile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ctest/Makefile b/ctest/Makefile index c02e04e1a3..877a190c19 100644 --- a/ctest/Makefile +++ b/ctest/Makefile @@ -26,7 +26,7 @@ endif override CFLAGS += -DADD$(BU) -DCBLAS ifeq ($(F_COMPILER),GFORTRAN) ifneq (, $(filter $(CORE),LOONGSON3R3 LOONGSON3R4)) - override FFLAGS = $(filter_out(-O2 -O3,$(FFLAGS))) -O0 + override FFLAGS := $(filter_out(-O2 -O3,$(FFLAGS))) -O0 endif override FFLAGS += -fno-tree-vectorize endif diff --git a/test/Makefile b/test/Makefile index cfb2d41f54..65576d3dd1 100644 --- a/test/Makefile +++ b/test/Makefile @@ -2,7 +2,7 @@ TOPDIR = .. include ../Makefile.system ifeq ($(F_COMPILER),GFORTRAN) ifneq (, $(filter $(CORE),LOONGSON3R3 LOONGSON3R4)) - override FFLAGS = $(filter_out(-O2 -O3,$(FFLAGS))) -O0 + override FFLAGS := $(filter_out(-O2 -O3,$(FFLAGS))) -O0 endif override FFLAGS += -fno-tree-vectorize endif From 23b5d66a86417a071bba9a96a0573192237981b6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 14 Aug 2024 10:35:44 +0200 Subject: [PATCH 018/244] Ensure a memory buffer has been allocated for each thread before invoking it --- driver/others/blas_server.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index 765511d8c7..b9a7674c17 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -1076,6 +1076,8 @@ fprintf(STDERR, "Server[%2ld] Calculation started. Mode = 0x%03x M = %3ld N=%3l main_status[cpu] = MAIN_RUNNING1; #endif +if (buffer == NULL) blas_thread_buffer[cpu] = blas_memory_alloc(2); + //For target LOONGSON3R5, applying an offset to the buffer is essential //for minimizing cache conflicts and optimizing performance. #if defined(ARCH_LOONGARCH64) && !defined(NO_AFFINITY) From ef94b9653057e65328ca8fe8897830047b72cfe9 Mon Sep 17 00:00:00 2001 From: Henry Chen Date: Tue, 13 Aug 2024 14:53:37 +0800 Subject: [PATCH 019/244] Use ldc1 and sdc1 for the prologue and epilogue on LOONGSON3A This fix is similar to 2d8064174c444bb377cc2e3879a9c8e76e45b314. --- .../mips64/cgemm_kernel_loongson3a_4x2_ps.S | 36 +++++++++---------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S b/kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S index 489b124455..f2d05faf81 100644 --- a/kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S +++ b/kernel/mips64/cgemm_kernel_loongson3a_4x2_ps.S @@ -131,11 +131,11 @@ sd $21, 40($sp) sd $22, 48($sp) - ST $f24, 56($sp) - ST $f25, 64($sp) - ST $f26, 72($sp) - ST $f27, 80($sp) - ST $f28, 88($sp) + sdc1 $f24, 56($sp) + sdc1 $f25, 64($sp) + sdc1 $f26, 72($sp) + sdc1 $f27, 80($sp) + sdc1 $f28, 88($sp) #if defined(TRMMKERNEL) sd $23, 96($sp) @@ -146,10 +146,10 @@ #endif #ifndef __64BIT__ - ST $f20,120($sp) - ST $f21,128($sp) - ST $f22,136($sp) - ST $f23,144($sp) + sdc1 $f20,120($sp) + sdc1 $f21,128($sp) + sdc1 $f22,136($sp) + sdc1 $f23,144($sp) #endif .align 4 @@ -4000,11 +4000,11 @@ ld $21, 40($sp) ld $22, 48($sp) - LD $f24, 56($sp) - LD $f25, 64($sp) - LD $f26, 72($sp) - LD $f27, 80($sp) - LD $f28, 88($sp) + ldc1 $f24, 56($sp) + ldc1 $f25, 64($sp) + ldc1 $f26, 72($sp) + ldc1 $f27, 80($sp) + ldc1 $f28, 88($sp) #if defined(TRMMKERNEL) ld $23, 96($sp) @@ -4013,10 +4013,10 @@ #endif #ifndef __64BIT__ - LD $f20,120($sp) - LD $f21,128($sp) - LD $f22,136($sp) - LD $f23,144($sp) + ldc1 $f20,120($sp) + ldc1 $f21,128($sp) + ldc1 $f22,136($sp) + ldc1 $f23,144($sp) #endif daddiu $sp,$sp,STACKSIZE From 31226740d6f12c39e3f7ac3d3eb1475180121b5e Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Wed, 14 Aug 2024 08:10:25 -0500 Subject: [PATCH 020/244] Cleanup of SBGEMM unit test. --- test/compare_sgemm_sbgemm.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/test/compare_sgemm_sbgemm.c b/test/compare_sgemm_sbgemm.c index 4b546fb1fd..3953174410 100644 --- a/test/compare_sgemm_sbgemm.c +++ b/test/compare_sgemm_sbgemm.c @@ -113,13 +113,19 @@ main (int argc, char *argv[]) for (j = 0; j < m; j++) { - for (i = 0; i < n; i++) + for (i = 0; i < k; i++) { A[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; - B[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; sbstobf16_(&one, &A[j*k+i], &one, &atmp, &one); - sbstobf16_(&one, &B[j*k+i], &one, &btmp, &one); AA[j * k + i].v = atmp; + } + } + for (j = 0; j < n; j++) + { + for (i = 0; i < k; i++) + { + B[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; + sbstobf16_(&one, &B[j*k+i], &one, &btmp, &one); BB[j * k + i].v = btmp; } } @@ -147,10 +153,7 @@ main (int argc, char *argv[]) for (i = 0; i < n; i++) for (j = 0; j < m; j++) - if (fabs (CC[i * m + j] - C[i * m + j]) > 1.0) - ret++; - for (i = 0; i < n; i++) - for (j = 0; j < m; j++) + { for (l = 0; l < k; l++) if (transA == 'N' && transB == 'N') { @@ -169,10 +172,11 @@ main (int argc, char *argv[]) DD[i * m + j] += float16to32 (AA[k * j + l]) * float16to32 (BB[i + l * n]); } - for (i = 0; i < n; i++) - for (j = 0; j < m; j++) + if (fabs (CC[i * m + j] - C[i * m + j]) > 1.0) + ret++; if (fabs (CC[i * m + j] - DD[i * m + j]) > 1.0) ret++; + } } free(A); free(B); From f6469e21bcb443fd54927df7e510e6f1bf59acae Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 14 Aug 2024 16:00:43 +0200 Subject: [PATCH 021/244] move gelqs and geqrs to lapack-deprecated --- exports/gensymbol | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/exports/gensymbol b/exports/gensymbol index 28dd883f2b..d53b980515 100755 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -880,10 +880,8 @@ lapackobjs2c="$lapackobjs2c # clatrs3 lapackobjs2d="$lapackobjs2d - dgelqs dgelst dgeqp3rk - dgeqrs dlaqp2rk dlaqp3rk dlarmm @@ -897,10 +895,8 @@ lapackobjs2d="$lapackobjs2d # dlaqz4 lapackobjs2z="$lapackobjs2z - zgelqs zgelst zgeqp3rk - zgeqrs zlaqp2rk zlaqp3rk zlatrs3 @@ -918,6 +914,7 @@ lapack_extendedprecision_objs=" " lapack_deprecated_objsc=" + cgelqs cgeqrs cgegs cggsvd cgegv cggsvp cgelsx clahrd @@ -926,6 +923,7 @@ lapack_deprecated_objsc=" " lapack_deprecated_objsd=" + dgelqs dgeqrs dgegs dgeqpf dgegv dggsvd dgelsx dggsvp @@ -933,6 +931,8 @@ lapack_deprecated_objsd=" dlatzm dtzrqf" lapack_deprecated_objss=" + sgelqs + sgeqrs sgelsx sgegs sgegv @@ -945,6 +945,8 @@ lapack_deprecated_objss=" " lapack_deprecated_objsz=" + zgelqs + zgeqrs zgegs zgegv zgelsx From c23897f58501aed575dc25b1a54e968cbaf68da4 Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Wed, 14 Aug 2024 15:55:23 -0500 Subject: [PATCH 022/244] Add GEMV testing to SBGEMx vs SGEMx testing. --- test/compare_sgemm_sbgemm.c | 76 ++++++++++++++++++++++++++++++++++++- 1 file changed, 75 insertions(+), 1 deletion(-) diff --git a/test/compare_sgemm_sbgemm.c b/test/compare_sgemm_sbgemm.c index 3953174410..cd508a0cf4 100644 --- a/test/compare_sgemm_sbgemm.c +++ b/test/compare_sgemm_sbgemm.c @@ -29,6 +29,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "../common.h" #define SGEMM BLASFUNC(sgemm) #define SBGEMM BLASFUNC(sbgemm) +#define SGEMV BLASFUNC(sgemv) +#define SBGEMV BLASFUNC(sbgemv) typedef union { unsigned short v; @@ -187,7 +189,79 @@ main (int argc, char *argv[]) free(CC); } - if (ret != 0) + if (ret != 0) { fprintf (stderr, "FATAL ERROR SBGEMM - Return code: %d\n", ret); + return ret; + } + + k = 1; + for (x = 1; x <= loop; x++) + { + float *A = (float *)malloc(x * x * sizeof(FLOAT)); + float *B = (float *)malloc(x * sizeof(FLOAT)); + float *C = (float *)malloc(x * sizeof(FLOAT)); + bfloat16_bits *AA = (bfloat16_bits *)malloc(x * x * sizeof(bfloat16_bits)); + bfloat16_bits *BB = (bfloat16_bits *)malloc(x * sizeof(bfloat16_bits)); + float *DD = (float *)malloc(x * sizeof(FLOAT)); + float *CC = (float *)malloc(x * sizeof(FLOAT)); + if ((A == NULL) || (B == NULL) || (C == NULL) || (AA == NULL) || (BB == NULL) || + (DD == NULL) || (CC == NULL)) + return 1; + bfloat16 atmp, btmp; + blasint one = 1; + + for (j = 0; j < x; j++) + { + for (i = 0; i < x; i++) + { + A[j * x + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; + sbstobf16_(&one, &A[j*x+i], &one, &atmp, &one); + AA[j * x + i].v = atmp; + } + B[j] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; + sbstobf16_(&one, &B[j], &one, &btmp, &one); + BB[j].v = btmp; + } + for (y = 0; y < 2; y++) + { + if (y == 0) { + transA = 'N'; + } else { + transA = 'T'; + } + + memset(CC, 0, x * sizeof(FLOAT)); + memset(DD, 0, x * sizeof(FLOAT)); + memset(C, 0, x * sizeof(FLOAT)); + + SGEMV (&transA, &x, &x, &alpha, A, &x, B, &k, &beta, C, &k); + SBGEMV (&transA, &x, &x, &alpha, (bfloat16*) AA, &x, (bfloat16*) BB, &k, &beta, CC, &k); + + for (j = 0; j < x; j++) + for (i = 0; i < x; i++) + if (transA == 'N') { + DD[i] += float16to32 (AA[j * x + i]) * float16to32 (BB[j]); + } else if (transA == 'T') { + DD[j] += float16to32 (AA[j * x + i]) * float16to32 (BB[i]); + } + + for (j = 0; j < x; j++) { + if (fabs (CC[j] - C[j]) > 1.0) + ret++; + if (fabs (CC[j] - DD[j]) > 1.0) + ret++; + } + } + free(A); + free(B); + free(C); + free(AA); + free(BB); + free(DD); + free(CC); + } + + if (ret != 0) + fprintf (stderr, "FATAL ERROR SBGEMV - Return code: %d\n", ret); return ret; } From 2d84ed7e76cb6533d28d3c54f2443727ae4e9fe9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vladimir=20Nikoli=C4=87?= Date: Wed, 14 Aug 2024 14:31:35 -0700 Subject: [PATCH 023/244] Update README.md --- README.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 578ab209a0..e250be6267 100644 --- a/README.md +++ b/README.md @@ -67,17 +67,17 @@ build options you plan to set. ### Cross compile Set `CC` and `FC` to point to the cross toolchains, and set `HOSTCC` to your host C compiler. -The target must be specified explicitly when cross compiling. The `CROSS=1` flag should be specified. +The target must be specified explicitly when cross compiling. Examples: * On an x86 box, compile this library for a loongson3a CPU: ```sh - make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A CROSS=1 + make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A ``` or same with the newer mips-crosscompiler put out by Loongson that defaults to the 32bit ABI: ```sh - make HOSTCC=gcc CC='/opt/mips-loongson-gcc7.3-linux-gnu/2019.06-29/bin/mips-linux-gnu-gcc -mabi=64' FC='/opt/mips-loongson-gcc7.3-linux-gnu/2019.06-29/bin/mips-linux-gnu-gfortran -mabi=64' TARGET=LOONGSON3A CROSS=1 + make HOSTCC=gcc CC='/opt/mips-loongson-gcc7.3-linux-gnu/2019.06-29/bin/mips-linux-gnu-gcc -mabi=64' FC='/opt/mips-loongson-gcc7.3-linux-gnu/2019.06-29/bin/mips-linux-gnu-gfortran -mabi=64' TARGET=LOONGSON3A ``` * On an x86 box, compile this library for a loongson3a CPU with loongcc (based on Open64) compiler: @@ -85,6 +85,8 @@ Examples: make CC=loongcc FC=loongf95 HOSTCC=gcc TARGET=LOONGSON3A CROSS=1 CROSS_SUFFIX=mips64el-st-linux-gnu- NO_LAPACKE=1 NO_SHARED=1 BINARY=32 ``` +When compiling for a more modern CPU TARGET of the same architecture, e.g. TARGET=SKYLAKEX on a HASWELL host, option "CROSS=1" can be used to suppress the automatic invocation of the tests at the end of the build. + ### Debug version A debug version can be built using `make DEBUG=1`. From 1b8e40874e14e8117bff91327973d2453ba61203 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 15 Aug 2024 09:33:42 +0200 Subject: [PATCH 024/244] Add autodetection support for Intel Granite Rapids as Sapphire Rapids --- cpuid_x86.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/cpuid_x86.c b/cpuid_x86.c index f77cca1d87..9b2b7a51eb 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1527,6 +1527,19 @@ int get_cpuname(void){ break; case 10: //family 6 exmodel 10 switch (model) { + case 13: // Granite Rapids + if(support_amx_bf16()) + return CPUTYPE_SAPPHIRERAPIDS; + if(support_avx512_bf16()) + return CPUTYPE_COOPERLAKE; + if(support_avx512()) + return CPUTYPE_SKYLAKEX; + if(support_avx2()) + return CPUTYPE_HASWELL; + if(support_avx()) + return CPUTYPE_SANDYBRIDGE; + else + return CPUTYPE_NEHALEM; case 5: // Comet Lake H and S case 6: // Comet Lake U case 10: // Meteor Lake @@ -2352,8 +2365,22 @@ int get_coretype(void){ case 10: switch (model) { + case 13: // Granite Rapids + if(support_amx_bf16()) + return CORE_SAPPHIRERAPIDS; + if(support_avx512_bf16()) + return CORE_COOPERLAKE; + if(support_avx512()) + return CORE_SKYLAKEX; + if(support_avx2()) + return CORE_HASWELL; + if(support_avx()) + return CORE_SANDYBRIDGE; + else + return CORE_NEHALEM; case 5: // Comet Lake H and S case 6: // Comet Lake U + case 10: // Meteor Lake if(support_avx()) #ifndef NO_AVX2 return CORE_HASWELL; @@ -2362,6 +2389,7 @@ int get_coretype(void){ #endif else return CORE_NEHALEM; + case 0: // Meteor Lake case 7:// Rocket Lake #ifndef NO_AVX512 if(support_avx512()) From fd033467acbd56710810a4405f0d8926cc800027 Mon Sep 17 00:00:00 2001 From: gxw Date: Thu, 15 Aug 2024 16:48:48 +0800 Subject: [PATCH 025/244] Fixed the undefined reference to blas_set_parameter Fixed the undefined reference to blas_set_parameter when enabling USE_OPENMP and DYNAMIC_ARCH. --- driver/others/blas_server_omp.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index 06862cec02..4341389d81 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -114,9 +114,11 @@ void goto_set_num_threads(int num_threads) { adjust_thread_buffers(); #if defined(ARCH_MIPS64) || defined(ARCH_LOONGARCH64) +#ifndef DYNAMIC_ARCH //set parameters for different number of threads. blas_set_parameter(); #endif +#endif } void openblas_set_num_threads(int num_threads) { From ed0321563a7858ad989ade66eac7b2176154af9d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 15 Aug 2024 11:11:07 +0200 Subject: [PATCH 026/244] fix installation of NDK in armv7 crossbuild --- .cirrus.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.cirrus.yml b/.cirrus.yml index d0e1eeff71..1408f25666 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -89,11 +89,11 @@ task: type: text/plain macos_instance: - image: ghcr.io/cirruslabs/macos-monterey-xcode:latest + image: ghcr.io/cirruslabs/macos-sonoma-xcode:latest task: name: AppleM1/LLVM armv7-androidndk xbuild compile_script: - - brew install android-ndk + - brew install --cask android-ndk - export #PATH=/opt/homebrew/opt/llvm/bin:$PATH - export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib" - export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include" From 94c9e0b7ad60c3149e05a24ebb7d7fe3175e5200 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 15 Aug 2024 11:30:23 +0200 Subject: [PATCH 027/244] Update ndk version number --- .cirrus.yml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/.cirrus.yml b/.cirrus.yml index 1408f25666..17e4eb7e87 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -96,13 +96,14 @@ task: - brew install --cask android-ndk - export #PATH=/opt/homebrew/opt/llvm/bin:$PATH - export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib" - - export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include" - - ls /System/Volumes/Data/opt/homebrew + - export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include" + - export ANDROID_NDK_HOME="/opt/homebrew/share/android-ndk" + - ls /opt/homebrew - ls -l /System/Volumes/Data/opt/homebrew/Caskroom/android-ndk - - find /System/Volumes/Data/opt/homebrew -name "armv7a-linux-androideabi*-ranlib" + - find /opt/homebrew -name "armv7a-linux-androideabi*-ranlib" - #export CC=/Applications/Xcode-13.4.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang - #export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-13.4.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.0.sdk -arch arm64 -miphoneos-version-min=10.0" - - export CC=/System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/26d/AndroidNDK*.app/Contents/NDK/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi23-clang + - export CC=/System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/27/AndroidNDK*.app/Contents/NDK/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi23-clang - make TARGET=ARMV7 ARM_SOFTFP_ABI=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l" always: config_artifacts: From d24b3cf39392a99e81ed47a5f093fbd074d4b39b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 15 Aug 2024 15:32:58 +0200 Subject: [PATCH 028/244] properly fix buffer allocation and assignment --- driver/others/blas_server.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index b9a7674c17..29f8a5e646 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -1076,7 +1076,11 @@ fprintf(STDERR, "Server[%2ld] Calculation started. Mode = 0x%03x M = %3ld N=%3l main_status[cpu] = MAIN_RUNNING1; #endif -if (buffer == NULL) blas_thread_buffer[cpu] = blas_memory_alloc(2); +if (buffer == NULL) { + blas_thread_buffer[cpu] = blas_memory_alloc(2); + buffer = blas_thread_buffer[cpu]; +} + //For target LOONGSON3R5, applying an offset to the buffer is essential //for minimizing cache conflicts and optimizing performance. From e05d98d00a8a4cf3566013f12b6420344d579d2e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 15 Aug 2024 22:14:29 +0200 Subject: [PATCH 029/244] expressly use fld.d/fst.d for floating point registers instead of LD/ST macros --- kernel/loongarch64/cgemm_kernel_16x4_lasx.S | 42 ++++++++++----------- kernel/loongarch64/dgemm_kernel_16x4.S | 38 +++++++++---------- kernel/loongarch64/zgemm_kernel_8x4_lasx.S | 42 ++++++++++----------- 3 files changed, 61 insertions(+), 61 deletions(-) diff --git a/kernel/loongarch64/cgemm_kernel_16x4_lasx.S b/kernel/loongarch64/cgemm_kernel_16x4_lasx.S index 249abe1022..4042ff745e 100644 --- a/kernel/loongarch64/cgemm_kernel_16x4_lasx.S +++ b/kernel/loongarch64/cgemm_kernel_16x4_lasx.S @@ -196,17 +196,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SDARG $r25, $sp, 16 SDARG $r26, $sp, 24 SDARG $r27, $sp, 32 - ST $f23, $sp, 40 - ST $f24, $sp, 48 - ST $f25, $sp, 56 - ST $f26, $sp, 64 - ST $f27, $sp, 72 - ST $f28, $sp, 80 - ST $f29, $sp, 88 - ST $f30, $sp, 96 - ST $f31, $sp, 104 - ST ALPHA_R,$sp, 112 - ST ALPHA_I,$sp, 120 + fst.d $f23, $sp, 40 + fst.d $f24, $sp, 48 + fst.d $f25, $sp, 56 + fst.d $f26, $sp, 64 + fst.d $f27, $sp, 72 + fst.d $f28, $sp, 80 + fst.d $f29, $sp, 88 + fst.d $f30, $sp, 96 + fst.d $f31, $sp, 104 + fst.d ALPHA_R,$sp, 112 + fst.d ALPHA_I,$sp, 120 xvldrepl.w VALPHAR, $sp, 112 xvldrepl.w VALPHAI, $sp, 120 @@ -3741,17 +3741,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. LDARG $r25, $sp, 16 LDARG $r26, $sp, 24 LDARG $r27, $sp, 32 - LD $f23, $sp, 40 - LD $f24, $sp, 48 - LD $f25, $sp, 56 - LD $f26, $sp, 64 - LD $f27, $sp, 72 - LD $f28, $sp, 80 - LD $f29, $sp, 88 - LD $f30, $sp, 96 - LD $f31, $sp, 104 + fld.d $f23, $sp, 40 + fld.d $f24, $sp, 48 + fld.d $f25, $sp, 56 + fld.d $f26, $sp, 64 + fld.d $f27, $sp, 72 + fld.d $f28, $sp, 80 + fld.d $f29, $sp, 88 + fld.d $f30, $sp, 96 + fld.d $f31, $sp, 104 addi.d $sp, $sp, 128 jirl $r0, $r1, 0x0 - EPILOGUE \ No newline at end of file + EPILOGUE diff --git a/kernel/loongarch64/dgemm_kernel_16x4.S b/kernel/loongarch64/dgemm_kernel_16x4.S index f8e26fda22..af98d68c9c 100644 --- a/kernel/loongarch64/dgemm_kernel_16x4.S +++ b/kernel/loongarch64/dgemm_kernel_16x4.S @@ -1098,16 +1098,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SDARG $r25, $sp, 16 SDARG $r26, $sp, 24 SDARG $r27, $sp, 32 - ST $f23, $sp, 40 - ST $f24, $sp, 48 - ST $f25, $sp, 56 - ST $f26, $sp, 64 - ST $f27, $sp, 72 - ST $f28, $sp, 80 - ST $f29, $sp, 88 - ST $f30, $sp, 96 - ST $f31, $sp, 104 - ST ALPHA, $sp, 112 + fst.d $f23, $sp, 40 + fst.d $f24, $sp, 48 + fst.d $f25, $sp, 56 + fst.d $f26, $sp, 64 + fst.d $f27, $sp, 72 + fst.d $f28, $sp, 80 + fst.d $f29, $sp, 88 + fst.d $f30, $sp, 96 + fst.d $f31, $sp, 104 + fst.d ALPHA, $sp, 112 #if defined (TRMMKERNEL) && !defined(LEFT) sub.d OFF, ZERO, OFFSET @@ -3504,15 +3504,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. LDARG $r25, $sp, 16 LDARG $r26, $sp, 24 LDARG $r27, $sp, 32 - LD $f23, $sp, 40 - LD $f24, $sp, 48 - LD $f25, $sp, 56 - LD $f26, $sp, 64 - LD $f27, $sp, 72 - LD $f28, $sp, 80 - LD $f29, $sp, 88 - LD $f30, $sp, 96 - LD $f31, $sp, 104 + fld.d $f23, $sp, 40 + fld.d $f24, $sp, 48 + fld.d $f25, $sp, 56 + fld.d $f26, $sp, 64 + fld.d $f27, $sp, 72 + fld.d $f28, $sp, 80 + fld.d $f29, $sp, 88 + fld.d $f30, $sp, 96 + fld.d $f31, $sp, 104 addi.d $sp, $sp, 120 jirl $r0, $r1, 0x0 diff --git a/kernel/loongarch64/zgemm_kernel_8x4_lasx.S b/kernel/loongarch64/zgemm_kernel_8x4_lasx.S index ca90b30f55..43533affb3 100644 --- a/kernel/loongarch64/zgemm_kernel_8x4_lasx.S +++ b/kernel/loongarch64/zgemm_kernel_8x4_lasx.S @@ -196,17 +196,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SDARG $r25, $sp, 16 SDARG $r26, $sp, 24 SDARG $r27, $sp, 32 - ST $f23, $sp, 40 - ST $f24, $sp, 48 - ST $f25, $sp, 56 - ST $f26, $sp, 64 - ST $f27, $sp, 72 - ST $f28, $sp, 80 - ST $f29, $sp, 88 - ST $f30, $sp, 96 - ST $f31, $sp, 104 - ST ALPHA_R,$sp, 112 - ST ALPHA_I,$sp, 120 + fst.d $f23, $sp, 40 + fst.d $f24, $sp, 48 + fst.d $f25, $sp, 56 + fst.d $f26, $sp, 64 + fst.d $f27, $sp, 72 + fst.d $f28, $sp, 80 + fst.d $f29, $sp, 88 + fst.d $f30, $sp, 96 + fst.d $f31, $sp, 104 + fst.d ALPHA_R,$sp, 112 + fst.d ALPHA_I,$sp, 120 xvldrepl.d VALPHAR, $sp, 112 xvldrepl.d VALPHAI, $sp, 120 @@ -3529,17 +3529,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. LDARG $r25, $sp, 16 LDARG $r26, $sp, 24 LDARG $r27, $sp, 32 - LD $f23, $sp, 40 - LD $f24, $sp, 48 - LD $f25, $sp, 56 - LD $f26, $sp, 64 - LD $f27, $sp, 72 - LD $f28, $sp, 80 - LD $f29, $sp, 88 - LD $f30, $sp, 96 - LD $f31, $sp, 104 + fld.d $f23, $sp, 40 + fld.d $f24, $sp, 48 + fld.d $f25, $sp, 56 + fld.d $f26, $sp, 64 + fld.d $f27, $sp, 72 + fld.d $f28, $sp, 80 + fld.d $f29, $sp, 88 + fld.d $f30, $sp, 96 + fld.d $f31, $sp, 104 addi.d $sp, $sp, 128 jirl $r0, $r1, 0x0 - EPILOGUE \ No newline at end of file + EPILOGUE From 49080b631e25227047be2638523266226c076662 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 15 Aug 2024 22:15:27 +0200 Subject: [PATCH 030/244] remove optimizer pragma again --- utest/test_potrs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utest/test_potrs.c b/utest/test_potrs.c index 642ce1e376..f39287d6f3 100644 --- a/utest/test_potrs.c +++ b/utest/test_potrs.c @@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ #include "openblas_utest.h" -#pragma GCC optimize("no-gcse") + /* void BLASFUNC(cpotrf)(char*, BLASINT*, complex float*, BLASINT*, BLASINT*); void BLASFUNC(zpotrs_(char*, BLASINT*, BLASINT*, complex double*, From dfba3f8841583debe277abac1ccc7790ac36a742 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 16 Aug 2024 11:23:19 +0200 Subject: [PATCH 031/244] restore the pragma as it is reportedly still needed on 3C6000/gcc14.2 --- utest/test_potrs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utest/test_potrs.c b/utest/test_potrs.c index f39287d6f3..642ce1e376 100644 --- a/utest/test_potrs.c +++ b/utest/test_potrs.c @@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ #include "openblas_utest.h" - +#pragma GCC optimize("no-gcse") /* void BLASFUNC(cpotrf)(char*, BLASINT*, complex float*, BLASINT*, BLASINT*); void BLASFUNC(zpotrs_(char*, BLASINT*, BLASINT*, complex double*, From b1802f4dc8c1634dc3fc2c6ef7684b1b7bd25f1c Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Fri, 16 Aug 2024 09:51:37 -0500 Subject: [PATCH 032/244] Fix unit test to start at 1 instead of 0 - since malloc zero bytes fails on some systems. --- test/compare_sgemm_sbgemm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/compare_sgemm_sbgemm.c b/test/compare_sgemm_sbgemm.c index cd508a0cf4..9209a61038 100644 --- a/test/compare_sgemm_sbgemm.c +++ b/test/compare_sgemm_sbgemm.c @@ -96,7 +96,7 @@ main (int argc, char *argv[]) char transA = 'N', transB = 'N'; float alpha = 1.0, beta = 0.0; - for (x = 0; x <= loop; x++) + for (x = 1; x <= loop; x++) { if ((x > 100) && (x != SBGEMM_LARGEST)) continue; m = k = n = x; From 868aa857bc3b32576e1ef1abd8501504339099c6 Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Fri, 16 Aug 2024 10:28:10 -0500 Subject: [PATCH 033/244] Change malloc zero to return one byte and update the SBGEMM test to again use sizes of zero. --- test/compare_sgemm_sbgemm.c | 40 ++++++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/test/compare_sgemm_sbgemm.c b/test/compare_sgemm_sbgemm.c index 9209a61038..7ea71b63d3 100644 --- a/test/compare_sgemm_sbgemm.c +++ b/test/compare_sgemm_sbgemm.c @@ -85,6 +85,14 @@ float16to32 (bfloat16_bits f16) #define SBGEMM_LARGEST 256 +void *malloc_safe(size_t size) +{ + if (size == 0) + return malloc(1); + else + return malloc(size); +} + int main (int argc, char *argv[]) { @@ -96,17 +104,17 @@ main (int argc, char *argv[]) char transA = 'N', transB = 'N'; float alpha = 1.0, beta = 0.0; - for (x = 1; x <= loop; x++) + for (x = 0; x <= loop; x++) { if ((x > 100) && (x != SBGEMM_LARGEST)) continue; m = k = n = x; - float *A = (float *)malloc(m * k * sizeof(FLOAT)); - float *B = (float *)malloc(k * n * sizeof(FLOAT)); - float *C = (float *)malloc(m * n * sizeof(FLOAT)); - bfloat16_bits *AA = (bfloat16_bits *)malloc(m * k * sizeof(bfloat16_bits)); - bfloat16_bits *BB = (bfloat16_bits *)malloc(k * n * sizeof(bfloat16_bits)); - float *DD = (float *)malloc(m * n * sizeof(FLOAT)); - float *CC = (float *)malloc(m * n * sizeof(FLOAT)); + float *A = (float *)malloc_safe(m * k * sizeof(FLOAT)); + float *B = (float *)malloc_safe(k * n * sizeof(FLOAT)); + float *C = (float *)malloc_safe(m * n * sizeof(FLOAT)); + bfloat16_bits *AA = (bfloat16_bits *)malloc_safe(m * k * sizeof(bfloat16_bits)); + bfloat16_bits *BB = (bfloat16_bits *)malloc_safe(k * n * sizeof(bfloat16_bits)); + float *DD = (float *)malloc_safe(m * n * sizeof(FLOAT)); + float *CC = (float *)malloc_safe(m * n * sizeof(FLOAT)); if ((A == NULL) || (B == NULL) || (C == NULL) || (AA == NULL) || (BB == NULL) || (DD == NULL) || (CC == NULL)) return 1; @@ -195,15 +203,15 @@ main (int argc, char *argv[]) } k = 1; - for (x = 1; x <= loop; x++) + for (x = 0; x <= loop; x++) { - float *A = (float *)malloc(x * x * sizeof(FLOAT)); - float *B = (float *)malloc(x * sizeof(FLOAT)); - float *C = (float *)malloc(x * sizeof(FLOAT)); - bfloat16_bits *AA = (bfloat16_bits *)malloc(x * x * sizeof(bfloat16_bits)); - bfloat16_bits *BB = (bfloat16_bits *)malloc(x * sizeof(bfloat16_bits)); - float *DD = (float *)malloc(x * sizeof(FLOAT)); - float *CC = (float *)malloc(x * sizeof(FLOAT)); + float *A = (float *)malloc_safe(x * x * sizeof(FLOAT)); + float *B = (float *)malloc_safe(x * sizeof(FLOAT)); + float *C = (float *)malloc_safe(x * sizeof(FLOAT)); + bfloat16_bits *AA = (bfloat16_bits *)malloc_safe(x * x * sizeof(bfloat16_bits)); + bfloat16_bits *BB = (bfloat16_bits *)malloc_safe(x * sizeof(bfloat16_bits)); + float *DD = (float *)malloc_safe(x * sizeof(FLOAT)); + float *CC = (float *)malloc_safe(x * sizeof(FLOAT)); if ((A == NULL) || (B == NULL) || (C == NULL) || (AA == NULL) || (BB == NULL) || (DD == NULL) || (CC == NULL)) return 1; From 77f85c7c00c3abbdd0fffc463fc1ec01dc198d88 Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Fri, 16 Aug 2024 11:15:32 -0500 Subject: [PATCH 034/244] GEMV tests don't like zero elements. --- test/compare_sgemm_sbgemm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/compare_sgemm_sbgemm.c b/test/compare_sgemm_sbgemm.c index 7ea71b63d3..eb47e83e5b 100644 --- a/test/compare_sgemm_sbgemm.c +++ b/test/compare_sgemm_sbgemm.c @@ -203,7 +203,7 @@ main (int argc, char *argv[]) } k = 1; - for (x = 0; x <= loop; x++) + for (x = 1; x <= loop; x++) { float *A = (float *)malloc_safe(x * x * sizeof(FLOAT)); float *B = (float *)malloc_safe(x * sizeof(FLOAT)); From 89702e1f4a8fbaf8f24f661eb6cca9f7f0e96a40 Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Fri, 16 Aug 2024 11:37:39 -0500 Subject: [PATCH 035/244] Fix zero element GEMV test. --- test/compare_sgemm_sbgemm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/compare_sgemm_sbgemm.c b/test/compare_sgemm_sbgemm.c index eb47e83e5b..b8aaee8be3 100644 --- a/test/compare_sgemm_sbgemm.c +++ b/test/compare_sgemm_sbgemm.c @@ -202,9 +202,9 @@ main (int argc, char *argv[]) return ret; } - k = 1; for (x = 1; x <= loop; x++) { + k = (x == 0) ? 0 : 1; float *A = (float *)malloc_safe(x * x * sizeof(FLOAT)); float *B = (float *)malloc_safe(x * sizeof(FLOAT)); float *C = (float *)malloc_safe(x * sizeof(FLOAT)); From 4894c540554ef6e6d4d68688015d4f7a9b2755fa Mon Sep 17 00:00:00 2001 From: Deeksha Goplani Date: Mon, 2 Sep 2024 22:22:49 +0530 Subject: [PATCH 036/244] Improve TN case with further unrolling --- kernel/arm64/dgemm_small_kernel_tn_sve.c | 228 ++++++++++++++++++++++- 1 file changed, 222 insertions(+), 6 deletions(-) diff --git a/kernel/arm64/dgemm_small_kernel_tn_sve.c b/kernel/arm64/dgemm_small_kernel_tn_sve.c index daca8e1bef..2ef23d7ee4 100644 --- a/kernel/arm64/dgemm_small_kernel_tn_sve.c +++ b/kernel/arm64/dgemm_small_kernel_tn_sve.c @@ -211,6 +211,7 @@ CNAME(BLASLONG M, const BLASLONG v_m1 = M & -v_size; const BLASLONG n4 = N & -4; const BLASLONG n2 = N & -2; + const BLASLONG n8 = N & -8; const int pack_a = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0; FLOAT* packed_a = @@ -229,28 +230,37 @@ CNAME(BLASLONG M, CREATE_A_POINTER(1, v_size); BLASLONG j = 0; - for (; j < n4; j += 4) { - + for (; j < n8; j += 8) { CREATE_B_POINTER(0, 0); CREATE_B_POINTER(1, 1); CREATE_B_POINTER(2, 2); CREATE_B_POINTER(3, 3); - UPDATE_B_POINTER(4); + CREATE_B_POINTER(4, 4); + CREATE_B_POINTER(5, 5); + CREATE_B_POINTER(6, 6); + CREATE_B_POINTER(7, 7); + UPDATE_B_POINTER(8); BLASLONG k = 0; DECLARE_RESULT_VECTOR(0, 0); DECLARE_RESULT_VECTOR(0, 1); DECLARE_RESULT_VECTOR(0, 2); DECLARE_RESULT_VECTOR(0, 3); + DECLARE_RESULT_VECTOR(0, 4); + DECLARE_RESULT_VECTOR(0, 5); + DECLARE_RESULT_VECTOR(0, 6); + DECLARE_RESULT_VECTOR(0, 7); DECLARE_RESULT_VECTOR(1, 0); DECLARE_RESULT_VECTOR(1, 1); DECLARE_RESULT_VECTOR(1, 2); DECLARE_RESULT_VECTOR(1, 3); - + DECLARE_RESULT_VECTOR(1, 4); + DECLARE_RESULT_VECTOR(1, 5); + DECLARE_RESULT_VECTOR(1, 6); + DECLARE_RESULT_VECTOR(1, 7); if (LIKELY(packed_a != NULL)) { if (j == 0) { for (; k < K; k++) { - BROADCAST_LOAD_B(0, 0); GATHER_LOAD_A(pg_true, 0, 0); VECTOR_PACK_A(0, 0); @@ -267,10 +277,21 @@ CNAME(BLASLONG M, BROADCAST_LOAD_B(3, 0); UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); UPDATE_RESULT_VECTOR(pg_true, 1, 3, 0); + BROADCAST_LOAD_B(4, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 4, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 4, 0); + BROADCAST_LOAD_B(5, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 5, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 5, 0); + BROADCAST_LOAD_B(6, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 6, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 6, 0); + BROADCAST_LOAD_B(7, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 7, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 7, 0); } } else { for (; k < K; k++) { - BROADCAST_LOAD_B(0, 0); UNPACK_VECTOR_A(0, 0); UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); @@ -285,7 +306,104 @@ CNAME(BLASLONG M, BROADCAST_LOAD_B(3, 0); UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); UPDATE_RESULT_VECTOR(pg_true, 1, 3, 0); + BROADCAST_LOAD_B(4, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 4, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 4, 0); + BROADCAST_LOAD_B(5, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 5, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 5, 0); + BROADCAST_LOAD_B(6, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 6, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 6, 0); + BROADCAST_LOAD_B(7, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 7, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 7, 0); } + } + } else { + for (; k < K; k++) { + BROADCAST_LOAD_B(0, 0); + GATHER_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); + BROADCAST_LOAD_B(1, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); + GATHER_LOAD_A(pg_true, 1, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 1, 0); + BROADCAST_LOAD_B(2, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 2, 0); + BROADCAST_LOAD_B(3, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 3, 0); + BROADCAST_LOAD_B(4, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 4, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 4, 0); + BROADCAST_LOAD_B(5, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 5, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 5, 0); + BROADCAST_LOAD_B(6, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 6, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 6, 0); + BROADCAST_LOAD_B(7, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 7, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 7, 0); + } + } + VECTOR_STORE(pg_true, 0, 0); + VECTOR_STORE(pg_true, 0, 1); + VECTOR_STORE(pg_true, 0, 2); + VECTOR_STORE(pg_true, 0, 3); + VECTOR_STORE(pg_true, 0, 4); + VECTOR_STORE(pg_true, 0, 5); + VECTOR_STORE(pg_true, 0, 6); + VECTOR_STORE(pg_true, 0, 7); + VECTOR_STORE(pg_true, 1, 0); + VECTOR_STORE(pg_true, 1, 1); + VECTOR_STORE(pg_true, 1, 2); + VECTOR_STORE(pg_true, 1, 3); + VECTOR_STORE(pg_true, 1, 4); + VECTOR_STORE(pg_true, 1, 5); + VECTOR_STORE(pg_true, 1, 6); + VECTOR_STORE(pg_true, 1, 7); + INCR_C_POINTER(0, 8); + INCR_C_POINTER(1, 8); + } + for (; j < n4; j += 4) { + + CREATE_B_POINTER(0, 0); + CREATE_B_POINTER(1, 1); + CREATE_B_POINTER(2, 2); + CREATE_B_POINTER(3, 3); + UPDATE_B_POINTER(4); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + DECLARE_RESULT_VECTOR(0, 1); + DECLARE_RESULT_VECTOR(0, 2); + DECLARE_RESULT_VECTOR(0, 3); + DECLARE_RESULT_VECTOR(1, 0); + DECLARE_RESULT_VECTOR(1, 1); + DECLARE_RESULT_VECTOR(1, 2); + DECLARE_RESULT_VECTOR(1, 3); + + if (LIKELY(packed_a != NULL)) { + for (; k < K; k++) { + + BROADCAST_LOAD_B(0, 0); + UNPACK_VECTOR_A(0, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); + BROADCAST_LOAD_B(1, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); + UNPACK_VECTOR_A(1, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 1, 0); + BROADCAST_LOAD_B(2, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 2, 0); + BROADCAST_LOAD_B(3, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); + UPDATE_RESULT_VECTOR(pg_true, 1, 3, 0); } } else { for (; k < K; k++) { @@ -405,6 +523,55 @@ CNAME(BLASLONG M, CREATE_A_POINTER(0, 0); BLASLONG j = 0; + for (; j < n8; j += 8) { + CREATE_B_POINTER(0, 0); + CREATE_B_POINTER(1, 1); + CREATE_B_POINTER(2, 2); + CREATE_B_POINTER(3, 3); + CREATE_B_POINTER(4, 4); + CREATE_B_POINTER(5, 5); + CREATE_B_POINTER(6, 6); + CREATE_B_POINTER(7, 7); + UPDATE_B_POINTER(8); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + DECLARE_RESULT_VECTOR(0, 1); + DECLARE_RESULT_VECTOR(0, 2); + DECLARE_RESULT_VECTOR(0, 3); + DECLARE_RESULT_VECTOR(0, 4); + DECLARE_RESULT_VECTOR(0, 5); + DECLARE_RESULT_VECTOR(0, 6); + DECLARE_RESULT_VECTOR(0, 7); + for (; k < K; k++) { + BROADCAST_LOAD_B(0, 0); + GATHER_LOAD_A(pg_true, 0, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); + BROADCAST_LOAD_B(1, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); + BROADCAST_LOAD_B(2, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); + BROADCAST_LOAD_B(3, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); + BROADCAST_LOAD_B(4, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 4, 0); + BROADCAST_LOAD_B(5, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 5, 0); + BROADCAST_LOAD_B(6, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 6, 0); + BROADCAST_LOAD_B(7, 0); + UPDATE_RESULT_VECTOR(pg_true, 0, 7, 0); + } + VECTOR_STORE(pg_true, 0, 0); + VECTOR_STORE(pg_true, 0, 1); + VECTOR_STORE(pg_true, 0, 2); + VECTOR_STORE(pg_true, 0, 3); + VECTOR_STORE(pg_true, 0, 4); + VECTOR_STORE(pg_true, 0, 5); + VECTOR_STORE(pg_true, 0, 6); + VECTOR_STORE(pg_true, 0, 7); + INCR_C_POINTER(0, 8); + } for (; j < n4; j += 4) { CREATE_B_POINTER(0, 0); @@ -487,6 +654,55 @@ CNAME(BLASLONG M, CREATE_A_POINTER(0, 0); BLASLONG j = 0; + for (; j < n8; j += 8) { + CREATE_B_POINTER(0, 0); + CREATE_B_POINTER(1, 1); + CREATE_B_POINTER(2, 2); + CREATE_B_POINTER(3, 3); + CREATE_B_POINTER(4, 4); + CREATE_B_POINTER(5, 5); + CREATE_B_POINTER(6, 6); + CREATE_B_POINTER(7, 7); + UPDATE_B_POINTER(8); + + BLASLONG k = 0; + DECLARE_RESULT_VECTOR(0, 0); + DECLARE_RESULT_VECTOR(0, 1); + DECLARE_RESULT_VECTOR(0, 2); + DECLARE_RESULT_VECTOR(0, 3); + DECLARE_RESULT_VECTOR(0, 4); + DECLARE_RESULT_VECTOR(0, 5); + DECLARE_RESULT_VECTOR(0, 6); + DECLARE_RESULT_VECTOR(0, 7); + for (; k < K; k++) { + BROADCAST_LOAD_B(0, 0); + GATHER_LOAD_A(pg_tail, 0, 0); + UPDATE_RESULT_VECTOR(pg_tail, 0, 0, 0); + BROADCAST_LOAD_B(1, 0); + UPDATE_RESULT_VECTOR(pg_tail, 0, 1, 0); + BROADCAST_LOAD_B(2, 0); + UPDATE_RESULT_VECTOR(pg_tail, 0, 2, 0); + BROADCAST_LOAD_B(3, 0); + UPDATE_RESULT_VECTOR(pg_tail, 0, 3, 0); + BROADCAST_LOAD_B(4, 0); + UPDATE_RESULT_VECTOR(pg_tail, 0, 4, 0); + BROADCAST_LOAD_B(5, 0); + UPDATE_RESULT_VECTOR(pg_tail, 0, 5, 0); + BROADCAST_LOAD_B(6, 0); + UPDATE_RESULT_VECTOR(pg_tail, 0, 6, 0); + BROADCAST_LOAD_B(7, 0); + UPDATE_RESULT_VECTOR(pg_tail, 0, 7, 0); + } + VECTOR_STORE(pg_tail, 0, 0); + VECTOR_STORE(pg_tail, 0, 1); + VECTOR_STORE(pg_tail, 0, 2); + VECTOR_STORE(pg_tail, 0, 3); + VECTOR_STORE(pg_tail, 0, 4); + VECTOR_STORE(pg_tail, 0, 5); + VECTOR_STORE(pg_tail, 0, 6); + VECTOR_STORE(pg_tail, 0, 7); + INCR_C_POINTER(0, 8); + } for (; j < n4; j += 4) { CREATE_B_POINTER(0, 0); From 76227e2948b1f847abef003de5f6d49ea0dd3171 Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Fri, 6 Sep 2024 14:03:31 -0500 Subject: [PATCH 037/244] Initial commit for vectorized BF16 GEMV. Added GEMM_GEMV_FORWARD_BF16 to enable using BF16 GEMV for one dimension matrices. Updated unit test to support inc_x != 1 or inc_y for GEMV. --- Makefile.system | 6 +- cmake/system.cmake | 3 + interface/gemm.c | 2 +- kernel/power/KERNEL.POWER10 | 2 + kernel/power/KERNEL.POWER8 | 2 + kernel/power/KERNEL.POWER9 | 2 + kernel/power/sbgemv_common.c | 285 ++++++++++++++++++++++++++++++ kernel/power/sbgemv_n.c | 189 ++++++++++++++++++++ kernel/power/sbgemv_n_power10.c | 33 ++++ kernel/power/sbgemv_n_vsx.c | 303 ++++++++++++++++++++++++++++++++ kernel/power/sbgemv_t.c | 117 ++++++++++++ kernel/power/sbgemv_t_power10.c | 32 ++++ kernel/power/sbgemv_t_vsx.c | 286 ++++++++++++++++++++++++++++++ test/compare_sgemm_sbgemm.c | 31 ++-- 14 files changed, 1277 insertions(+), 16 deletions(-) create mode 100644 kernel/power/sbgemv_common.c create mode 100644 kernel/power/sbgemv_n.c create mode 100644 kernel/power/sbgemv_n_power10.c create mode 100644 kernel/power/sbgemv_n_vsx.c create mode 100644 kernel/power/sbgemv_t.c create mode 100644 kernel/power/sbgemv_t_power10.c create mode 100644 kernel/power/sbgemv_t_vsx.c diff --git a/Makefile.system b/Makefile.system index b065f9a981..8c030842a4 100644 --- a/Makefile.system +++ b/Makefile.system @@ -282,15 +282,19 @@ GEMM_GEMV_FORWARD = 1 endif ifeq ($(ARCH), power) GEMM_GEMV_FORWARD = 1 +GEMM_GEMV_FORWARD_BF16 = 1 endif ifeq ($(SMALL_MATRIX_OPT), 1) CCOMMON_OPT += -DSMALL_MATRIX_OPT endif -ifeq ($(GEMM_GEMV_FORWARD), 1) ifneq ($(ONLY_CBLAS), 1) +ifeq ($(GEMM_GEMV_FORWARD), 1) CCOMMON_OPT += -DGEMM_GEMV_FORWARD endif +ifeq ($(GEMM_GEMV_FORWARD_BF16), 1) +CCOMMON_OPT += -DGEMM_GEMV_FORWARD_BF16 +endif endif # This operation is expensive, so execution should be once. diff --git a/cmake/system.cmake b/cmake/system.cmake index a0b73ddae0..fb2d350abb 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -398,6 +398,9 @@ endif () if (GEMM_GEMV_FORWARD AND NOT ONLY_CBLAS) set(CCOMMON_OPT "${CCOMMON_OPT} -DGEMM_GEMV_FORWARD") endif () +if (GEMM_GEMV_FORWARD_BF16 AND NOT ONLY_CBLAS) + set(CCOMMON_OPT "${CCOMMON_OPT} -DGEMM_GEMV_FORWARD_BF16") +endif () if (SMALL_MATRIX_OPT) set(CCOMMON_OPT "${CCOMMON_OPT} -DSMALL_MATRIX_OPT") endif () diff --git a/interface/gemm.c b/interface/gemm.c index 64b8b620cf..7cd0884fad 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -498,7 +498,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS args.m, args.n, args.k, args.lda, args.ldb, args.ldc); #endif -#if defined(GEMM_GEMV_FORWARD) && !defined(GEMM3M) && !defined(COMPLEX) && !defined(BFLOAT16) +#if defined(GEMM_GEMV_FORWARD) && !defined(GEMM3M) && !defined(COMPLEX) && (!defined(BFLOAT16) || defined(GEMM_GEMV_FORWARD_BF16)) // Check if we can convert GEMM -> GEMV if (args.k != 0) { if (args.n == 1) { diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 index c84cd91d2a..956b401fb2 100644 --- a/kernel/power/KERNEL.POWER10 +++ b/kernel/power/KERNEL.POWER10 @@ -228,11 +228,13 @@ ZSWAPKERNEL = zswap.c # SGEMVNKERNEL = sgemv_n.c +SBGEMVNKERNEL = sbgemv_n_power10.c DGEMVNKERNEL = dgemv_n_power10.c CGEMVNKERNEL = cgemv_n.c ZGEMVNKERNEL = zgemv_n_power10.c # SGEMVTKERNEL = sgemv_t.c +SBGEMVTKERNEL = sbgemv_t_power10.c DGEMVTKERNEL = dgemv_t_power10.c CGEMVTKERNEL = cgemv_t.c ZGEMVTKERNEL = zgemv_t_4.c diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index 700a68e447..001401d532 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -257,11 +257,13 @@ ZSWAPKERNEL = zswap.c # SGEMVNKERNEL = sgemv_n.c +SBGEMVNKERNEL = sbgemv_n_vsx.c DGEMVNKERNEL = dgemv_n.c CGEMVNKERNEL = cgemv_n.c ZGEMVNKERNEL = zgemv_n_4.c # SGEMVTKERNEL = sgemv_t.c +SBGEMVTKERNEL = sbgemv_t_vsx.c DGEMVTKERNEL = dgemv_t.c CGEMVTKERNEL = cgemv_t.c ZGEMVTKERNEL = zgemv_t_4.c diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9 index 7d007d1a2b..a18c31a2e9 100644 --- a/kernel/power/KERNEL.POWER9 +++ b/kernel/power/KERNEL.POWER9 @@ -181,11 +181,13 @@ ZSWAPKERNEL = zswap.c # SGEMVNKERNEL = sgemv_n.c +SBGEMVNKERNEL = sbgemv_n_vsx.c DGEMVNKERNEL = dgemv_n.c CGEMVNKERNEL = cgemv_n.c ZGEMVNKERNEL = zgemv_n_4.c # SGEMVTKERNEL = sgemv_t.c +SBGEMVTKERNEL = sbgemv_t_vsx.c DGEMVTKERNEL = dgemv_t.c CGEMVTKERNEL = cgemv_t.c ZGEMVTKERNEL = zgemv_t_4.c diff --git a/kernel/power/sbgemv_common.c b/kernel/power/sbgemv_common.c new file mode 100644 index 0000000000..2aadcca6ff --- /dev/null +++ b/kernel/power/sbgemv_common.c @@ -0,0 +1,285 @@ +/*************************************************************************** +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#ifndef SBGEMV_COMMON_C +#define SBGEMV_COMMON_C +#include "common.h" + +#include + +#define FORCEINLINE inline __attribute__((always_inline)) + +#ifdef __clang__ +#define uint16_t unsigned short +#define uint32_t unsigned int +#define uint64_t unsigned long long +#endif + +#ifdef _ARCH_PWR10 +#ifdef __has_builtin +#if !__has_builtin(__builtin_vsx_assemble_pair) +#define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair +#endif +#if !__has_builtin(__builtin_vsx_disassemble_pair) +#define __builtin_vsx_disassemble_pair __builtin_mma_disassemble_pair +#endif +#endif + +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#define __builtin_vsx_assemble_pair2(vp0, v0, v1) __builtin_vsx_assemble_pair(vp0, v1, v0) +#else +#define __builtin_vsx_assemble_pair2(vp0, v0, v1) __builtin_vsx_assemble_pair(vp0, v0, v1) +#endif + +#define USE_VECTOR_PAIRS +#endif + +typedef __vector IFLOAT vec_bf16; +typedef __vector FLOAT vec_f32; +typedef __vector unsigned char vec_uc8; + +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#define BF16_HI(data, zero) (vec_f32)vec_mergeh(data, zero) +#define BF16_LO(data, zero) (vec_f32)vec_mergel(data, zero) +#else +#define BF16_HI(data, zero) (vec_f32)vec_mergeh(zero, data) +#define BF16_LO(data, zero) (vec_f32)vec_mergel(zero, data) +#endif + +FORCEINLINE vec_uc8 vec_load_vec(void *src) +{ + return vec_xl(0, (unsigned char *)(src)); +} + +FORCEINLINE void vec_load_pair(vec_f32 *dst, vec_f32 *src) +{ +#ifdef USE_VECTOR_PAIRS + __vector_pair vy0p; + vy0p = *(__vector_pair *)(src); + __builtin_vsx_disassemble_pair((void *)(dst), &vy0p); +#else + dst[0] = src[0]; + dst[1] = src[1]; +#endif +} + +FORCEINLINE void vec_store_pair(vec_f32 *dst, vec_f32 *src) +{ +#ifdef USE_VECTOR_PAIRS + __vector_pair vy0p; + __builtin_vsx_assemble_pair2(&vy0p, (vec_uc8)src[1], (vec_uc8)src[0]); + *(__vector_pair *)(dst) = vy0p; +#else + dst[0] = src[0]; + dst[1] = src[1]; +#endif +} + +FORCEINLINE vec_bf16 vec_loadN(void *src, BLASLONG n) +{ + IFLOAT *src2 = (IFLOAT *)(src); +#ifdef _ARCH_PWR9 + return vec_xl_len(src2, n * sizeof(IFLOAT)); +#else + __attribute__((aligned(16))) IFLOAT data[sizeof(vec_bf16) / sizeof(IFLOAT)]; + memset(data, 0, sizeof(vec_bf16)); + if (n & 4) { + memcpy(data, src2, sizeof(uint64_t)); + } + if (n & 2) { + BLASLONG n4 = n & 4; + memcpy(data + n4, src2 + n4, sizeof(uint32_t)); + } + if (n & 1) { + BLASLONG n6 = n & 6; + data[n6] = src2[n6]; + } + return (vec_bf16)vec_load_vec(data); +#endif +} + +FORCEINLINE vec_f32 vec_loadNHi(void *src, BLASLONG n, vec_bf16 zero) +{ + vec_bf16 data = vec_loadN(src, n); + return BF16_HI(data, zero); +} + +FORCEINLINE vec_f32 vec_loadN_f32(void *src, BLASLONG n) +{ +#ifndef _ARCH_PWR9 + if (n & 4) { + return (vec_f32)vec_load_vec(src); + } +#endif + return (vec_f32)vec_loadN(src, n * (sizeof(FLOAT) / sizeof(IFLOAT))); +} + +FORCEINLINE void vec_storeN_f32(vec_f32 data, void *dst, BLASLONG n) +{ + FLOAT *dst2 = (FLOAT *)(dst); +#ifdef _ARCH_PWR9 + vec_xst_len(data, dst2, n * sizeof(FLOAT)); +#else + if (n & 4) { + vec_xst(data, 0, dst2); + return; + } + __attribute__((aligned(16))) FLOAT data2[sizeof(vec_f32) / sizeof(FLOAT)]; + vec_xst(data, 0, data2); + if (n & 2) { + memcpy(dst2, data2, sizeof(uint64_t)); + } + if (n & 1) { + BLASLONG n2 = n & 2; + dst2[n2] = data2[n2]; + } +#endif +} + +FORCEINLINE vec_f32 vec_mult(vec_f32 *inp, vec_bf16 in0, vec_bf16 zero) +{ + vec_f32 v_in00 = BF16_HI(in0, zero); + vec_f32 v_in01 = BF16_LO(in0, zero); + + return (inp[0] * v_in00) + (inp[1] * v_in01); +} + +FORCEINLINE vec_f32 vec_load_mult(vec_bf16 *in, vec_f32 *inp, vec_bf16 zero) +{ + vec_bf16 in0 = (vec_bf16)vec_load_vec(in); + + return vec_mult(inp, in0, zero); +} + +FORCEINLINE void vec_load_vec2(vec_bf16 *in, BLASLONG i, vec_f32 *v_x0, vec_bf16 zero) +{ + vec_bf16 inp = (vec_bf16)vec_load_vec(&in[i]); + + v_x0[0] = BF16_HI(inp, zero); + v_x0[1] = BF16_LO(inp, zero); +} + +FORCEINLINE void vec_mult2(vec_f32 v_x0, vec_bf16 in0, vec_bf16 zero, vec_f32 *vy0) +{ + vec_f32 v_in00 = BF16_HI(in0, zero); + vec_f32 v_in01 = BF16_LO(in0, zero); + + vy0[0] += (v_x0 * v_in00); + vy0[1] += (v_x0 * v_in01); +} + +FORCEINLINE void vec_load_mult2(vec_f32 v_x0, vec_bf16 *in, vec_bf16 zero, vec_f32 *vy0) +{ + vec_bf16 in0 = (vec_bf16)vec_load_vec(in); + + vec_mult2(v_x0, in0, zero, vy0); +} + +FORCEINLINE vec_f32 vec_loadN_mult(vec_bf16 *in, vec_f32 *inp, BLASLONG n, vec_bf16 zero) +{ + vec_bf16 in0 = vec_loadN(in, n); + + return vec_mult(inp, in0, zero); +} + +FORCEINLINE void vec_loadN_vec2(vec_bf16 *in, BLASLONG i, vec_f32 *v_x0, BLASLONG n, vec_bf16 zero) +{ + vec_bf16 inp = vec_loadN(&in[i], n); + + v_x0[0] = BF16_HI(inp, zero); + v_x0[1] = BF16_LO(inp, zero); +} + +FORCEINLINE void vec_loadN_mult2(vec_f32 v_x0, vec_bf16 *in, BLASLONG n, vec_bf16 zero, vec_f32 *vy0) +{ + vec_bf16 in0 = vec_loadN(in, n); + + vec_mult2(v_x0, in0, zero, vy0); +} + +FORCEINLINE vec_f32 vec_loadNHi_mult(vec_bf16 *in, vec_f32 v_inp0, BLASLONG n, vec_bf16 zero) +{ + vec_f32 v_in00 = vec_loadNHi(in, n, zero); + + return (v_inp0 * v_in00); +} + +FORCEINLINE vec_f32 vec_loadNHi_multi2(vec_f32 v_x0, vec_bf16 *in, BLASLONG n, vec_bf16 zero) +{ + vec_f32 v_in00 = vec_loadNHi(in, n, zero); + + return (v_x0 * v_in00); +} + +FORCEINLINE vec_f32 vec_loadNHi_vec(vec_bf16 *in, BLASLONG i, BLASLONG n, vec_bf16 zero) +{ + return vec_loadNHi(&in[i], n, zero); +} + +FORCEINLINE void copy_x(BLASLONG n, IFLOAT *src, IFLOAT *dest, BLASLONG inc_src) +{ + for (BLASLONG i = 0; i < n; i++) { + *dest++ = *src; + src += inc_src; + } +} + +FORCEINLINE void copy_y_beta(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src, FLOAT beta) +{ + if (beta == 0) { + memset(dest, 0, sizeof(FLOAT) * n); + } else { + for (BLASLONG i = 0; i < n; i++) { + *dest++ = *src * beta; + src += inc_src; + } + } +} + +FORCEINLINE void copy_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src, FLOAT beta) +{ + if (beta == 0) { + for (BLASLONG i = 0; i < n; i++) { + *dest = *src++; + dest += inc_src; + } + } else { + for (BLASLONG i = 0; i < n; i++) { + *dest = *src++ + (beta * *dest); + dest += inc_src; + } + } +} + +FORCEINLINE void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) +{ + for (BLASLONG i = 0; i < n; i++) { + *dest += *src++; + dest += inc_dest; + } +} +#endif diff --git a/kernel/power/sbgemv_n.c b/kernel/power/sbgemv_n.c new file mode 100644 index 0000000000..854ad93ee2 --- /dev/null +++ b/kernel/power/sbgemv_n.c @@ -0,0 +1,189 @@ +/*************************************************************************** +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#ifndef SBGEMV_N_COMMON_C +#define SBGEMV_N_COMMON_C +static void BF16GEMV_N_beta(BLASLONG n, FLOAT *output_vector, FLOAT *input_vector, FLOAT beta) +{ + if (beta == 0) { + memset(output_vector, 0, sizeof(FLOAT) * n); + } else { + vec_f32 b = { beta, beta, beta, beta }; + + vec_f32 *in = (vec_f32 *)input_vector; + vec_f32 *out = (vec_f32 *)output_vector; + + BLASLONG n8 = n / 8; + BLASLONG i = 0; + vec_f32 v_inp0[2]; + + for (; i + 4 <= n8; i += 4) { + vec_f32 v_inp1[2], v_inp2[2], v_inp3[2]; + vec_load_pair(v_inp0, &in[(i * 2) + 0]); + vec_load_pair(v_inp1, &in[(i * 2) + 2]); + vec_load_pair(v_inp2, &in[(i * 2) + 4]); + vec_load_pair(v_inp3, &in[(i * 2) + 6]); + v_inp0[0] *= b; + v_inp0[1] *= b; + v_inp1[0] *= b; + v_inp1[1] *= b; + v_inp2[0] *= b; + v_inp2[1] *= b; + v_inp3[0] *= b; + v_inp3[1] *= b; + vec_store_pair(&out[(i * 2) + 0], v_inp0); + vec_store_pair(&out[(i * 2) + 2], v_inp1); + vec_store_pair(&out[(i * 2) + 4], v_inp2); + vec_store_pair(&out[(i * 2) + 6], v_inp3); + } + + for (; i < n8; i++) { + vec_load_pair(v_inp0, &in[(i * 2) + 0]); + v_inp0[0] *= b; + v_inp0[1] *= b; + vec_store_pair(&out[(i * 2) + 0], v_inp0); + } + + n &= 7; + if (n > 4) { + BLASLONG n3 = n & 3; + v_inp0[0] = in[(i * 2) + 0]; + v_inp0[1] = vec_loadN_f32(&in[(i * 2) + 1], n3); + v_inp0[0] *= b; + v_inp0[1] *= b; + out[(i * 2) + 0] = v_inp0[0]; + vec_storeN_f32(v_inp0[1], &out[(i * 2) + 1], n3); + } else if (n) { + v_inp0[0] = vec_loadN_f32(&in[(i * 2) + 0], n); + v_inp0[0] *= b; + vec_storeN_f32(v_inp0[0], &out[(i * 2) + 0], n); + } + } +} + +int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, IFLOAT *a, BLASLONG lda, IFLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y) +{ + IFLOAT *x_ptr, *ap[4]; + IFLOAT xbuffer[8] __attribute__((aligned(16))); + FLOAT *y_ptr, *ybuffer; + FLOAT buffer[NBMAX] __attribute__((aligned(16))); + + if ((m < 1) || (n < 1)) return 0; + + ybuffer = buffer; + y_ptr = y; + + BLASLONG lda4 = lda << 2; + BLASLONG lda8 = lda << 3; + BLASLONG NB = NBMAX; + BLASLONG m2 = (m & (NBMAX - 1)); + + while (NB == NBMAX) { + m -= NB; + if (m < 0) { + if (m2 == 0) break; + NB = m2; + } + + if (inc_y != 1) { + copy_y_beta(NB, y_ptr, ybuffer, inc_y, beta); + } else { + ybuffer = y_ptr; + BF16GEMV_N_beta(NB, ybuffer, ybuffer, beta); + } + + x_ptr = x; + + ap[0] = a; + ap[1] = a + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + + if (inc_x == 1) { + for (BLASLONG j = 0; j + 8 <= n; j += 8) { + BF16GEMV_N_8(NB, ap, x_ptr, ybuffer, lda4, alpha); + ap[0] += lda8; + ap[1] += lda8; + ap[2] += lda8; + ap[3] += lda8; + x_ptr += 8; + } + if (n & 4) { + BF16GEMV_N_4(NB, ap, x_ptr, ybuffer, alpha); + ap[0] += lda4; + ap[1] += lda4; + x_ptr += 4; + } + if (n & 2) { + BF16GEMV_N_2(NB, ap, x_ptr, ybuffer, alpha); + ap[0] += (lda * 2); + x_ptr += 2; + } + if (n & 1) { + BF16GEMV_N_1(NB, ap, x_ptr, ybuffer, alpha); + } + } else { + for (BLASLONG j = 0; j + 8 <= n; j += 8) { + copy_x(8, x_ptr, xbuffer, inc_x); + BF16GEMV_N_8(NB, ap, xbuffer, ybuffer, lda4, alpha); + ap[0] += lda8; + ap[1] += lda8; + ap[2] += lda8; + ap[3] += lda8; + x_ptr += 8 * inc_x; + } + if (n & 4) { + copy_x(4, x_ptr, xbuffer, inc_x); + BF16GEMV_N_4(NB, ap, xbuffer, ybuffer, alpha); + ap[0] += lda4; + ap[1] += lda4; + x_ptr += 4 * inc_x; + } + if (n & 2) { + copy_x(2, x_ptr, xbuffer, inc_x); + BF16GEMV_N_2(NB, ap, xbuffer, ybuffer, alpha); + ap[0] += (lda * 2); + x_ptr += 2 * inc_x; + } + if (n & 1) { + copy_x(1, x_ptr, xbuffer, inc_x); + BF16GEMV_N_1(NB, ap, xbuffer, ybuffer, alpha); + } + } + + a += NB; + if (inc_y != 1) { + add_y(NB, ybuffer, y_ptr, inc_y); + y_ptr += (NB * inc_y); + } else { + y_ptr += NB; + } + } + + return 0; +} +#endif diff --git a/kernel/power/sbgemv_n_power10.c b/kernel/power/sbgemv_n_power10.c new file mode 100644 index 0000000000..fc83b38c37 --- /dev/null +++ b/kernel/power/sbgemv_n_power10.c @@ -0,0 +1,33 @@ +/*************************************************************************** +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +//#include "sbgemv_common.c" + +#include "sbgemv_n_vsx.c" + +//#include "sbgemv_n.c" + diff --git a/kernel/power/sbgemv_n_vsx.c b/kernel/power/sbgemv_n_vsx.c new file mode 100644 index 0000000000..ddbf908b3f --- /dev/null +++ b/kernel/power/sbgemv_n_vsx.c @@ -0,0 +1,303 @@ +/*************************************************************************** +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#ifndef SBGEMV_N_VSX +#define SBGEMV_N_VSX + +#include "sbgemv_common.c" + +#define NBMAX 4096 + +static void BF16GEMV_N_VSX_1(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, FLOAT alpha) +{ + IFLOAT *a0; + vec_bf16 zero = { 0, 0, 0, 0, 0, 0, 0, 0 }; + vec_f32 v_alpha = { alpha, alpha, alpha, alpha }; + + a0 = ap[0]; + + vec_bf16 *va0 = (vec_bf16 *)a0; + + vec_bf16 *x_bf = (vec_bf16 *)(xo); + vec_f32 x_0 = vec_loadNHi(x_bf, 1, zero); + x_0 *= v_alpha; + + vec_f32 v_x0 = vec_splat(x_0, 0); + + vec_f32 *v_y = (vec_f32 *)y; + BLASLONG n8 = n / 8; + BLASLONG i = 0; + vec_f32 vy0[2]; + + for (; i < n8; i++) { + vec_load_pair(vy0, &v_y[(i * 2) + 0]); + + vec_load_mult2(v_x0, &va0[i], zero, vy0); + + vec_store_pair(&v_y[(i * 2) + 0], vy0); + } + + n &= 7; + if (n > 4) { + BLASLONG n3 = n & 3; + vy0[0] = v_y[(i * 2) + 0]; + vy0[1] = vec_loadN_f32(&v_y[(i * 2) + 1], n3); + + vec_loadN_mult2(v_x0, &va0[i], n, zero, vy0); + + v_y[(i * 2) + 0] = vy0[0]; + vec_storeN_f32(vy0[1], &v_y[(i * 2) + 1], n3); + } else if (n) { + vec_f32 vy0 = vec_loadN_f32(&v_y[(i * 2) + 0], n); + + vy0 += vec_loadNHi_multi2(v_x0, &va0[i], n, zero); + + vec_storeN_f32(vy0, &v_y[(i * 2) + 0], n); + } +} + +static void BF16GEMV_N_VSX_2(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, FLOAT alpha) +{ + IFLOAT *a0, *a1; + vec_bf16 zero = { 0, 0, 0, 0, 0, 0, 0, 0 }; + vec_f32 v_alpha = { alpha, alpha, alpha, alpha }; + + a0 = ap[0]; + a1 = ap[1]; + + vec_bf16 *va0 = (vec_bf16 *)a0; + vec_bf16 *va1 = (vec_bf16 *)a1; + + vec_bf16 *x_bf = (vec_bf16 *)(xo); + vec_f32 x_0 = vec_loadNHi(x_bf, 2, zero); + x_0 *= v_alpha; + + vec_f32 v_x0 = vec_splat(x_0, 0); + vec_f32 v_x1 = vec_splat(x_0, 1); + + vec_f32 *v_y = (vec_f32 *)y; + BLASLONG n8 = n / 8; + BLASLONG i = 0; + vec_f32 vy0[2]; + + for (; i < n8; i++) { + vec_load_pair(vy0, &v_y[(i * 2) + 0]); + + vec_load_mult2(v_x0, &va0[i], zero, vy0); + vec_load_mult2(v_x1, &va1[i], zero, vy0); + + vec_store_pair(&v_y[(i * 2) + 0], vy0); + } + + n &= 7; + if (n > 4) { + BLASLONG n3 = n & 3; + vy0[0] = v_y[(i * 2) + 0]; + vy0[1] = vec_loadN_f32(&v_y[(i * 2) + 1], n3); + + vec_loadN_mult2(v_x0, &va0[i], n, zero, vy0); + vec_loadN_mult2(v_x1, &va1[i], n, zero, vy0); + + v_y[(i * 2) + 0] = vy0[0]; + vec_storeN_f32(vy0[1], &v_y[(i * 2) + 1], n3); + } else if (n) { + vec_f32 vy0 = vec_loadN_f32(&v_y[(i * 2) + 0], n); + + vy0 += vec_loadNHi_multi2(v_x0, &va0[i], n, zero); + vy0 += vec_loadNHi_multi2(v_x1, &va1[i], n, zero); + + vec_storeN_f32(vy0, &v_y[(i * 2) + 0], n); + } +} + +static void BF16GEMV_N_VSX_4(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, FLOAT alpha) +{ + IFLOAT *a0, *a1, *a2, *a3; + vec_bf16 zero = { 0, 0, 0, 0, 0, 0, 0, 0 }; + vec_f32 v_alpha = { alpha, alpha, alpha, alpha }; + + a0 = ap[0]; + a1 = ap[1]; + a2 = ap[2]; + a3 = ap[3]; + + vec_bf16 *va0 = (vec_bf16 *)a0; + vec_bf16 *va1 = (vec_bf16 *)a1; + vec_bf16 *va2 = (vec_bf16 *)a2; + vec_bf16 *va3 = (vec_bf16 *)a3; + + vec_bf16 *x_bf = (vec_bf16 *)(xo); + vec_f32 x_0 = vec_loadNHi(x_bf, 4, zero); + x_0 *= v_alpha; + + vec_f32 v_x0 = vec_splat(x_0, 0); + vec_f32 v_x1 = vec_splat(x_0, 1); + vec_f32 v_x2 = vec_splat(x_0, 2); + vec_f32 v_x3 = vec_splat(x_0, 3); + + vec_f32 *v_y = (vec_f32 *)y; + BLASLONG n8 = n / 8; + BLASLONG i = 0; + vec_f32 vy0[2]; + + for (; i < n8; i++) { + vec_load_pair(vy0, &v_y[(i * 2) + 0]); + + vec_load_mult2(v_x0, &va0[i], zero, vy0); + vec_load_mult2(v_x1, &va1[i], zero, vy0); + vec_load_mult2(v_x2, &va2[i], zero, vy0); + vec_load_mult2(v_x3, &va3[i], zero, vy0); + + vec_store_pair(&v_y[(i * 2) + 0], vy0); + } + + n &= 7; + if (n > 4) { + BLASLONG n3 = n & 3; + vy0[0] = v_y[(i * 2) + 0]; + vy0[1] = vec_loadN_f32(&v_y[(i * 2) + 1], n3); + + vec_loadN_mult2(v_x0, &va0[i], n, zero, vy0); + vec_loadN_mult2(v_x1, &va1[i], n, zero, vy0); + vec_loadN_mult2(v_x2, &va2[i], n, zero, vy0); + vec_loadN_mult2(v_x3, &va3[i], n, zero, vy0); + + v_y[(i * 2) + 0] = vy0[0]; + vec_storeN_f32(vy0[1], &v_y[(i * 2) + 1], n3); + } else if (n) { + vec_f32 vy0 = vec_loadN_f32(&v_y[(i * 2) + 0], n); + + vy0 += vec_loadNHi_multi2(v_x0, &va0[i], n, zero); + vy0 += vec_loadNHi_multi2(v_x1, &va1[i], n, zero); + vy0 += vec_loadNHi_multi2(v_x2, &va2[i], n, zero); + vy0 += vec_loadNHi_multi2(v_x3, &va3[i], n, zero); + + vec_storeN_f32(vy0, &v_y[(i * 2) + 0], n); + } +} + +static void BF16GEMV_N_VSX_8(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT alpha) +{ + IFLOAT *a0, *a1, *a2, *a3, *b0, *b1, *b2, *b3; + vec_bf16 zero = { 0, 0, 0, 0, 0, 0, 0, 0 }; + vec_f32 v_alpha = { alpha, alpha, alpha, alpha }; + + a0 = ap[0]; + a1 = ap[1]; + a2 = ap[2]; + a3 = ap[3]; + b0 = a0 + lda4; + b1 = a1 + lda4; + b2 = a2 + lda4; + b3 = a3 + lda4; + + vec_bf16 *va0 = (vec_bf16 *)a0; + vec_bf16 *va1 = (vec_bf16 *)a1; + vec_bf16 *va2 = (vec_bf16 *)a2; + vec_bf16 *va3 = (vec_bf16 *)a3; + vec_bf16 *vb0 = (vec_bf16 *)b0; + vec_bf16 *vb1 = (vec_bf16 *)b1; + vec_bf16 *vb2 = (vec_bf16 *)b2; + vec_bf16 *vb3 = (vec_bf16 *)b3; + + vec_bf16 *x_bf = (vec_bf16 *)(xo); + vec_bf16 x_in = (vec_bf16)vec_load_vec(x_bf); + vec_f32 x_0 = BF16_HI(x_in, zero); + vec_f32 x_1 = BF16_LO(x_in, zero); + x_0 *= v_alpha; + x_1 *= v_alpha; + + vec_f32 v_x0 = vec_splat(x_0, 0); + vec_f32 v_x1 = vec_splat(x_0, 1); + vec_f32 v_x2 = vec_splat(x_0, 2); + vec_f32 v_x3 = vec_splat(x_0, 3); + vec_f32 v_x4 = vec_splat(x_1, 0); + vec_f32 v_x5 = vec_splat(x_1, 1); + vec_f32 v_x6 = vec_splat(x_1, 2); + vec_f32 v_x7 = vec_splat(x_1, 3); + + vec_f32 *v_y = (vec_f32 *)y; + BLASLONG n8 = n / 8; + BLASLONG i = 0; + vec_f32 vy0[2]; + + for (; i < n8; i++) { + vec_load_pair(vy0, &v_y[(i * 2) + 0]); + + vec_load_mult2(v_x0, &va0[i], zero, vy0); + vec_load_mult2(v_x1, &va1[i], zero, vy0); + vec_load_mult2(v_x2, &va2[i], zero, vy0); + vec_load_mult2(v_x3, &va3[i], zero, vy0); + vec_load_mult2(v_x4, &vb0[i], zero, vy0); + vec_load_mult2(v_x5, &vb1[i], zero, vy0); + vec_load_mult2(v_x6, &vb2[i], zero, vy0); + vec_load_mult2(v_x7, &vb3[i], zero, vy0); + + vec_store_pair(&v_y[(i * 2) + 0], vy0); + } + + n &= 7; + if (n > 4) { + BLASLONG n3 = n & 3; + vy0[0] = v_y[(i * 2) + 0]; + vy0[1] = vec_loadN_f32(&v_y[(i * 2) + 1], n3); + + vec_loadN_mult2(v_x0, &va0[i], n, zero, vy0); + vec_loadN_mult2(v_x1, &va1[i], n, zero, vy0); + vec_loadN_mult2(v_x2, &va2[i], n, zero, vy0); + vec_loadN_mult2(v_x3, &va3[i], n, zero, vy0); + vec_loadN_mult2(v_x4, &vb0[i], n, zero, vy0); + vec_loadN_mult2(v_x5, &vb1[i], n, zero, vy0); + vec_loadN_mult2(v_x6, &vb2[i], n, zero, vy0); + vec_loadN_mult2(v_x7, &vb3[i], n, zero, vy0); + + v_y[(i * 2) + 0] = vy0[0]; + vec_storeN_f32(vy0[1], &v_y[(i * 2) + 1], n3); + } else + if (n) { + vec_f32 vy0 = vec_loadN_f32(&v_y[(i * 2) + 0], n); + + vy0 += vec_loadNHi_multi2(v_x0, &va0[i], n, zero); + vy0 += vec_loadNHi_multi2(v_x1, &va1[i], n, zero); + vy0 += vec_loadNHi_multi2(v_x2, &va2[i], n, zero); + vy0 += vec_loadNHi_multi2(v_x3, &va3[i], n, zero); + vy0 += vec_loadNHi_multi2(v_x4, &vb0[i], n, zero); + vy0 += vec_loadNHi_multi2(v_x5, &vb1[i], n, zero); + vy0 += vec_loadNHi_multi2(v_x6, &vb2[i], n, zero); + vy0 += vec_loadNHi_multi2(v_x7, &vb3[i], n, zero); + + vec_storeN_f32(vy0, &v_y[(i * 2) + 0], n); + } +} + +#define BF16GEMV_N_8 BF16GEMV_N_VSX_8 +#define BF16GEMV_N_4 BF16GEMV_N_VSX_4 +#define BF16GEMV_N_2 BF16GEMV_N_VSX_2 +#define BF16GEMV_N_1 BF16GEMV_N_VSX_1 + +#include "sbgemv_n.c" +#endif diff --git a/kernel/power/sbgemv_t.c b/kernel/power/sbgemv_t.c new file mode 100644 index 0000000000..f0c79fe77a --- /dev/null +++ b/kernel/power/sbgemv_t.c @@ -0,0 +1,117 @@ +/*************************************************************************** +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#ifndef SBGEMV_T_COMMON_C +#define SBGEMV_T_COMMON_C +int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, IFLOAT *a, BLASLONG lda, IFLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y) +{ + IFLOAT *xbuffer, *a_ptr; + IFLOAT buffer[NBMAX] __attribute__((aligned(16))); + FLOAT ybuffer[8] __attribute__((aligned(16))); + FLOAT *y_ptr; + + if ((m < 1) || (n < 1)) return 0; + + xbuffer = buffer; + + BLASLONG lda4 = lda << 2; + BLASLONG lda8 = lda << 3; + BLASLONG NB = NBMAX; + BLASLONG m2 = (m & (NBMAX - 1)); + + while (NB == NBMAX) { + m -= NB; + if (m < 0) { + if (m2 == 0) break; + NB = m2; + } + + a_ptr = a; + y_ptr = y; + + if (inc_x != 1) { + copy_x(NB, x, xbuffer, inc_x); + } else { + xbuffer = x; + } + + if (inc_y == 1) { + for (BLASLONG j = 0; j + 8 <= n; j += 8) { + BF16GEMV_T_8(NB, lda, a_ptr, xbuffer, y_ptr, alpha, beta); + y_ptr += 8; + a_ptr += lda8; + } + if (n & 4) { + BF16GEMV_T_4(NB, lda, a_ptr, xbuffer, y_ptr, alpha, beta); + y_ptr += 4; + a_ptr += lda4; + } + if (n & 2) { + BF16GEMV_T_2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, beta); + y_ptr += 2; + a_ptr += (lda * 2); + } + if (n & 1) { + BF16GEMV_T_1(NB, lda, a_ptr, xbuffer, y_ptr, alpha, beta); + } + } else { + for (BLASLONG j = 0; j + 8 <= n; j += 8) { + memset(ybuffer, 0, sizeof(FLOAT) * 8); + BF16GEMV_T_8(NB, lda, a_ptr, xbuffer, ybuffer, alpha, beta); + copy_y(8, ybuffer, y_ptr, inc_y, beta); + y_ptr += 8 * inc_y; + a_ptr += lda8; + } + if (n & 4) { + memset(ybuffer, 0, sizeof(FLOAT) * 4); + BF16GEMV_T_4(NB, lda, a_ptr, xbuffer, ybuffer, alpha, beta); + copy_y(4, ybuffer, y_ptr, inc_y, beta); + y_ptr += 4 * inc_y; + a_ptr += lda4; + } + if (n & 2) { + memset(ybuffer, 0, sizeof(FLOAT) * 4); + BF16GEMV_T_2(NB, lda, a_ptr, xbuffer, ybuffer, alpha, beta); + copy_y(2, ybuffer, y_ptr, inc_y, beta); + y_ptr += 2 * inc_y; + a_ptr += (lda * 2); + } + if (n & 1) { + memset(ybuffer, 0, sizeof(FLOAT) * 4); + BF16GEMV_T_1(NB, lda, a_ptr, xbuffer, ybuffer, alpha, beta); + copy_y(1, ybuffer, y_ptr, inc_y, beta); + } + } + + a += NB; + x += NB * inc_x; + } + + return 0; +} +#endif + diff --git a/kernel/power/sbgemv_t_power10.c b/kernel/power/sbgemv_t_power10.c new file mode 100644 index 0000000000..08bc4237c7 --- /dev/null +++ b/kernel/power/sbgemv_t_power10.c @@ -0,0 +1,32 @@ +/*************************************************************************** +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +//#include "sbgemv_common.c" + +#include "sbgemv_t_vsx.c" + +//#include "sbgemv_t.c" diff --git a/kernel/power/sbgemv_t_vsx.c b/kernel/power/sbgemv_t_vsx.c new file mode 100644 index 0000000000..7da894109b --- /dev/null +++ b/kernel/power/sbgemv_t_vsx.c @@ -0,0 +1,286 @@ +/*************************************************************************** +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#ifndef SBGEMV_T_VSX +#define SBGEMV_T_VSX + +#include "sbgemv_common.c" + +#define NBMAX 4096 + +static void BF16GEMV_T_VSX_1(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FLOAT *y, FLOAT alpha, FLOAT beta) +{ + IFLOAT *a0; + vec_bf16 *va0, *v_x; + vec_f32 temp0 = { 0, 0, 0, 0 }; + vec_bf16 zero = { 0, 0, 0, 0, 0, 0, 0, 0 }; + vec_f32 inp[2]; + + a0 = ap; + va0 = (vec_bf16 *)a0; + v_x = (vec_bf16 *)x; + BLASLONG n8 = n / 8; + BLASLONG i = 0; + + for (; i < n8; i++) { + vec_load_vec2(v_x, i, inp, zero); + + temp0 += vec_load_mult(&va0[i], inp, zero); + } + + n &= 7; + if (n > 4) { + vec_loadN_vec2(v_x, i, inp, n, zero); + + temp0 += vec_loadN_mult(&va0[i], inp, n, zero); + } else if (n) { + vec_f32 v_inp0 = vec_loadNHi_vec(v_x, i, n, zero); + + temp0 += vec_loadNHi_mult(&va0[i], v_inp0, n, zero); + } + + y[0] = (alpha * (temp0[0] + temp0[1] + temp0[2] + temp0[3])) + (beta * y[0]); +} + +static void BF16GEMV_T_VSX_2(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FLOAT *y, FLOAT alpha, FLOAT beta) +{ + IFLOAT *a0, *a1; + vec_bf16 *va0, *va1, *v_x; + vec_f32 temp0 = { 0, 0, 0, 0 }; + vec_f32 temp1 = { 0, 0, 0, 0 }; + vec_bf16 zero = { 0, 0, 0, 0, 0, 0, 0, 0 }; + vec_f32 inp[2]; + + a0 = ap; + a1 = ap + lda; + va0 = (vec_bf16 *)a0; + va1 = (vec_bf16 *)a1; + v_x = (vec_bf16 *)x; + BLASLONG n8 = n / 8; + BLASLONG i = 0; + + for (; i < n8; i++) { + vec_load_vec2(v_x, i, inp, zero); + + temp0 += vec_load_mult(&va0[i], inp, zero); + temp1 += vec_load_mult(&va1[i], inp, zero); + } + + n &= 7; + if (n > 4) { + vec_loadN_vec2(v_x, i, inp, n, zero); + + temp0 += vec_loadN_mult(&va0[i], inp, n, zero); + temp1 += vec_loadN_mult(&va1[i], inp, n, zero); + } else if (n) { + vec_f32 v_inp0 = vec_loadNHi_vec(v_x, i, n, zero); + + temp0 += vec_loadNHi_mult(&va0[i], v_inp0, n, zero); + temp1 += vec_loadNHi_mult(&va1[i], v_inp0, n, zero); + } + + y[0] = (alpha * (temp0[0] + temp0[1] + temp0[2] + temp0[3])) + (beta * y[0]); + y[1] = (alpha * (temp1[0] + temp1[1] + temp1[2] + temp1[3])) + (beta * y[1]); +} + +static void BF16GEMV_T_VSX_4(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FLOAT *y, FLOAT alpha, FLOAT beta) +{ + IFLOAT *a0, *a1, *a2, *a3; + vec_bf16 *va0, *va1, *va2, *va3, *v_x; + vec_f32 temp0 = { 0, 0, 0, 0 }; + vec_f32 temp1 = { 0, 0, 0, 0 }; + vec_f32 temp2 = { 0, 0, 0, 0 }; + vec_f32 temp3 = { 0, 0, 0, 0 }; + vec_bf16 zero = { 0, 0, 0, 0, 0, 0, 0, 0 }; + vec_f32 inp[2]; + + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + va0 = (vec_bf16 *)a0; + va1 = (vec_bf16 *)a1; + va2 = (vec_bf16 *)a2; + va3 = (vec_bf16 *)a3; + v_x = (vec_bf16 *)x; + BLASLONG n8 = n / 8; + BLASLONG i = 0; + + for (; i < n8; i++) { + vec_load_vec2(v_x, i, inp, zero); + + temp0 += vec_load_mult(&va0[i], inp, zero); + temp1 += vec_load_mult(&va1[i], inp, zero); + temp2 += vec_load_mult(&va2[i], inp, zero); + temp3 += vec_load_mult(&va3[i], inp, zero); + } + + n &= 7; + if (n > 4) { + vec_loadN_vec2(v_x, i, inp, n, zero); + + temp0 += vec_loadN_mult(&va0[i], inp, n, zero); + temp1 += vec_loadN_mult(&va1[i], inp, n, zero); + temp2 += vec_loadN_mult(&va2[i], inp, n, zero); + temp3 += vec_loadN_mult(&va3[i], inp, n, zero); + } else if (n) { + vec_f32 v_inp0 = vec_loadNHi_vec(v_x, i, n, zero); + + temp0 += vec_loadNHi_mult(&va0[i], v_inp0, n, zero); + temp1 += vec_loadNHi_mult(&va1[i], v_inp0, n, zero); + temp2 += vec_loadNHi_mult(&va2[i], v_inp0, n, zero); + temp3 += vec_loadNHi_mult(&va3[i], v_inp0, n, zero); + } + + vec_f32 t0, t1, t2, t3; + vec_f32 a = { alpha, alpha, alpha, alpha }; + vec_f32 b = { beta, beta, beta, beta }; + vec_f32 *v_y = (vec_f32 *) y; + + t0 = vec_mergeh(temp0, temp2); + t1 = vec_mergel(temp0, temp2); + t2 = vec_mergeh(temp1, temp3); + t3 = vec_mergel(temp1, temp3); + temp0 = vec_mergeh(t0, t2); + temp1 = vec_mergel(t0, t2); + temp2 = vec_mergeh(t1, t3); + temp3 = vec_mergel(t1, t3); + temp0 += temp1 + temp2 + temp3; + + v_y[0] = (a * temp0) + (b * v_y[0]); +} + +static void BF16GEMV_T_VSX_8(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FLOAT *y, FLOAT alpha, FLOAT beta) +{ + IFLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; + vec_bf16 *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x; + vec_f32 temp0 = { 0, 0, 0, 0 }; + vec_f32 temp1 = { 0, 0, 0, 0 }; + vec_f32 temp2 = { 0, 0, 0, 0 }; + vec_f32 temp3 = { 0, 0, 0, 0 }; + vec_f32 temp4 = { 0, 0, 0, 0 }; + vec_f32 temp5 = { 0, 0, 0, 0 }; + vec_f32 temp6 = { 0, 0, 0, 0 }; + vec_f32 temp7 = { 0, 0, 0, 0 }; + vec_bf16 zero = { 0, 0, 0, 0, 0, 0, 0, 0 }; + vec_f32 inp[2]; + + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + a4 = a3 + lda; + a5 = a4 + lda; + a6 = a5 + lda; + a7 = a6 + lda; + va0 = (vec_bf16 *)a0; + va1 = (vec_bf16 *)a1; + va2 = (vec_bf16 *)a2; + va3 = (vec_bf16 *)a3; + va4 = (vec_bf16 *)a4; + va5 = (vec_bf16 *)a5; + va6 = (vec_bf16 *)a6; + va7 = (vec_bf16 *)a7; + v_x = (vec_bf16 *)x; + BLASLONG n8 = n / 8; + BLASLONG i = 0; + + for (; i < n8; i++) { + vec_load_vec2(v_x, i, inp, zero); + + temp0 += vec_load_mult(&va0[i], inp, zero); + temp1 += vec_load_mult(&va1[i], inp, zero); + temp2 += vec_load_mult(&va2[i], inp, zero); + temp3 += vec_load_mult(&va3[i], inp, zero); + temp4 += vec_load_mult(&va4[i], inp, zero); + temp5 += vec_load_mult(&va5[i], inp, zero); + temp6 += vec_load_mult(&va6[i], inp, zero); + temp7 += vec_load_mult(&va7[i], inp, zero); + } + + n &= 7; + if (n > 4) { + vec_loadN_vec2(v_x, i, inp, n, zero); + + temp0 += vec_loadN_mult(&va0[i], inp, n, zero); + temp1 += vec_loadN_mult(&va1[i], inp, n, zero); + temp2 += vec_loadN_mult(&va2[i], inp, n, zero); + temp3 += vec_loadN_mult(&va3[i], inp, n, zero); + temp4 += vec_loadN_mult(&va4[i], inp, n, zero); + temp5 += vec_loadN_mult(&va5[i], inp, n, zero); + temp6 += vec_loadN_mult(&va6[i], inp, n, zero); + temp7 += vec_loadN_mult(&va7[i], inp, n, zero); + } else if (n) { + vec_f32 v_inp0 = vec_loadNHi_vec(v_x, i, n, zero); + + temp0 += vec_loadNHi_mult(&va0[i], v_inp0, n, zero); + temp1 += vec_loadNHi_mult(&va1[i], v_inp0, n, zero); + temp2 += vec_loadNHi_mult(&va2[i], v_inp0, n, zero); + temp3 += vec_loadNHi_mult(&va3[i], v_inp0, n, zero); + temp4 += vec_loadNHi_mult(&va4[i], v_inp0, n, zero); + temp5 += vec_loadNHi_mult(&va5[i], v_inp0, n, zero); + temp6 += vec_loadNHi_mult(&va6[i], v_inp0, n, zero); + temp7 += vec_loadNHi_mult(&va7[i], v_inp0, n, zero); + } + + vec_f32 t0, t1, t2, t3; + vec_f32 a = { alpha, alpha, alpha, alpha }; + vec_f32 b = { beta, beta, beta, beta }; + vec_f32 *v_y = (vec_f32 *) y; + + t0 = vec_mergeh(temp0, temp2); + t1 = vec_mergel(temp0, temp2); + t2 = vec_mergeh(temp1, temp3); + t3 = vec_mergel(temp1, temp3); + temp0 = vec_mergeh(t0, t2); + temp1 = vec_mergel(t0, t2); + temp2 = vec_mergeh(t1, t3); + temp3 = vec_mergel(t1, t3); + temp0 += temp1 + temp2 + temp3; + + t0 = vec_mergeh(temp4, temp6); + t1 = vec_mergel(temp4, temp6); + t2 = vec_mergeh(temp5, temp7); + t3 = vec_mergel(temp5, temp7); + temp4 = vec_mergeh(t0, t2); + temp5 = vec_mergel(t0, t2); + temp6 = vec_mergeh(t1, t3); + temp7 = vec_mergel(t1, t3); + temp4 += temp5 + temp6 + temp7; + + v_y[0] = (a * temp0) + (b * v_y[0]); + v_y[1] = (a * temp4) + (b * v_y[1]); +} + +#define BF16GEMV_T_8 BF16GEMV_T_VSX_8 +#define BF16GEMV_T_4 BF16GEMV_T_VSX_4 +#define BF16GEMV_T_2 BF16GEMV_T_VSX_2 +#define BF16GEMV_T_1 BF16GEMV_T_VSX_1 + +#include "sbgemv_t.c" +#endif + diff --git a/test/compare_sgemm_sbgemm.c b/test/compare_sgemm_sbgemm.c index b8aaee8be3..a86c73d1c5 100644 --- a/test/compare_sgemm_sbgemm.c +++ b/test/compare_sgemm_sbgemm.c @@ -202,16 +202,18 @@ main (int argc, char *argv[]) return ret; } + for (l = 0; l < 2; l++) { // l = 1 to test inc_x & inc_y not equal to one. for (x = 1; x <= loop; x++) { - k = (x == 0) ? 0 : 1; + m = l + 1; + k = (x == 0) ? 0 : m; float *A = (float *)malloc_safe(x * x * sizeof(FLOAT)); - float *B = (float *)malloc_safe(x * sizeof(FLOAT)); - float *C = (float *)malloc_safe(x * sizeof(FLOAT)); + float *B = (float *)malloc_safe(x * sizeof(FLOAT) * m); + float *C = (float *)malloc_safe(x * sizeof(FLOAT) * m); bfloat16_bits *AA = (bfloat16_bits *)malloc_safe(x * x * sizeof(bfloat16_bits)); - bfloat16_bits *BB = (bfloat16_bits *)malloc_safe(x * sizeof(bfloat16_bits)); + bfloat16_bits *BB = (bfloat16_bits *)malloc_safe(x * sizeof(bfloat16_bits) * m); float *DD = (float *)malloc_safe(x * sizeof(FLOAT)); - float *CC = (float *)malloc_safe(x * sizeof(FLOAT)); + float *CC = (float *)malloc_safe(x * sizeof(FLOAT) * m); if ((A == NULL) || (B == NULL) || (C == NULL) || (AA == NULL) || (BB == NULL) || (DD == NULL) || (CC == NULL)) return 1; @@ -226,9 +228,9 @@ main (int argc, char *argv[]) sbstobf16_(&one, &A[j*x+i], &one, &atmp, &one); AA[j * x + i].v = atmp; } - B[j] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; - sbstobf16_(&one, &B[j], &one, &btmp, &one); - BB[j].v = btmp; + B[j*m] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; + sbstobf16_(&one, &B[j*m], &one, &btmp, &one); + BB[j*m].v = btmp; } for (y = 0; y < 2; y++) { @@ -238,9 +240,9 @@ main (int argc, char *argv[]) transA = 'T'; } - memset(CC, 0, x * sizeof(FLOAT)); + memset(CC, 0, x * m * sizeof(FLOAT)); memset(DD, 0, x * sizeof(FLOAT)); - memset(C, 0, x * sizeof(FLOAT)); + memset(C, 0, x * m * sizeof(FLOAT)); SGEMV (&transA, &x, &x, &alpha, A, &x, B, &k, &beta, C, &k); SBGEMV (&transA, &x, &x, &alpha, (bfloat16*) AA, &x, (bfloat16*) BB, &k, &beta, CC, &k); @@ -248,15 +250,15 @@ main (int argc, char *argv[]) for (j = 0; j < x; j++) for (i = 0; i < x; i++) if (transA == 'N') { - DD[i] += float16to32 (AA[j * x + i]) * float16to32 (BB[j]); + DD[i] += float16to32 (AA[j * x + i]) * float16to32 (BB[j*m]); } else if (transA == 'T') { - DD[j] += float16to32 (AA[j * x + i]) * float16to32 (BB[i]); + DD[j] += float16to32 (AA[j * x + i]) * float16to32 (BB[i*m]); } for (j = 0; j < x; j++) { - if (fabs (CC[j] - C[j]) > 1.0) + if (fabs (CC[j*m] - C[j*m]) > 1.0) ret++; - if (fabs (CC[j] - DD[j]) > 1.0) + if (fabs (CC[j*m] - DD[j]) > 1.0) ret++; } } @@ -268,6 +270,7 @@ main (int argc, char *argv[]) free(DD); free(CC); } + } if (ret != 0) fprintf (stderr, "FATAL ERROR SBGEMV - Return code: %d\n", ret); From 8541b25e1d755f7e05594547184bd88bda23a5af Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Fri, 6 Sep 2024 14:48:48 -0500 Subject: [PATCH 038/244] Special case beta is one. --- kernel/power/sbgemv_common.c | 10 ++++++++++ kernel/power/sbgemv_n.c | 2 ++ 2 files changed, 12 insertions(+) diff --git a/kernel/power/sbgemv_common.c b/kernel/power/sbgemv_common.c index 2aadcca6ff..b11ab59de8 100644 --- a/kernel/power/sbgemv_common.c +++ b/kernel/power/sbgemv_common.c @@ -252,6 +252,11 @@ FORCEINLINE void copy_y_beta(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_s { if (beta == 0) { memset(dest, 0, sizeof(FLOAT) * n); + } else if (beta == 1) { + for (BLASLONG i = 0; i < n; i++) { + *dest++ = *src; + src += inc_src; + } } else { for (BLASLONG i = 0; i < n; i++) { *dest++ = *src * beta; @@ -267,6 +272,11 @@ FORCEINLINE void copy_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src, F *dest = *src++; dest += inc_src; } + } else if (beta == 1) { + for (BLASLONG i = 0; i < n; i++) { + *dest += *src++; + dest += inc_src; + } } else { for (BLASLONG i = 0; i < n; i++) { *dest = *src++ + (beta * *dest); diff --git a/kernel/power/sbgemv_n.c b/kernel/power/sbgemv_n.c index 854ad93ee2..db64915e05 100644 --- a/kernel/power/sbgemv_n.c +++ b/kernel/power/sbgemv_n.c @@ -31,6 +31,8 @@ static void BF16GEMV_N_beta(BLASLONG n, FLOAT *output_vector, FLOAT *input_vecto { if (beta == 0) { memset(output_vector, 0, sizeof(FLOAT) * n); + } else if ((output_vector != input_vector) && (beta == 1)) { + memcpy(output_vector, input_vector, sizeof(FLOAT) * n); } else { vec_f32 b = { beta, beta, beta, beta }; From 39fd29f1de36763c77c8bfe5acb8a6337046f748 Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Sun, 8 Sep 2024 18:28:31 -0500 Subject: [PATCH 039/244] Minor improvement and turn off BF16 GEMV forwarding by default. --- Makefile.system | 1 - kernel/power/sbgemv_n.c | 6 ++++-- test/compare_sgemm_sbgemm.c | 29 ++++++++++++++--------------- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/Makefile.system b/Makefile.system index 8c030842a4..2c5ca96906 100644 --- a/Makefile.system +++ b/Makefile.system @@ -282,7 +282,6 @@ GEMM_GEMV_FORWARD = 1 endif ifeq ($(ARCH), power) GEMM_GEMV_FORWARD = 1 -GEMM_GEMV_FORWARD_BF16 = 1 endif ifeq ($(SMALL_MATRIX_OPT), 1) diff --git a/kernel/power/sbgemv_n.c b/kernel/power/sbgemv_n.c index db64915e05..fa7df858f8 100644 --- a/kernel/power/sbgemv_n.c +++ b/kernel/power/sbgemv_n.c @@ -31,8 +31,10 @@ static void BF16GEMV_N_beta(BLASLONG n, FLOAT *output_vector, FLOAT *input_vecto { if (beta == 0) { memset(output_vector, 0, sizeof(FLOAT) * n); - } else if ((output_vector != input_vector) && (beta == 1)) { - memcpy(output_vector, input_vector, sizeof(FLOAT) * n); + } else if (beta == 1) { + if (output_vector != input_vector) { + memcpy(output_vector, input_vector, sizeof(FLOAT) * n); + } } else { vec_f32 b = { beta, beta, beta, beta }; diff --git a/test/compare_sgemm_sbgemm.c b/test/compare_sgemm_sbgemm.c index a86c73d1c5..05d9b33aba 100644 --- a/test/compare_sgemm_sbgemm.c +++ b/test/compare_sgemm_sbgemm.c @@ -205,15 +205,14 @@ main (int argc, char *argv[]) for (l = 0; l < 2; l++) { // l = 1 to test inc_x & inc_y not equal to one. for (x = 1; x <= loop; x++) { - m = l + 1; - k = (x == 0) ? 0 : m; + k = (x == 0) ? 0 : l + 1; float *A = (float *)malloc_safe(x * x * sizeof(FLOAT)); - float *B = (float *)malloc_safe(x * sizeof(FLOAT) * m); - float *C = (float *)malloc_safe(x * sizeof(FLOAT) * m); + float *B = (float *)malloc_safe(x * sizeof(FLOAT) << l); + float *C = (float *)malloc_safe(x * sizeof(FLOAT) << l); bfloat16_bits *AA = (bfloat16_bits *)malloc_safe(x * x * sizeof(bfloat16_bits)); - bfloat16_bits *BB = (bfloat16_bits *)malloc_safe(x * sizeof(bfloat16_bits) * m); + bfloat16_bits *BB = (bfloat16_bits *)malloc_safe(x * sizeof(bfloat16_bits) << l); float *DD = (float *)malloc_safe(x * sizeof(FLOAT)); - float *CC = (float *)malloc_safe(x * sizeof(FLOAT) * m); + float *CC = (float *)malloc_safe(x * sizeof(FLOAT) << l); if ((A == NULL) || (B == NULL) || (C == NULL) || (AA == NULL) || (BB == NULL) || (DD == NULL) || (CC == NULL)) return 1; @@ -228,9 +227,9 @@ main (int argc, char *argv[]) sbstobf16_(&one, &A[j*x+i], &one, &atmp, &one); AA[j * x + i].v = atmp; } - B[j*m] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; - sbstobf16_(&one, &B[j*m], &one, &btmp, &one); - BB[j*m].v = btmp; + B[j << l] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; + sbstobf16_(&one, &B[j << l], &one, &btmp, &one); + BB[j << l].v = btmp; } for (y = 0; y < 2; y++) { @@ -240,9 +239,9 @@ main (int argc, char *argv[]) transA = 'T'; } - memset(CC, 0, x * m * sizeof(FLOAT)); + memset(CC, 0, x * sizeof(FLOAT) << l); memset(DD, 0, x * sizeof(FLOAT)); - memset(C, 0, x * m * sizeof(FLOAT)); + memset(C, 0, x * sizeof(FLOAT) << l); SGEMV (&transA, &x, &x, &alpha, A, &x, B, &k, &beta, C, &k); SBGEMV (&transA, &x, &x, &alpha, (bfloat16*) AA, &x, (bfloat16*) BB, &k, &beta, CC, &k); @@ -250,15 +249,15 @@ main (int argc, char *argv[]) for (j = 0; j < x; j++) for (i = 0; i < x; i++) if (transA == 'N') { - DD[i] += float16to32 (AA[j * x + i]) * float16to32 (BB[j*m]); + DD[i] += float16to32 (AA[j * x + i]) * float16to32 (BB[j << l]); } else if (transA == 'T') { - DD[j] += float16to32 (AA[j * x + i]) * float16to32 (BB[i*m]); + DD[j] += float16to32 (AA[j * x + i]) * float16to32 (BB[i << l]); } for (j = 0; j < x; j++) { - if (fabs (CC[j*m] - C[j*m]) > 1.0) + if (fabs (CC[j << l] - C[j << l]) > 1.0) ret++; - if (fabs (CC[j*m] - DD[j]) > 1.0) + if (fabs (CC[j << l] - DD[j]) > 1.0) ret++; } } From 2f142ee857e2c04118401a83f62bcba365a8f537 Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Mon, 9 Sep 2024 14:41:55 -0500 Subject: [PATCH 040/244] More common code. --- kernel/power/sbgemv_common.c | 12 ++++++++++++ kernel/power/sbgemv_n.c | 6 ++---- kernel/power/sbgemv_n_vsx.c | 24 ++++++++---------------- 3 files changed, 22 insertions(+), 20 deletions(-) diff --git a/kernel/power/sbgemv_common.c b/kernel/power/sbgemv_common.c index b11ab59de8..1893eba516 100644 --- a/kernel/power/sbgemv_common.c +++ b/kernel/power/sbgemv_common.c @@ -138,6 +138,12 @@ FORCEINLINE vec_f32 vec_loadN_f32(void *src, BLASLONG n) return (vec_f32)vec_loadN(src, n * (sizeof(FLOAT) / sizeof(IFLOAT))); } +FORCEINLINE void vec_loadN2_f32(vec_f32 *data, vec_f32 *src, BLASLONG n) +{ + data[0] = src[0]; + data[1] = vec_loadN_f32(&src[1], n); +} + FORCEINLINE void vec_storeN_f32(vec_f32 data, void *dst, BLASLONG n) { FLOAT *dst2 = (FLOAT *)(dst); @@ -160,6 +166,12 @@ FORCEINLINE void vec_storeN_f32(vec_f32 data, void *dst, BLASLONG n) #endif } +FORCEINLINE void vec_storeN2_f32(vec_f32 *data, vec_f32 *dst, BLASLONG n) +{ + dst[0] = data[0]; + vec_storeN_f32(data[1], &dst[1], n); +} + FORCEINLINE vec_f32 vec_mult(vec_f32 *inp, vec_bf16 in0, vec_bf16 zero) { vec_f32 v_in00 = BF16_HI(in0, zero); diff --git a/kernel/power/sbgemv_n.c b/kernel/power/sbgemv_n.c index fa7df858f8..05c02a0068 100644 --- a/kernel/power/sbgemv_n.c +++ b/kernel/power/sbgemv_n.c @@ -75,12 +75,10 @@ static void BF16GEMV_N_beta(BLASLONG n, FLOAT *output_vector, FLOAT *input_vecto n &= 7; if (n > 4) { BLASLONG n3 = n & 3; - v_inp0[0] = in[(i * 2) + 0]; - v_inp0[1] = vec_loadN_f32(&in[(i * 2) + 1], n3); + vec_loadN2_f32(v_inp0, &in[(i * 2) + 0], n3); v_inp0[0] *= b; v_inp0[1] *= b; - out[(i * 2) + 0] = v_inp0[0]; - vec_storeN_f32(v_inp0[1], &out[(i * 2) + 1], n3); + vec_storeN2_f32(v_inp0, &out[(i * 2) + 0], n3); } else if (n) { v_inp0[0] = vec_loadN_f32(&in[(i * 2) + 0], n); v_inp0[0] *= b; diff --git a/kernel/power/sbgemv_n_vsx.c b/kernel/power/sbgemv_n_vsx.c index ddbf908b3f..45570950ea 100644 --- a/kernel/power/sbgemv_n_vsx.c +++ b/kernel/power/sbgemv_n_vsx.c @@ -64,13 +64,11 @@ static void BF16GEMV_N_VSX_1(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, FLOA n &= 7; if (n > 4) { BLASLONG n3 = n & 3; - vy0[0] = v_y[(i * 2) + 0]; - vy0[1] = vec_loadN_f32(&v_y[(i * 2) + 1], n3); + vec_loadN2_f32(vy0, &v_y[(i * 2) + 0], n3); vec_loadN_mult2(v_x0, &va0[i], n, zero, vy0); - v_y[(i * 2) + 0] = vy0[0]; - vec_storeN_f32(vy0[1], &v_y[(i * 2) + 1], n3); + vec_storeN2_f32(vy0, &v_y[(i * 2) + 0], n3); } else if (n) { vec_f32 vy0 = vec_loadN_f32(&v_y[(i * 2) + 0], n); @@ -116,14 +114,12 @@ static void BF16GEMV_N_VSX_2(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, FLOA n &= 7; if (n > 4) { BLASLONG n3 = n & 3; - vy0[0] = v_y[(i * 2) + 0]; - vy0[1] = vec_loadN_f32(&v_y[(i * 2) + 1], n3); + vec_loadN2_f32(vy0, &v_y[(i * 2) + 0], n3); vec_loadN_mult2(v_x0, &va0[i], n, zero, vy0); vec_loadN_mult2(v_x1, &va1[i], n, zero, vy0); - v_y[(i * 2) + 0] = vy0[0]; - vec_storeN_f32(vy0[1], &v_y[(i * 2) + 1], n3); + vec_storeN2_f32(vy0, &v_y[(i * 2) + 0], n3); } else if (n) { vec_f32 vy0 = vec_loadN_f32(&v_y[(i * 2) + 0], n); @@ -178,16 +174,14 @@ static void BF16GEMV_N_VSX_4(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, FLOA n &= 7; if (n > 4) { BLASLONG n3 = n & 3; - vy0[0] = v_y[(i * 2) + 0]; - vy0[1] = vec_loadN_f32(&v_y[(i * 2) + 1], n3); + vec_loadN2_f32(vy0, &v_y[(i * 2) + 0], n3); vec_loadN_mult2(v_x0, &va0[i], n, zero, vy0); vec_loadN_mult2(v_x1, &va1[i], n, zero, vy0); vec_loadN_mult2(v_x2, &va2[i], n, zero, vy0); vec_loadN_mult2(v_x3, &va3[i], n, zero, vy0); - v_y[(i * 2) + 0] = vy0[0]; - vec_storeN_f32(vy0[1], &v_y[(i * 2) + 1], n3); + vec_storeN2_f32(vy0, &v_y[(i * 2) + 0], n3); } else if (n) { vec_f32 vy0 = vec_loadN_f32(&v_y[(i * 2) + 0], n); @@ -263,8 +257,7 @@ static void BF16GEMV_N_VSX_8(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, BLAS n &= 7; if (n > 4) { BLASLONG n3 = n & 3; - vy0[0] = v_y[(i * 2) + 0]; - vy0[1] = vec_loadN_f32(&v_y[(i * 2) + 1], n3); + vec_loadN2_f32(vy0, &v_y[(i * 2) + 0], n3); vec_loadN_mult2(v_x0, &va0[i], n, zero, vy0); vec_loadN_mult2(v_x1, &va1[i], n, zero, vy0); @@ -275,8 +268,7 @@ static void BF16GEMV_N_VSX_8(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, BLAS vec_loadN_mult2(v_x6, &vb2[i], n, zero, vy0); vec_loadN_mult2(v_x7, &vb3[i], n, zero, vy0); - v_y[(i * 2) + 0] = vy0[0]; - vec_storeN_f32(vy0[1], &v_y[(i * 2) + 1], n3); + vec_storeN2_f32(vy0, &v_y[(i * 2) + 0], n3); } else if (n) { vec_f32 vy0 = vec_loadN_f32(&v_y[(i * 2) + 0], n); From 72216d28c256087363435642b6ec3d497902033d Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Wed, 11 Sep 2024 08:47:32 -0500 Subject: [PATCH 041/244] Fix bug with inc_y adding results twice. --- kernel/power/sbgemv_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/power/sbgemv_common.c b/kernel/power/sbgemv_common.c index 1893eba516..07f75d3183 100644 --- a/kernel/power/sbgemv_common.c +++ b/kernel/power/sbgemv_common.c @@ -300,7 +300,7 @@ FORCEINLINE void copy_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src, F FORCEINLINE void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) { for (BLASLONG i = 0; i < n; i++) { - *dest += *src++; + *dest = *src++; dest += inc_dest; } } From 869a169c57d66e783f2a9fc18e34ffa8db51af78 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 11 Sep 2024 22:18:14 +0200 Subject: [PATCH 042/244] Fix ZAXPYTEST prototype --- ctest/c_zblat1c.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ctest/c_zblat1c.c b/ctest/c_zblat1c.c index f7c0515fc5..5629f8ca7d 100644 --- a/ctest/c_zblat1c.c +++ b/ctest/c_zblat1c.c @@ -380,7 +380,7 @@ static doublereal c_b43 = 1.; static integer i__; extern /* Subroutine */ int ctest_(integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*); static doublecomplex mwpcs[5], mwpct[5]; - extern /* Subroutine */ int zscaltest_(integer*, doublereal*, doublecomplex*, integer*), itest1_(integer*, integer*), stest1_(doublereal*, doublereal*, doublereal*, doublereal*); + extern /* Subroutine */ int zscaltest_(integer*, doublecomplex*, doublecomplex*, integer*), itest1_(integer*, integer*), stest1_(doublereal*, doublereal*, doublereal*, doublereal*); static doublecomplex cx[8]; extern doublereal dznrm2test_(integer*, doublecomplex*, integer*); static integer np1; @@ -595,7 +595,7 @@ static doublereal c_b43 = 1.; static integer ki; extern /* Subroutine */ int zdotutest_(integer*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*), zswaptest_(integer*, doublecomplex*, integer*, doublecomplex*, integer*); static integer kn; - extern /* Subroutine */ int zaxpytest_(integer*, doublereal*, doublecomplex*, integer*, doublecomplex*, integer*); + extern /* Subroutine */ int zaxpytest_(integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*); static doublecomplex cx[7], cy[7]; static integer mx, my; From 383e0b133e577e882dce84f7eeb1c8118ba533d5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 11 Sep 2024 22:21:09 +0200 Subject: [PATCH 043/244] remove suppression of gcc14's incompatible pointer error --- Makefile.system | 3 --- 1 file changed, 3 deletions(-) diff --git a/Makefile.system b/Makefile.system index b065f9a981..c40c1f2340 100644 --- a/Makefile.system +++ b/Makefile.system @@ -376,9 +376,6 @@ OBJCONV = $(CROSS_SUFFIX)objconv ifeq ($(NOFORTRAN), 1) C_LAPACK = 1 override FEXTRALIB = -ifeq ($(C_COMPILER), GCC) -CCOMMON_OPT += -Wno-error=incompatible-pointer-types -endif endif ifeq ($(C_COMPILER), GCC) From 4178905fa77333f9212c303b4ab19c36699f07b1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 12 Sep 2024 16:39:20 +0200 Subject: [PATCH 044/244] Update version of upload-artifacts following deprecation --- .github/workflows/nightly-Homebrew-build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nightly-Homebrew-build.yml b/.github/workflows/nightly-Homebrew-build.yml index ca57fba709..71da7cd875 100644 --- a/.github/workflows/nightly-Homebrew-build.yml +++ b/.github/workflows/nightly-Homebrew-build.yml @@ -69,7 +69,7 @@ jobs: mv *.bottle.tar.gz bottles - name: Upload bottle - uses: actions/upload-artifact@v1 + uses: actions/upload-artifact@v3 with: name: openblas--HEAD.catalina.bottle.tar.gz path: bottles From b588e922a1571e8de606d8a697ed0ddf48fb5223 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 12 Sep 2024 18:13:46 +0200 Subject: [PATCH 045/244] Update oneAPI download location for Mac to final --- azure-pipelines.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index d72baabe16..8c5b1e5bb2 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -212,7 +212,7 @@ jobs: vmImage: 'macOS-latest' variables: LD_LIBRARY_PATH: /usr/local/opt/llvm/lib - MACOS_HPCKIT_URL: https://registrationcenter-download.intel.com/akdlm/irc_nas/17643/m_HPCKit_p_2021.2.0.2903_offline.dmg + MACOS_HPCKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/edb4dc2f-266f-47f2-8d56-21bc7764e119/m_HPCKit_p_2023.2.0.49443.dmg LIBRARY_PATH: /usr/local/opt/llvm/lib MACOS_FORTRAN_COMPONENTS: intel.oneapi.mac.ifort-compiler steps: From 7947970f9d5d88a9399c691a0911689c592f5d37 Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Fri, 13 Sep 2024 06:22:13 -0500 Subject: [PATCH 046/244] Move common code. --- kernel/power/gemm_common.c | 148 +++++++++++++++++++++++++++++++++++ kernel/power/sbgemv_common.c | 133 +------------------------------ kernel/power/sbgemv_n.c | 2 +- kernel/power/sbgemv_n_vsx.c | 3 +- 4 files changed, 152 insertions(+), 134 deletions(-) create mode 100644 kernel/power/gemm_common.c diff --git a/kernel/power/gemm_common.c b/kernel/power/gemm_common.c new file mode 100644 index 0000000000..c33faffe0e --- /dev/null +++ b/kernel/power/gemm_common.c @@ -0,0 +1,148 @@ +#ifndef GEMM_COMMON_C +#define GEMM_COMMON_C +#include "common.h" + +#include + +#define FORCEINLINE inline __attribute__((always_inline)) + +#ifdef __clang__ +#define uint16_t unsigned short +#define uint32_t unsigned int +#define uint64_t unsigned long long +#endif + +#ifdef _ARCH_PWR10 +#ifdef __has_builtin +#if !__has_builtin(__builtin_vsx_assemble_pair) +#define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair +#endif +#if !__has_builtin(__builtin_vsx_disassemble_pair) +#define __builtin_vsx_disassemble_pair __builtin_mma_disassemble_pair +#endif +#endif + +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#define __builtin_vsx_assemble_pair2(vp0, v0, v1) __builtin_vsx_assemble_pair(vp0, v1, v0) +#else +#define __builtin_vsx_assemble_pair2(vp0, v0, v1) __builtin_vsx_assemble_pair(vp0, v0, v1) +#endif + +#define USE_VECTOR_PAIRS +#endif + +typedef __vector IFLOAT vec_bf16; +typedef __vector FLOAT vec_f32; +typedef __vector unsigned char vec_uc8; + +FORCEINLINE vec_uc8 vec_load_vec(void *src) +{ + return vec_xl(0, (unsigned char *)(src)); +} + +FORCEINLINE void vec_load_pair(vec_f32 *dst, vec_f32 *src) +{ +#ifdef USE_VECTOR_PAIRS + __vector_pair vy0p; + vy0p = *(__vector_pair *)(src); + __builtin_vsx_disassemble_pair((void *)(dst), &vy0p); +#else + dst[0] = src[0]; + dst[1] = src[1]; +#endif +} + +FORCEINLINE void vec_store_pair(vec_f32 *dst, vec_f32 *src) +{ +#ifdef USE_VECTOR_PAIRS + __vector_pair vy0p; + __builtin_vsx_assemble_pair2(&vy0p, (vec_uc8)src[1], (vec_uc8)src[0]); + *(__vector_pair *)(dst) = vy0p; +#else + dst[0] = src[0]; + dst[1] = src[1]; +#endif +} + +FORCEINLINE vec_bf16 vec_loadN(void *src, BLASLONG n) +{ + IFLOAT *src2 = (IFLOAT *)(src); +#ifdef _ARCH_PWR9 + return vec_xl_len(src2, n * sizeof(IFLOAT)); +#else + __attribute__((aligned(16))) IFLOAT data[sizeof(vec_bf16) / sizeof(IFLOAT)]; + memset(data, 0, sizeof(vec_bf16)); + if (n & 4) { + memcpy(data, src2, sizeof(uint64_t)); + } + if (n & 2) { + BLASLONG n4 = n & 4; + memcpy(data + n4, src2 + n4, sizeof(uint32_t)); + } + if (n & 1) { + BLASLONG n6 = n & 6; + data[n6] = src2[n6]; + } + return (vec_bf16)vec_load_vec(data); +#endif +} + +FORCEINLINE vec_f32 vec_loadN_f32(void *src, BLASLONG n) +{ +#ifndef _ARCH_PWR9 + if (n & 4) { + return (vec_f32)vec_load_vec(src); + } +#endif + return (vec_f32)vec_loadN(src, n * (sizeof(FLOAT) / sizeof(IFLOAT))); +} + +FORCEINLINE void vec_loadN2_f32(vec_f32 *data, vec_f32 *src, BLASLONG n) +{ + data[0] = src[0]; + data[1] = vec_loadN_f32(&src[1], n); +} + +FORCEINLINE void vec_storeN(vec_bf16 data, void *dst, BLASLONG n) +{ + IFLOAT *dst2 = (IFLOAT *)(dst); +#ifdef _ARCH_PWR9 + vec_xst_len(data, dst2, n * sizeof(IFLOAT)); +#else + if (n & 8) { + vec_xst(data, 0, dst2); + return; + } + __attribute__((aligned(16))) IFLOAT data2[sizeof(vec_f32) / sizeof(IFLOAT)]; + vec_xst(data, 0, data2); + if (n & 4) { + memcpy(dst2, data2, sizeof(uint64_t)); + } + if (n & 2) { + BLASLONG n4 = n & 4; + memcpy(dst2 + n4, data2 + n4, sizeof(uint32_t)); + } + if (n & 1) { + BLASLONG n6 = n & 6; + dst2[n6] = data2[n6]; + } +#endif +} + +FORCEINLINE void vec_storeN_f32(vec_f32 data, void *dst, BLASLONG n) +{ +#ifndef _ARCH_PWR9 + if (n & 4) { + vec_xst(data, 0, (FLOAT *)dst); + return; + } +#endif + return vec_storeN((vec_bf16)data, dst, n * (sizeof(FLOAT) / sizeof(IFLOAT))); +} + +FORCEINLINE void vec_storeN2_f32(vec_f32 *data, vec_f32 *dst, BLASLONG n) +{ + dst[0] = data[0]; + vec_storeN_f32(data[1], &dst[1], n); +} +#endif diff --git a/kernel/power/sbgemv_common.c b/kernel/power/sbgemv_common.c index 07f75d3183..46dee74c3e 100644 --- a/kernel/power/sbgemv_common.c +++ b/kernel/power/sbgemv_common.c @@ -27,40 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef SBGEMV_COMMON_C #define SBGEMV_COMMON_C -#include "common.h" - -#include - -#define FORCEINLINE inline __attribute__((always_inline)) - -#ifdef __clang__ -#define uint16_t unsigned short -#define uint32_t unsigned int -#define uint64_t unsigned long long -#endif - -#ifdef _ARCH_PWR10 -#ifdef __has_builtin -#if !__has_builtin(__builtin_vsx_assemble_pair) -#define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair -#endif -#if !__has_builtin(__builtin_vsx_disassemble_pair) -#define __builtin_vsx_disassemble_pair __builtin_mma_disassemble_pair -#endif -#endif - -#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ -#define __builtin_vsx_assemble_pair2(vp0, v0, v1) __builtin_vsx_assemble_pair(vp0, v1, v0) -#else -#define __builtin_vsx_assemble_pair2(vp0, v0, v1) __builtin_vsx_assemble_pair(vp0, v0, v1) -#endif - -#define USE_VECTOR_PAIRS -#endif - -typedef __vector IFLOAT vec_bf16; -typedef __vector FLOAT vec_f32; -typedef __vector unsigned char vec_uc8; +#include "gemm_common.c" #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ #define BF16_HI(data, zero) (vec_f32)vec_mergeh(data, zero) @@ -70,108 +37,12 @@ typedef __vector unsigned char vec_uc8; #define BF16_LO(data, zero) (vec_f32)vec_mergel(zero, data) #endif -FORCEINLINE vec_uc8 vec_load_vec(void *src) -{ - return vec_xl(0, (unsigned char *)(src)); -} - -FORCEINLINE void vec_load_pair(vec_f32 *dst, vec_f32 *src) -{ -#ifdef USE_VECTOR_PAIRS - __vector_pair vy0p; - vy0p = *(__vector_pair *)(src); - __builtin_vsx_disassemble_pair((void *)(dst), &vy0p); -#else - dst[0] = src[0]; - dst[1] = src[1]; -#endif -} - -FORCEINLINE void vec_store_pair(vec_f32 *dst, vec_f32 *src) -{ -#ifdef USE_VECTOR_PAIRS - __vector_pair vy0p; - __builtin_vsx_assemble_pair2(&vy0p, (vec_uc8)src[1], (vec_uc8)src[0]); - *(__vector_pair *)(dst) = vy0p; -#else - dst[0] = src[0]; - dst[1] = src[1]; -#endif -} - -FORCEINLINE vec_bf16 vec_loadN(void *src, BLASLONG n) -{ - IFLOAT *src2 = (IFLOAT *)(src); -#ifdef _ARCH_PWR9 - return vec_xl_len(src2, n * sizeof(IFLOAT)); -#else - __attribute__((aligned(16))) IFLOAT data[sizeof(vec_bf16) / sizeof(IFLOAT)]; - memset(data, 0, sizeof(vec_bf16)); - if (n & 4) { - memcpy(data, src2, sizeof(uint64_t)); - } - if (n & 2) { - BLASLONG n4 = n & 4; - memcpy(data + n4, src2 + n4, sizeof(uint32_t)); - } - if (n & 1) { - BLASLONG n6 = n & 6; - data[n6] = src2[n6]; - } - return (vec_bf16)vec_load_vec(data); -#endif -} - FORCEINLINE vec_f32 vec_loadNHi(void *src, BLASLONG n, vec_bf16 zero) { vec_bf16 data = vec_loadN(src, n); return BF16_HI(data, zero); } -FORCEINLINE vec_f32 vec_loadN_f32(void *src, BLASLONG n) -{ -#ifndef _ARCH_PWR9 - if (n & 4) { - return (vec_f32)vec_load_vec(src); - } -#endif - return (vec_f32)vec_loadN(src, n * (sizeof(FLOAT) / sizeof(IFLOAT))); -} - -FORCEINLINE void vec_loadN2_f32(vec_f32 *data, vec_f32 *src, BLASLONG n) -{ - data[0] = src[0]; - data[1] = vec_loadN_f32(&src[1], n); -} - -FORCEINLINE void vec_storeN_f32(vec_f32 data, void *dst, BLASLONG n) -{ - FLOAT *dst2 = (FLOAT *)(dst); -#ifdef _ARCH_PWR9 - vec_xst_len(data, dst2, n * sizeof(FLOAT)); -#else - if (n & 4) { - vec_xst(data, 0, dst2); - return; - } - __attribute__((aligned(16))) FLOAT data2[sizeof(vec_f32) / sizeof(FLOAT)]; - vec_xst(data, 0, data2); - if (n & 2) { - memcpy(dst2, data2, sizeof(uint64_t)); - } - if (n & 1) { - BLASLONG n2 = n & 2; - dst2[n2] = data2[n2]; - } -#endif -} - -FORCEINLINE void vec_storeN2_f32(vec_f32 *data, vec_f32 *dst, BLASLONG n) -{ - dst[0] = data[0]; - vec_storeN_f32(data[1], &dst[1], n); -} - FORCEINLINE vec_f32 vec_mult(vec_f32 *inp, vec_bf16 in0, vec_bf16 zero) { vec_f32 v_in00 = BF16_HI(in0, zero); @@ -297,7 +168,7 @@ FORCEINLINE void copy_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src, F } } -FORCEINLINE void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) +FORCEINLINE void move_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) { for (BLASLONG i = 0; i < n; i++) { *dest = *src++; diff --git a/kernel/power/sbgemv_n.c b/kernel/power/sbgemv_n.c index 05c02a0068..c7559a47c4 100644 --- a/kernel/power/sbgemv_n.c +++ b/kernel/power/sbgemv_n.c @@ -179,7 +179,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, IFLOAT *a, BLASLONG lda, IFLOAT * a += NB; if (inc_y != 1) { - add_y(NB, ybuffer, y_ptr, inc_y); + move_y(NB, ybuffer, y_ptr, inc_y); y_ptr += (NB * inc_y); } else { y_ptr += NB; diff --git a/kernel/power/sbgemv_n_vsx.c b/kernel/power/sbgemv_n_vsx.c index 45570950ea..cab2316d4e 100644 --- a/kernel/power/sbgemv_n_vsx.c +++ b/kernel/power/sbgemv_n_vsx.c @@ -269,8 +269,7 @@ static void BF16GEMV_N_VSX_8(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, BLAS vec_loadN_mult2(v_x7, &vb3[i], n, zero, vy0); vec_storeN2_f32(vy0, &v_y[(i * 2) + 0], n3); - } else - if (n) { + } else if (n) { vec_f32 vy0 = vec_loadN_f32(&v_y[(i * 2) + 0], n); vy0 += vec_loadNHi_multi2(v_x0, &va0[i], n, zero); From 969bb949b175032388011c6f8401269991cbe2bb Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 19 Sep 2024 11:10:28 +0200 Subject: [PATCH 047/244] Strip any mtune option from FFLAGS is the compiler is flang-new --- Makefile.system | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile.system b/Makefile.system index c40c1f2340..16b367503c 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1720,8 +1720,8 @@ LAPACK_FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx override FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS)) endif ifeq ($(F_COMPILER),FLANGNEW) -LAPACK_FFLAGS := $(filter-out -m32 -m64 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS)) -override FFLAGS := $(filter-out -m32 -m64 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS)) +LAPACK_FFLAGS := $(filter-out -m32 -m64 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 -mtune=% ,$(FFLAGS)) +override FFLAGS := $(filter-out -m32 -m64 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 -mtune=% ,$(FFLAGS)) endif LAPACK_CFLAGS = $(CFLAGS) From 89a12fa08352da73b03225cf1743dead75669043 Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Mon, 23 Sep 2024 06:32:14 -0500 Subject: [PATCH 048/244] MMA BF16 GEMV code. --- kernel/power/gemm_common.c | 2 + kernel/power/sbgemv_common.c | 2 +- kernel/power/sbgemv_common_power10.c | 265 +++++++++++++++++++++ kernel/power/sbgemv_n.c | 22 ++ kernel/power/sbgemv_n_power10.c | 306 ++++++++++++++++++++++++- kernel/power/sbgemv_n_vsx.c | 71 +++--- kernel/power/sbgemv_t.c | 15 ++ kernel/power/sbgemv_t_power10.c | 330 ++++++++++++++++++++++++++- kernel/power/sbgemv_t_vsx.c | 67 +++--- 9 files changed, 1011 insertions(+), 69 deletions(-) create mode 100644 kernel/power/sbgemv_common_power10.c diff --git a/kernel/power/gemm_common.c b/kernel/power/gemm_common.c index c33faffe0e..0611ebc2a9 100644 --- a/kernel/power/gemm_common.c +++ b/kernel/power/gemm_common.c @@ -4,6 +4,8 @@ #include +#define NBMAX 4096 + #define FORCEINLINE inline __attribute__((always_inline)) #ifdef __clang__ diff --git a/kernel/power/sbgemv_common.c b/kernel/power/sbgemv_common.c index 46dee74c3e..ab50f430af 100644 --- a/kernel/power/sbgemv_common.c +++ b/kernel/power/sbgemv_common.c @@ -111,7 +111,7 @@ FORCEINLINE vec_f32 vec_loadNHi_mult(vec_bf16 *in, vec_f32 v_inp0, BLASLONG n, v return (v_inp0 * v_in00); } -FORCEINLINE vec_f32 vec_loadNHi_multi2(vec_f32 v_x0, vec_bf16 *in, BLASLONG n, vec_bf16 zero) +FORCEINLINE vec_f32 vec_loadNHi_mult2(vec_f32 v_x0, vec_bf16 *in, BLASLONG n, vec_bf16 zero) { vec_f32 v_in00 = vec_loadNHi(in, n, zero); diff --git a/kernel/power/sbgemv_common_power10.c b/kernel/power/sbgemv_common_power10.c new file mode 100644 index 0000000000..da088014b0 --- /dev/null +++ b/kernel/power/sbgemv_common_power10.c @@ -0,0 +1,265 @@ +/*************************************************************************** +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#ifndef SBGEMV_COMMON_MMA_C +#define SBGEMV_COMMON_MMA_C +#include "sbgemv_common.c" + +FORCEINLINE void vec_load_mult_mma(__vector_quad *out, vec_bf16 *in, vec_bf16 inp) +{ + vec_bf16 in0 = (vec_bf16)vec_load_vec(in); + + __builtin_mma_xvbf16ger2pp(out, (vec_uc8)in0, (vec_uc8)inp); +} + +FORCEINLINE void vec_load_mult2_mma(__vector_quad *out, vec_bf16 *in, vec_bf16 *inp) +{ + vec_bf16 in0[2]; + + vec_load_pair((vec_f32 *)in0, (vec_f32 *)in); + + __builtin_mma_xvbf16ger2pp(out, (vec_uc8)in0[0], (vec_uc8)inp[0]); + __builtin_mma_xvbf16ger2pp(out, (vec_uc8)in0[1], (vec_uc8)inp[1]); +} + +FORCEINLINE void vec_loadN_mult_mma(__vector_quad *out, vec_bf16 *in, vec_bf16 inp, BLASLONG n) +{ + vec_bf16 in0 = vec_loadN(in, n); + + __builtin_mma_xvbf16ger2pp(out, (vec_uc8)in0, (vec_uc8)inp); +} + +FORCEINLINE void vec_mult1_mma(__vector_quad *out, vec_bf16 in0, vec_bf16 inp) +{ + vec_bf16 in00 = vec_mergeh(in0, in0); + + __builtin_mma_xvbf16ger2(out, (vec_uc8)inp, (vec_uc8)in00); +} + +FORCEINLINE void vec_mult2_mma(__vector_quad *out, vec_bf16 in0, vec_bf16 inp) +{ + vec_bf16 in01 = vec_mergel(in0, in0); + + vec_mult1_mma(&out[0], in0, inp); + + __builtin_mma_xvbf16ger2(&out[1], (vec_uc8)inp, (vec_uc8)in01); +} + +FORCEINLINE void vec_mult4_mma(__vector_quad *out, vec_bf16 *in0, vec_bf16 inp) +{ + vec_mult2_mma(out + 0, in0[0], inp); + vec_mult2_mma(out + 2, in0[1], inp); +} + +FORCEINLINE void vec_loadN_mult11_mma(__vector_quad *out, vec_bf16 *in, vec_bf16 inp, BLASLONG n) +{ + vec_bf16 in0 = vec_loadN(in, n); + + vec_mult1_mma(out, in0, inp); +} + +FORCEINLINE void vec_loadN_mult12_mma(__vector_quad *out, vec_bf16 *in, vec_bf16 inp, BLASLONG n) +{ + vec_bf16 in0 = vec_loadN(in, n); + + vec_mult2_mma(out, in0, inp); +} + +FORCEINLINE void vec_load_mult12_mma(__vector_quad *out, vec_bf16 *in, vec_bf16 inp) +{ + vec_bf16 in0 = (vec_bf16)vec_load_vec(in); + + vec_mult2_mma(out, in0, inp); +} + +FORCEINLINE void vec_load_mult18_mma(__vector_quad *out, vec_bf16 *in, vec_bf16 inp) +{ + vec_bf16 in0[4]; + + vec_load_pair((vec_f32 *)(in0 + 0), (vec_f32 *)(in + 0)); + vec_load_pair((vec_f32 *)(in0 + 2), (vec_f32 *)(in + 2)); + + vec_mult4_mma(&out[0], in0 + 0, inp); + vec_mult4_mma(&out[4], in0 + 2, inp); +} + +FORCEINLINE void vec_reduce1_mma(__vector_quad *out, vec_f32 *temp, vec_f32 v_alpha, vec_f32 *vy0) +{ + __builtin_mma_disassemble_acc((void*)temp, &out[0]); + + vy0[0] += (temp[0] * v_alpha); +} + +FORCEINLINE void vec_reduce2_mma(__vector_quad *out, vec_f32 *temp, vec_f32 v_alpha, vec_f32 *vy0) +{ + vec_reduce1_mma(&out[0], &temp[0], v_alpha, &vy0[0]); + vec_reduce1_mma(&out[1], &temp[4], v_alpha, &vy0[1]); +} + +FORCEINLINE void vec_reduce8_mma(__vector_quad *out, vec_f32 *temp, vec_f32 v_alpha, vec_f32 *vy0) +{ + vec_reduce2_mma(&out[0], &temp[0], v_alpha, vy0 + 0); + vec_reduce2_mma(&out[2], &temp[8], v_alpha, vy0 + 2); + vec_reduce2_mma(&out[4], &temp[16], v_alpha, vy0 + 4); + vec_reduce2_mma(&out[6], &temp[24], v_alpha, vy0 + 6); +} + +FORCEINLINE void vec_mult11a_mma(__vector_quad *out, vec_bf16 in0, vec_bf16 in1, vec_bf16 inp) +{ + vec_bf16 in00 = vec_mergeh(in0, in1); + + __builtin_mma_xvbf16ger2(out, (vec_uc8)inp, (vec_uc8)in00); +} + +FORCEINLINE void vec_mult2a_mma(__vector_quad *out, vec_bf16 in0, vec_bf16 in1, vec_bf16 inp) +{ + vec_bf16 in01 = vec_mergel(in0, in1); + + vec_mult11a_mma(&out[0], in0, in1, inp); + + __builtin_mma_xvbf16ger2(&out[1], (vec_uc8)inp, (vec_uc8)in01); +} + +FORCEINLINE void vec_mult4a_mma(__vector_quad *out, vec_bf16 *in0, vec_bf16 *in1, vec_bf16 inp) +{ + vec_mult2a_mma(out + 0, in0[0], in1[0], inp); + vec_mult2a_mma(out + 2, in0[1], in1[1], inp); +} + +FORCEINLINE void vec_loadN_mult11a_mma(__vector_quad *out, vec_bf16 *ina, vec_bf16 *inb, vec_bf16 inp, BLASLONG n) +{ + vec_bf16 in0 = vec_loadN(ina, n); + vec_bf16 in1 = vec_loadN(inb, n); + + vec_mult11a_mma(out, in0, in1, inp); +} + +FORCEINLINE void vec_load_mult22a_mma(__vector_quad *out, vec_bf16 *ina, vec_bf16 *inb, vec_bf16 inp) +{ + vec_bf16 in0 = (vec_bf16)vec_load_vec(ina); + vec_bf16 in1 = (vec_bf16)vec_load_vec(inb); + + vec_mult2a_mma(out, in0, in1, inp); +} + +FORCEINLINE void vec_load_mult28a_mma(__vector_quad *out, vec_bf16 *ina, vec_bf16 *inb, vec_bf16 inp) +{ + vec_bf16 in0[4], in1[4]; + + vec_load_pair((vec_f32 *)(in0 + 0), (vec_f32 *)(ina + 0)); + vec_load_pair((vec_f32 *)(in1 + 0), (vec_f32 *)(inb + 0)); + vec_load_pair((vec_f32 *)(in0 + 2), (vec_f32 *)(ina + 2)); + vec_load_pair((vec_f32 *)(in1 + 2), (vec_f32 *)(inb + 2)); + + vec_mult4a_mma(&out[0], in0 + 0, in1 + 0, inp); + vec_mult4a_mma(&out[4], in0 + 2, in1 + 2, inp); +} + +FORCEINLINE void vec_loadN_mult22a_mma(__vector_quad *out, vec_bf16 *ina, vec_bf16 *inb, vec_bf16 inp, BLASLONG n) +{ + vec_bf16 in0 = vec_loadN(ina, n); + vec_bf16 in1 = vec_loadN(inb, n); + + vec_mult2a_mma(out, in0, in1, inp); +} + +FORCEINLINE void vec_mult11b_mma(__vector_quad *out, vec_bf16 in0, vec_bf16 in1, vec_bf16 inp) +{ + vec_bf16 in00 = vec_mergeh(in0, in1); + + __builtin_mma_xvbf16ger2pp(out, (vec_uc8)inp, (vec_uc8)in00); +} + +FORCEINLINE void vec_mult2b_mma(__vector_quad *out, vec_bf16 in0, vec_bf16 in1, vec_bf16 inp) +{ + vec_bf16 in01 = vec_mergel(in0, in1); + + vec_mult11b_mma(&out[0], in0, in1, inp); + + __builtin_mma_xvbf16ger2pp(&out[1], (vec_uc8)inp, (vec_uc8)in01); +} + +FORCEINLINE void vec_mult4b_mma(__vector_quad *out, vec_bf16 *in0, vec_bf16 *in1, vec_bf16 inp) +{ + vec_mult2b_mma(out + 0, in0[0], in1[0], inp); + vec_mult2b_mma(out + 2, in0[1], in1[1], inp); +} + +FORCEINLINE void vec_loadN_mult11b_mma(__vector_quad *out, vec_bf16 *ina, vec_bf16 *inb, vec_bf16 inp, BLASLONG n) +{ + vec_bf16 in0 = vec_loadN(ina, n); + vec_bf16 in1 = vec_loadN(inb, n); + + vec_mult11b_mma(out, in0, in1, inp); +} + +FORCEINLINE void vec_load_mult22b_mma(__vector_quad *out, vec_bf16 *ina, vec_bf16 *inb, vec_bf16 inp) +{ + vec_bf16 in0 = (vec_bf16)vec_load_vec(ina); + vec_bf16 in1 = (vec_bf16)vec_load_vec(inb); + + vec_mult2b_mma(out, in0, in1, inp); +} + +FORCEINLINE void vec_load_mult28b_mma(__vector_quad *out, vec_bf16 *ina, vec_bf16 *inb, vec_bf16 inp) +{ + vec_bf16 in0[4], in1[4]; + + vec_load_pair((vec_f32 *)(in0 + 0), (vec_f32 *)(ina + 0)); + vec_load_pair((vec_f32 *)(in1 + 0), (vec_f32 *)(inb + 0)); + vec_load_pair((vec_f32 *)(in0 + 2), (vec_f32 *)(ina + 2)); + vec_load_pair((vec_f32 *)(in1 + 2), (vec_f32 *)(inb + 2)); + + vec_mult4b_mma(&out[0], in0 + 0, in1 + 0, inp); + vec_mult4b_mma(&out[4], in0 + 2, in1 + 2, inp); +} + +FORCEINLINE void vec_loadN_mult22b_mma(__vector_quad *out, vec_bf16 *ina, vec_bf16 *inb, vec_bf16 inp, BLASLONG n) +{ + vec_bf16 in0 = vec_loadN(ina, n); + vec_bf16 in1 = vec_loadN(inb, n); + + vec_mult2b_mma(out, in0, in1, inp); +} + +FORCEINLINE void vec_load4_pair(vec_f32 *vy0, vec_f32 *v_y) +{ + vec_load_pair(vy0 + 0, v_y + 0); + vec_load_pair(vy0 + 2, v_y + 2); + vec_load_pair(vy0 + 4, v_y + 4); + vec_load_pair(vy0 + 6, v_y + 6); +} + +FORCEINLINE void vec_store4_pair(vec_f32 *v_y, vec_f32 *vy0) +{ + vec_store_pair(v_y + 0, vy0 + 0); + vec_store_pair(v_y + 2, vy0 + 2); + vec_store_pair(v_y + 4, vy0 + 4); + vec_store_pair(v_y + 6, vy0 + 6); +} + +#endif diff --git a/kernel/power/sbgemv_n.c b/kernel/power/sbgemv_n.c index c7559a47c4..4768be31fa 100644 --- a/kernel/power/sbgemv_n.c +++ b/kernel/power/sbgemv_n.c @@ -87,6 +87,10 @@ static void BF16GEMV_N_beta(BLASLONG n, FLOAT *output_vector, FLOAT *input_vecto } } +#if (defined(_ARCH_PWR10) && (defined(USE_BFGEMV_8_N_MMA) || (!defined(USE_BFGEMV_N_MMA) && defined(USE_BFGEMV_8_N_VSX)))) || (!defined(_ARCH_PWR10) && defined(USE_BFGEMV_8_N_VSX)) +#define USE_N_8 +#endif + int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, IFLOAT *a, BLASLONG lda, IFLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y) { IFLOAT *x_ptr, *ap[4]; @@ -100,7 +104,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, IFLOAT *a, BLASLONG lda, IFLOAT * y_ptr = y; BLASLONG lda4 = lda << 2; +#ifdef USE_N_8 BLASLONG lda8 = lda << 3; +#endif BLASLONG NB = NBMAX; BLASLONG m2 = (m & (NBMAX - 1)); @@ -126,6 +132,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, IFLOAT *a, BLASLONG lda, IFLOAT * ap[3] = ap[2] + lda; if (inc_x == 1) { +#ifdef USE_N_8 for (BLASLONG j = 0; j + 8 <= n; j += 8) { BF16GEMV_N_8(NB, ap, x_ptr, ybuffer, lda4, alpha); ap[0] += lda8; @@ -135,9 +142,16 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, IFLOAT *a, BLASLONG lda, IFLOAT * x_ptr += 8; } if (n & 4) { +#else + for (BLASLONG j = 0; j + 4 <= n; j += 4) { +#endif BF16GEMV_N_4(NB, ap, x_ptr, ybuffer, alpha); ap[0] += lda4; ap[1] += lda4; +#ifndef USE_N_8 + ap[2] += lda4; + ap[3] += lda4; +#endif x_ptr += 4; } if (n & 2) { @@ -149,6 +163,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, IFLOAT *a, BLASLONG lda, IFLOAT * BF16GEMV_N_1(NB, ap, x_ptr, ybuffer, alpha); } } else { +#ifdef USE_N_8 for (BLASLONG j = 0; j + 8 <= n; j += 8) { copy_x(8, x_ptr, xbuffer, inc_x); BF16GEMV_N_8(NB, ap, xbuffer, ybuffer, lda4, alpha); @@ -159,10 +174,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, IFLOAT *a, BLASLONG lda, IFLOAT * x_ptr += 8 * inc_x; } if (n & 4) { +#else + for (BLASLONG j = 0; j + 4 <= n; j += 4) { +#endif copy_x(4, x_ptr, xbuffer, inc_x); BF16GEMV_N_4(NB, ap, xbuffer, ybuffer, alpha); ap[0] += lda4; ap[1] += lda4; +#ifndef USE_N_8 + ap[2] += lda4; + ap[3] += lda4; +#endif x_ptr += 4 * inc_x; } if (n & 2) { diff --git a/kernel/power/sbgemv_n_power10.c b/kernel/power/sbgemv_n_power10.c index fc83b38c37..7b2beb0c7b 100644 --- a/kernel/power/sbgemv_n_power10.c +++ b/kernel/power/sbgemv_n_power10.c @@ -25,9 +25,309 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -//#include "sbgemv_common.c" +#ifndef SBGEMV_N_MMA_C +#define SBGEMV_N_MMA_C -#include "sbgemv_n_vsx.c" +#define USE_BFGEMV_N_MMA + +#ifdef USE_BFGEMV_N_MMA +#include "sbgemv_common_power10.c" + +#ifndef BF16GEMV_N_X +#define BF16GEMV_N_X +#define BF16GEMV_N_8 BF16GEMV_N_MMA_8 +#define BF16GEMV_N_4 BF16GEMV_N_MMA_4 +#define BF16GEMV_N_2 BF16GEMV_N_MMA_2 +#define BF16GEMV_N_1 BF16GEMV_N_MMA_1 +#endif + +#define USE_BFGEMV_8_N_MMA + +static void BF16GEMV_N_MMA_1(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, FLOAT alpha) +{ + IFLOAT *a0; + __vector_quad temp[2*4]; + vec_f32 temp0[8*4], vy0[2*4]; + vec_f32 v_alpha = { alpha, alpha, alpha, alpha }; + + a0 = ap[0]; + + vec_bf16 *va0 = (vec_bf16 *)a0; + + vec_bf16 *x_bf = (vec_bf16 *)(xo); + vec_bf16 v_x0 = vec_loadN(x_bf, 1); + + vec_f32 *v_y = (vec_f32 *)y; + BLASLONG n8 = n / 8; + BLASLONG i = 0; + + for (; i + 4 <= n8; i += 4) { + vec_load4_pair(vy0, &v_y[(i * 2) + 0]); + + vec_load_mult18_mma(&temp[0], &va0[i + 0], v_x0); + + vec_reduce8_mma(&temp[0], temp0, v_alpha, vy0); + + vec_store4_pair(&v_y[(i * 2) + 0], vy0); + } + + for (; i < n8; i++) { + vec_load_pair(vy0, &v_y[(i * 2) + 0]); + + vec_load_mult12_mma(&temp[0], &va0[i], v_x0); + + vec_reduce2_mma(&temp[0], temp0, v_alpha, vy0); + + vec_store_pair(&v_y[(i * 2) + 0], vy0); + } + + n &= 7; + if (n > 4) { + BLASLONG n3 = n & 3; + vec_loadN2_f32(vy0, &v_y[(i * 2) + 0], n3); + + vec_loadN_mult12_mma(&temp[0], &va0[i], v_x0, n); + + vec_reduce2_mma(&temp[0], temp0, v_alpha, vy0); + + vec_storeN2_f32(vy0, &v_y[(i * 2) + 0], n3); + } else if (n) { + vy0[0] = vec_loadN_f32(&v_y[(i * 2) + 0], n); + + vec_loadN_mult11_mma(&temp[0], &va0[i], v_x0, n); + + vec_reduce1_mma(&temp[0], temp0, v_alpha, vy0); + + vec_storeN_f32(vy0[0], &v_y[(i * 2) + 0], n); + } +} + +static void BF16GEMV_N_MMA_2(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, FLOAT alpha) +{ + IFLOAT *a0, *a1; + __vector_quad temp[2*4]; + vec_f32 temp0[8*4], vy0[2*4]; + vec_f32 v_alpha = { alpha, alpha, alpha, alpha }; + + a0 = ap[0]; + a1 = ap[1]; + + vec_bf16 *va0 = (vec_bf16 *)a0; + vec_bf16 *va1 = (vec_bf16 *)a1; + + vec_bf16 *x_bf = (vec_bf16 *)(xo); + vec_bf16 v_x0 = vec_loadN(x_bf, 2); + + vec_f32 *v_y = (vec_f32 *)y; + BLASLONG n8 = n / 8; + BLASLONG i = 0; + + for (; i + 4 <= n8; i += 4) { + vec_load4_pair(vy0, &v_y[(i * 2) + 0]); + + vec_load_mult28a_mma(&temp[0], &va0[i + 0], &va1[i + 0], v_x0); + + vec_reduce8_mma(&temp[0], temp0, v_alpha, vy0); + + vec_store4_pair(&v_y[(i * 2) + 0], vy0); + } + + for (; i < n8; i++) { + vec_load_pair(vy0, &v_y[(i * 2) + 0]); + + vec_load_mult22a_mma(&temp[0], &va0[i], &va1[i], v_x0); + + vec_reduce2_mma(&temp[0], temp0, v_alpha, vy0); + + vec_store_pair(&v_y[(i * 2) + 0], vy0); + } + + n &= 7; + if (n > 4) { + BLASLONG n3 = n & 3; + vec_loadN2_f32(vy0, &v_y[(i * 2) + 0], n3); + + vec_loadN_mult22a_mma(&temp[0], &va0[i], &va1[i], v_x0, n); + + vec_reduce2_mma(&temp[0], temp0, v_alpha, vy0); + + vec_storeN2_f32(vy0, &v_y[(i * 2) + 0], n3); + } else if (n) { + vy0[0] = vec_loadN_f32(&v_y[(i * 2) + 0], n); + + vec_loadN_mult11a_mma(&temp[0], &va0[i], &va1[i], v_x0, n); + + vec_reduce1_mma(&temp[0], temp0, v_alpha, vy0); + + vec_storeN_f32(vy0[0], &v_y[(i * 2) + 0], n); + } +} -//#include "sbgemv_n.c" +static void BF16GEMV_N_MMA_4(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, FLOAT alpha) +{ + IFLOAT *a0, *a1, *a2, *a3; + __vector_quad temp[2*4]; + vec_f32 temp0[8*4], vy0[2*4]; + vec_f32 v_alpha = { alpha, alpha, alpha, alpha }; + + a0 = ap[0]; + a1 = ap[1]; + a2 = ap[2]; + a3 = ap[3]; + + vec_bf16 *va0 = (vec_bf16 *)a0; + vec_bf16 *va1 = (vec_bf16 *)a1; + vec_bf16 *va2 = (vec_bf16 *)a2; + vec_bf16 *va3 = (vec_bf16 *)a3; + + vec_bf16 *x_bf = (vec_bf16 *)(xo); + vec_bf16 v_x00 = vec_loadN(x_bf, 4); + + vec_bf16 v_x01 = (vec_bf16)vec_splat((vec_f32)v_x00, 1); + + vec_f32 *v_y = (vec_f32 *)y; + BLASLONG n8 = n / 8; + BLASLONG i = 0; + + for (; i + 4 <= n8; i += 4) { + vec_load4_pair(vy0, &v_y[(i * 2) + 0]); + + vec_load_mult28a_mma(&temp[0], &va0[i + 0], &va1[i + 0], v_x00); + vec_load_mult28b_mma(&temp[0], &va2[i + 0], &va3[i + 0], v_x01); + + vec_reduce8_mma(&temp[0], temp0, v_alpha, vy0); + + vec_store4_pair(&v_y[(i * 2) + 0], vy0); + } + + for (; i < n8; i++) { + vec_load_pair(vy0, &v_y[(i * 2) + 0]); + + vec_load_mult22a_mma(&temp[0], &va0[i], &va1[i], v_x00); + vec_load_mult22b_mma(&temp[0], &va2[i], &va3[i], v_x01); + + vec_reduce2_mma(&temp[0], temp0, v_alpha, vy0); + + vec_store_pair(&v_y[(i * 2) + 0], vy0); + } + + n &= 7; + if (n > 4) { + BLASLONG n3 = n & 3; + vec_loadN2_f32(vy0, &v_y[(i * 2) + 0], n3); + + vec_loadN_mult22a_mma(&temp[0], &va0[i], &va1[i], v_x00, n); + vec_loadN_mult22b_mma(&temp[0], &va2[i], &va3[i], v_x01, n); + + vec_reduce2_mma(&temp[0], temp0, v_alpha, vy0); + + vec_storeN2_f32(vy0, &v_y[(i * 2) + 0], n3); + } else if (n) { + vy0[0] = vec_loadN_f32(&v_y[(i * 2) + 0], n); + + vec_loadN_mult11a_mma(&temp[0], &va0[i], &va1[i], v_x00, n); + vec_loadN_mult11b_mma(&temp[0], &va2[i], &va3[i], v_x01, n); + + vec_reduce1_mma(&temp[0], temp0, v_alpha, vy0); + + vec_storeN_f32(vy0[0], &v_y[(i * 2) + 0], n); + } +} + +#ifdef USE_BFGEMV_8_N_MMA +static void BF16GEMV_N_MMA_8(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT alpha) +{ + IFLOAT *a0, *a1, *a2, *a3, *b0, *b1, *b2, *b3; + __vector_quad temp[2*4]; + vec_f32 temp0[8*4], vy0[2*4]; + vec_f32 v_alpha = { alpha, alpha, alpha, alpha }; + + a0 = ap[0]; + a1 = ap[1]; + a2 = ap[2]; + a3 = ap[3]; + b0 = a0 + lda4; + b1 = a1 + lda4; + b2 = a2 + lda4; + b3 = a3 + lda4; + + vec_bf16 *va0 = (vec_bf16 *)a0; + vec_bf16 *va1 = (vec_bf16 *)a1; + vec_bf16 *va2 = (vec_bf16 *)a2; + vec_bf16 *va3 = (vec_bf16 *)a3; + vec_bf16 *vb0 = (vec_bf16 *)b0; + vec_bf16 *vb1 = (vec_bf16 *)b1; + vec_bf16 *vb2 = (vec_bf16 *)b2; + vec_bf16 *vb3 = (vec_bf16 *)b3; + + vec_bf16 *x_bf = (vec_bf16 *)(xo); + vec_bf16 v_x00 = (vec_bf16)vec_load_vec(x_bf); + + vec_bf16 v_x01 = (vec_bf16)vec_splat((vec_f32)v_x00, 1); + vec_bf16 v_x02 = (vec_bf16)vec_splat((vec_f32)v_x00, 2); + vec_bf16 v_x03 = (vec_bf16)vec_splat((vec_f32)v_x00, 3); + + vec_f32 *v_y = (vec_f32 *)y; + BLASLONG n8 = n / 8; + BLASLONG i = 0; + + for (; i + 4 <= n8; i += 4) { + vec_load4_pair(vy0, &v_y[(i * 2) + 0]); + + vec_load_mult28a_mma(&temp[0], &va0[i + 0], &va1[i + 0], v_x00); + vec_load_mult28b_mma(&temp[0], &va2[i + 0], &va3[i + 0], v_x01); + vec_load_mult28b_mma(&temp[0], &vb0[i + 0], &vb1[i + 0], v_x02); + vec_load_mult28b_mma(&temp[0], &vb2[i + 0], &vb3[i + 0], v_x03); + + vec_reduce8_mma(&temp[0], temp0, v_alpha, vy0); + + vec_store4_pair(&v_y[(i * 2) + 0], vy0); + } + + for (; i < n8; i++) { + vec_load_pair(vy0, &v_y[(i * 2) + 0]); + + vec_load_mult22a_mma(&temp[0], &va0[i], &va1[i], v_x00); + vec_load_mult22b_mma(&temp[0], &va2[i], &va3[i], v_x01); + vec_load_mult22b_mma(&temp[0], &vb0[i], &vb1[i], v_x02); + vec_load_mult22b_mma(&temp[0], &vb2[i], &vb3[i], v_x03); + + vec_reduce2_mma(&temp[0], temp0, v_alpha, vy0); + + vec_store_pair(&v_y[(i * 2) + 0], vy0); + } + + n &= 7; + if (n > 4) { + BLASLONG n3 = n & 3; + vec_loadN2_f32(vy0, &v_y[(i * 2) + 0], n3); + + vec_loadN_mult22a_mma(&temp[0], &va0[i], &va1[i], v_x00, n); + vec_loadN_mult22b_mma(&temp[0], &va2[i], &va3[i], v_x01, n); + vec_loadN_mult22b_mma(&temp[0], &vb0[i], &vb1[i], v_x02, n); + vec_loadN_mult22b_mma(&temp[0], &vb2[i], &vb3[i], v_x03, n); + + vec_reduce2_mma(&temp[0], temp0, v_alpha, vy0); + + vec_storeN2_f32(vy0, &v_y[(i * 2) + 0], n3); + } else if (n) { + vy0[0] = vec_loadN_f32(&v_y[(i * 2) + 0], n); + + vec_loadN_mult11a_mma(&temp[0], &va0[i], &va1[i], v_x00, n); + vec_loadN_mult11b_mma(&temp[0], &va2[i], &va3[i], v_x01, n); + vec_loadN_mult11b_mma(&temp[0], &vb0[i], &vb1[i], v_x02, n); + vec_loadN_mult11b_mma(&temp[0], &vb2[i], &vb3[i], v_x03, n); + + vec_reduce1_mma(&temp[0], temp0, v_alpha, vy0); + + vec_storeN_f32(vy0[0], &v_y[(i * 2) + 0], n); + } +} +#endif + +#include "sbgemv_n.c" +#else +#include "sbgemv_n_vsx.c" +#endif +#endif diff --git a/kernel/power/sbgemv_n_vsx.c b/kernel/power/sbgemv_n_vsx.c index cab2316d4e..e8f6dca9fc 100644 --- a/kernel/power/sbgemv_n_vsx.c +++ b/kernel/power/sbgemv_n_vsx.c @@ -25,12 +25,20 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#ifndef SBGEMV_N_VSX -#define SBGEMV_N_VSX +#ifndef SBGEMV_N_VSX_C +#define SBGEMV_N_VSX_C #include "sbgemv_common.c" -#define NBMAX 4096 +#ifndef BF16GEMV_N_X +#define BF16GEMV_N_X +#define BF16GEMV_N_8 BF16GEMV_N_VSX_8 +#define BF16GEMV_N_4 BF16GEMV_N_VSX_4 +#define BF16GEMV_N_2 BF16GEMV_N_VSX_2 +#define BF16GEMV_N_1 BF16GEMV_N_VSX_1 +#endif + +#define USE_BFGEMV_8_N_VSX static void BF16GEMV_N_VSX_1(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, FLOAT alpha) { @@ -70,11 +78,11 @@ static void BF16GEMV_N_VSX_1(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, FLOA vec_storeN2_f32(vy0, &v_y[(i * 2) + 0], n3); } else if (n) { - vec_f32 vy0 = vec_loadN_f32(&v_y[(i * 2) + 0], n); + vy0[0] = vec_loadN_f32(&v_y[(i * 2) + 0], n); - vy0 += vec_loadNHi_multi2(v_x0, &va0[i], n, zero); + vy0[0] += vec_loadNHi_mult2(v_x0, &va0[i], n, zero); - vec_storeN_f32(vy0, &v_y[(i * 2) + 0], n); + vec_storeN_f32(vy0[0], &v_y[(i * 2) + 0], n); } } @@ -121,12 +129,12 @@ static void BF16GEMV_N_VSX_2(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, FLOA vec_storeN2_f32(vy0, &v_y[(i * 2) + 0], n3); } else if (n) { - vec_f32 vy0 = vec_loadN_f32(&v_y[(i * 2) + 0], n); + vy0[0] = vec_loadN_f32(&v_y[(i * 2) + 0], n); - vy0 += vec_loadNHi_multi2(v_x0, &va0[i], n, zero); - vy0 += vec_loadNHi_multi2(v_x1, &va1[i], n, zero); + vy0[0] += vec_loadNHi_mult2(v_x0, &va0[i], n, zero); + vy0[0] += vec_loadNHi_mult2(v_x1, &va1[i], n, zero); - vec_storeN_f32(vy0, &v_y[(i * 2) + 0], n); + vec_storeN_f32(vy0[0], &v_y[(i * 2) + 0], n); } } @@ -183,17 +191,18 @@ static void BF16GEMV_N_VSX_4(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, FLOA vec_storeN2_f32(vy0, &v_y[(i * 2) + 0], n3); } else if (n) { - vec_f32 vy0 = vec_loadN_f32(&v_y[(i * 2) + 0], n); + vy0[0] = vec_loadN_f32(&v_y[(i * 2) + 0], n); - vy0 += vec_loadNHi_multi2(v_x0, &va0[i], n, zero); - vy0 += vec_loadNHi_multi2(v_x1, &va1[i], n, zero); - vy0 += vec_loadNHi_multi2(v_x2, &va2[i], n, zero); - vy0 += vec_loadNHi_multi2(v_x3, &va3[i], n, zero); + vy0[0] += vec_loadNHi_mult2(v_x0, &va0[i], n, zero); + vy0[0] += vec_loadNHi_mult2(v_x1, &va1[i], n, zero); + vy0[0] += vec_loadNHi_mult2(v_x2, &va2[i], n, zero); + vy0[0] += vec_loadNHi_mult2(v_x3, &va3[i], n, zero); - vec_storeN_f32(vy0, &v_y[(i * 2) + 0], n); + vec_storeN_f32(vy0[0], &v_y[(i * 2) + 0], n); } } +#ifdef USE_BFGEMV_8_N_VSX static void BF16GEMV_N_VSX_8(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT alpha) { IFLOAT *a0, *a1, *a2, *a3, *b0, *b1, *b2, *b3; @@ -270,25 +279,21 @@ static void BF16GEMV_N_VSX_8(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, BLAS vec_storeN2_f32(vy0, &v_y[(i * 2) + 0], n3); } else if (n) { - vec_f32 vy0 = vec_loadN_f32(&v_y[(i * 2) + 0], n); - - vy0 += vec_loadNHi_multi2(v_x0, &va0[i], n, zero); - vy0 += vec_loadNHi_multi2(v_x1, &va1[i], n, zero); - vy0 += vec_loadNHi_multi2(v_x2, &va2[i], n, zero); - vy0 += vec_loadNHi_multi2(v_x3, &va3[i], n, zero); - vy0 += vec_loadNHi_multi2(v_x4, &vb0[i], n, zero); - vy0 += vec_loadNHi_multi2(v_x5, &vb1[i], n, zero); - vy0 += vec_loadNHi_multi2(v_x6, &vb2[i], n, zero); - vy0 += vec_loadNHi_multi2(v_x7, &vb3[i], n, zero); - - vec_storeN_f32(vy0, &v_y[(i * 2) + 0], n); + vy0[0] = vec_loadN_f32(&v_y[(i * 2) + 0], n); + + vy0[0] += vec_loadNHi_mult2(v_x0, &va0[i], n, zero); + vy0[0] += vec_loadNHi_mult2(v_x1, &va1[i], n, zero); + vy0[0] += vec_loadNHi_mult2(v_x2, &va2[i], n, zero); + vy0[0] += vec_loadNHi_mult2(v_x3, &va3[i], n, zero); + vy0[0] += vec_loadNHi_mult2(v_x4, &vb0[i], n, zero); + vy0[0] += vec_loadNHi_mult2(v_x5, &vb1[i], n, zero); + vy0[0] += vec_loadNHi_mult2(v_x6, &vb2[i], n, zero); + vy0[0] += vec_loadNHi_mult2(v_x7, &vb3[i], n, zero); + + vec_storeN_f32(vy0[0], &v_y[(i * 2) + 0], n); } } - -#define BF16GEMV_N_8 BF16GEMV_N_VSX_8 -#define BF16GEMV_N_4 BF16GEMV_N_VSX_4 -#define BF16GEMV_N_2 BF16GEMV_N_VSX_2 -#define BF16GEMV_N_1 BF16GEMV_N_VSX_1 +#endif #include "sbgemv_n.c" #endif diff --git a/kernel/power/sbgemv_t.c b/kernel/power/sbgemv_t.c index f0c79fe77a..4cc8f060e9 100644 --- a/kernel/power/sbgemv_t.c +++ b/kernel/power/sbgemv_t.c @@ -27,6 +27,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef SBGEMV_T_COMMON_C #define SBGEMV_T_COMMON_C + +#if (defined(_ARCH_PWR10) && (defined(USE_BFGEMV_8_T_MMA) || (!defined(USE_BFGEMV_N_MMA) && defined(USE_BFGEMV_8_T_VSX)))) || (!defined(_ARCH_PWR10) && defined(USE_BFGEMV_8_T_VSX)) +#define USE_T_8 +#endif + int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, IFLOAT *a, BLASLONG lda, IFLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y) { IFLOAT *xbuffer, *a_ptr; @@ -39,7 +44,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, IFLOAT *a, BLASLONG lda, IFLOAT * xbuffer = buffer; BLASLONG lda4 = lda << 2; +#ifdef USE_T_8 BLASLONG lda8 = lda << 3; +#endif BLASLONG NB = NBMAX; BLASLONG m2 = (m & (NBMAX - 1)); @@ -60,12 +67,16 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, IFLOAT *a, BLASLONG lda, IFLOAT * } if (inc_y == 1) { +#ifdef USE_T_8 for (BLASLONG j = 0; j + 8 <= n; j += 8) { BF16GEMV_T_8(NB, lda, a_ptr, xbuffer, y_ptr, alpha, beta); y_ptr += 8; a_ptr += lda8; } if (n & 4) { +#else + for (BLASLONG j = 0; j + 4 <= n; j += 4) { +#endif BF16GEMV_T_4(NB, lda, a_ptr, xbuffer, y_ptr, alpha, beta); y_ptr += 4; a_ptr += lda4; @@ -79,6 +90,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, IFLOAT *a, BLASLONG lda, IFLOAT * BF16GEMV_T_1(NB, lda, a_ptr, xbuffer, y_ptr, alpha, beta); } } else { +#ifdef USE_T_8 for (BLASLONG j = 0; j + 8 <= n; j += 8) { memset(ybuffer, 0, sizeof(FLOAT) * 8); BF16GEMV_T_8(NB, lda, a_ptr, xbuffer, ybuffer, alpha, beta); @@ -87,6 +99,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, IFLOAT *a, BLASLONG lda, IFLOAT * a_ptr += lda8; } if (n & 4) { +#else + for (BLASLONG j = 0; j + 4 <= n; j += 4) { +#endif memset(ybuffer, 0, sizeof(FLOAT) * 4); BF16GEMV_T_4(NB, lda, a_ptr, xbuffer, ybuffer, alpha, beta); copy_y(4, ybuffer, y_ptr, inc_y, beta); diff --git a/kernel/power/sbgemv_t_power10.c b/kernel/power/sbgemv_t_power10.c index 08bc4237c7..810287e89a 100644 --- a/kernel/power/sbgemv_t_power10.c +++ b/kernel/power/sbgemv_t_power10.c @@ -25,8 +25,334 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -//#include "sbgemv_common.c" +#ifndef SBGEMV_T_MMA_C +#define SBGEMV_T_MMA_C +#define USE_BFGEMV_T_MMA + +#ifdef USE_BFGEMV_T_MMA +#include "sbgemv_common_power10.c" + +#ifndef BF16GEMV_T_X +#define BF16GEMV_T_X +#define BF16GEMV_T_8 BF16GEMV_T_MMA_8 +#define BF16GEMV_T_4 BF16GEMV_T_MMA_4 +#define BF16GEMV_T_2 BF16GEMV_T_MMA_2 +#define BF16GEMV_T_1 BF16GEMV_T_MMA_1 +#endif + +#define USE_BFGEMV_8_T_MMA + +static void BF16GEMV_T_MMA_1(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FLOAT *y, FLOAT alpha, FLOAT beta) +{ + IFLOAT *a0; + vec_bf16 *va0, *v_x; + __vector_quad temp0; + vec_f32 temp00[4]; + vec_bf16 inp[2]; + + __builtin_mma_xxsetaccz(&temp0); + + a0 = ap; + va0 = (vec_bf16 *)a0; + v_x = (vec_bf16 *)x; + BLASLONG n8 = n / 8; + BLASLONG i = 0; + + for (; i + 2 <= n8; i += 2) { + vec_load_pair((vec_f32 *)inp, (vec_f32 *)&v_x[i]); + + vec_load_mult2_mma(&temp0, &va0[i + 0], inp); + } + + if (n8 & 1) { + inp[0] = (vec_bf16)vec_load_vec(&v_x[i]); + + vec_load_mult_mma(&temp0, &va0[i], inp[0]); + + i++; + } + + n &= 7; + if (n) { + inp[0] = vec_loadN(&v_x[i], n); + + vec_loadN_mult_mma(&temp0, &va0[i], inp[0], n); + } + + __builtin_mma_disassemble_acc((void*)temp00, &temp0); + + y[0] = (alpha * (temp00[0][0] + temp00[1][1] + temp00[2][2] + temp00[3][3])) + (beta * y[0]); +} + +static void BF16GEMV_T_MMA_2(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FLOAT *y, FLOAT alpha, FLOAT beta) +{ + IFLOAT *a0, *a1; + vec_bf16 *va0, *va1, *v_x; + __vector_quad temp0, temp1; + vec_f32 temp00[4], temp01[4]; + vec_bf16 inp[2]; + + __builtin_mma_xxsetaccz(&temp0); + __builtin_mma_xxsetaccz(&temp1); + + a0 = ap; + a1 = ap + lda; + va0 = (vec_bf16 *)a0; + va1 = (vec_bf16 *)a1; + v_x = (vec_bf16 *)x; + BLASLONG n8 = n / 8; + BLASLONG i = 0; + + for (; i + 2 <= n8; i += 2) { + vec_load_pair((vec_f32 *)inp, (vec_f32 *)&v_x[i]); + + vec_load_mult2_mma(&temp0, &va0[i + 0], inp); + vec_load_mult2_mma(&temp1, &va1[i + 0], inp); + } + + if (n8 & 1) { + inp[0] = (vec_bf16)vec_load_vec(&v_x[i]); + + vec_load_mult_mma(&temp0, &va0[i], inp[0]); + vec_load_mult_mma(&temp1, &va1[i], inp[0]); + + i++; + } + + n &= 7; + if (n) { + inp[0] = vec_loadN(&v_x[i], n); + + vec_loadN_mult_mma(&temp0, &va0[i], inp[0], n); + vec_loadN_mult_mma(&temp1, &va1[i], inp[0], n); + } + + __builtin_mma_disassemble_acc((void*)temp00, &temp0); + __builtin_mma_disassemble_acc((void*)temp01, &temp1); + + y[0] = (alpha * (temp00[0][0] + temp00[1][1] + temp00[2][2] + temp00[3][3])) + (beta * y[0]); + y[1] = (alpha * (temp01[0][0] + temp01[1][1] + temp01[2][2] + temp01[3][3])) + (beta * y[1]); +} + +static void BF16GEMV_T_MMA_4(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FLOAT *y, FLOAT alpha, FLOAT beta) +{ + IFLOAT *a0, *a1, *a2, *a3; + vec_bf16 *va0, *va1, *va2, *va3, *v_x; + __vector_quad temp0, temp1, temp2, temp3; + vec_f32 temp00[4], temp01[4], temp02[4], temp03[4]; + vec_bf16 inp[2]; + + __builtin_mma_xxsetaccz(&temp0); + __builtin_mma_xxsetaccz(&temp1); + __builtin_mma_xxsetaccz(&temp2); + __builtin_mma_xxsetaccz(&temp3); + + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + va0 = (vec_bf16 *)a0; + va1 = (vec_bf16 *)a1; + va2 = (vec_bf16 *)a2; + va3 = (vec_bf16 *)a3; + v_x = (vec_bf16 *)x; + BLASLONG n8 = n / 8; + BLASLONG i = 0; + + for (; i + 2 <= n8; i += 2) { + vec_load_pair((vec_f32 *)inp, (vec_f32 *)&v_x[i]); + + vec_load_mult2_mma(&temp0, &va0[i + 0], inp); + vec_load_mult2_mma(&temp1, &va1[i + 0], inp); + vec_load_mult2_mma(&temp2, &va2[i + 0], inp); + vec_load_mult2_mma(&temp3, &va3[i + 0], inp); + } + + if (n8 & 1) { + inp[0] = (vec_bf16)vec_load_vec(&v_x[i]); + + vec_load_mult_mma(&temp0, &va0[i], inp[0]); + vec_load_mult_mma(&temp1, &va1[i], inp[0]); + vec_load_mult_mma(&temp2, &va2[i], inp[0]); + vec_load_mult_mma(&temp3, &va3[i], inp[0]); + + i++; + } + + n &= 7; + if (n) { + inp[0] = vec_loadN(&v_x[i], n); + + vec_loadN_mult_mma(&temp0, &va0[i], inp[0], n); + vec_loadN_mult_mma(&temp1, &va1[i], inp[0], n); + vec_loadN_mult_mma(&temp2, &va2[i], inp[0], n); + vec_loadN_mult_mma(&temp3, &va3[i], inp[0], n); + } + + __builtin_mma_disassemble_acc((void*)temp00, &temp0); + __builtin_mma_disassemble_acc((void*)temp01, &temp1); + __builtin_mma_disassemble_acc((void*)temp02, &temp2); + __builtin_mma_disassemble_acc((void*)temp03, &temp3); + + vec_f32 t0, t1, t2, t3, t4, t5, t6, t7; + vec_f32 a = { alpha, alpha, alpha, alpha }; + vec_f32 b = { beta, beta, beta, beta }; + vec_f32 *v_y = (vec_f32 *) y; + + t0 = vec_mergeh(temp00[0], temp01[0]); + t1 = vec_mergeh(temp02[0], temp03[0]); + t2 = vec_mergeo(temp00[1], temp01[1]); + t3 = vec_mergeo(temp02[1], temp03[1]); + t4 = vec_mergel(temp00[2], temp01[2]); + t5 = vec_mergel(temp02[2], temp03[2]); + t6 = vec_mergeo(temp00[3], temp01[3]); + t7 = vec_mergeo(temp02[3], temp03[3]); + t0 = vec_xxpermdi(t0, t1, 0); + t2 = vec_xxpermdi(t2, t3, 0); + t4 = vec_xxpermdi(t4, t5, 0); + t6 = vec_xxpermdi(t6, t7, 3); + + t0 += t2 + t4 + t6; + + v_y[0] = (a * t0) + (b * v_y[0]); +} + +#ifdef USE_BFGEMV_8_T_MMA +static void BF16GEMV_T_MMA_8(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FLOAT *y, FLOAT alpha, FLOAT beta) +{ + IFLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; + vec_bf16 *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x; + __vector_quad temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; + vec_f32 temp00[4], temp01[4], temp02[4], temp03[4], temp04[4], temp05[4], temp06[4], temp07[4]; + vec_bf16 inp[2]; + + __builtin_mma_xxsetaccz(&temp0); + __builtin_mma_xxsetaccz(&temp1); + __builtin_mma_xxsetaccz(&temp2); + __builtin_mma_xxsetaccz(&temp3); + __builtin_mma_xxsetaccz(&temp4); + __builtin_mma_xxsetaccz(&temp5); + __builtin_mma_xxsetaccz(&temp6); + __builtin_mma_xxsetaccz(&temp7); + + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + a4 = a3 + lda; + a5 = a4 + lda; + a6 = a5 + lda; + a7 = a6 + lda; + va0 = (vec_bf16 *)a0; + va1 = (vec_bf16 *)a1; + va2 = (vec_bf16 *)a2; + va3 = (vec_bf16 *)a3; + va4 = (vec_bf16 *)a4; + va5 = (vec_bf16 *)a5; + va6 = (vec_bf16 *)a6; + va7 = (vec_bf16 *)a7; + v_x = (vec_bf16 *)x; + BLASLONG n8 = n / 8; + BLASLONG i = 0; + + for (; i + 2 <= n8; i += 2) { + vec_load_pair((vec_f32 *)inp, (vec_f32 *)&v_x[i]); + + vec_load_mult2_mma(&temp0, &va0[i + 0], inp); + vec_load_mult2_mma(&temp1, &va1[i + 0], inp); + vec_load_mult2_mma(&temp2, &va2[i + 0], inp); + vec_load_mult2_mma(&temp3, &va3[i + 0], inp); + vec_load_mult2_mma(&temp4, &va4[i + 0], inp); + vec_load_mult2_mma(&temp5, &va5[i + 0], inp); + vec_load_mult2_mma(&temp6, &va6[i + 0], inp); + vec_load_mult2_mma(&temp7, &va7[i + 0], inp); + } + + if (n8 & 1) { + inp[0] = (vec_bf16)vec_load_vec(&v_x[i]); + + vec_load_mult_mma(&temp0, &va0[i], inp[0]); + vec_load_mult_mma(&temp1, &va1[i], inp[0]); + vec_load_mult_mma(&temp2, &va2[i], inp[0]); + vec_load_mult_mma(&temp3, &va3[i], inp[0]); + vec_load_mult_mma(&temp4, &va4[i], inp[0]); + vec_load_mult_mma(&temp5, &va5[i], inp[0]); + vec_load_mult_mma(&temp6, &va6[i], inp[0]); + vec_load_mult_mma(&temp7, &va7[i], inp[0]); + + i++; + } + + n &= 7; + if (n) { + inp[0] = vec_loadN(&v_x[i], n); + + vec_loadN_mult_mma(&temp0, &va0[i], inp[0], n); + vec_loadN_mult_mma(&temp1, &va1[i], inp[0], n); + vec_loadN_mult_mma(&temp2, &va2[i], inp[0], n); + vec_loadN_mult_mma(&temp3, &va3[i], inp[0], n); + vec_loadN_mult_mma(&temp4, &va4[i], inp[0], n); + vec_loadN_mult_mma(&temp5, &va5[i], inp[0], n); + vec_loadN_mult_mma(&temp6, &va6[i], inp[0], n); + vec_loadN_mult_mma(&temp7, &va7[i], inp[0], n); + } + + __builtin_mma_disassemble_acc((void*)temp00, &temp0); + __builtin_mma_disassemble_acc((void*)temp01, &temp1); + __builtin_mma_disassemble_acc((void*)temp02, &temp2); + __builtin_mma_disassemble_acc((void*)temp03, &temp3); + __builtin_mma_disassemble_acc((void*)temp04, &temp4); + __builtin_mma_disassemble_acc((void*)temp05, &temp5); + __builtin_mma_disassemble_acc((void*)temp06, &temp6); + __builtin_mma_disassemble_acc((void*)temp07, &temp7); + + vec_f32 t0, t1, t2, t3, t4, t5, t6, t7, t10, t11, t12, t13, t14, t15, t16, t17; + vec_f32 a = { alpha, alpha, alpha, alpha }; + vec_f32 b = { beta, beta, beta, beta }; + vec_f32 *v_y = (vec_f32 *) y; + + t0 = vec_mergeh(temp00[0], temp01[0]); + t1 = vec_mergeh(temp02[0], temp03[0]); + t2 = vec_mergeo(temp00[1], temp01[1]); + t3 = vec_mergeo(temp02[1], temp03[1]); + t4 = vec_mergel(temp00[2], temp01[2]); + t5 = vec_mergel(temp02[2], temp03[2]); + t6 = vec_mergeo(temp00[3], temp01[3]); + t7 = vec_mergeo(temp02[3], temp03[3]); + t0 = vec_xxpermdi(t0, t1, 0); + t2 = vec_xxpermdi(t2, t3, 0); + t4 = vec_xxpermdi(t4, t5, 0); + t6 = vec_xxpermdi(t6, t7, 3); + + t0 += t2 + t4 + t6; + + t10 = vec_mergeh(temp04[0], temp05[0]); + t11 = vec_mergeh(temp06[0], temp07[0]); + t12 = vec_mergeo(temp04[1], temp05[1]); + t13 = vec_mergeo(temp06[1], temp07[1]); + t14 = vec_mergel(temp04[2], temp05[2]); + t15 = vec_mergel(temp06[2], temp07[2]); + t16 = vec_mergeo(temp04[3], temp05[3]); + t17 = vec_mergeo(temp06[3], temp07[3]); + t10 = vec_xxpermdi(t10, t11, 0); + t12 = vec_xxpermdi(t12, t13, 0); + t14 = vec_xxpermdi(t14, t15, 0); + t16 = vec_xxpermdi(t16, t17, 3); + + t10 += t12 + t14 + t16; + + vec_f32 inp2[2]; + vec_load_pair(inp2, v_y); + inp2[0] = (a * t0) + (b * inp2[0]); + inp2[1] = (a * t10) + (b * inp2[1]); + vec_store_pair(v_y, inp2); +} +#endif + +#include "sbgemv_t.c" +#else #include "sbgemv_t_vsx.c" +#endif +#endif -//#include "sbgemv_t.c" diff --git a/kernel/power/sbgemv_t_vsx.c b/kernel/power/sbgemv_t_vsx.c index 7da894109b..399989bb52 100644 --- a/kernel/power/sbgemv_t_vsx.c +++ b/kernel/power/sbgemv_t_vsx.c @@ -25,12 +25,20 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#ifndef SBGEMV_T_VSX -#define SBGEMV_T_VSX +#ifndef SBGEMV_T_VSX_C +#define SBGEMV_T_VSX_C #include "sbgemv_common.c" -#define NBMAX 4096 +#ifndef BF16GEMV_T_X +#define BF16GEMV_T_X +#define BF16GEMV_T_8 BF16GEMV_T_VSX_8 +#define BF16GEMV_T_4 BF16GEMV_T_VSX_4 +#define BF16GEMV_T_2 BF16GEMV_T_VSX_2 +#define BF16GEMV_T_1 BF16GEMV_T_VSX_1 +#endif + +#define USE_BFGEMV_8_T_VSX static void BF16GEMV_T_VSX_1(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FLOAT *y, FLOAT alpha, FLOAT beta) { @@ -58,9 +66,9 @@ static void BF16GEMV_T_VSX_1(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FL temp0 += vec_loadN_mult(&va0[i], inp, n, zero); } else if (n) { - vec_f32 v_inp0 = vec_loadNHi_vec(v_x, i, n, zero); + inp[0] = vec_loadNHi_vec(v_x, i, n, zero); - temp0 += vec_loadNHi_mult(&va0[i], v_inp0, n, zero); + temp0 += vec_loadNHi_mult(&va0[i], inp[0], n, zero); } y[0] = (alpha * (temp0[0] + temp0[1] + temp0[2] + temp0[3])) + (beta * y[0]); @@ -97,10 +105,10 @@ static void BF16GEMV_T_VSX_2(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FL temp0 += vec_loadN_mult(&va0[i], inp, n, zero); temp1 += vec_loadN_mult(&va1[i], inp, n, zero); } else if (n) { - vec_f32 v_inp0 = vec_loadNHi_vec(v_x, i, n, zero); + inp[0] = vec_loadNHi_vec(v_x, i, n, zero); - temp0 += vec_loadNHi_mult(&va0[i], v_inp0, n, zero); - temp1 += vec_loadNHi_mult(&va1[i], v_inp0, n, zero); + temp0 += vec_loadNHi_mult(&va0[i], inp[0], n, zero); + temp1 += vec_loadNHi_mult(&va1[i], inp[0], n, zero); } y[0] = (alpha * (temp0[0] + temp0[1] + temp0[2] + temp0[3])) + (beta * y[0]); @@ -148,12 +156,12 @@ static void BF16GEMV_T_VSX_4(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FL temp2 += vec_loadN_mult(&va2[i], inp, n, zero); temp3 += vec_loadN_mult(&va3[i], inp, n, zero); } else if (n) { - vec_f32 v_inp0 = vec_loadNHi_vec(v_x, i, n, zero); + inp[0] = vec_loadNHi_vec(v_x, i, n, zero); - temp0 += vec_loadNHi_mult(&va0[i], v_inp0, n, zero); - temp1 += vec_loadNHi_mult(&va1[i], v_inp0, n, zero); - temp2 += vec_loadNHi_mult(&va2[i], v_inp0, n, zero); - temp3 += vec_loadNHi_mult(&va3[i], v_inp0, n, zero); + temp0 += vec_loadNHi_mult(&va0[i], inp[0], n, zero); + temp1 += vec_loadNHi_mult(&va1[i], inp[0], n, zero); + temp2 += vec_loadNHi_mult(&va2[i], inp[0], n, zero); + temp3 += vec_loadNHi_mult(&va3[i], inp[0], n, zero); } vec_f32 t0, t1, t2, t3; @@ -174,6 +182,7 @@ static void BF16GEMV_T_VSX_4(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FL v_y[0] = (a * temp0) + (b * v_y[0]); } +#ifdef USE_BFGEMV_8_T_VSX static void BF16GEMV_T_VSX_8(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FLOAT *y, FLOAT alpha, FLOAT beta) { IFLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; @@ -235,16 +244,16 @@ static void BF16GEMV_T_VSX_8(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FL temp6 += vec_loadN_mult(&va6[i], inp, n, zero); temp7 += vec_loadN_mult(&va7[i], inp, n, zero); } else if (n) { - vec_f32 v_inp0 = vec_loadNHi_vec(v_x, i, n, zero); - - temp0 += vec_loadNHi_mult(&va0[i], v_inp0, n, zero); - temp1 += vec_loadNHi_mult(&va1[i], v_inp0, n, zero); - temp2 += vec_loadNHi_mult(&va2[i], v_inp0, n, zero); - temp3 += vec_loadNHi_mult(&va3[i], v_inp0, n, zero); - temp4 += vec_loadNHi_mult(&va4[i], v_inp0, n, zero); - temp5 += vec_loadNHi_mult(&va5[i], v_inp0, n, zero); - temp6 += vec_loadNHi_mult(&va6[i], v_inp0, n, zero); - temp7 += vec_loadNHi_mult(&va7[i], v_inp0, n, zero); + inp[0] = vec_loadNHi_vec(v_x, i, n, zero); + + temp0 += vec_loadNHi_mult(&va0[i], inp[0], n, zero); + temp1 += vec_loadNHi_mult(&va1[i], inp[0], n, zero); + temp2 += vec_loadNHi_mult(&va2[i], inp[0], n, zero); + temp3 += vec_loadNHi_mult(&va3[i], inp[0], n, zero); + temp4 += vec_loadNHi_mult(&va4[i], inp[0], n, zero); + temp5 += vec_loadNHi_mult(&va5[i], inp[0], n, zero); + temp6 += vec_loadNHi_mult(&va6[i], inp[0], n, zero); + temp7 += vec_loadNHi_mult(&va7[i], inp[0], n, zero); } vec_f32 t0, t1, t2, t3; @@ -272,14 +281,12 @@ static void BF16GEMV_T_VSX_8(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FL temp7 = vec_mergel(t1, t3); temp4 += temp5 + temp6 + temp7; - v_y[0] = (a * temp0) + (b * v_y[0]); - v_y[1] = (a * temp4) + (b * v_y[1]); + vec_load_pair(inp, v_y); + inp[0] = (a * temp0) + (b * inp[0]); + inp[1] = (a * temp4) + (b * inp[1]); + vec_store_pair(v_y, inp); } - -#define BF16GEMV_T_8 BF16GEMV_T_VSX_8 -#define BF16GEMV_T_4 BF16GEMV_T_VSX_4 -#define BF16GEMV_T_2 BF16GEMV_T_VSX_2 -#define BF16GEMV_T_1 BF16GEMV_T_VSX_1 +#endif #include "sbgemv_t.c" #endif From c9ce37d527311145be20210fe6cef792aca7a6f5 Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Mon, 23 Sep 2024 08:43:58 -0500 Subject: [PATCH 049/244] Force vector pairs in clang. --- kernel/power/gemm_common.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/kernel/power/gemm_common.c b/kernel/power/gemm_common.c index 0611ebc2a9..ed00de95b0 100644 --- a/kernel/power/gemm_common.c +++ b/kernel/power/gemm_common.c @@ -46,7 +46,11 @@ FORCEINLINE void vec_load_pair(vec_f32 *dst, vec_f32 *src) { #ifdef USE_VECTOR_PAIRS __vector_pair vy0p; +#ifdef __clang__ + vy0p = __builtin_vsx_lxvp(0L, (const __vector_pair *)(src)); +#else vy0p = *(__vector_pair *)(src); +#endif __builtin_vsx_disassemble_pair((void *)(dst), &vy0p); #else dst[0] = src[0]; @@ -59,7 +63,11 @@ FORCEINLINE void vec_store_pair(vec_f32 *dst, vec_f32 *src) #ifdef USE_VECTOR_PAIRS __vector_pair vy0p; __builtin_vsx_assemble_pair2(&vy0p, (vec_uc8)src[1], (vec_uc8)src[0]); +#ifdef __clang__ + __builtin_vsx_stxvp(vy0p, 0L, (__vector_pair *)(dst)); +#else *(__vector_pair *)(dst) = vy0p; +#endif #else dst[0] = src[0]; dst[1] = src[1]; From 05aa63e738edeb06ad2697a04f8889f6c0746067 Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Tue, 24 Sep 2024 12:54:02 -0500 Subject: [PATCH 050/244] More MMA BF16 GEMV code. --- kernel/power/sbgemv_common_power10.c | 179 ++++++++++++++++++- kernel/power/sbgemv_n_power10.c | 246 ++++++++++++++++++++++----- 2 files changed, 372 insertions(+), 53 deletions(-) diff --git a/kernel/power/sbgemv_common_power10.c b/kernel/power/sbgemv_common_power10.c index da088014b0..2ee912b9d2 100644 --- a/kernel/power/sbgemv_common_power10.c +++ b/kernel/power/sbgemv_common_power10.c @@ -29,6 +29,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SBGEMV_COMMON_MMA_C #include "sbgemv_common.c" +#if defined(_AIX) || defined(__clang__) +#define USE_MERGE_MMA +#endif + FORCEINLINE void vec_load_mult_mma(__vector_quad *out, vec_bf16 *in, vec_bf16 inp) { vec_bf16 in0 = (vec_bf16)vec_load_vec(in); @@ -69,11 +73,13 @@ FORCEINLINE void vec_mult2_mma(__vector_quad *out, vec_bf16 in0, vec_bf16 inp) __builtin_mma_xvbf16ger2(&out[1], (vec_uc8)inp, (vec_uc8)in01); } +#ifndef USE_MERGE_MMA FORCEINLINE void vec_mult4_mma(__vector_quad *out, vec_bf16 *in0, vec_bf16 inp) { vec_mult2_mma(out + 0, in0[0], inp); vec_mult2_mma(out + 2, in0[1], inp); } +#endif FORCEINLINE void vec_loadN_mult11_mma(__vector_quad *out, vec_bf16 *in, vec_bf16 inp, BLASLONG n) { @@ -96,6 +102,7 @@ FORCEINLINE void vec_load_mult12_mma(__vector_quad *out, vec_bf16 *in, vec_bf16 vec_mult2_mma(out, in0, inp); } +#ifndef USE_MERGE_MMA FORCEINLINE void vec_load_mult18_mma(__vector_quad *out, vec_bf16 *in, vec_bf16 inp) { vec_bf16 in0[4]; @@ -106,6 +113,7 @@ FORCEINLINE void vec_load_mult18_mma(__vector_quad *out, vec_bf16 *in, vec_bf16 vec_mult4_mma(&out[0], in0 + 0, inp); vec_mult4_mma(&out[4], in0 + 2, inp); } +#endif FORCEINLINE void vec_reduce1_mma(__vector_quad *out, vec_f32 *temp, vec_f32 v_alpha, vec_f32 *vy0) { @@ -120,6 +128,7 @@ FORCEINLINE void vec_reduce2_mma(__vector_quad *out, vec_f32 *temp, vec_f32 v_al vec_reduce1_mma(&out[1], &temp[4], v_alpha, &vy0[1]); } +#ifndef USE_MERGE_MMA FORCEINLINE void vec_reduce8_mma(__vector_quad *out, vec_f32 *temp, vec_f32 v_alpha, vec_f32 *vy0) { vec_reduce2_mma(&out[0], &temp[0], v_alpha, vy0 + 0); @@ -127,6 +136,23 @@ FORCEINLINE void vec_reduce8_mma(__vector_quad *out, vec_f32 *temp, vec_f32 v_al vec_reduce2_mma(&out[4], &temp[16], v_alpha, vy0 + 4); vec_reduce2_mma(&out[6], &temp[24], v_alpha, vy0 + 6); } +#else +FORCEINLINE void vec_reduce44_mma(__vector_quad *out, vec_f32 *temp, vec_f32 v_alpha, vec_f32 *vy0) +{ + __builtin_mma_disassemble_acc((void*)temp, &out[0]); + + vy0[0] += (temp[0] * v_alpha); + vy0[2] += (temp[1] * v_alpha); + vy0[4] += (temp[2] * v_alpha); + vy0[6] += (temp[3] * v_alpha); +} + +FORCEINLINE void vec_reduce84_mma(__vector_quad *out, vec_f32 *temp, vec_f32 v_alpha, vec_f32 *vy0) +{ + vec_reduce44_mma(&out[0], &temp[0], v_alpha, vy0 + 0); + vec_reduce44_mma(&out[1], &temp[4], v_alpha, vy0 + 1); +} +#endif FORCEINLINE void vec_mult11a_mma(__vector_quad *out, vec_bf16 in0, vec_bf16 in1, vec_bf16 inp) { @@ -166,18 +192,25 @@ FORCEINLINE void vec_load_mult22a_mma(__vector_quad *out, vec_bf16 *ina, vec_bf1 vec_mult2a_mma(out, in0, in1, inp); } -FORCEINLINE void vec_load_mult28a_mma(__vector_quad *out, vec_bf16 *ina, vec_bf16 *inb, vec_bf16 inp) +FORCEINLINE void vec_load4_mma(vec_bf16 *in0, vec_bf16 *in1, vec_bf16 *ina, vec_bf16 *inb) { - vec_bf16 in0[4], in1[4]; - vec_load_pair((vec_f32 *)(in0 + 0), (vec_f32 *)(ina + 0)); vec_load_pair((vec_f32 *)(in1 + 0), (vec_f32 *)(inb + 0)); vec_load_pair((vec_f32 *)(in0 + 2), (vec_f32 *)(ina + 2)); vec_load_pair((vec_f32 *)(in1 + 2), (vec_f32 *)(inb + 2)); +} + +#ifndef USE_MERGE_MMA +FORCEINLINE void vec_load_mult28a_mma(__vector_quad *out, vec_bf16 *ina, vec_bf16 *inb, vec_bf16 inp) +{ + vec_bf16 in0[4], in1[4]; + + vec_load4_mma(in0, in1, ina, inb); vec_mult4a_mma(&out[0], in0 + 0, in1 + 0, inp); vec_mult4a_mma(&out[4], in0 + 2, in1 + 2, inp); } +#endif FORCEINLINE void vec_loadN_mult22a_mma(__vector_quad *out, vec_bf16 *ina, vec_bf16 *inb, vec_bf16 inp, BLASLONG n) { @@ -209,6 +242,48 @@ FORCEINLINE void vec_mult4b_mma(__vector_quad *out, vec_bf16 *in0, vec_bf16 *in1 vec_mult2b_mma(out + 2, in0[1], in1[1], inp); } +#ifdef USE_MERGE_MMA +FORCEINLINE void vec_mult1c_mma(__vector_quad *out, vec_bf16 in0, vec_bf16 inp) +{ + vec_bf16 in00 = vec_mergeh(in0, in0); + + __builtin_mma_xvbf16ger2pp(out, (vec_uc8)inp, (vec_uc8)in00); +} + +FORCEINLINE void vec_mult2c_mma(__vector_quad *out, vec_bf16 in0, vec_bf16 inp) +{ + vec_bf16 in01 = vec_mergel(in0, in0); + + vec_mult1c_mma(&out[0], in0, inp); + + __builtin_mma_xvbf16ger2pp(&out[1], (vec_uc8)inp, (vec_uc8)in01); +} + +FORCEINLINE void vec_mult44_mma(__vector_quad *out, vec_bf16 *in, vec_bf16 *inp) +{ + vec_mult2_mma(out, in[0], inp[0]); + vec_mult2c_mma(out, in[1], inp[1]); +} + +FORCEINLINE void vec_mult44c_mma(__vector_quad *out, vec_bf16 *in, vec_bf16 *inp) +{ + vec_mult2c_mma(out, in[0], inp[0]); + vec_mult2c_mma(out, in[1], inp[1]); +} + +FORCEINLINE void vec_mult44a_mma(__vector_quad *out, vec_bf16 *in0, vec_bf16 *in1, vec_bf16 *inp) +{ + vec_mult2a_mma(out, in0[0], in1[0], inp[0]); + vec_mult2b_mma(out, in0[1], in1[1], inp[1]); +} + +FORCEINLINE void vec_mult44b_mma(__vector_quad *out, vec_bf16 *in0, vec_bf16 *in1, vec_bf16 *inp) +{ + vec_mult2b_mma(out, in0[0], in1[0], inp[0]); + vec_mult2b_mma(out, in0[1], in1[1], inp[1]); +} +#endif + FORCEINLINE void vec_loadN_mult11b_mma(__vector_quad *out, vec_bf16 *ina, vec_bf16 *inb, vec_bf16 inp, BLASLONG n) { vec_bf16 in0 = vec_loadN(ina, n); @@ -225,18 +300,48 @@ FORCEINLINE void vec_load_mult22b_mma(__vector_quad *out, vec_bf16 *ina, vec_bf1 vec_mult2b_mma(out, in0, in1, inp); } +#ifndef USE_MERGE_MMA FORCEINLINE void vec_load_mult28b_mma(__vector_quad *out, vec_bf16 *ina, vec_bf16 *inb, vec_bf16 inp) { vec_bf16 in0[4], in1[4]; - vec_load_pair((vec_f32 *)(in0 + 0), (vec_f32 *)(ina + 0)); - vec_load_pair((vec_f32 *)(in1 + 0), (vec_f32 *)(inb + 0)); - vec_load_pair((vec_f32 *)(in0 + 2), (vec_f32 *)(ina + 2)); - vec_load_pair((vec_f32 *)(in1 + 2), (vec_f32 *)(inb + 2)); + vec_load4_mma(in0, in1, ina, inb); vec_mult4b_mma(&out[0], in0 + 0, in1 + 0, inp); vec_mult4b_mma(&out[4], in0 + 2, in1 + 2, inp); } +#else +FORCEINLINE void vec_load_mult184_mma(__vector_quad *out, vec_bf16 *in, vec_bf16 *inp) +{ + vec_bf16 in0[4]; + + vec_load_pair((vec_f32 *)(in0 + 0), (vec_f32 *)(in + 0)); + vec_load_pair((vec_f32 *)(in0 + 2), (vec_f32 *)(in + 2)); + + vec_mult44_mma(out, in0 + 0, inp + 0); + vec_mult44c_mma(out, in0 + 2, inp + 2); +} + +FORCEINLINE void vec_load_mult284a_mma(__vector_quad *out, vec_bf16 *ina, vec_bf16 *inb, vec_bf16 *inp) +{ + vec_bf16 in0[4], in1[4]; + + vec_load4_mma(in0, in1, ina, inb); + + vec_mult44a_mma(out, in0 + 0, in1 + 0, inp + 0); + vec_mult44b_mma(out, in0 + 2, in1 + 2, inp + 2); +} + +FORCEINLINE void vec_load_mult284b_mma(__vector_quad *out, vec_bf16 *ina, vec_bf16 *inb, vec_bf16 *inp) +{ + vec_bf16 in0[4], in1[4]; + + vec_load4_mma(in0, in1, ina, inb); + + vec_mult44b_mma(out, in0 + 0, in1 + 0, inp + 0); + vec_mult44b_mma(out, in0 + 2, in1 + 2, inp + 2); +} +#endif FORCEINLINE void vec_loadN_mult22b_mma(__vector_quad *out, vec_bf16 *ina, vec_bf16 *inb, vec_bf16 inp, BLASLONG n) { @@ -262,4 +367,64 @@ FORCEINLINE void vec_store4_pair(vec_f32 *v_y, vec_f32 *vy0) vec_store_pair(v_y + 6, vy0 + 6); } +#ifdef USE_MERGE_MMA +FORCEINLINE void vec_load8_pair(vec_f32 *vy0, vec_f32 *v_y) +{ + vec_load4_pair(vy0 + 0, v_y + 0); + vec_load4_pair(vy0 + 8, v_y + 8); +} + +FORCEINLINE void vec_store8_pair(vec_f32 *v_y, vec_f32 *vy0) +{ + vec_store4_pair(v_y + 0, vy0 + 0); + vec_store4_pair(v_y + 8, vy0 + 8); +} + +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#define VEC_SHIFT(data, shift) vec_sld(data, data, 16 - shift) +#else +#define VEC_SHIFT(data, shift) vec_sld(data, data, shift) +#endif + +typedef __vector unsigned int vec_ui32; + +static vec_ui32 mask_0 = { 0xffffffff, 0x00000000, 0x00000000, 0x00000000 }; +static vec_ui32 mask_1 = { 0x00000000, 0xffffffff, 0x00000000, 0x00000000 }; +static vec_ui32 mask_2 = { 0x00000000, 0x00000000, 0xffffffff, 0x00000000 }; +static vec_ui32 mask_3 = { 0x00000000, 0x00000000, 0x00000000, 0xffffffff }; + +FORCEINLINE void vec_make_mult1(vec_bf16 *v_x0) +{ + v_x0[ 0] = vec_and(v_x0[0], (vec_bf16)mask_0); + + v_x0[ 1] = VEC_SHIFT(v_x0[ 0], 4); + v_x0[ 2] = VEC_SHIFT(v_x0[ 0], 8); + v_x0[ 3] = VEC_SHIFT(v_x0[ 0], 12); +} + +FORCEINLINE void vec_make_mult2(vec_bf16 *v_x0) +{ + v_x0[ 5] = vec_and(v_x0[0], (vec_bf16)mask_1); + vec_make_mult1(v_x0); + + v_x0[ 4] = VEC_SHIFT(v_x0[ 5], 12); + v_x0[ 6] = VEC_SHIFT(v_x0[ 5], 4); + v_x0[ 7] = VEC_SHIFT(v_x0[ 5], 8); +} + +FORCEINLINE void vec_make_mult4(vec_bf16 *v_x0) +{ + v_x0[10] = vec_and(v_x0[0], (vec_bf16)mask_2); + v_x0[15] = vec_and(v_x0[0], (vec_bf16)mask_3); + vec_make_mult2(v_x0); + + v_x0[ 8] = VEC_SHIFT(v_x0[10], 8); + v_x0[ 9] = VEC_SHIFT(v_x0[10], 12); + v_x0[11] = VEC_SHIFT(v_x0[10], 4); + v_x0[12] = VEC_SHIFT(v_x0[15], 4); + v_x0[13] = VEC_SHIFT(v_x0[15], 8); + v_x0[14] = VEC_SHIFT(v_x0[15], 12); +} +#endif + #endif diff --git a/kernel/power/sbgemv_n_power10.c b/kernel/power/sbgemv_n_power10.c index 7b2beb0c7b..f2ed6bf9a8 100644 --- a/kernel/power/sbgemv_n_power10.c +++ b/kernel/power/sbgemv_n_power10.c @@ -28,7 +28,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef SBGEMV_N_MMA_C #define SBGEMV_N_MMA_C +#if !defined(_AIX) || defined(__clang__) #define USE_BFGEMV_N_MMA +#endif #ifdef USE_BFGEMV_N_MMA #include "sbgemv_common_power10.c" @@ -47,7 +49,7 @@ static void BF16GEMV_N_MMA_1(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, FLOA { IFLOAT *a0; __vector_quad temp[2*4]; - vec_f32 temp0[8*4], vy0[2*4]; + vec_f32 temp0[8*4]; vec_f32 v_alpha = { alpha, alpha, alpha, alpha }; a0 = ap[0]; @@ -55,26 +57,61 @@ static void BF16GEMV_N_MMA_1(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, FLOA vec_bf16 *va0 = (vec_bf16 *)a0; vec_bf16 *x_bf = (vec_bf16 *)(xo); - vec_bf16 v_x0 = vec_loadN(x_bf, 1); vec_f32 *v_y = (vec_f32 *)y; BLASLONG n8 = n / 8; BLASLONG i = 0; +#ifdef USE_MERGE_MMA + vec_bf16 v_x0[4]; + v_x0[0] = vec_loadN(x_bf, 1); + vec_f32 vy0[2*4*2]; + + vec_make_mult1(v_x0); + + for (; i + 8 <= n8; i += 8) { + vec_load8_pair(vy0, &v_y[(i * 2) + 0]); + + vec_load_mult184_mma(&temp[0], &va0[i + 0], &v_x0[ 0]); + vec_load_mult184_mma(&temp[2], &va0[i + 4], &v_x0[ 0]); + + vec_reduce84_mma(&temp[0], temp0 + 0, v_alpha, vy0 + 0); + vec_reduce84_mma(&temp[2], temp0 + 8, v_alpha, vy0 + 8); + + vec_store8_pair(&v_y[(i * 2) + 0], vy0); + } + + if (n8 & 4) { + vec_load4_pair(vy0, &v_y[(i * 2) + 0]); + + vec_load_mult184_mma(&temp[0], &va0[i + 0], &v_x0[ 0]); + + vec_reduce84_mma(&temp[0], temp0, v_alpha, vy0); + + vec_store4_pair(&v_y[(i * 2) + 0], vy0); + + i += 4; + } +#else + vec_bf16 v_x0[1]; + v_x0[0] = vec_loadN(x_bf, 1); + vec_f32 vy0[2*4]; + for (; i + 4 <= n8; i += 4) { vec_load4_pair(vy0, &v_y[(i * 2) + 0]); - vec_load_mult18_mma(&temp[0], &va0[i + 0], v_x0); + vec_load_mult18_mma(&temp[0], &va0[i + 0], v_x0[ 0]); vec_reduce8_mma(&temp[0], temp0, v_alpha, vy0); vec_store4_pair(&v_y[(i * 2) + 0], vy0); } +#endif for (; i < n8; i++) { vec_load_pair(vy0, &v_y[(i * 2) + 0]); - vec_load_mult12_mma(&temp[0], &va0[i], v_x0); + vec_load_mult12_mma(&temp[0], &va0[i], v_x0[ 0]); vec_reduce2_mma(&temp[0], temp0, v_alpha, vy0); @@ -86,7 +123,7 @@ static void BF16GEMV_N_MMA_1(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, FLOA BLASLONG n3 = n & 3; vec_loadN2_f32(vy0, &v_y[(i * 2) + 0], n3); - vec_loadN_mult12_mma(&temp[0], &va0[i], v_x0, n); + vec_loadN_mult12_mma(&temp[0], &va0[i], v_x0[ 0], n); vec_reduce2_mma(&temp[0], temp0, v_alpha, vy0); @@ -94,7 +131,7 @@ static void BF16GEMV_N_MMA_1(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, FLOA } else if (n) { vy0[0] = vec_loadN_f32(&v_y[(i * 2) + 0], n); - vec_loadN_mult11_mma(&temp[0], &va0[i], v_x0, n); + vec_loadN_mult11_mma(&temp[0], &va0[i], v_x0[ 0], n); vec_reduce1_mma(&temp[0], temp0, v_alpha, vy0); @@ -106,7 +143,7 @@ static void BF16GEMV_N_MMA_2(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, FLOA { IFLOAT *a0, *a1; __vector_quad temp[2*4]; - vec_f32 temp0[8*4], vy0[2*4]; + vec_f32 temp0[8*4]; vec_f32 v_alpha = { alpha, alpha, alpha, alpha }; a0 = ap[0]; @@ -116,26 +153,61 @@ static void BF16GEMV_N_MMA_2(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, FLOA vec_bf16 *va1 = (vec_bf16 *)a1; vec_bf16 *x_bf = (vec_bf16 *)(xo); - vec_bf16 v_x0 = vec_loadN(x_bf, 2); vec_f32 *v_y = (vec_f32 *)y; BLASLONG n8 = n / 8; BLASLONG i = 0; +#ifdef USE_MERGE_MMA + vec_bf16 v_x0[4]; + vec_f32 vy0[2*4*2]; + v_x0[0] = vec_loadN(x_bf, 2); + + vec_make_mult1(v_x0); + + for (; i + 8 <= n8; i += 8) { + vec_load8_pair(vy0, &v_y[(i * 2) + 0]); + + vec_load_mult284a_mma(&temp[0], &va0[i + 0], &va1[i + 0], &v_x0[ 0]); + vec_load_mult284a_mma(&temp[2], &va0[i + 4], &va1[i + 4], &v_x0[ 0]); + + vec_reduce84_mma(&temp[0], temp0 + 0, v_alpha, vy0 + 0); + vec_reduce84_mma(&temp[2], temp0 + 8, v_alpha, vy0 + 8); + + vec_store8_pair(&v_y[(i * 2) + 0], vy0); + } + + if (n8 & 4) { + vec_load4_pair(vy0, &v_y[(i * 2) + 0]); + + vec_load_mult284a_mma(&temp[0], &va0[i + 0], &va1[i + 0], &v_x0[ 0]); + + vec_reduce84_mma(&temp[0], temp0, v_alpha, vy0); + + vec_store4_pair(&v_y[(i * 2) + 0], vy0); + + i += 4; + } +#else + vec_bf16 v_x0[1]; + vec_f32 vy0[2*4]; + v_x0[0] = vec_loadN(x_bf, 2); + for (; i + 4 <= n8; i += 4) { vec_load4_pair(vy0, &v_y[(i * 2) + 0]); - vec_load_mult28a_mma(&temp[0], &va0[i + 0], &va1[i + 0], v_x0); + vec_load_mult28a_mma(&temp[0], &va0[i + 0], &va1[i + 0], v_x0[ 0]); vec_reduce8_mma(&temp[0], temp0, v_alpha, vy0); vec_store4_pair(&v_y[(i * 2) + 0], vy0); } +#endif for (; i < n8; i++) { vec_load_pair(vy0, &v_y[(i * 2) + 0]); - vec_load_mult22a_mma(&temp[0], &va0[i], &va1[i], v_x0); + vec_load_mult22a_mma(&temp[0], &va0[i], &va1[i], v_x0[ 0]); vec_reduce2_mma(&temp[0], temp0, v_alpha, vy0); @@ -147,7 +219,7 @@ static void BF16GEMV_N_MMA_2(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, FLOA BLASLONG n3 = n & 3; vec_loadN2_f32(vy0, &v_y[(i * 2) + 0], n3); - vec_loadN_mult22a_mma(&temp[0], &va0[i], &va1[i], v_x0, n); + vec_loadN_mult22a_mma(&temp[0], &va0[i], &va1[i], v_x0[ 0], n); vec_reduce2_mma(&temp[0], temp0, v_alpha, vy0); @@ -155,7 +227,7 @@ static void BF16GEMV_N_MMA_2(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, FLOA } else if (n) { vy0[0] = vec_loadN_f32(&v_y[(i * 2) + 0], n); - vec_loadN_mult11a_mma(&temp[0], &va0[i], &va1[i], v_x0, n); + vec_loadN_mult11a_mma(&temp[0], &va0[i], &va1[i], v_x0[ 0], n); vec_reduce1_mma(&temp[0], temp0, v_alpha, vy0); @@ -167,7 +239,7 @@ static void BF16GEMV_N_MMA_4(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, FLOA { IFLOAT *a0, *a1, *a2, *a3; __vector_quad temp[2*4]; - vec_f32 temp0[8*4], vy0[2*4]; + vec_f32 temp0[8*4]; vec_f32 v_alpha = { alpha, alpha, alpha, alpha }; a0 = ap[0]; @@ -181,30 +253,68 @@ static void BF16GEMV_N_MMA_4(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, FLOA vec_bf16 *va3 = (vec_bf16 *)a3; vec_bf16 *x_bf = (vec_bf16 *)(xo); - vec_bf16 v_x00 = vec_loadN(x_bf, 4); - - vec_bf16 v_x01 = (vec_bf16)vec_splat((vec_f32)v_x00, 1); vec_f32 *v_y = (vec_f32 *)y; BLASLONG n8 = n / 8; BLASLONG i = 0; +#ifdef USE_MERGE_MMA + vec_bf16 v_x0[8]; + vec_f32 vy0[2*4*2]; + v_x0[0] = vec_loadN(x_bf, 4); + + vec_make_mult2(v_x0); + + for (; i + 8 <= n8; i += 8) { + vec_load8_pair(vy0, &v_y[(i * 2) + 0]); + + vec_load_mult284a_mma(&temp[0], &va0[i + 0], &va1[i + 0], &v_x0[ 0]); + vec_load_mult284b_mma(&temp[0], &va2[i + 0], &va3[i + 0], &v_x0[ 4]); + vec_load_mult284a_mma(&temp[2], &va0[i + 4], &va1[i + 4], &v_x0[ 0]); + vec_load_mult284b_mma(&temp[2], &va2[i + 4], &va3[i + 4], &v_x0[ 4]); + + vec_reduce84_mma(&temp[0], temp0 + 0, v_alpha, vy0 + 0); + vec_reduce84_mma(&temp[2], temp0 + 8, v_alpha, vy0 + 8); + + vec_store8_pair(&v_y[(i * 2) + 0], vy0); + } + + if (n8 & 4) { + vec_load4_pair(vy0, &v_y[(i * 2) + 0]); + + vec_load_mult284a_mma(&temp[0], &va0[i + 0], &va1[i + 0], &v_x0[ 0]); + vec_load_mult284b_mma(&temp[0], &va2[i + 0], &va3[i + 0], &v_x0[ 4]); + + vec_reduce84_mma(&temp[0], temp0, v_alpha, vy0); + + vec_store4_pair(&v_y[(i * 2) + 0], vy0); + + i += 4; + } +#else + vec_bf16 v_x0[5]; + vec_f32 vy0[2*4]; + v_x0[0] = vec_loadN(x_bf, 4); + + v_x0[ 4] = (vec_bf16)vec_splat((vec_f32)v_x0[0], 1); + for (; i + 4 <= n8; i += 4) { vec_load4_pair(vy0, &v_y[(i * 2) + 0]); - vec_load_mult28a_mma(&temp[0], &va0[i + 0], &va1[i + 0], v_x00); - vec_load_mult28b_mma(&temp[0], &va2[i + 0], &va3[i + 0], v_x01); + vec_load_mult28a_mma(&temp[0], &va0[i + 0], &va1[i + 0], v_x0[ 0]); + vec_load_mult28b_mma(&temp[0], &va2[i + 0], &va3[i + 0], v_x0[ 4]); vec_reduce8_mma(&temp[0], temp0, v_alpha, vy0); vec_store4_pair(&v_y[(i * 2) + 0], vy0); } +#endif for (; i < n8; i++) { vec_load_pair(vy0, &v_y[(i * 2) + 0]); - vec_load_mult22a_mma(&temp[0], &va0[i], &va1[i], v_x00); - vec_load_mult22b_mma(&temp[0], &va2[i], &va3[i], v_x01); + vec_load_mult22a_mma(&temp[0], &va0[i], &va1[i], v_x0[ 0]); + vec_load_mult22b_mma(&temp[0], &va2[i], &va3[i], v_x0[ 4]); vec_reduce2_mma(&temp[0], temp0, v_alpha, vy0); @@ -216,8 +326,8 @@ static void BF16GEMV_N_MMA_4(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, FLOA BLASLONG n3 = n & 3; vec_loadN2_f32(vy0, &v_y[(i * 2) + 0], n3); - vec_loadN_mult22a_mma(&temp[0], &va0[i], &va1[i], v_x00, n); - vec_loadN_mult22b_mma(&temp[0], &va2[i], &va3[i], v_x01, n); + vec_loadN_mult22a_mma(&temp[0], &va0[i], &va1[i], v_x0[ 0], n); + vec_loadN_mult22b_mma(&temp[0], &va2[i], &va3[i], v_x0[ 4], n); vec_reduce2_mma(&temp[0], temp0, v_alpha, vy0); @@ -225,8 +335,8 @@ static void BF16GEMV_N_MMA_4(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, FLOA } else if (n) { vy0[0] = vec_loadN_f32(&v_y[(i * 2) + 0], n); - vec_loadN_mult11a_mma(&temp[0], &va0[i], &va1[i], v_x00, n); - vec_loadN_mult11b_mma(&temp[0], &va2[i], &va3[i], v_x01, n); + vec_loadN_mult11a_mma(&temp[0], &va0[i], &va1[i], v_x0[ 0], n); + vec_loadN_mult11b_mma(&temp[0], &va2[i], &va3[i], v_x0[ 4], n); vec_reduce1_mma(&temp[0], temp0, v_alpha, vy0); @@ -239,7 +349,7 @@ static void BF16GEMV_N_MMA_8(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, BLAS { IFLOAT *a0, *a1, *a2, *a3, *b0, *b1, *b2, *b3; __vector_quad temp[2*4]; - vec_f32 temp0[8*4], vy0[2*4]; + vec_f32 temp0[8*4]; vec_f32 v_alpha = { alpha, alpha, alpha, alpha }; a0 = ap[0]; @@ -261,36 +371,80 @@ static void BF16GEMV_N_MMA_8(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, BLAS vec_bf16 *vb3 = (vec_bf16 *)b3; vec_bf16 *x_bf = (vec_bf16 *)(xo); - vec_bf16 v_x00 = (vec_bf16)vec_load_vec(x_bf); - - vec_bf16 v_x01 = (vec_bf16)vec_splat((vec_f32)v_x00, 1); - vec_bf16 v_x02 = (vec_bf16)vec_splat((vec_f32)v_x00, 2); - vec_bf16 v_x03 = (vec_bf16)vec_splat((vec_f32)v_x00, 3); vec_f32 *v_y = (vec_f32 *)y; BLASLONG n8 = n / 8; BLASLONG i = 0; +#ifdef USE_MERGE_MMA + vec_bf16 v_x0[16]; + vec_f32 vy0[2*4*2]; + v_x0[0] = (vec_bf16)vec_load_vec(x_bf); + + vec_make_mult4(v_x0); + + for (; i + 8 <= n8; i += 8) { + vec_load8_pair(vy0, &v_y[(i * 2) + 0]); + + vec_load_mult284a_mma(&temp[0], &va0[i + 0], &va1[i + 0], &v_x0[ 0]); + vec_load_mult284b_mma(&temp[0], &va2[i + 0], &va3[i + 0], &v_x0[ 4]); + vec_load_mult284b_mma(&temp[0], &vb0[i + 0], &vb1[i + 0], &v_x0[ 8]); + vec_load_mult284b_mma(&temp[0], &vb2[i + 0], &vb3[i + 0], &v_x0[12]); + vec_load_mult284a_mma(&temp[2], &va0[i + 4], &va1[i + 4], &v_x0[ 0]); + vec_load_mult284b_mma(&temp[2], &va2[i + 4], &va3[i + 4], &v_x0[ 4]); + vec_load_mult284b_mma(&temp[2], &vb0[i + 4], &vb1[i + 4], &v_x0[ 8]); + vec_load_mult284b_mma(&temp[2], &vb2[i + 4], &vb3[i + 4], &v_x0[12]); + + vec_reduce84_mma(&temp[0], temp0 + 0, v_alpha, vy0 + 0); + vec_reduce84_mma(&temp[2], temp0 + 8, v_alpha, vy0 + 8); + + vec_store8_pair(&v_y[(i * 2) + 0], vy0); + } + + if (n8 & 4) { + vec_load4_pair(vy0, &v_y[(i * 2) + 0]); + + vec_load_mult284a_mma(&temp[0], &va0[i + 0], &va1[i + 0], &v_x0[ 0]); + vec_load_mult284b_mma(&temp[0], &va2[i + 0], &va3[i + 0], &v_x0[ 4]); + vec_load_mult284b_mma(&temp[0], &vb0[i + 0], &vb1[i + 0], &v_x0[ 8]); + vec_load_mult284b_mma(&temp[0], &vb2[i + 0], &vb3[i + 0], &v_x0[12]); + + vec_reduce84_mma(&temp[0], temp0, v_alpha, vy0); + + vec_store4_pair(&v_y[(i * 2) + 0], vy0); + + i += 4; + } +#else + vec_bf16 v_x0[13]; + vec_f32 vy0[2*4]; + v_x0[0] = (vec_bf16)vec_load_vec(x_bf); + + v_x0[ 4] = (vec_bf16)vec_splat((vec_f32)v_x0[0], 1); + v_x0[ 8] = (vec_bf16)vec_splat((vec_f32)v_x0[0], 2); + v_x0[12] = (vec_bf16)vec_splat((vec_f32)v_x0[0], 3); + for (; i + 4 <= n8; i += 4) { vec_load4_pair(vy0, &v_y[(i * 2) + 0]); - vec_load_mult28a_mma(&temp[0], &va0[i + 0], &va1[i + 0], v_x00); - vec_load_mult28b_mma(&temp[0], &va2[i + 0], &va3[i + 0], v_x01); - vec_load_mult28b_mma(&temp[0], &vb0[i + 0], &vb1[i + 0], v_x02); - vec_load_mult28b_mma(&temp[0], &vb2[i + 0], &vb3[i + 0], v_x03); + vec_load_mult28a_mma(&temp[0], &va0[i + 0], &va1[i + 0], v_x0[ 0]); + vec_load_mult28b_mma(&temp[0], &va2[i + 0], &va3[i + 0], v_x0[ 4]); + vec_load_mult28b_mma(&temp[0], &vb0[i + 0], &vb1[i + 0], v_x0[ 8]); + vec_load_mult28b_mma(&temp[0], &vb2[i + 0], &vb3[i + 0], v_x0[12]); vec_reduce8_mma(&temp[0], temp0, v_alpha, vy0); vec_store4_pair(&v_y[(i * 2) + 0], vy0); } +#endif for (; i < n8; i++) { vec_load_pair(vy0, &v_y[(i * 2) + 0]); - vec_load_mult22a_mma(&temp[0], &va0[i], &va1[i], v_x00); - vec_load_mult22b_mma(&temp[0], &va2[i], &va3[i], v_x01); - vec_load_mult22b_mma(&temp[0], &vb0[i], &vb1[i], v_x02); - vec_load_mult22b_mma(&temp[0], &vb2[i], &vb3[i], v_x03); + vec_load_mult22a_mma(&temp[0], &va0[i], &va1[i], v_x0[ 0]); + vec_load_mult22b_mma(&temp[0], &va2[i], &va3[i], v_x0[ 4]); + vec_load_mult22b_mma(&temp[0], &vb0[i], &vb1[i], v_x0[ 8]); + vec_load_mult22b_mma(&temp[0], &vb2[i], &vb3[i], v_x0[12]); vec_reduce2_mma(&temp[0], temp0, v_alpha, vy0); @@ -302,10 +456,10 @@ static void BF16GEMV_N_MMA_8(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, BLAS BLASLONG n3 = n & 3; vec_loadN2_f32(vy0, &v_y[(i * 2) + 0], n3); - vec_loadN_mult22a_mma(&temp[0], &va0[i], &va1[i], v_x00, n); - vec_loadN_mult22b_mma(&temp[0], &va2[i], &va3[i], v_x01, n); - vec_loadN_mult22b_mma(&temp[0], &vb0[i], &vb1[i], v_x02, n); - vec_loadN_mult22b_mma(&temp[0], &vb2[i], &vb3[i], v_x03, n); + vec_loadN_mult22a_mma(&temp[0], &va0[i], &va1[i], v_x0[ 0], n); + vec_loadN_mult22b_mma(&temp[0], &va2[i], &va3[i], v_x0[ 4], n); + vec_loadN_mult22b_mma(&temp[0], &vb0[i], &vb1[i], v_x0[ 8], n); + vec_loadN_mult22b_mma(&temp[0], &vb2[i], &vb3[i], v_x0[12], n); vec_reduce2_mma(&temp[0], temp0, v_alpha, vy0); @@ -313,10 +467,10 @@ static void BF16GEMV_N_MMA_8(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, BLAS } else if (n) { vy0[0] = vec_loadN_f32(&v_y[(i * 2) + 0], n); - vec_loadN_mult11a_mma(&temp[0], &va0[i], &va1[i], v_x00, n); - vec_loadN_mult11b_mma(&temp[0], &va2[i], &va3[i], v_x01, n); - vec_loadN_mult11b_mma(&temp[0], &vb0[i], &vb1[i], v_x02, n); - vec_loadN_mult11b_mma(&temp[0], &vb2[i], &vb3[i], v_x03, n); + vec_loadN_mult11a_mma(&temp[0], &va0[i], &va1[i], v_x0[ 0], n); + vec_loadN_mult11b_mma(&temp[0], &va2[i], &va3[i], v_x0[ 4], n); + vec_loadN_mult11b_mma(&temp[0], &vb0[i], &vb1[i], v_x0[ 8], n); + vec_loadN_mult11b_mma(&temp[0], &vb2[i], &vb3[i], v_x0[12], n); vec_reduce1_mma(&temp[0], temp0, v_alpha, vy0); From df19375560641fa213b3332d8fa775efa77f7756 Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Tue, 24 Sep 2024 16:30:01 -0500 Subject: [PATCH 051/244] Almost final code for MMA. --- kernel/power/sbgemv_common_power10.c | 93 ++++++++++++++++++++-------- kernel/power/sbgemv_n_power10.c | 41 +++++------- 2 files changed, 80 insertions(+), 54 deletions(-) diff --git a/kernel/power/sbgemv_common_power10.c b/kernel/power/sbgemv_common_power10.c index 2ee912b9d2..d24a98418f 100644 --- a/kernel/power/sbgemv_common_power10.c +++ b/kernel/power/sbgemv_common_power10.c @@ -152,6 +152,14 @@ FORCEINLINE void vec_reduce84_mma(__vector_quad *out, vec_f32 *temp, vec_f32 v_a vec_reduce44_mma(&out[0], &temp[0], v_alpha, vy0 + 0); vec_reduce44_mma(&out[1], &temp[4], v_alpha, vy0 + 1); } + +FORCEINLINE void vec_reduce88_mma(__vector_quad *out, vec_f32 *temp, vec_f32 v_alpha, vec_f32 *vy0) +{ + vec_reduce44_mma(&out[0], &temp[ 0], v_alpha, vy0 + 0); + vec_reduce44_mma(&out[1], &temp[ 4], v_alpha, vy0 + 1); + vec_reduce44_mma(&out[2], &temp[ 8], v_alpha, vy0 + 8); + vec_reduce44_mma(&out[3], &temp[12], v_alpha, vy0 + 9); +} #endif FORCEINLINE void vec_mult11a_mma(__vector_quad *out, vec_bf16 in0, vec_bf16 in1, vec_bf16 inp) @@ -341,6 +349,32 @@ FORCEINLINE void vec_load_mult284b_mma(__vector_quad *out, vec_bf16 *ina, vec_bf vec_mult44b_mma(out, in0 + 0, in1 + 0, inp + 0); vec_mult44b_mma(out, in0 + 2, in1 + 2, inp + 2); } + +FORCEINLINE void vec_load_mult288a_mma(__vector_quad *out, vec_bf16 *ina, vec_bf16 *inb, vec_bf16 *inp) +{ + vec_bf16 in0[8], in1[8]; + + vec_load4_mma(in0 + 0, in1 + 0, ina + 0, inb + 0); + vec_load4_mma(in0 + 4, in1 + 4, ina + 4, inb + 4); + + vec_mult44a_mma(out + 0, in0 + 0, in1 + 0, inp + 0); + vec_mult44a_mma(out + 2, in0 + 4, in1 + 4, inp + 0); + vec_mult44b_mma(out + 0, in0 + 2, in1 + 2, inp + 2); + vec_mult44b_mma(out + 2, in0 + 6, in1 + 6, inp + 2); +} + +FORCEINLINE void vec_load_mult288b_mma(__vector_quad *out, vec_bf16 *ina, vec_bf16 *inb, vec_bf16 *inp) +{ + vec_bf16 in0[8], in1[8]; + + vec_load4_mma(in0 + 0, in1 + 0, ina + 0, inb + 0); + vec_load4_mma(in0 + 4, in1 + 4, ina + 4, inb + 4); + + vec_mult44b_mma(out + 0, in0 + 0, in1 + 0, inp + 0); + vec_mult44b_mma(out + 2, in0 + 4, in1 + 4, inp + 0); + vec_mult44b_mma(out + 0, in0 + 2, in1 + 2, inp + 2); + vec_mult44b_mma(out + 2, in0 + 6, in1 + 6, inp + 2); +} #endif FORCEINLINE void vec_loadN_mult22b_mma(__vector_quad *out, vec_bf16 *ina, vec_bf16 *inb, vec_bf16 inp, BLASLONG n) @@ -381,49 +415,54 @@ FORCEINLINE void vec_store8_pair(vec_f32 *v_y, vec_f32 *vy0) } #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ -#define VEC_SHIFT(data, shift) vec_sld(data, data, 16 - shift) -#else -#define VEC_SHIFT(data, shift) vec_sld(data, data, shift) -#endif +#define VEC_SHIFT(data, shift) vec_sldw(data, data, 4 - shift) -typedef __vector unsigned int vec_ui32; +#define MASK_0 0xf000 +#define MASK_1 0x0f00 +#define MASK_2 0x00f0 +#define MASK_3 0x000f +#else +#define VEC_SHIFT(data, shift) vec_sldw(data, data, shift) -static vec_ui32 mask_0 = { 0xffffffff, 0x00000000, 0x00000000, 0x00000000 }; -static vec_ui32 mask_1 = { 0x00000000, 0xffffffff, 0x00000000, 0x00000000 }; -static vec_ui32 mask_2 = { 0x00000000, 0x00000000, 0xffffffff, 0x00000000 }; -static vec_ui32 mask_3 = { 0x00000000, 0x00000000, 0x00000000, 0xffffffff }; +#define MASK_0 0x000f +#define MASK_1 0x00f0 +#define MASK_2 0x0f00 +#define MASK_3 0xf000 +#endif -FORCEINLINE void vec_make_mult1(vec_bf16 *v_x0) +FORCEINLINE void vec_make_mult1(vec_bf16 *v_x0, const bool mask) { - v_x0[ 0] = vec_and(v_x0[0], (vec_bf16)mask_0); + if (mask) { + v_x0[ 0] = vec_and(v_x0[0], (vec_bf16)vec_genbm(MASK_0)); + } - v_x0[ 1] = VEC_SHIFT(v_x0[ 0], 4); - v_x0[ 2] = VEC_SHIFT(v_x0[ 0], 8); - v_x0[ 3] = VEC_SHIFT(v_x0[ 0], 12); + v_x0[ 1] = VEC_SHIFT(v_x0[ 0], 1); + v_x0[ 2] = VEC_SHIFT(v_x0[ 0], 2); + v_x0[ 3] = VEC_SHIFT(v_x0[ 0], 3); } FORCEINLINE void vec_make_mult2(vec_bf16 *v_x0) { - v_x0[ 5] = vec_and(v_x0[0], (vec_bf16)mask_1); - vec_make_mult1(v_x0); + v_x0[ 5] = vec_and(v_x0[0], (vec_bf16)vec_genbm(MASK_1)); + vec_make_mult1(v_x0, true); - v_x0[ 4] = VEC_SHIFT(v_x0[ 5], 12); - v_x0[ 6] = VEC_SHIFT(v_x0[ 5], 4); - v_x0[ 7] = VEC_SHIFT(v_x0[ 5], 8); + v_x0[ 4] = VEC_SHIFT(v_x0[ 5], 3); + v_x0[ 6] = VEC_SHIFT(v_x0[ 5], 1); + v_x0[ 7] = VEC_SHIFT(v_x0[ 5], 2); } FORCEINLINE void vec_make_mult4(vec_bf16 *v_x0) { - v_x0[10] = vec_and(v_x0[0], (vec_bf16)mask_2); - v_x0[15] = vec_and(v_x0[0], (vec_bf16)mask_3); + v_x0[10] = vec_and(v_x0[0], (vec_bf16)vec_genbm(MASK_2)); + v_x0[15] = vec_and(v_x0[0], (vec_bf16)vec_genbm(MASK_3)); vec_make_mult2(v_x0); - v_x0[ 8] = VEC_SHIFT(v_x0[10], 8); - v_x0[ 9] = VEC_SHIFT(v_x0[10], 12); - v_x0[11] = VEC_SHIFT(v_x0[10], 4); - v_x0[12] = VEC_SHIFT(v_x0[15], 4); - v_x0[13] = VEC_SHIFT(v_x0[15], 8); - v_x0[14] = VEC_SHIFT(v_x0[15], 12); + v_x0[ 8] = VEC_SHIFT(v_x0[10], 2); + v_x0[ 9] = VEC_SHIFT(v_x0[10], 3); + v_x0[11] = VEC_SHIFT(v_x0[10], 1); + v_x0[12] = VEC_SHIFT(v_x0[15], 1); + v_x0[13] = VEC_SHIFT(v_x0[15], 2); + v_x0[14] = VEC_SHIFT(v_x0[15], 3); } #endif diff --git a/kernel/power/sbgemv_n_power10.c b/kernel/power/sbgemv_n_power10.c index f2ed6bf9a8..e75f394e72 100644 --- a/kernel/power/sbgemv_n_power10.c +++ b/kernel/power/sbgemv_n_power10.c @@ -28,9 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef SBGEMV_N_MMA_C #define SBGEMV_N_MMA_C -#if !defined(_AIX) || defined(__clang__) #define USE_BFGEMV_N_MMA -#endif #ifdef USE_BFGEMV_N_MMA #include "sbgemv_common_power10.c" @@ -67,7 +65,7 @@ static void BF16GEMV_N_MMA_1(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, FLOA v_x0[0] = vec_loadN(x_bf, 1); vec_f32 vy0[2*4*2]; - vec_make_mult1(v_x0); + vec_make_mult1(v_x0, false); for (; i + 8 <= n8; i += 8) { vec_load8_pair(vy0, &v_y[(i * 2) + 0]); @@ -75,8 +73,7 @@ static void BF16GEMV_N_MMA_1(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, FLOA vec_load_mult184_mma(&temp[0], &va0[i + 0], &v_x0[ 0]); vec_load_mult184_mma(&temp[2], &va0[i + 4], &v_x0[ 0]); - vec_reduce84_mma(&temp[0], temp0 + 0, v_alpha, vy0 + 0); - vec_reduce84_mma(&temp[2], temp0 + 8, v_alpha, vy0 + 8); + vec_reduce88_mma(&temp[0], temp0 + 0, v_alpha, vy0 + 0); vec_store8_pair(&v_y[(i * 2) + 0], vy0); } @@ -163,16 +160,14 @@ static void BF16GEMV_N_MMA_2(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, FLOA vec_f32 vy0[2*4*2]; v_x0[0] = vec_loadN(x_bf, 2); - vec_make_mult1(v_x0); + vec_make_mult1(v_x0, false); for (; i + 8 <= n8; i += 8) { vec_load8_pair(vy0, &v_y[(i * 2) + 0]); - vec_load_mult284a_mma(&temp[0], &va0[i + 0], &va1[i + 0], &v_x0[ 0]); - vec_load_mult284a_mma(&temp[2], &va0[i + 4], &va1[i + 4], &v_x0[ 0]); + vec_load_mult288a_mma(&temp[0], &va0[i + 0], &va1[i + 0], &v_x0[ 0]); - vec_reduce84_mma(&temp[0], temp0 + 0, v_alpha, vy0 + 0); - vec_reduce84_mma(&temp[2], temp0 + 8, v_alpha, vy0 + 8); + vec_reduce88_mma(&temp[0], temp0 + 0, v_alpha, vy0 + 0); vec_store8_pair(&v_y[(i * 2) + 0], vy0); } @@ -268,13 +263,10 @@ static void BF16GEMV_N_MMA_4(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, FLOA for (; i + 8 <= n8; i += 8) { vec_load8_pair(vy0, &v_y[(i * 2) + 0]); - vec_load_mult284a_mma(&temp[0], &va0[i + 0], &va1[i + 0], &v_x0[ 0]); - vec_load_mult284b_mma(&temp[0], &va2[i + 0], &va3[i + 0], &v_x0[ 4]); - vec_load_mult284a_mma(&temp[2], &va0[i + 4], &va1[i + 4], &v_x0[ 0]); - vec_load_mult284b_mma(&temp[2], &va2[i + 4], &va3[i + 4], &v_x0[ 4]); + vec_load_mult288a_mma(&temp[0], &va0[i + 0], &va1[i + 0], &v_x0[ 0]); + vec_load_mult288b_mma(&temp[0], &va2[i + 0], &va3[i + 0], &v_x0[ 4]); - vec_reduce84_mma(&temp[0], temp0 + 0, v_alpha, vy0 + 0); - vec_reduce84_mma(&temp[2], temp0 + 8, v_alpha, vy0 + 8); + vec_reduce88_mma(&temp[0], temp0 + 0, v_alpha, vy0 + 0); vec_store8_pair(&v_y[(i * 2) + 0], vy0); } @@ -386,17 +378,12 @@ static void BF16GEMV_N_MMA_8(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, BLAS for (; i + 8 <= n8; i += 8) { vec_load8_pair(vy0, &v_y[(i * 2) + 0]); - vec_load_mult284a_mma(&temp[0], &va0[i + 0], &va1[i + 0], &v_x0[ 0]); - vec_load_mult284b_mma(&temp[0], &va2[i + 0], &va3[i + 0], &v_x0[ 4]); - vec_load_mult284b_mma(&temp[0], &vb0[i + 0], &vb1[i + 0], &v_x0[ 8]); - vec_load_mult284b_mma(&temp[0], &vb2[i + 0], &vb3[i + 0], &v_x0[12]); - vec_load_mult284a_mma(&temp[2], &va0[i + 4], &va1[i + 4], &v_x0[ 0]); - vec_load_mult284b_mma(&temp[2], &va2[i + 4], &va3[i + 4], &v_x0[ 4]); - vec_load_mult284b_mma(&temp[2], &vb0[i + 4], &vb1[i + 4], &v_x0[ 8]); - vec_load_mult284b_mma(&temp[2], &vb2[i + 4], &vb3[i + 4], &v_x0[12]); - - vec_reduce84_mma(&temp[0], temp0 + 0, v_alpha, vy0 + 0); - vec_reduce84_mma(&temp[2], temp0 + 8, v_alpha, vy0 + 8); + vec_load_mult288a_mma(&temp[0], &va0[i + 0], &va1[i + 0], &v_x0[ 0]); + vec_load_mult288b_mma(&temp[0], &va2[i + 0], &va3[i + 0], &v_x0[ 4]); + vec_load_mult288b_mma(&temp[0], &vb0[i + 0], &vb1[i + 0], &v_x0[ 8]); + vec_load_mult288b_mma(&temp[0], &vb2[i + 0], &vb3[i + 0], &v_x0[12]); + + vec_reduce88_mma(&temp[0], temp0 + 0, v_alpha, vy0 + 0); vec_store8_pair(&v_y[(i * 2) + 0], vy0); } From 8ab6245771b9b8b52a937c35a447c218f41e3bbd Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Tue, 24 Sep 2024 16:50:21 -0500 Subject: [PATCH 052/244] Small change. --- kernel/power/sbgemv_n_power10.c | 100 ++++++++++++++++---------------- 1 file changed, 50 insertions(+), 50 deletions(-) diff --git a/kernel/power/sbgemv_n_power10.c b/kernel/power/sbgemv_n_power10.c index e75f394e72..f33a246a99 100644 --- a/kernel/power/sbgemv_n_power10.c +++ b/kernel/power/sbgemv_n_power10.c @@ -68,21 +68,21 @@ static void BF16GEMV_N_MMA_1(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, FLOA vec_make_mult1(v_x0, false); for (; i + 8 <= n8; i += 8) { - vec_load8_pair(vy0, &v_y[(i * 2) + 0]); - vec_load_mult184_mma(&temp[0], &va0[i + 0], &v_x0[ 0]); vec_load_mult184_mma(&temp[2], &va0[i + 4], &v_x0[ 0]); + vec_load8_pair(vy0, &v_y[(i * 2) + 0]); + vec_reduce88_mma(&temp[0], temp0 + 0, v_alpha, vy0 + 0); vec_store8_pair(&v_y[(i * 2) + 0], vy0); } if (n8 & 4) { - vec_load4_pair(vy0, &v_y[(i * 2) + 0]); - vec_load_mult184_mma(&temp[0], &va0[i + 0], &v_x0[ 0]); + vec_load4_pair(vy0, &v_y[(i * 2) + 0]); + vec_reduce84_mma(&temp[0], temp0, v_alpha, vy0); vec_store4_pair(&v_y[(i * 2) + 0], vy0); @@ -95,10 +95,10 @@ static void BF16GEMV_N_MMA_1(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, FLOA vec_f32 vy0[2*4]; for (; i + 4 <= n8; i += 4) { - vec_load4_pair(vy0, &v_y[(i * 2) + 0]); - vec_load_mult18_mma(&temp[0], &va0[i + 0], v_x0[ 0]); + vec_load4_pair(vy0, &v_y[(i * 2) + 0]); + vec_reduce8_mma(&temp[0], temp0, v_alpha, vy0); vec_store4_pair(&v_y[(i * 2) + 0], vy0); @@ -106,10 +106,10 @@ static void BF16GEMV_N_MMA_1(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, FLOA #endif for (; i < n8; i++) { - vec_load_pair(vy0, &v_y[(i * 2) + 0]); - vec_load_mult12_mma(&temp[0], &va0[i], v_x0[ 0]); + vec_load_pair(vy0, &v_y[(i * 2) + 0]); + vec_reduce2_mma(&temp[0], temp0, v_alpha, vy0); vec_store_pair(&v_y[(i * 2) + 0], vy0); @@ -117,19 +117,19 @@ static void BF16GEMV_N_MMA_1(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, FLOA n &= 7; if (n > 4) { + vec_loadN_mult12_mma(&temp[0], &va0[i], v_x0[ 0], n); + BLASLONG n3 = n & 3; vec_loadN2_f32(vy0, &v_y[(i * 2) + 0], n3); - vec_loadN_mult12_mma(&temp[0], &va0[i], v_x0[ 0], n); - vec_reduce2_mma(&temp[0], temp0, v_alpha, vy0); vec_storeN2_f32(vy0, &v_y[(i * 2) + 0], n3); } else if (n) { - vy0[0] = vec_loadN_f32(&v_y[(i * 2) + 0], n); - vec_loadN_mult11_mma(&temp[0], &va0[i], v_x0[ 0], n); + vy0[0] = vec_loadN_f32(&v_y[(i * 2) + 0], n); + vec_reduce1_mma(&temp[0], temp0, v_alpha, vy0); vec_storeN_f32(vy0[0], &v_y[(i * 2) + 0], n); @@ -163,20 +163,20 @@ static void BF16GEMV_N_MMA_2(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, FLOA vec_make_mult1(v_x0, false); for (; i + 8 <= n8; i += 8) { - vec_load8_pair(vy0, &v_y[(i * 2) + 0]); - vec_load_mult288a_mma(&temp[0], &va0[i + 0], &va1[i + 0], &v_x0[ 0]); + vec_load8_pair(vy0, &v_y[(i * 2) + 0]); + vec_reduce88_mma(&temp[0], temp0 + 0, v_alpha, vy0 + 0); vec_store8_pair(&v_y[(i * 2) + 0], vy0); } if (n8 & 4) { - vec_load4_pair(vy0, &v_y[(i * 2) + 0]); - vec_load_mult284a_mma(&temp[0], &va0[i + 0], &va1[i + 0], &v_x0[ 0]); + vec_load4_pair(vy0, &v_y[(i * 2) + 0]); + vec_reduce84_mma(&temp[0], temp0, v_alpha, vy0); vec_store4_pair(&v_y[(i * 2) + 0], vy0); @@ -189,10 +189,10 @@ static void BF16GEMV_N_MMA_2(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, FLOA v_x0[0] = vec_loadN(x_bf, 2); for (; i + 4 <= n8; i += 4) { - vec_load4_pair(vy0, &v_y[(i * 2) + 0]); - vec_load_mult28a_mma(&temp[0], &va0[i + 0], &va1[i + 0], v_x0[ 0]); + vec_load4_pair(vy0, &v_y[(i * 2) + 0]); + vec_reduce8_mma(&temp[0], temp0, v_alpha, vy0); vec_store4_pair(&v_y[(i * 2) + 0], vy0); @@ -200,10 +200,10 @@ static void BF16GEMV_N_MMA_2(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, FLOA #endif for (; i < n8; i++) { - vec_load_pair(vy0, &v_y[(i * 2) + 0]); - vec_load_mult22a_mma(&temp[0], &va0[i], &va1[i], v_x0[ 0]); + vec_load_pair(vy0, &v_y[(i * 2) + 0]); + vec_reduce2_mma(&temp[0], temp0, v_alpha, vy0); vec_store_pair(&v_y[(i * 2) + 0], vy0); @@ -211,19 +211,19 @@ static void BF16GEMV_N_MMA_2(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, FLOA n &= 7; if (n > 4) { + vec_loadN_mult22a_mma(&temp[0], &va0[i], &va1[i], v_x0[ 0], n); + BLASLONG n3 = n & 3; vec_loadN2_f32(vy0, &v_y[(i * 2) + 0], n3); - vec_loadN_mult22a_mma(&temp[0], &va0[i], &va1[i], v_x0[ 0], n); - vec_reduce2_mma(&temp[0], temp0, v_alpha, vy0); vec_storeN2_f32(vy0, &v_y[(i * 2) + 0], n3); } else if (n) { - vy0[0] = vec_loadN_f32(&v_y[(i * 2) + 0], n); - vec_loadN_mult11a_mma(&temp[0], &va0[i], &va1[i], v_x0[ 0], n); + vy0[0] = vec_loadN_f32(&v_y[(i * 2) + 0], n); + vec_reduce1_mma(&temp[0], temp0, v_alpha, vy0); vec_storeN_f32(vy0[0], &v_y[(i * 2) + 0], n); @@ -261,22 +261,22 @@ static void BF16GEMV_N_MMA_4(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, FLOA vec_make_mult2(v_x0); for (; i + 8 <= n8; i += 8) { - vec_load8_pair(vy0, &v_y[(i * 2) + 0]); - vec_load_mult288a_mma(&temp[0], &va0[i + 0], &va1[i + 0], &v_x0[ 0]); vec_load_mult288b_mma(&temp[0], &va2[i + 0], &va3[i + 0], &v_x0[ 4]); + vec_load8_pair(vy0, &v_y[(i * 2) + 0]); + vec_reduce88_mma(&temp[0], temp0 + 0, v_alpha, vy0 + 0); vec_store8_pair(&v_y[(i * 2) + 0], vy0); } if (n8 & 4) { - vec_load4_pair(vy0, &v_y[(i * 2) + 0]); - vec_load_mult284a_mma(&temp[0], &va0[i + 0], &va1[i + 0], &v_x0[ 0]); vec_load_mult284b_mma(&temp[0], &va2[i + 0], &va3[i + 0], &v_x0[ 4]); + vec_load4_pair(vy0, &v_y[(i * 2) + 0]); + vec_reduce84_mma(&temp[0], temp0, v_alpha, vy0); vec_store4_pair(&v_y[(i * 2) + 0], vy0); @@ -291,11 +291,11 @@ static void BF16GEMV_N_MMA_4(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, FLOA v_x0[ 4] = (vec_bf16)vec_splat((vec_f32)v_x0[0], 1); for (; i + 4 <= n8; i += 4) { - vec_load4_pair(vy0, &v_y[(i * 2) + 0]); - vec_load_mult28a_mma(&temp[0], &va0[i + 0], &va1[i + 0], v_x0[ 0]); vec_load_mult28b_mma(&temp[0], &va2[i + 0], &va3[i + 0], v_x0[ 4]); + vec_load4_pair(vy0, &v_y[(i * 2) + 0]); + vec_reduce8_mma(&temp[0], temp0, v_alpha, vy0); vec_store4_pair(&v_y[(i * 2) + 0], vy0); @@ -303,11 +303,11 @@ static void BF16GEMV_N_MMA_4(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, FLOA #endif for (; i < n8; i++) { - vec_load_pair(vy0, &v_y[(i * 2) + 0]); - vec_load_mult22a_mma(&temp[0], &va0[i], &va1[i], v_x0[ 0]); vec_load_mult22b_mma(&temp[0], &va2[i], &va3[i], v_x0[ 4]); + vec_load_pair(vy0, &v_y[(i * 2) + 0]); + vec_reduce2_mma(&temp[0], temp0, v_alpha, vy0); vec_store_pair(&v_y[(i * 2) + 0], vy0); @@ -315,21 +315,21 @@ static void BF16GEMV_N_MMA_4(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, FLOA n &= 7; if (n > 4) { - BLASLONG n3 = n & 3; - vec_loadN2_f32(vy0, &v_y[(i * 2) + 0], n3); - vec_loadN_mult22a_mma(&temp[0], &va0[i], &va1[i], v_x0[ 0], n); vec_loadN_mult22b_mma(&temp[0], &va2[i], &va3[i], v_x0[ 4], n); + BLASLONG n3 = n & 3; + vec_loadN2_f32(vy0, &v_y[(i * 2) + 0], n3); + vec_reduce2_mma(&temp[0], temp0, v_alpha, vy0); vec_storeN2_f32(vy0, &v_y[(i * 2) + 0], n3); } else if (n) { - vy0[0] = vec_loadN_f32(&v_y[(i * 2) + 0], n); - vec_loadN_mult11a_mma(&temp[0], &va0[i], &va1[i], v_x0[ 0], n); vec_loadN_mult11b_mma(&temp[0], &va2[i], &va3[i], v_x0[ 4], n); + vy0[0] = vec_loadN_f32(&v_y[(i * 2) + 0], n); + vec_reduce1_mma(&temp[0], temp0, v_alpha, vy0); vec_storeN_f32(vy0[0], &v_y[(i * 2) + 0], n); @@ -376,26 +376,26 @@ static void BF16GEMV_N_MMA_8(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, BLAS vec_make_mult4(v_x0); for (; i + 8 <= n8; i += 8) { - vec_load8_pair(vy0, &v_y[(i * 2) + 0]); - vec_load_mult288a_mma(&temp[0], &va0[i + 0], &va1[i + 0], &v_x0[ 0]); vec_load_mult288b_mma(&temp[0], &va2[i + 0], &va3[i + 0], &v_x0[ 4]); vec_load_mult288b_mma(&temp[0], &vb0[i + 0], &vb1[i + 0], &v_x0[ 8]); vec_load_mult288b_mma(&temp[0], &vb2[i + 0], &vb3[i + 0], &v_x0[12]); + vec_load8_pair(vy0, &v_y[(i * 2) + 0]); + vec_reduce88_mma(&temp[0], temp0 + 0, v_alpha, vy0 + 0); vec_store8_pair(&v_y[(i * 2) + 0], vy0); } if (n8 & 4) { - vec_load4_pair(vy0, &v_y[(i * 2) + 0]); - vec_load_mult284a_mma(&temp[0], &va0[i + 0], &va1[i + 0], &v_x0[ 0]); vec_load_mult284b_mma(&temp[0], &va2[i + 0], &va3[i + 0], &v_x0[ 4]); vec_load_mult284b_mma(&temp[0], &vb0[i + 0], &vb1[i + 0], &v_x0[ 8]); vec_load_mult284b_mma(&temp[0], &vb2[i + 0], &vb3[i + 0], &v_x0[12]); + vec_load4_pair(vy0, &v_y[(i * 2) + 0]); + vec_reduce84_mma(&temp[0], temp0, v_alpha, vy0); vec_store4_pair(&v_y[(i * 2) + 0], vy0); @@ -412,13 +412,13 @@ static void BF16GEMV_N_MMA_8(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, BLAS v_x0[12] = (vec_bf16)vec_splat((vec_f32)v_x0[0], 3); for (; i + 4 <= n8; i += 4) { - vec_load4_pair(vy0, &v_y[(i * 2) + 0]); - vec_load_mult28a_mma(&temp[0], &va0[i + 0], &va1[i + 0], v_x0[ 0]); vec_load_mult28b_mma(&temp[0], &va2[i + 0], &va3[i + 0], v_x0[ 4]); vec_load_mult28b_mma(&temp[0], &vb0[i + 0], &vb1[i + 0], v_x0[ 8]); vec_load_mult28b_mma(&temp[0], &vb2[i + 0], &vb3[i + 0], v_x0[12]); + vec_load4_pair(vy0, &v_y[(i * 2) + 0]); + vec_reduce8_mma(&temp[0], temp0, v_alpha, vy0); vec_store4_pair(&v_y[(i * 2) + 0], vy0); @@ -426,13 +426,13 @@ static void BF16GEMV_N_MMA_8(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, BLAS #endif for (; i < n8; i++) { - vec_load_pair(vy0, &v_y[(i * 2) + 0]); - vec_load_mult22a_mma(&temp[0], &va0[i], &va1[i], v_x0[ 0]); vec_load_mult22b_mma(&temp[0], &va2[i], &va3[i], v_x0[ 4]); vec_load_mult22b_mma(&temp[0], &vb0[i], &vb1[i], v_x0[ 8]); vec_load_mult22b_mma(&temp[0], &vb2[i], &vb3[i], v_x0[12]); + vec_load_pair(vy0, &v_y[(i * 2) + 0]); + vec_reduce2_mma(&temp[0], temp0, v_alpha, vy0); vec_store_pair(&v_y[(i * 2) + 0], vy0); @@ -440,25 +440,25 @@ static void BF16GEMV_N_MMA_8(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, BLAS n &= 7; if (n > 4) { - BLASLONG n3 = n & 3; - vec_loadN2_f32(vy0, &v_y[(i * 2) + 0], n3); - vec_loadN_mult22a_mma(&temp[0], &va0[i], &va1[i], v_x0[ 0], n); vec_loadN_mult22b_mma(&temp[0], &va2[i], &va3[i], v_x0[ 4], n); vec_loadN_mult22b_mma(&temp[0], &vb0[i], &vb1[i], v_x0[ 8], n); vec_loadN_mult22b_mma(&temp[0], &vb2[i], &vb3[i], v_x0[12], n); + BLASLONG n3 = n & 3; + vec_loadN2_f32(vy0, &v_y[(i * 2) + 0], n3); + vec_reduce2_mma(&temp[0], temp0, v_alpha, vy0); vec_storeN2_f32(vy0, &v_y[(i * 2) + 0], n3); } else if (n) { - vy0[0] = vec_loadN_f32(&v_y[(i * 2) + 0], n); - vec_loadN_mult11a_mma(&temp[0], &va0[i], &va1[i], v_x0[ 0], n); vec_loadN_mult11b_mma(&temp[0], &va2[i], &va3[i], v_x0[ 4], n); vec_loadN_mult11b_mma(&temp[0], &vb0[i], &vb1[i], v_x0[ 8], n); vec_loadN_mult11b_mma(&temp[0], &vb2[i], &vb3[i], v_x0[12], n); + vy0[0] = vec_loadN_f32(&v_y[(i * 2) + 0], n); + vec_reduce1_mma(&temp[0], temp0, v_alpha, vy0); vec_storeN_f32(vy0[0], &v_y[(i * 2) + 0], n); From fb287d17fc1a5f53920f0b8c29ba476b258950bc Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Wed, 25 Sep 2024 16:31:36 -0500 Subject: [PATCH 053/244] Common code. --- kernel/power/sbgemv_common_power10.c | 138 +++++++++++++++++ kernel/power/sbgemv_n_power10.c | 24 +-- kernel/power/sbgemv_t_power10.c | 223 ++++++++++++++------------- 3 files changed, 263 insertions(+), 122 deletions(-) diff --git a/kernel/power/sbgemv_common_power10.c b/kernel/power/sbgemv_common_power10.c index d24a98418f..638e2655c0 100644 --- a/kernel/power/sbgemv_common_power10.c +++ b/kernel/power/sbgemv_common_power10.c @@ -33,6 +33,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define USE_MERGE_MMA #endif +FORCEINLINE void vec_load_pair2(vec_bf16 *in0, vec_bf16 *in) +{ + vec_load_pair((vec_f32 *)(in0 + 0), (vec_f32 *)(in + 0)); + vec_load_pair((vec_f32 *)(in0 + 2), (vec_f32 *)(in + 2)); +} + FORCEINLINE void vec_load_mult_mma(__vector_quad *out, vec_bf16 *in, vec_bf16 inp) { vec_bf16 in0 = (vec_bf16)vec_load_vec(in); @@ -40,6 +46,28 @@ FORCEINLINE void vec_load_mult_mma(__vector_quad *out, vec_bf16 *in, vec_bf16 in __builtin_mma_xvbf16ger2pp(out, (vec_uc8)in0, (vec_uc8)inp); } +FORCEINLINE void vec_load_mult12a_mma(__vector_quad *out, vec_bf16 *in0, vec_bf16 *in1, vec_bf16 inp) +{ + vec_bf16 in01 = (vec_bf16)vec_load_vec(in0); + vec_bf16 in11 = (vec_bf16)vec_load_vec(in1); + + __builtin_mma_xvbf16ger2pp(out + 0, (vec_uc8)in01, (vec_uc8)inp); + __builtin_mma_xvbf16ger2pp(out + 1, (vec_uc8)in11, (vec_uc8)inp); +} + +FORCEINLINE void vec_load_mult14_mma(__vector_quad *out, vec_bf16 *in0, vec_bf16 *in1, vec_bf16 *in2, vec_bf16 *in3, vec_bf16 inp) +{ + vec_bf16 in01 = (vec_bf16)vec_load_vec(in0); + vec_bf16 in11 = (vec_bf16)vec_load_vec(in1); + vec_bf16 in21 = (vec_bf16)vec_load_vec(in2); + vec_bf16 in31 = (vec_bf16)vec_load_vec(in3); + + __builtin_mma_xvbf16ger2pp(out + 0, (vec_uc8)in01, (vec_uc8)inp); + __builtin_mma_xvbf16ger2pp(out + 1, (vec_uc8)in11, (vec_uc8)inp); + __builtin_mma_xvbf16ger2pp(out + 2, (vec_uc8)in21, (vec_uc8)inp); + __builtin_mma_xvbf16ger2pp(out + 3, (vec_uc8)in31, (vec_uc8)inp); +} + FORCEINLINE void vec_load_mult2_mma(__vector_quad *out, vec_bf16 *in, vec_bf16 *inp) { vec_bf16 in0[2]; @@ -50,6 +78,94 @@ FORCEINLINE void vec_load_mult2_mma(__vector_quad *out, vec_bf16 *in, vec_bf16 * __builtin_mma_xvbf16ger2pp(out, (vec_uc8)in0[1], (vec_uc8)inp[1]); } +FORCEINLINE void vec_load_mult22_mma(__vector_quad *out, vec_bf16 *in0, vec_bf16 *in1, vec_bf16 *inp) +{ + vec_bf16 in01[2], in11[2]; + + vec_load_pair((vec_f32 *)in01, (vec_f32 *)in0); + vec_load_pair((vec_f32 *)in11, (vec_f32 *)in1); + + __builtin_mma_xvbf16ger2pp(out + 0, (vec_uc8)in01[0], (vec_uc8)inp[0]); + __builtin_mma_xvbf16ger2pp(out + 1, (vec_uc8)in11[0], (vec_uc8)inp[0]); + __builtin_mma_xvbf16ger2pp(out + 0, (vec_uc8)in01[1], (vec_uc8)inp[1]); + __builtin_mma_xvbf16ger2pp(out + 1, (vec_uc8)in11[1], (vec_uc8)inp[1]); +} + +FORCEINLINE void vec_load_mult24_mma(__vector_quad *out, vec_bf16 *in0, vec_bf16 *in1, vec_bf16 *in2, vec_bf16 *in3, vec_bf16 *inp) +{ + vec_bf16 in01[2], in11[2], in21[2], in31[2]; + + vec_load_pair((vec_f32 *)in01, (vec_f32 *)in0); + vec_load_pair((vec_f32 *)in11, (vec_f32 *)in1); + vec_load_pair((vec_f32 *)in21, (vec_f32 *)in2); + vec_load_pair((vec_f32 *)in31, (vec_f32 *)in3); + + __builtin_mma_xvbf16ger2pp(out + 0, (vec_uc8)in01[0], (vec_uc8)inp[0]); + __builtin_mma_xvbf16ger2pp(out + 1, (vec_uc8)in11[0], (vec_uc8)inp[0]); + __builtin_mma_xvbf16ger2pp(out + 2, (vec_uc8)in21[0], (vec_uc8)inp[0]); + __builtin_mma_xvbf16ger2pp(out + 3, (vec_uc8)in31[0], (vec_uc8)inp[0]); + __builtin_mma_xvbf16ger2pp(out + 0, (vec_uc8)in01[1], (vec_uc8)inp[1]); + __builtin_mma_xvbf16ger2pp(out + 1, (vec_uc8)in11[1], (vec_uc8)inp[1]); + __builtin_mma_xvbf16ger2pp(out + 2, (vec_uc8)in21[1], (vec_uc8)inp[1]); + __builtin_mma_xvbf16ger2pp(out + 3, (vec_uc8)in31[1], (vec_uc8)inp[1]); +} + +FORCEINLINE void vec_load_mult4_mma(__vector_quad *out, vec_bf16 *in, vec_bf16 *inp) +{ + vec_bf16 in0[4]; + + vec_load_pair2(in0, in); + + __builtin_mma_xvbf16ger2pp(out, (vec_uc8)in0[0], (vec_uc8)inp[0]); + __builtin_mma_xvbf16ger2pp(out, (vec_uc8)in0[1], (vec_uc8)inp[1]); + __builtin_mma_xvbf16ger2pp(out, (vec_uc8)in0[2], (vec_uc8)inp[2]); + __builtin_mma_xvbf16ger2pp(out, (vec_uc8)in0[3], (vec_uc8)inp[3]); +} + +FORCEINLINE void vec_load_mult42_mma(__vector_quad *out, vec_bf16 *in0, vec_bf16 *in1, vec_bf16 *inp) +{ + vec_bf16 in01[4], in11[4]; + + vec_load_pair2(in01, in0); + vec_load_pair2(in11, in1); + + __builtin_mma_xvbf16ger2pp(out + 0, (vec_uc8)in01[0], (vec_uc8)inp[0]); + __builtin_mma_xvbf16ger2pp(out + 1, (vec_uc8)in11[0], (vec_uc8)inp[0]); + __builtin_mma_xvbf16ger2pp(out + 0, (vec_uc8)in01[1], (vec_uc8)inp[1]); + __builtin_mma_xvbf16ger2pp(out + 1, (vec_uc8)in11[1], (vec_uc8)inp[1]); + __builtin_mma_xvbf16ger2pp(out + 0, (vec_uc8)in01[2], (vec_uc8)inp[2]); + __builtin_mma_xvbf16ger2pp(out + 1, (vec_uc8)in11[2], (vec_uc8)inp[2]); + __builtin_mma_xvbf16ger2pp(out + 0, (vec_uc8)in01[3], (vec_uc8)inp[3]); + __builtin_mma_xvbf16ger2pp(out + 1, (vec_uc8)in11[3], (vec_uc8)inp[3]); +} + +FORCEINLINE void vec_load_mult44_mma(__vector_quad *out, vec_bf16 *in0, vec_bf16 *in1, vec_bf16 *in2, vec_bf16 *in3, vec_bf16 *inp) +{ + vec_bf16 in01[4], in11[4], in21[4], in31[4]; + + vec_load_pair2(in01, in0); + vec_load_pair2(in11, in1); + vec_load_pair2(in21, in2); + vec_load_pair2(in31, in3); + + __builtin_mma_xvbf16ger2pp(out + 0, (vec_uc8)in01[0], (vec_uc8)inp[0]); + __builtin_mma_xvbf16ger2pp(out + 1, (vec_uc8)in11[0], (vec_uc8)inp[0]); + __builtin_mma_xvbf16ger2pp(out + 2, (vec_uc8)in21[0], (vec_uc8)inp[0]); + __builtin_mma_xvbf16ger2pp(out + 3, (vec_uc8)in31[0], (vec_uc8)inp[0]); + __builtin_mma_xvbf16ger2pp(out + 0, (vec_uc8)in01[1], (vec_uc8)inp[1]); + __builtin_mma_xvbf16ger2pp(out + 1, (vec_uc8)in11[1], (vec_uc8)inp[1]); + __builtin_mma_xvbf16ger2pp(out + 2, (vec_uc8)in21[1], (vec_uc8)inp[1]); + __builtin_mma_xvbf16ger2pp(out + 3, (vec_uc8)in31[1], (vec_uc8)inp[1]); + __builtin_mma_xvbf16ger2pp(out + 0, (vec_uc8)in01[2], (vec_uc8)inp[2]); + __builtin_mma_xvbf16ger2pp(out + 1, (vec_uc8)in11[2], (vec_uc8)inp[2]); + __builtin_mma_xvbf16ger2pp(out + 2, (vec_uc8)in21[2], (vec_uc8)inp[2]); + __builtin_mma_xvbf16ger2pp(out + 3, (vec_uc8)in31[2], (vec_uc8)inp[2]); + __builtin_mma_xvbf16ger2pp(out + 0, (vec_uc8)in01[3], (vec_uc8)inp[3]); + __builtin_mma_xvbf16ger2pp(out + 1, (vec_uc8)in11[3], (vec_uc8)inp[3]); + __builtin_mma_xvbf16ger2pp(out + 2, (vec_uc8)in21[3], (vec_uc8)inp[3]); + __builtin_mma_xvbf16ger2pp(out + 3, (vec_uc8)in31[3], (vec_uc8)inp[3]); +} + FORCEINLINE void vec_loadN_mult_mma(__vector_quad *out, vec_bf16 *in, vec_bf16 inp, BLASLONG n) { vec_bf16 in0 = vec_loadN(in, n); @@ -57,6 +173,28 @@ FORCEINLINE void vec_loadN_mult_mma(__vector_quad *out, vec_bf16 *in, vec_bf16 i __builtin_mma_xvbf16ger2pp(out, (vec_uc8)in0, (vec_uc8)inp); } +FORCEINLINE void vec_loadN_mult12a_mma(__vector_quad *out, vec_bf16 *in0, vec_bf16 *in1, vec_bf16 inp, BLASLONG n) +{ + vec_bf16 in01 = (vec_bf16)vec_loadN(in0, n); + vec_bf16 in11 = (vec_bf16)vec_loadN(in1, n); + + __builtin_mma_xvbf16ger2pp(out + 0, (vec_uc8)in01, (vec_uc8)inp); + __builtin_mma_xvbf16ger2pp(out + 1, (vec_uc8)in11, (vec_uc8)inp); +} + +FORCEINLINE void vec_loadN_mult14_mma(__vector_quad *out, vec_bf16 *in0, vec_bf16 *in1, vec_bf16 *in2, vec_bf16 *in3, vec_bf16 inp, BLASLONG n) +{ + vec_bf16 in01 = (vec_bf16)vec_loadN(in0, n); + vec_bf16 in11 = (vec_bf16)vec_loadN(in1, n); + vec_bf16 in21 = (vec_bf16)vec_loadN(in2, n); + vec_bf16 in31 = (vec_bf16)vec_loadN(in3, n); + + __builtin_mma_xvbf16ger2pp(out + 0, (vec_uc8)in01, (vec_uc8)inp); + __builtin_mma_xvbf16ger2pp(out + 1, (vec_uc8)in11, (vec_uc8)inp); + __builtin_mma_xvbf16ger2pp(out + 2, (vec_uc8)in21, (vec_uc8)inp); + __builtin_mma_xvbf16ger2pp(out + 3, (vec_uc8)in31, (vec_uc8)inp); +} + FORCEINLINE void vec_mult1_mma(__vector_quad *out, vec_bf16 in0, vec_bf16 inp) { vec_bf16 in00 = vec_mergeh(in0, in0); diff --git a/kernel/power/sbgemv_n_power10.c b/kernel/power/sbgemv_n_power10.c index f33a246a99..b1dcb2fcc4 100644 --- a/kernel/power/sbgemv_n_power10.c +++ b/kernel/power/sbgemv_n_power10.c @@ -119,12 +119,12 @@ static void BF16GEMV_N_MMA_1(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, FLOA if (n > 4) { vec_loadN_mult12_mma(&temp[0], &va0[i], v_x0[ 0], n); - BLASLONG n3 = n & 3; - vec_loadN2_f32(vy0, &v_y[(i * 2) + 0], n3); + n &= 3; + vec_loadN2_f32(vy0, &v_y[(i * 2) + 0], n); vec_reduce2_mma(&temp[0], temp0, v_alpha, vy0); - vec_storeN2_f32(vy0, &v_y[(i * 2) + 0], n3); + vec_storeN2_f32(vy0, &v_y[(i * 2) + 0], n); } else if (n) { vec_loadN_mult11_mma(&temp[0], &va0[i], v_x0[ 0], n); @@ -213,12 +213,12 @@ static void BF16GEMV_N_MMA_2(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, FLOA if (n > 4) { vec_loadN_mult22a_mma(&temp[0], &va0[i], &va1[i], v_x0[ 0], n); - BLASLONG n3 = n & 3; - vec_loadN2_f32(vy0, &v_y[(i * 2) + 0], n3); + n &= 3; + vec_loadN2_f32(vy0, &v_y[(i * 2) + 0], n); vec_reduce2_mma(&temp[0], temp0, v_alpha, vy0); - vec_storeN2_f32(vy0, &v_y[(i * 2) + 0], n3); + vec_storeN2_f32(vy0, &v_y[(i * 2) + 0], n); } else if (n) { vec_loadN_mult11a_mma(&temp[0], &va0[i], &va1[i], v_x0[ 0], n); @@ -318,12 +318,12 @@ static void BF16GEMV_N_MMA_4(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, FLOA vec_loadN_mult22a_mma(&temp[0], &va0[i], &va1[i], v_x0[ 0], n); vec_loadN_mult22b_mma(&temp[0], &va2[i], &va3[i], v_x0[ 4], n); - BLASLONG n3 = n & 3; - vec_loadN2_f32(vy0, &v_y[(i * 2) + 0], n3); + n &= 3; + vec_loadN2_f32(vy0, &v_y[(i * 2) + 0], n); vec_reduce2_mma(&temp[0], temp0, v_alpha, vy0); - vec_storeN2_f32(vy0, &v_y[(i * 2) + 0], n3); + vec_storeN2_f32(vy0, &v_y[(i * 2) + 0], n); } else if (n) { vec_loadN_mult11a_mma(&temp[0], &va0[i], &va1[i], v_x0[ 0], n); vec_loadN_mult11b_mma(&temp[0], &va2[i], &va3[i], v_x0[ 4], n); @@ -445,12 +445,12 @@ static void BF16GEMV_N_MMA_8(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, BLAS vec_loadN_mult22b_mma(&temp[0], &vb0[i], &vb1[i], v_x0[ 8], n); vec_loadN_mult22b_mma(&temp[0], &vb2[i], &vb3[i], v_x0[12], n); - BLASLONG n3 = n & 3; - vec_loadN2_f32(vy0, &v_y[(i * 2) + 0], n3); + n &= 3; + vec_loadN2_f32(vy0, &v_y[(i * 2) + 0], n); vec_reduce2_mma(&temp[0], temp0, v_alpha, vy0); - vec_storeN2_f32(vy0, &v_y[(i * 2) + 0], n3); + vec_storeN2_f32(vy0, &v_y[(i * 2) + 0], n); } else if (n) { vec_loadN_mult11a_mma(&temp[0], &va0[i], &va1[i], v_x0[ 0], n); vec_loadN_mult11b_mma(&temp[0], &va2[i], &va3[i], v_x0[ 4], n); diff --git a/kernel/power/sbgemv_t_power10.c b/kernel/power/sbgemv_t_power10.c index 810287e89a..9a5c54f12f 100644 --- a/kernel/power/sbgemv_t_power10.c +++ b/kernel/power/sbgemv_t_power10.c @@ -49,7 +49,7 @@ static void BF16GEMV_T_MMA_1(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FL vec_bf16 *va0, *v_x; __vector_quad temp0; vec_f32 temp00[4]; - vec_bf16 inp[2]; + vec_bf16 inp[4]; __builtin_mma_xxsetaccz(&temp0); @@ -59,10 +59,18 @@ static void BF16GEMV_T_MMA_1(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FL BLASLONG n8 = n / 8; BLASLONG i = 0; - for (; i + 2 <= n8; i += 2) { + for (; i + 4 <= n8; i += 4) { + vec_load_pair2(inp, &v_x[i]); + + vec_load_mult4_mma(&temp0, &va0[i + 0], inp); + } + + if (n8 & 2) { vec_load_pair((vec_f32 *)inp, (vec_f32 *)&v_x[i]); vec_load_mult2_mma(&temp0, &va0[i + 0], inp); + + i += 2; } if (n8 & 1) { @@ -89,12 +97,12 @@ static void BF16GEMV_T_MMA_2(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FL { IFLOAT *a0, *a1; vec_bf16 *va0, *va1, *v_x; - __vector_quad temp0, temp1; - vec_f32 temp00[4], temp01[4]; - vec_bf16 inp[2]; + __vector_quad temp0[2]; + vec_f32 temp00[4*2]; + vec_bf16 inp[4]; - __builtin_mma_xxsetaccz(&temp0); - __builtin_mma_xxsetaccz(&temp1); + __builtin_mma_xxsetaccz(&temp0[0]); + __builtin_mma_xxsetaccz(&temp0[1]); a0 = ap; a1 = ap + lda; @@ -104,18 +112,24 @@ static void BF16GEMV_T_MMA_2(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FL BLASLONG n8 = n / 8; BLASLONG i = 0; - for (; i + 2 <= n8; i += 2) { + for (; i + 4 <= n8; i += 4) { + vec_load_pair2(inp, &v_x[i]); + + vec_load_mult42_mma(&temp0[0], &va0[i + 0], &va1[i + 0], inp); + } + + if (n8 & 2) { vec_load_pair((vec_f32 *)inp, (vec_f32 *)&v_x[i]); - vec_load_mult2_mma(&temp0, &va0[i + 0], inp); - vec_load_mult2_mma(&temp1, &va1[i + 0], inp); + vec_load_mult22_mma(&temp0[0], &va0[i + 0], &va1[i + 0], inp); + + i += 2; } if (n8 & 1) { inp[0] = (vec_bf16)vec_load_vec(&v_x[i]); - vec_load_mult_mma(&temp0, &va0[i], inp[0]); - vec_load_mult_mma(&temp1, &va1[i], inp[0]); + vec_load_mult12a_mma(&temp0[0], &va0[i], &va1[i], inp[0]); i++; } @@ -124,29 +138,28 @@ static void BF16GEMV_T_MMA_2(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FL if (n) { inp[0] = vec_loadN(&v_x[i], n); - vec_loadN_mult_mma(&temp0, &va0[i], inp[0], n); - vec_loadN_mult_mma(&temp1, &va1[i], inp[0], n); + vec_loadN_mult12a_mma(&temp0[0], &va0[i], &va1[i], inp[0], n); } - __builtin_mma_disassemble_acc((void*)temp00, &temp0); - __builtin_mma_disassemble_acc((void*)temp01, &temp1); + __builtin_mma_disassemble_acc((void*)(temp00 + 0), &temp0[0]); + __builtin_mma_disassemble_acc((void*)(temp00 + 4), &temp0[1]); y[0] = (alpha * (temp00[0][0] + temp00[1][1] + temp00[2][2] + temp00[3][3])) + (beta * y[0]); - y[1] = (alpha * (temp01[0][0] + temp01[1][1] + temp01[2][2] + temp01[3][3])) + (beta * y[1]); + y[1] = (alpha * (temp00[4][0] + temp00[5][1] + temp00[6][2] + temp00[7][3])) + (beta * y[1]); } static void BF16GEMV_T_MMA_4(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FLOAT *y, FLOAT alpha, FLOAT beta) { IFLOAT *a0, *a1, *a2, *a3; vec_bf16 *va0, *va1, *va2, *va3, *v_x; - __vector_quad temp0, temp1, temp2, temp3; - vec_f32 temp00[4], temp01[4], temp02[4], temp03[4]; - vec_bf16 inp[2]; + __vector_quad temp0[4]; + vec_f32 temp00[4*4]; + vec_bf16 inp[4]; - __builtin_mma_xxsetaccz(&temp0); - __builtin_mma_xxsetaccz(&temp1); - __builtin_mma_xxsetaccz(&temp2); - __builtin_mma_xxsetaccz(&temp3); + __builtin_mma_xxsetaccz(&temp0[0]); + __builtin_mma_xxsetaccz(&temp0[1]); + __builtin_mma_xxsetaccz(&temp0[2]); + __builtin_mma_xxsetaccz(&temp0[3]); a0 = ap; a1 = ap + lda; @@ -160,22 +173,24 @@ static void BF16GEMV_T_MMA_4(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FL BLASLONG n8 = n / 8; BLASLONG i = 0; - for (; i + 2 <= n8; i += 2) { + for (; i + 4 <= n8; i += 4) { + vec_load_pair2(inp, &v_x[i]); + + vec_load_mult44_mma(&temp0[0], &va0[i + 0], &va1[i + 0], &va2[i + 0], &va3[i + 0], inp); + } + + if (n8 & 2) { vec_load_pair((vec_f32 *)inp, (vec_f32 *)&v_x[i]); - vec_load_mult2_mma(&temp0, &va0[i + 0], inp); - vec_load_mult2_mma(&temp1, &va1[i + 0], inp); - vec_load_mult2_mma(&temp2, &va2[i + 0], inp); - vec_load_mult2_mma(&temp3, &va3[i + 0], inp); + vec_load_mult24_mma(&temp0[0], &va0[i + 0], &va1[i + 0], &va2[i + 0], &va3[i + 0], inp); + + i += 2; } if (n8 & 1) { inp[0] = (vec_bf16)vec_load_vec(&v_x[i]); - vec_load_mult_mma(&temp0, &va0[i], inp[0]); - vec_load_mult_mma(&temp1, &va1[i], inp[0]); - vec_load_mult_mma(&temp2, &va2[i], inp[0]); - vec_load_mult_mma(&temp3, &va3[i], inp[0]); + vec_load_mult14_mma(&temp0[0], &va0[i], &va1[i], &va2[i], &va3[i], inp[0]); i++; } @@ -184,30 +199,27 @@ static void BF16GEMV_T_MMA_4(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FL if (n) { inp[0] = vec_loadN(&v_x[i], n); - vec_loadN_mult_mma(&temp0, &va0[i], inp[0], n); - vec_loadN_mult_mma(&temp1, &va1[i], inp[0], n); - vec_loadN_mult_mma(&temp2, &va2[i], inp[0], n); - vec_loadN_mult_mma(&temp3, &va3[i], inp[0], n); + vec_loadN_mult14_mma(&temp0[0], &va0[i], &va1[i], &va2[i], &va3[i], inp[0], n); } - __builtin_mma_disassemble_acc((void*)temp00, &temp0); - __builtin_mma_disassemble_acc((void*)temp01, &temp1); - __builtin_mma_disassemble_acc((void*)temp02, &temp2); - __builtin_mma_disassemble_acc((void*)temp03, &temp3); + __builtin_mma_disassemble_acc((void*)(temp00 + 0), &temp0[0]); + __builtin_mma_disassemble_acc((void*)(temp00 + 4), &temp0[1]); + __builtin_mma_disassemble_acc((void*)(temp00 + 8), &temp0[2]); + __builtin_mma_disassemble_acc((void*)(temp00 + 12), &temp0[3]); vec_f32 t0, t1, t2, t3, t4, t5, t6, t7; vec_f32 a = { alpha, alpha, alpha, alpha }; vec_f32 b = { beta, beta, beta, beta }; vec_f32 *v_y = (vec_f32 *) y; - t0 = vec_mergeh(temp00[0], temp01[0]); - t1 = vec_mergeh(temp02[0], temp03[0]); - t2 = vec_mergeo(temp00[1], temp01[1]); - t3 = vec_mergeo(temp02[1], temp03[1]); - t4 = vec_mergel(temp00[2], temp01[2]); - t5 = vec_mergel(temp02[2], temp03[2]); - t6 = vec_mergeo(temp00[3], temp01[3]); - t7 = vec_mergeo(temp02[3], temp03[3]); + t0 = vec_mergeh(temp00[ 0], temp00[ 4]); + t1 = vec_mergeh(temp00[ 8], temp00[12]); + t2 = vec_mergeo(temp00[ 1], temp00[ 5]); + t3 = vec_mergeo(temp00[ 9], temp00[13]); + t4 = vec_mergel(temp00[ 2], temp00[ 6]); + t5 = vec_mergel(temp00[10], temp00[14]); + t6 = vec_mergeo(temp00[ 3], temp00[ 7]); + t7 = vec_mergeo(temp00[11], temp00[15]); t0 = vec_xxpermdi(t0, t1, 0); t2 = vec_xxpermdi(t2, t3, 0); t4 = vec_xxpermdi(t4, t5, 0); @@ -223,18 +235,18 @@ static void BF16GEMV_T_MMA_8(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FL { IFLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; vec_bf16 *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x; - __vector_quad temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; - vec_f32 temp00[4], temp01[4], temp02[4], temp03[4], temp04[4], temp05[4], temp06[4], temp07[4]; - vec_bf16 inp[2]; - - __builtin_mma_xxsetaccz(&temp0); - __builtin_mma_xxsetaccz(&temp1); - __builtin_mma_xxsetaccz(&temp2); - __builtin_mma_xxsetaccz(&temp3); - __builtin_mma_xxsetaccz(&temp4); - __builtin_mma_xxsetaccz(&temp5); - __builtin_mma_xxsetaccz(&temp6); - __builtin_mma_xxsetaccz(&temp7); + __vector_quad temp0[8]; + vec_f32 temp00[4*8]; + vec_bf16 inp[4]; + + __builtin_mma_xxsetaccz(&temp0[0]); + __builtin_mma_xxsetaccz(&temp0[1]); + __builtin_mma_xxsetaccz(&temp0[2]); + __builtin_mma_xxsetaccz(&temp0[3]); + __builtin_mma_xxsetaccz(&temp0[4]); + __builtin_mma_xxsetaccz(&temp0[5]); + __builtin_mma_xxsetaccz(&temp0[6]); + __builtin_mma_xxsetaccz(&temp0[7]); a0 = ap; a1 = ap + lda; @@ -256,30 +268,27 @@ static void BF16GEMV_T_MMA_8(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FL BLASLONG n8 = n / 8; BLASLONG i = 0; - for (; i + 2 <= n8; i += 2) { + for (; i + 4 <= n8; i += 4) { + vec_load_pair2(inp, &v_x[i]); + + vec_load_mult44_mma(&temp0[0], &va0[i + 0], &va1[i + 0], &va2[i + 0], &va3[i + 0], inp); + vec_load_mult44_mma(&temp0[4], &va4[i + 0], &va5[i + 0], &va6[i + 0], &va7[i + 0], inp); + } + + if (n8 & 2) { vec_load_pair((vec_f32 *)inp, (vec_f32 *)&v_x[i]); - vec_load_mult2_mma(&temp0, &va0[i + 0], inp); - vec_load_mult2_mma(&temp1, &va1[i + 0], inp); - vec_load_mult2_mma(&temp2, &va2[i + 0], inp); - vec_load_mult2_mma(&temp3, &va3[i + 0], inp); - vec_load_mult2_mma(&temp4, &va4[i + 0], inp); - vec_load_mult2_mma(&temp5, &va5[i + 0], inp); - vec_load_mult2_mma(&temp6, &va6[i + 0], inp); - vec_load_mult2_mma(&temp7, &va7[i + 0], inp); + vec_load_mult24_mma(&temp0[0], &va0[i + 0], &va1[i + 0], &va2[i + 0], &va3[i + 0], inp); + vec_load_mult24_mma(&temp0[4], &va4[i + 0], &va5[i + 0], &va6[i + 0], &va7[i + 0], inp); + + i += 2; } if (n8 & 1) { inp[0] = (vec_bf16)vec_load_vec(&v_x[i]); - vec_load_mult_mma(&temp0, &va0[i], inp[0]); - vec_load_mult_mma(&temp1, &va1[i], inp[0]); - vec_load_mult_mma(&temp2, &va2[i], inp[0]); - vec_load_mult_mma(&temp3, &va3[i], inp[0]); - vec_load_mult_mma(&temp4, &va4[i], inp[0]); - vec_load_mult_mma(&temp5, &va5[i], inp[0]); - vec_load_mult_mma(&temp6, &va6[i], inp[0]); - vec_load_mult_mma(&temp7, &va7[i], inp[0]); + vec_load_mult14_mma(&temp0[0], &va0[i], &va1[i], &va2[i], &va3[i], inp[0]); + vec_load_mult14_mma(&temp0[4], &va4[i], &va5[i], &va6[i], &va7[i], inp[0]); i++; } @@ -288,38 +297,32 @@ static void BF16GEMV_T_MMA_8(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FL if (n) { inp[0] = vec_loadN(&v_x[i], n); - vec_loadN_mult_mma(&temp0, &va0[i], inp[0], n); - vec_loadN_mult_mma(&temp1, &va1[i], inp[0], n); - vec_loadN_mult_mma(&temp2, &va2[i], inp[0], n); - vec_loadN_mult_mma(&temp3, &va3[i], inp[0], n); - vec_loadN_mult_mma(&temp4, &va4[i], inp[0], n); - vec_loadN_mult_mma(&temp5, &va5[i], inp[0], n); - vec_loadN_mult_mma(&temp6, &va6[i], inp[0], n); - vec_loadN_mult_mma(&temp7, &va7[i], inp[0], n); + vec_loadN_mult14_mma(&temp0[0], &va0[i], &va1[i], &va2[i], &va3[i], inp[0], n); + vec_loadN_mult14_mma(&temp0[4], &va4[i], &va5[i], &va6[i], &va7[i], inp[0], n); } - __builtin_mma_disassemble_acc((void*)temp00, &temp0); - __builtin_mma_disassemble_acc((void*)temp01, &temp1); - __builtin_mma_disassemble_acc((void*)temp02, &temp2); - __builtin_mma_disassemble_acc((void*)temp03, &temp3); - __builtin_mma_disassemble_acc((void*)temp04, &temp4); - __builtin_mma_disassemble_acc((void*)temp05, &temp5); - __builtin_mma_disassemble_acc((void*)temp06, &temp6); - __builtin_mma_disassemble_acc((void*)temp07, &temp7); + __builtin_mma_disassemble_acc((void*)(temp00 + 0), &temp0[0]); + __builtin_mma_disassemble_acc((void*)(temp00 + 4), &temp0[1]); + __builtin_mma_disassemble_acc((void*)(temp00 + 8), &temp0[2]); + __builtin_mma_disassemble_acc((void*)(temp00 + 12), &temp0[3]); + __builtin_mma_disassemble_acc((void*)(temp00 + 16), &temp0[4]); + __builtin_mma_disassemble_acc((void*)(temp00 + 20), &temp0[5]); + __builtin_mma_disassemble_acc((void*)(temp00 + 24), &temp0[6]); + __builtin_mma_disassemble_acc((void*)(temp00 + 28), &temp0[7]); vec_f32 t0, t1, t2, t3, t4, t5, t6, t7, t10, t11, t12, t13, t14, t15, t16, t17; vec_f32 a = { alpha, alpha, alpha, alpha }; vec_f32 b = { beta, beta, beta, beta }; vec_f32 *v_y = (vec_f32 *) y; - t0 = vec_mergeh(temp00[0], temp01[0]); - t1 = vec_mergeh(temp02[0], temp03[0]); - t2 = vec_mergeo(temp00[1], temp01[1]); - t3 = vec_mergeo(temp02[1], temp03[1]); - t4 = vec_mergel(temp00[2], temp01[2]); - t5 = vec_mergel(temp02[2], temp03[2]); - t6 = vec_mergeo(temp00[3], temp01[3]); - t7 = vec_mergeo(temp02[3], temp03[3]); + t0 = vec_mergeh(temp00[ 0], temp00[ 4]); + t1 = vec_mergeh(temp00[ 8], temp00[12]); + t2 = vec_mergeo(temp00[ 1], temp00[ 5]); + t3 = vec_mergeo(temp00[ 9], temp00[13]); + t4 = vec_mergel(temp00[ 2], temp00[ 6]); + t5 = vec_mergel(temp00[10], temp00[14]); + t6 = vec_mergeo(temp00[ 3], temp00[ 7]); + t7 = vec_mergeo(temp00[11], temp00[15]); t0 = vec_xxpermdi(t0, t1, 0); t2 = vec_xxpermdi(t2, t3, 0); t4 = vec_xxpermdi(t4, t5, 0); @@ -327,14 +330,14 @@ static void BF16GEMV_T_MMA_8(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FL t0 += t2 + t4 + t6; - t10 = vec_mergeh(temp04[0], temp05[0]); - t11 = vec_mergeh(temp06[0], temp07[0]); - t12 = vec_mergeo(temp04[1], temp05[1]); - t13 = vec_mergeo(temp06[1], temp07[1]); - t14 = vec_mergel(temp04[2], temp05[2]); - t15 = vec_mergel(temp06[2], temp07[2]); - t16 = vec_mergeo(temp04[3], temp05[3]); - t17 = vec_mergeo(temp06[3], temp07[3]); + t10 = vec_mergeh(temp00[16], temp00[20]); + t11 = vec_mergeh(temp00[24], temp00[28]); + t12 = vec_mergeo(temp00[17], temp00[21]); + t13 = vec_mergeo(temp00[25], temp00[29]); + t14 = vec_mergel(temp00[18], temp00[22]); + t15 = vec_mergel(temp00[26], temp00[30]); + t16 = vec_mergeo(temp00[19], temp00[23]); + t17 = vec_mergeo(temp00[27], temp00[31]); t10 = vec_xxpermdi(t10, t11, 0); t12 = vec_xxpermdi(t12, t13, 0); t14 = vec_xxpermdi(t14, t15, 0); From eb6f3a05efb1a441c8920a2c4a7fa2e0fe7f6507 Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Thu, 26 Sep 2024 09:28:56 -0500 Subject: [PATCH 054/244] Common MMA code. --- kernel/power/sbgemv_common_power10.c | 94 ++++++++++++---------------- 1 file changed, 40 insertions(+), 54 deletions(-) diff --git a/kernel/power/sbgemv_common_power10.c b/kernel/power/sbgemv_common_power10.c index 638e2655c0..0510088b23 100644 --- a/kernel/power/sbgemv_common_power10.c +++ b/kernel/power/sbgemv_common_power10.c @@ -48,22 +48,20 @@ FORCEINLINE void vec_load_mult_mma(__vector_quad *out, vec_bf16 *in, vec_bf16 in FORCEINLINE void vec_load_mult12a_mma(__vector_quad *out, vec_bf16 *in0, vec_bf16 *in1, vec_bf16 inp) { - vec_bf16 in01 = (vec_bf16)vec_load_vec(in0); vec_bf16 in11 = (vec_bf16)vec_load_vec(in1); - __builtin_mma_xvbf16ger2pp(out + 0, (vec_uc8)in01, (vec_uc8)inp); + vec_load_mult_mma(out, in0, inp); + __builtin_mma_xvbf16ger2pp(out + 1, (vec_uc8)in11, (vec_uc8)inp); } FORCEINLINE void vec_load_mult14_mma(__vector_quad *out, vec_bf16 *in0, vec_bf16 *in1, vec_bf16 *in2, vec_bf16 *in3, vec_bf16 inp) { - vec_bf16 in01 = (vec_bf16)vec_load_vec(in0); - vec_bf16 in11 = (vec_bf16)vec_load_vec(in1); vec_bf16 in21 = (vec_bf16)vec_load_vec(in2); vec_bf16 in31 = (vec_bf16)vec_load_vec(in3); - __builtin_mma_xvbf16ger2pp(out + 0, (vec_uc8)in01, (vec_uc8)inp); - __builtin_mma_xvbf16ger2pp(out + 1, (vec_uc8)in11, (vec_uc8)inp); + vec_load_mult12a_mma(out, in0, in1, inp); + __builtin_mma_xvbf16ger2pp(out + 2, (vec_uc8)in21, (vec_uc8)inp); __builtin_mma_xvbf16ger2pp(out + 3, (vec_uc8)in31, (vec_uc8)inp); } @@ -78,6 +76,12 @@ FORCEINLINE void vec_load_mult2_mma(__vector_quad *out, vec_bf16 *in, vec_bf16 * __builtin_mma_xvbf16ger2pp(out, (vec_uc8)in0[1], (vec_uc8)inp[1]); } +FORCEINLINE void vec_mult2d_mma(__vector_quad *out, vec_bf16 *in01, vec_bf16 *in11, vec_bf16 *inp) +{ + __builtin_mma_xvbf16ger2pp(out + 0, (vec_uc8)in01[0], (vec_uc8)inp[0]); + __builtin_mma_xvbf16ger2pp(out + 1, (vec_uc8)in11[0], (vec_uc8)inp[0]); +} + FORCEINLINE void vec_load_mult22_mma(__vector_quad *out, vec_bf16 *in0, vec_bf16 *in1, vec_bf16 *inp) { vec_bf16 in01[2], in11[2]; @@ -85,10 +89,8 @@ FORCEINLINE void vec_load_mult22_mma(__vector_quad *out, vec_bf16 *in0, vec_bf16 vec_load_pair((vec_f32 *)in01, (vec_f32 *)in0); vec_load_pair((vec_f32 *)in11, (vec_f32 *)in1); - __builtin_mma_xvbf16ger2pp(out + 0, (vec_uc8)in01[0], (vec_uc8)inp[0]); - __builtin_mma_xvbf16ger2pp(out + 1, (vec_uc8)in11[0], (vec_uc8)inp[0]); - __builtin_mma_xvbf16ger2pp(out + 0, (vec_uc8)in01[1], (vec_uc8)inp[1]); - __builtin_mma_xvbf16ger2pp(out + 1, (vec_uc8)in11[1], (vec_uc8)inp[1]); + vec_mult2d_mma(out, in01 + 0, in11 + 0, inp + 0); + vec_mult2d_mma(out, in01 + 1, in11 + 1, inp + 1); } FORCEINLINE void vec_load_mult24_mma(__vector_quad *out, vec_bf16 *in0, vec_bf16 *in1, vec_bf16 *in2, vec_bf16 *in3, vec_bf16 *inp) @@ -100,26 +102,22 @@ FORCEINLINE void vec_load_mult24_mma(__vector_quad *out, vec_bf16 *in0, vec_bf16 vec_load_pair((vec_f32 *)in21, (vec_f32 *)in2); vec_load_pair((vec_f32 *)in31, (vec_f32 *)in3); - __builtin_mma_xvbf16ger2pp(out + 0, (vec_uc8)in01[0], (vec_uc8)inp[0]); - __builtin_mma_xvbf16ger2pp(out + 1, (vec_uc8)in11[0], (vec_uc8)inp[0]); - __builtin_mma_xvbf16ger2pp(out + 2, (vec_uc8)in21[0], (vec_uc8)inp[0]); - __builtin_mma_xvbf16ger2pp(out + 3, (vec_uc8)in31[0], (vec_uc8)inp[0]); - __builtin_mma_xvbf16ger2pp(out + 0, (vec_uc8)in01[1], (vec_uc8)inp[1]); - __builtin_mma_xvbf16ger2pp(out + 1, (vec_uc8)in11[1], (vec_uc8)inp[1]); - __builtin_mma_xvbf16ger2pp(out + 2, (vec_uc8)in21[1], (vec_uc8)inp[1]); - __builtin_mma_xvbf16ger2pp(out + 3, (vec_uc8)in31[1], (vec_uc8)inp[1]); + vec_mult2d_mma(out + 0, in01 + 0, in11 + 0, inp + 0); + vec_mult2d_mma(out + 2, in21 + 0, in31 + 0, inp + 0); + vec_mult2d_mma(out + 0, in01 + 1, in11 + 1, inp + 1); + vec_mult2d_mma(out + 2, in21 + 1, in31 + 1, inp + 1); } FORCEINLINE void vec_load_mult4_mma(__vector_quad *out, vec_bf16 *in, vec_bf16 *inp) { - vec_bf16 in0[4]; + vec_bf16 in0[2]; - vec_load_pair2(in0, in); + vec_load_pair((vec_f32 *)(in0 + 0), (vec_f32 *)(in + 2)); - __builtin_mma_xvbf16ger2pp(out, (vec_uc8)in0[0], (vec_uc8)inp[0]); - __builtin_mma_xvbf16ger2pp(out, (vec_uc8)in0[1], (vec_uc8)inp[1]); - __builtin_mma_xvbf16ger2pp(out, (vec_uc8)in0[2], (vec_uc8)inp[2]); - __builtin_mma_xvbf16ger2pp(out, (vec_uc8)in0[3], (vec_uc8)inp[3]); + vec_load_mult2_mma(out, in + 0, inp + 0); + + __builtin_mma_xvbf16ger2pp(out, (vec_uc8)in0[0], (vec_uc8)inp[2]); + __builtin_mma_xvbf16ger2pp(out, (vec_uc8)in0[1], (vec_uc8)inp[3]); } FORCEINLINE void vec_load_mult42_mma(__vector_quad *out, vec_bf16 *in0, vec_bf16 *in1, vec_bf16 *inp) @@ -129,14 +127,16 @@ FORCEINLINE void vec_load_mult42_mma(__vector_quad *out, vec_bf16 *in0, vec_bf16 vec_load_pair2(in01, in0); vec_load_pair2(in11, in1); - __builtin_mma_xvbf16ger2pp(out + 0, (vec_uc8)in01[0], (vec_uc8)inp[0]); - __builtin_mma_xvbf16ger2pp(out + 1, (vec_uc8)in11[0], (vec_uc8)inp[0]); - __builtin_mma_xvbf16ger2pp(out + 0, (vec_uc8)in01[1], (vec_uc8)inp[1]); - __builtin_mma_xvbf16ger2pp(out + 1, (vec_uc8)in11[1], (vec_uc8)inp[1]); - __builtin_mma_xvbf16ger2pp(out + 0, (vec_uc8)in01[2], (vec_uc8)inp[2]); - __builtin_mma_xvbf16ger2pp(out + 1, (vec_uc8)in11[2], (vec_uc8)inp[2]); - __builtin_mma_xvbf16ger2pp(out + 0, (vec_uc8)in01[3], (vec_uc8)inp[3]); - __builtin_mma_xvbf16ger2pp(out + 1, (vec_uc8)in11[3], (vec_uc8)inp[3]); + vec_mult2d_mma(out, in01 + 0, in11 + 0, inp + 0); + vec_mult2d_mma(out, in01 + 1, in11 + 1, inp + 1); + vec_mult2d_mma(out, in01 + 2, in11 + 2, inp + 2); + vec_mult2d_mma(out, in01 + 3, in11 + 3, inp + 3); +} + +FORCEINLINE void vec_mult4d_mma(__vector_quad *out, vec_bf16 *in01, vec_bf16 *in11, vec_bf16 *in21, vec_bf16 *in31, vec_bf16 *inp) +{ + vec_mult2d_mma(out + 0, in01, in11, inp); + vec_mult2d_mma(out + 2, in21, in31, inp); } FORCEINLINE void vec_load_mult44_mma(__vector_quad *out, vec_bf16 *in0, vec_bf16 *in1, vec_bf16 *in2, vec_bf16 *in3, vec_bf16 *inp) @@ -148,22 +148,10 @@ FORCEINLINE void vec_load_mult44_mma(__vector_quad *out, vec_bf16 *in0, vec_bf16 vec_load_pair2(in21, in2); vec_load_pair2(in31, in3); - __builtin_mma_xvbf16ger2pp(out + 0, (vec_uc8)in01[0], (vec_uc8)inp[0]); - __builtin_mma_xvbf16ger2pp(out + 1, (vec_uc8)in11[0], (vec_uc8)inp[0]); - __builtin_mma_xvbf16ger2pp(out + 2, (vec_uc8)in21[0], (vec_uc8)inp[0]); - __builtin_mma_xvbf16ger2pp(out + 3, (vec_uc8)in31[0], (vec_uc8)inp[0]); - __builtin_mma_xvbf16ger2pp(out + 0, (vec_uc8)in01[1], (vec_uc8)inp[1]); - __builtin_mma_xvbf16ger2pp(out + 1, (vec_uc8)in11[1], (vec_uc8)inp[1]); - __builtin_mma_xvbf16ger2pp(out + 2, (vec_uc8)in21[1], (vec_uc8)inp[1]); - __builtin_mma_xvbf16ger2pp(out + 3, (vec_uc8)in31[1], (vec_uc8)inp[1]); - __builtin_mma_xvbf16ger2pp(out + 0, (vec_uc8)in01[2], (vec_uc8)inp[2]); - __builtin_mma_xvbf16ger2pp(out + 1, (vec_uc8)in11[2], (vec_uc8)inp[2]); - __builtin_mma_xvbf16ger2pp(out + 2, (vec_uc8)in21[2], (vec_uc8)inp[2]); - __builtin_mma_xvbf16ger2pp(out + 3, (vec_uc8)in31[2], (vec_uc8)inp[2]); - __builtin_mma_xvbf16ger2pp(out + 0, (vec_uc8)in01[3], (vec_uc8)inp[3]); - __builtin_mma_xvbf16ger2pp(out + 1, (vec_uc8)in11[3], (vec_uc8)inp[3]); - __builtin_mma_xvbf16ger2pp(out + 2, (vec_uc8)in21[3], (vec_uc8)inp[3]); - __builtin_mma_xvbf16ger2pp(out + 3, (vec_uc8)in31[3], (vec_uc8)inp[3]); + vec_mult4d_mma(out, in01 + 0, in11 + 0, in21 + 0, in31 + 0, inp + 0); + vec_mult4d_mma(out, in01 + 1, in11 + 1, in21 + 1, in31 + 1, inp + 1); + vec_mult4d_mma(out, in01 + 2, in11 + 2, in21 + 2, in31 + 2, inp + 2); + vec_mult4d_mma(out, in01 + 3, in11 + 3, in21 + 3, in31 + 3, inp + 3); } FORCEINLINE void vec_loadN_mult_mma(__vector_quad *out, vec_bf16 *in, vec_bf16 inp, BLASLONG n) @@ -175,22 +163,20 @@ FORCEINLINE void vec_loadN_mult_mma(__vector_quad *out, vec_bf16 *in, vec_bf16 i FORCEINLINE void vec_loadN_mult12a_mma(__vector_quad *out, vec_bf16 *in0, vec_bf16 *in1, vec_bf16 inp, BLASLONG n) { - vec_bf16 in01 = (vec_bf16)vec_loadN(in0, n); vec_bf16 in11 = (vec_bf16)vec_loadN(in1, n); - __builtin_mma_xvbf16ger2pp(out + 0, (vec_uc8)in01, (vec_uc8)inp); + vec_loadN_mult_mma(out, in0, inp, n); + __builtin_mma_xvbf16ger2pp(out + 1, (vec_uc8)in11, (vec_uc8)inp); } FORCEINLINE void vec_loadN_mult14_mma(__vector_quad *out, vec_bf16 *in0, vec_bf16 *in1, vec_bf16 *in2, vec_bf16 *in3, vec_bf16 inp, BLASLONG n) { - vec_bf16 in01 = (vec_bf16)vec_loadN(in0, n); - vec_bf16 in11 = (vec_bf16)vec_loadN(in1, n); vec_bf16 in21 = (vec_bf16)vec_loadN(in2, n); vec_bf16 in31 = (vec_bf16)vec_loadN(in3, n); - __builtin_mma_xvbf16ger2pp(out + 0, (vec_uc8)in01, (vec_uc8)inp); - __builtin_mma_xvbf16ger2pp(out + 1, (vec_uc8)in11, (vec_uc8)inp); + vec_loadN_mult12a_mma(out, in0, in1, inp, n); + __builtin_mma_xvbf16ger2pp(out + 2, (vec_uc8)in21, (vec_uc8)inp); __builtin_mma_xvbf16ger2pp(out + 3, (vec_uc8)in31, (vec_uc8)inp); } From d7c0d87cd1b961300a1d32a3a7ac74d030ad1faf Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Thu, 26 Sep 2024 15:21:29 -0500 Subject: [PATCH 055/244] Small changes. --- kernel/power/sbgemv_common_power10.c | 36 +++++++++++++++++++++++ kernel/power/sbgemv_t_power10.c | 43 +++++++--------------------- kernel/power/sbgemv_t_vsx.c | 9 +++--- 3 files changed, 52 insertions(+), 36 deletions(-) diff --git a/kernel/power/sbgemv_common_power10.c b/kernel/power/sbgemv_common_power10.c index 0510088b23..b0e611cb68 100644 --- a/kernel/power/sbgemv_common_power10.c +++ b/kernel/power/sbgemv_common_power10.c @@ -525,6 +525,42 @@ FORCEINLINE void vec_store4_pair(vec_f32 *v_y, vec_f32 *vy0) vec_store_pair(v_y + 6, vy0 + 6); } +FORCEINLINE void vec_setzero_2(__vector_quad *temp0) +{ + __builtin_mma_xxsetaccz(&temp0[0]); + __builtin_mma_xxsetaccz(&temp0[1]); +} + +FORCEINLINE void vec_setzero_4(__vector_quad *temp0) +{ + vec_setzero_2(temp0 + 0); + vec_setzero_2(temp0 + 2); +} + +FORCEINLINE void vec_setzero_8(__vector_quad *temp0) +{ + vec_setzero_4(temp0 + 0); + vec_setzero_4(temp0 + 4); +} + +FORCEINLINE void vec_reduce_2(vec_f32 *temp00, __vector_quad *temp0) +{ + __builtin_mma_disassemble_acc((void*)(temp00 + 0), &temp0[0]); + __builtin_mma_disassemble_acc((void*)(temp00 + 4), &temp0[1]); +} + +FORCEINLINE void vec_reduce_4(vec_f32 *temp00, __vector_quad *temp0) +{ + vec_reduce_2(temp00 + 0, temp0 + 0); + vec_reduce_2(temp00 + 8, temp0 + 2); +} + +FORCEINLINE void vec_reduce_8(vec_f32 *temp00, __vector_quad *temp0) +{ + vec_reduce_4(temp00 + 0, temp0 + 0); + vec_reduce_4(temp00 + 16, temp0 + 4); +} + #ifdef USE_MERGE_MMA FORCEINLINE void vec_load8_pair(vec_f32 *vy0, vec_f32 *v_y) { diff --git a/kernel/power/sbgemv_t_power10.c b/kernel/power/sbgemv_t_power10.c index 9a5c54f12f..d2f6087f05 100644 --- a/kernel/power/sbgemv_t_power10.c +++ b/kernel/power/sbgemv_t_power10.c @@ -101,8 +101,7 @@ static void BF16GEMV_T_MMA_2(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FL vec_f32 temp00[4*2]; vec_bf16 inp[4]; - __builtin_mma_xxsetaccz(&temp0[0]); - __builtin_mma_xxsetaccz(&temp0[1]); + vec_setzero_2(&temp0[0]); a0 = ap; a1 = ap + lda; @@ -141,8 +140,7 @@ static void BF16GEMV_T_MMA_2(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FL vec_loadN_mult12a_mma(&temp0[0], &va0[i], &va1[i], inp[0], n); } - __builtin_mma_disassemble_acc((void*)(temp00 + 0), &temp0[0]); - __builtin_mma_disassemble_acc((void*)(temp00 + 4), &temp0[1]); + vec_reduce_2(temp00, &temp0[0]); y[0] = (alpha * (temp00[0][0] + temp00[1][1] + temp00[2][2] + temp00[3][3])) + (beta * y[0]); y[1] = (alpha * (temp00[4][0] + temp00[5][1] + temp00[6][2] + temp00[7][3])) + (beta * y[1]); @@ -156,10 +154,7 @@ static void BF16GEMV_T_MMA_4(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FL vec_f32 temp00[4*4]; vec_bf16 inp[4]; - __builtin_mma_xxsetaccz(&temp0[0]); - __builtin_mma_xxsetaccz(&temp0[1]); - __builtin_mma_xxsetaccz(&temp0[2]); - __builtin_mma_xxsetaccz(&temp0[3]); + vec_setzero_4(&temp0[0]); a0 = ap; a1 = ap + lda; @@ -202,10 +197,7 @@ static void BF16GEMV_T_MMA_4(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FL vec_loadN_mult14_mma(&temp0[0], &va0[i], &va1[i], &va2[i], &va3[i], inp[0], n); } - __builtin_mma_disassemble_acc((void*)(temp00 + 0), &temp0[0]); - __builtin_mma_disassemble_acc((void*)(temp00 + 4), &temp0[1]); - __builtin_mma_disassemble_acc((void*)(temp00 + 8), &temp0[2]); - __builtin_mma_disassemble_acc((void*)(temp00 + 12), &temp0[3]); + vec_reduce_4(temp00, &temp0[0]); vec_f32 t0, t1, t2, t3, t4, t5, t6, t7; vec_f32 a = { alpha, alpha, alpha, alpha }; @@ -239,23 +231,17 @@ static void BF16GEMV_T_MMA_8(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FL vec_f32 temp00[4*8]; vec_bf16 inp[4]; - __builtin_mma_xxsetaccz(&temp0[0]); - __builtin_mma_xxsetaccz(&temp0[1]); - __builtin_mma_xxsetaccz(&temp0[2]); - __builtin_mma_xxsetaccz(&temp0[3]); - __builtin_mma_xxsetaccz(&temp0[4]); - __builtin_mma_xxsetaccz(&temp0[5]); - __builtin_mma_xxsetaccz(&temp0[6]); - __builtin_mma_xxsetaccz(&temp0[7]); + vec_setzero_8(&temp0[0]); + BLASLONG lda4 = lda << 2; a0 = ap; a1 = ap + lda; a2 = a1 + lda; a3 = a2 + lda; - a4 = a3 + lda; - a5 = a4 + lda; - a6 = a5 + lda; - a7 = a6 + lda; + a4 = a0 + lda4; + a5 = a1 + lda4; + a6 = a2 + lda4; + a7 = a3 + lda4; va0 = (vec_bf16 *)a0; va1 = (vec_bf16 *)a1; va2 = (vec_bf16 *)a2; @@ -301,14 +287,7 @@ static void BF16GEMV_T_MMA_8(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FL vec_loadN_mult14_mma(&temp0[4], &va4[i], &va5[i], &va6[i], &va7[i], inp[0], n); } - __builtin_mma_disassemble_acc((void*)(temp00 + 0), &temp0[0]); - __builtin_mma_disassemble_acc((void*)(temp00 + 4), &temp0[1]); - __builtin_mma_disassemble_acc((void*)(temp00 + 8), &temp0[2]); - __builtin_mma_disassemble_acc((void*)(temp00 + 12), &temp0[3]); - __builtin_mma_disassemble_acc((void*)(temp00 + 16), &temp0[4]); - __builtin_mma_disassemble_acc((void*)(temp00 + 20), &temp0[5]); - __builtin_mma_disassemble_acc((void*)(temp00 + 24), &temp0[6]); - __builtin_mma_disassemble_acc((void*)(temp00 + 28), &temp0[7]); + vec_reduce_8(temp00, &temp0[0]); vec_f32 t0, t1, t2, t3, t4, t5, t6, t7, t10, t11, t12, t13, t14, t15, t16, t17; vec_f32 a = { alpha, alpha, alpha, alpha }; diff --git a/kernel/power/sbgemv_t_vsx.c b/kernel/power/sbgemv_t_vsx.c index 399989bb52..0750405031 100644 --- a/kernel/power/sbgemv_t_vsx.c +++ b/kernel/power/sbgemv_t_vsx.c @@ -198,14 +198,15 @@ static void BF16GEMV_T_VSX_8(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FL vec_bf16 zero = { 0, 0, 0, 0, 0, 0, 0, 0 }; vec_f32 inp[2]; + BLASLONG lda4 = lda << 2; a0 = ap; a1 = ap + lda; a2 = a1 + lda; a3 = a2 + lda; - a4 = a3 + lda; - a5 = a4 + lda; - a6 = a5 + lda; - a7 = a6 + lda; + a4 = a0 + lda4; + a5 = a1 + lda4; + a6 = a2 + lda4; + a7 = a3 + lda4; va0 = (vec_bf16 *)a0; va1 = (vec_bf16 *)a1; va2 = (vec_bf16 *)a2; From c8788208c8bb135bb9b5b8af2476f296987b7cf5 Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Fri, 27 Sep 2024 13:27:03 -0500 Subject: [PATCH 056/244] Fixing block issue with transpose version. --- kernel/power/sbgemv_n.c | 4 +--- kernel/power/sbgemv_t.c | 1 + 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/power/sbgemv_n.c b/kernel/power/sbgemv_n.c index 4768be31fa..eab0b4e33b 100644 --- a/kernel/power/sbgemv_n.c +++ b/kernel/power/sbgemv_n.c @@ -202,10 +202,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, IFLOAT *a, BLASLONG lda, IFLOAT * a += NB; if (inc_y != 1) { move_y(NB, ybuffer, y_ptr, inc_y); - y_ptr += (NB * inc_y); - } else { - y_ptr += NB; } + y_ptr += (NB * inc_y); } return 0; diff --git a/kernel/power/sbgemv_t.c b/kernel/power/sbgemv_t.c index 4cc8f060e9..c6fdb6b1ae 100644 --- a/kernel/power/sbgemv_t.c +++ b/kernel/power/sbgemv_t.c @@ -124,6 +124,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, IFLOAT *a, BLASLONG lda, IFLOAT * a += NB; x += NB * inc_x; + beta = (FLOAT)1; } return 0; From 48698b2b1d575cd4e10b5667e9dc5bd2fca1cbf2 Mon Sep 17 00:00:00 2001 From: gxw Date: Wed, 18 Sep 2024 17:20:43 +0800 Subject: [PATCH 057/244] LoongArch64: Rename core Use microarchitecture name instead of meaningless strings to name the core, the legacy core is still retained. 1. Rename LOONGSONGENERIC to LA64_GENERIC 2. Rename LOONGSON3R5 to LA464 3. Rename LOONGSON2K1000 to LA264 --- .github/workflows/loongarch64.yml | 9 + .github/workflows/loongarch64_clang.yml | 6 + Makefile.system | 2 +- TargetList.txt | 10 +- cpuid_loongarch64.c | 406 +++++++++++++++--- driver/others/blas_server.c | 2 +- driver/others/dynamic_loongarch64.c | 104 ++++- driver/others/parameter.c | 2 +- getarch.c | 79 +++- interface/gemm.c | 2 +- .../{KERNEL.LOONGSON2K1000 => KERNEL.LA264} | 0 .../{KERNEL.LOONGSON3R5 => KERNEL.LA464} | 0 kernel/setparam-ref.c | 2 +- param.h | 6 +- 14 files changed, 506 insertions(+), 124 deletions(-) rename kernel/loongarch64/{KERNEL.LOONGSON2K1000 => KERNEL.LA264} (100%) rename kernel/loongarch64/{KERNEL.LOONGSON3R5 => KERNEL.LA464} (100%) diff --git a/.github/workflows/loongarch64.yml b/.github/workflows/loongarch64.yml index da7f6c9a0c..69379e0500 100644 --- a/.github/workflows/loongarch64.yml +++ b/.github/workflows/loongarch64.yml @@ -23,6 +23,15 @@ jobs: - target: LOONGSON2K1000 triple: loongarch64-unknown-linux-gnu opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON2K1000 + - target: LA64_GENERIC + triple: loongarch64-unknown-linux-gnu + opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LA64_GENERIC + - target: LA464 + triple: loongarch64-unknown-linux-gnu + opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LA464 + - target: LA264 + triple: loongarch64-unknown-linux-gnu + opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LA264 - target: DYNAMIC_ARCH triple: loongarch64-unknown-linux-gnu opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=GENERIC diff --git a/.github/workflows/loongarch64_clang.yml b/.github/workflows/loongarch64_clang.yml index d08e56f627..f1a75ad343 100644 --- a/.github/workflows/loongarch64_clang.yml +++ b/.github/workflows/loongarch64_clang.yml @@ -20,6 +20,12 @@ jobs: opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON3R5 - target: LOONGSON2K1000 opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON2K1000 + - target: LA64_GENERIC + opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LA64_GENERIC + - target: LA464 + opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LA464 + - target: LA264 + opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LA264 - target: DYNAMIC_ARCH opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=GENERIC diff --git a/Makefile.system b/Makefile.system index c40c1f2340..908e65dab0 100644 --- a/Makefile.system +++ b/Makefile.system @@ -727,7 +727,7 @@ endif endif ifeq ($(ARCH), loongarch64) -DYNAMIC_CORE = LOONGSON3R5 LOONGSON2K1000 LOONGSONGENERIC +DYNAMIC_CORE = LA64_GENERIC LA264 LA464 endif ifeq ($(ARCH), riscv64) diff --git a/TargetList.txt b/TargetList.txt index 1531fd0d2f..25eeddfb00 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -126,9 +126,17 @@ x280 RISCV64_ZVL256B 11.LOONGARCH64: +// LOONGSONGENERIC/LOONGSON2K1000/LOONGSON3R5 are legacy names, +// and it is recommended to use the more standardized naming conventions +// LA64_GENERIC/LA264/LA464. You can still specify TARGET as +// LOONGSONGENERIC/LOONGSON2K1000/LOONGSON3R5 during compilation or runtime, +// and they will be internally relocated to LA64_GENERIC/LA264/LA464. LOONGSONGENERIC -LOONGSON3R5 LOONGSON2K1000 +LOONGSON3R5 +LA64_GENERIC +LA264 +LA464 12. Elbrus E2000: E2K diff --git a/cpuid_loongarch64.c b/cpuid_loongarch64.c index 3b7a9c82ea..c6ce2bb731 100644 --- a/cpuid_loongarch64.c +++ b/cpuid_loongarch64.c @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011-2020, The OpenBLAS Project +Copyright (c) 2011-2024, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without @@ -32,53 +32,299 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ #include -#include #include +#include +#include +#include -/* If LASX extension instructions supported, - * using core LOONGSON3R5 - * If only LSX extension instructions supported, - * using core LOONGSON2K1000 - * If neither LASX nor LSX extension instructions supported, - * using core LOONGSONGENERIC (As far as I know, there is no such - * CPU yet) - */ +#define CPU_LA64_GENERIC 0 +#define CPU_LA264 1 +#define CPU_LA364 2 +#define CPU_LA464 3 +#define CPU_LA664 4 -#define CPU_GENERIC 0 -#define CPU_LOONGSON3R5 1 -#define CPU_LOONGSON2K1000 2 +#define CORE_LA64_GENERIC 0 +#define CORE_LA264 1 +#define CORE_LA464 2 #define LA_HWCAP_LSX (1U << 4) #define LA_HWCAP_LASX (1U << 5) +#define LOONGARCH_CFG0 0x00 +#define LOONGARCH_CFG2 0x02 +#define LOONGARCH_CFG10 0x10 +#define LOONGARCH_CFG11 0x11 +#define LOONGARCH_CFG12 0x12 +#define LOONGARCH_CFG13 0x13 +#define LOONGARCH_CFG14 0x14 +#define LASX_MASK 1<<7 +#define LSX_MASK 1<<6 +#define PRID_SERIES_MASK 0xf000 +#define PRID_SERIES_LA264 0xa000 +#define PRID_SERIES_LA364 0xb000 +#define PRID_SERIES_LA464 0xc000 +#define PRID_SERIES_LA664 0xd000 + +#define CACHE_INFO_L1_IU 0 +#define CACHE_INFO_L1_D 1 +#define CACHE_INFO_L2_IU 2 +#define CACHE_INFO_L2_D 3 +#define CACHE_INFO_L3_IU 4 +#define CACHE_INFO_L3_D 5 +#define L1_IU_PRESENT_MASK 0x0001 +#define L1_IU_UNITY_MASK 0x0002 +#define L1_D_PRESENT_MASK 0x0004 +#define L2_IU_PRESENT_MASK 0x0008 +#define L2_IU_UNITY_MASK 0x0010 +#define L2_D_PRESENT_MASK 0x0080 +#define L3_IU_PRESENT_MASK 0x0400 +#define L3_IU_UNITY_MASK 0x0800 +#define L3_D_PRESENT_MASK 0x4000 +#define CACHE_WAY_MINUS_1_MASK 0x0000ffff +#define CACHE_INDEX_LOG2_MASK 0x00ff0000 +#define CACHE_LINESIZE_LOG2_MASK 0x7f000000 + +typedef struct { + int size; + int associative; + int linesize; + int unify; + int present; +} cache_info_t; + +/* Using microarchitecture representation */ static char *cpuname[] = { - "LOONGSONGENERIC", - "LOONGSON3R5", - "LOONGSON2K1000" + "LA64_GENERIC", + "LA264", /* Loongson 64bit, 2-issue, Like 2K1000LA */ + "LA364", /* Loongson 64bit, 3-issue, Like 2K2000 */ + "LA464", /* Loongson 64bit, 4-issue, Like 3A5000, 3C5000L, 3C5000 and 3D5000 */ + "LA664" /* Loongson 64bit, 6-issue, Like 3A6000, 3C6000 and 3D6000 */ }; static char *cpuname_lower[] = { - "loongsongeneric", - "loongson3r5", - "loongson2k1000" + "la64_generic", + "la264", + "la364", + "la464", + "la664" +}; + +static char *corename[] = { + "LA64_GENERIC", /* Implies using scalar instructions for optimization */ + "LA264", /* Implies using LSX instructions for optimization */ + "LA464", /* Implies using LASX instructions for optimization */ +}; + +static char *corename_lower[] = { + "la64_generic", + "la264", + "la464", }; -int detect(void) { -#ifdef __linux +/* + * Obtain cache and processor identification + * through the cpucfg command. + */ +static void get_cacheinfo(int type, cache_info_t *cacheinfo) { + cache_info_t cache_info; + memset(&cache_info, 0, sizeof(cache_info)); + uint32_t reg_10 = 0; + __asm__ volatile ( + "cpucfg %0, %1 \n\t" + : "+&r"(reg_10) + : "r"(LOONGARCH_CFG10) + ); + + switch (type) { + case CACHE_INFO_L1_IU: + if (reg_10 & L1_IU_PRESENT_MASK) { + uint32_t reg_11 = 0; + cache_info.present = reg_10 & L1_IU_PRESENT_MASK; + cache_info.unify = reg_10 & L1_IU_UNITY_MASK; + __asm__ volatile ( + "cpucfg %0, %1 \n\t" + : "+&r"(reg_11) + : "r"(LOONGARCH_CFG11) + ); + cache_info.associative = (reg_11 & CACHE_WAY_MINUS_1_MASK) + 1; + cache_info.linesize = 1 << ((reg_11 & CACHE_LINESIZE_LOG2_MASK) >> 24); + cache_info.size = cache_info.associative * cache_info.linesize * + (1 << ((reg_11 & CACHE_INDEX_LOG2_MASK) >> 16)); + } + break; + + case CACHE_INFO_L1_D: + if (reg_10 & L1_D_PRESENT_MASK) { + uint32_t reg_12 = 0; + cache_info.present = reg_10 & L1_D_PRESENT_MASK; + __asm__ volatile ( + "cpucfg %0, %1 \n\t" + : "+&r"(reg_12) + : "r"(LOONGARCH_CFG12) + ); + cache_info.associative = (reg_12 & CACHE_WAY_MINUS_1_MASK) + 1; + cache_info.linesize = 1 << ((reg_12 & CACHE_LINESIZE_LOG2_MASK) >> 24); + cache_info.size = cache_info.associative * cache_info.linesize * + (1 << ((reg_12 & CACHE_INDEX_LOG2_MASK) >> 16)); + } + break; + + case CACHE_INFO_L2_IU: + if (reg_10 & L2_IU_PRESENT_MASK) { + uint32_t reg_13 = 0; + cache_info.present = reg_10 & L2_IU_PRESENT_MASK; + cache_info.unify = reg_10 & L2_IU_UNITY_MASK; + __asm__ volatile ( + "cpucfg %0, %1 \n\t" + : "+&r"(reg_13) + : "r"(LOONGARCH_CFG13) + ); + cache_info.associative = (reg_13 & CACHE_WAY_MINUS_1_MASK) + 1; + cache_info.linesize = 1 << ((reg_13 & CACHE_LINESIZE_LOG2_MASK) >> 24); + cache_info.size = cache_info.associative * cache_info.linesize * + (1 << ((reg_13 & CACHE_INDEX_LOG2_MASK) >> 16)); + } + break; + + case CACHE_INFO_L2_D: + if (reg_10 & L2_D_PRESENT_MASK) { + cache_info.present = reg_10 & L2_D_PRESENT_MASK; + // No date fetch + } + break; + + case CACHE_INFO_L3_IU: + if (reg_10 & L3_IU_PRESENT_MASK) { + uint32_t reg_14 = 0; + cache_info.present = reg_10 & L3_IU_PRESENT_MASK; + cache_info.unify = reg_10 & L3_IU_UNITY_MASK; + __asm__ volatile ( + "cpucfg %0, %1 \n\t" + : "+&r"(reg_14) + : "r"(LOONGARCH_CFG14) + ); + cache_info.associative = (reg_14 & CACHE_WAY_MINUS_1_MASK) + 1; + cache_info.linesize = 1 << ((reg_14 & CACHE_LINESIZE_LOG2_MASK) >> 24); + cache_info.size = cache_info.associative * cache_info.linesize * + (1 << ((reg_14 & CACHE_INDEX_LOG2_MASK) >> 16)); + } + break; + + case CACHE_INFO_L3_D: + if (reg_10 & L3_D_PRESENT_MASK) { + cache_info.present = reg_10 & L3_D_PRESENT_MASK; + // No data fetch + } + break; + + default: + break; + } + *cacheinfo = cache_info; +} + +static uint32_t get_prid() { + uint32_t reg = 0; + __asm__ volatile ( + "cpucfg %0, %1 \n\t" + : "+&r"(reg) + : "r"(LOONGARCH_CFG0) + ); + return reg; +} + +static void get_cpucount(uint32_t *count) { + uint32_t num = 0; + FILE *f = fopen("/proc/cpuinfo", "r"); + if (!f) return; + char buf[200]; + while (fgets(buf, sizeof(buf), f)) + { + if (!strncmp("processor", buf, 9)) + num ++; + } + fclose(f); + *count = num; +} + +/* Detect whether the OS supports the LASX instruction set */ +static int os_support_lasx() { int hwcap = (int)getauxval(AT_HWCAP); if (hwcap & LA_HWCAP_LASX) - return CPU_LOONGSON3R5; - else if (hwcap & LA_HWCAP_LSX) - return CPU_LOONGSON2K1000; + return 1; + else + return 0; +} + +/* Detect whether the OS supports the LSX instruction set */ +static int os_support_lsx() { + int hwcap = (int)getauxval(AT_HWCAP); + + if (hwcap & LA_HWCAP_LSX) + return 1; else - return CPU_GENERIC; -#endif - return CPU_GENERIC; + return 0; +} + +int get_coretype(void) { + uint32_t prid = get_prid(); + switch (prid & PRID_SERIES_MASK) { + case (PRID_SERIES_LA464): + case (PRID_SERIES_LA664): + if (os_support_lasx()) + return CORE_LA464; + else if (os_support_lsx()) + return CORE_LA264; + else + return CORE_LA64_GENERIC; + break; + + case (PRID_SERIES_LA264): + case (PRID_SERIES_LA364): + if (os_support_lsx()) + return CORE_LA264; + else + return CORE_LA64_GENERIC; + break; + + default: + return CORE_LA64_GENERIC; + break; + } +} + +int get_cputype(void) { + uint32_t prid = get_prid(); + switch (prid & PRID_SERIES_MASK) { + case (PRID_SERIES_LA264): + return CPU_LA264; + break; + + case (PRID_SERIES_LA364): + return CPU_LA364; + break; + + case (PRID_SERIES_LA464): + return CPU_LA464; + break; + + case (PRID_SERIES_LA664): + return CPU_LA664; + break; + + default: + return CPU_LA64_GENERIC; + break; + } } char *get_corename(void) { - return cpuname[detect()]; + return corename[get_coretype()]; +} + +void get_libname(void){ + printf("%s", corename_lower[get_coretype()]); } void get_architecture(void) { @@ -86,8 +332,7 @@ void get_architecture(void) { } void get_subarchitecture(void) { - int d = detect(); - printf("%s", cpuname[d]); + printf("%s", cpuname[get_cputype()]); } void get_subdirname(void) { @@ -95,50 +340,69 @@ void get_subdirname(void) { } void get_cpuconfig(void) { - uint32_t hwcaps = 0; - int d = detect(); - - switch (d) { - case CPU_LOONGSON3R5: - printf("#define LOONGSON3R5\n"); - printf("#define L1_DATA_SIZE 65536\n"); - printf("#define L1_DATA_LINESIZE 64\n"); - printf("#define L2_SIZE 1048576\n"); - printf("#define L2_LINESIZE 64\n"); - printf("#define DTB_DEFAULT_ENTRIES 64\n"); - printf("#define DTB_SIZE 4096\n"); - printf("#define L2_ASSOCIATIVE 16\n"); - break; + cache_info_t info; + uint32_t num_cores = 0; - case CPU_LOONGSON2K1000: - printf("#define LOONGSON2K1000\n"); - printf("#define L1_DATA_SIZE 65536\n"); - printf("#define L1_DATA_LINESIZE 64\n"); - printf("#define L2_SIZE 262144\n"); - printf("#define L2_LINESIZE 64\n"); - printf("#define DTB_DEFAULT_ENTRIES 64\n"); - printf("#define DTB_SIZE 4096\n"); - printf("#define L2_ASSOCIATIVE 16\n"); - break; + printf("#define %s\n", corename[get_coretype()]); // Core name - default: - printf("#define LOONGSONGENERIC\n"); - printf("#define L1_DATA_SIZE 65536\n"); - printf("#define L1_DATA_LINESIZE 64\n"); - printf("#define L2_SIZE 262144\n"); - printf("#define L2_LINESIZE 64\n"); - printf("#define DTB_DEFAULT_ENTRIES 64\n"); - printf("#define DTB_SIZE 4096\n"); - printf("#define L2_ASSOCIATIVE 16\n"); - break; + printf("#define CPU_NAME %s\n", cpuname[get_cputype()]); // Cpu microarchitecture name + + get_cacheinfo(CACHE_INFO_L1_IU, &info); + if (info.present) { + if (info.unify) { // Unified cache, without distinguishing between instructions and data + printf("#define L1_SIZE %d\n", info.size); + printf("#define L1_ASSOCIATIVE %d\n", info.associative); + printf("#define L1_LINESIZE %d\n", info.linesize); + } else { + printf("#define L1_CODE_SIZE %d\n", info.size); + printf("#define L1_CODE_ASSOCIATIVE %d\n", info.associative); + printf("#define L1_CODE_LINESIZE %d\n", info.linesize); + } } - hwcaps = (uint32_t)getauxval( AT_HWCAP ); - if (hwcaps & LA_HWCAP_LSX) printf("#define HAVE_LSX\n"); - if (hwcaps & LA_HWCAP_LASX) printf("#define HAVE_LASX\n"); -} + if (!info.unify) { + get_cacheinfo(CACHE_INFO_L1_D, &info); + if (info.present) { + printf("#define L1_DATA_SIZE %d\n", info.size); + printf("#define L1_DATA_ASSOCIATIVE %d\n", info.associative); + printf("#define L1_DATA_LINESIZE %d\n", info.linesize); + } + } -void get_libname(void){ - int d = detect(); - printf("%s", cpuname_lower[d]); + get_cacheinfo(CACHE_INFO_L2_IU, &info); + if (info.present > 0) { + if (info.unify) { + printf("#define L2_SIZE %d\n", info.size); + printf("#define L2_ASSOCIATIVE %d\n", info.associative); + printf("#define L2_LINESIZE %d\n", info.linesize); + } else { + printf("#define L2_CODE_SIZE %d\n", info.size); + printf("#define L2_CODE_ASSOCIATIVE %d\n", info.associative); + printf("#define L2_CODE_LINESIZE %d\n", info.linesize); + } + } + + get_cacheinfo(CACHE_INFO_L3_IU, &info); + if (info.present > 0) { + if (info.unify) { + printf("#define L3_SIZE %d\n", info.size); + printf("#define L3_ASSOCIATIVE %d\n", info.associative); + printf("#define L3_LINESIZE %d\n", info.linesize); + } else { + printf("#define L3_CODE_SIZE %d\n", info.size); + printf("#define L3_CODE_ASSOCIATIVE %d\n", info.associative); + printf("#define L3_CODE_LINESIZE %d\n", info.linesize); + } + } + + if(os_support_lsx) printf("#define HAVE_LSX\n"); + if(os_support_lasx) printf("#define HAVE_LASX\n"); + + get_cpucount(&num_cores); + if (num_cores) + printf("#define NUM_CORES %d\n", num_cores); + + //TODO: It’s unclear what this entry represents, but it is indeed necessary. + //It has been set based on reference to other platforms. + printf("#define DTB_DEFAULT_ENTRIES 64\n"); } diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index 29f8a5e646..7306a3ecd8 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -1082,7 +1082,7 @@ if (buffer == NULL) { } -//For target LOONGSON3R5, applying an offset to the buffer is essential +//For LOONGARCH64, applying an offset to the buffer is essential //for minimizing cache conflicts and optimizing performance. #if defined(ARCH_LOONGARCH64) && !defined(NO_AFFINITY) if (sa == NULL) sa = (void *)((BLASLONG)buffer + (WhereAmI() & 0xf) * GEMM_OFFSET_A); diff --git a/driver/others/dynamic_loongarch64.c b/driver/others/dynamic_loongarch64.c index 44de596698..51196c6b87 100644 --- a/driver/others/dynamic_loongarch64.c +++ b/driver/others/dynamic_loongarch64.c @@ -28,25 +28,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include "common.h" -extern gotoblas_t gotoblas_LOONGSON3R5; -extern gotoblas_t gotoblas_LOONGSON2K1000; -extern gotoblas_t gotoblas_LOONGSONGENERIC; +#define NUM_CORETYPES 6 +#define LOONGARCH_CFG0 0x00 +#define LA_HWCAP_LSX (1U << 4) +#define LA_HWCAP_LASX (1U << 5) +#define PRID_SERIES_MASK 0xf000 +#define PRID_SERIES_LA264 0xa000 +#define PRID_SERIES_LA364 0xb000 +#define PRID_SERIES_LA464 0xc000 +#define PRID_SERIES_LA664 0xd000 + +extern gotoblas_t gotoblas_LA64_GENERIC; +extern gotoblas_t gotoblas_LA264; +extern gotoblas_t gotoblas_LA464; extern void openblas_warning(int verbose, const char * msg); -#define NUM_CORETYPES 3 - static char *corename[] = { - "loongson3r5", - "loongson2k1000", + "la64_generic", + "la264", + "la464", "loongsongeneric", + "loongson2k1000", + "loongson3r5", "unknown" }; char *gotoblas_corename(void) { - if (gotoblas == &gotoblas_LOONGSON3R5) return corename[0]; - if (gotoblas == &gotoblas_LOONGSON2K1000) return corename[1]; - if (gotoblas == &gotoblas_LOONGSONGENERIC) return corename[2]; + if (gotoblas == &gotoblas_LA64_GENERIC) return corename[0]; + if (gotoblas == &gotoblas_LA264) return corename[1]; + if (gotoblas == &gotoblas_LA464) return corename[2]; return corename[NUM_CORETYPES]; } @@ -66,27 +77,78 @@ static gotoblas_t *force_coretype(char *coretype) { switch (found) { - case 0: return (&gotoblas_LOONGSON3R5); - case 1: return (&gotoblas_LOONGSON2K1000); - case 2: return (&gotoblas_LOONGSONGENERIC); + case 0: return (&gotoblas_LA64_GENERIC); + case 1: return (&gotoblas_LA264); + case 2: return (&gotoblas_LA464); + case 3: return (&gotoblas_LA64_GENERIC); + case 4: return (&gotoblas_LA264); + case 5: return (&gotoblas_LA464); } snprintf(message, 128, "Core not found: %s\n", coretype); openblas_warning(1, message); return NULL; } -#define LA_HWCAP_LSX (1U << 4) -#define LA_HWCAP_LASX (1U << 5) -static gotoblas_t *get_coretype(void) { - int hwcap = (int)getauxval(AT_HWCAP); +/* Detect whether the OS supports the LASX instruction set */ +static int os_support_lasx() { + int hwcap = (int)getauxval(AT_HWCAP); if (hwcap & LA_HWCAP_LASX) - return &gotoblas_LOONGSON3R5; - else if (hwcap & LA_HWCAP_LSX) - return &gotoblas_LOONGSON2K1000; + return 1; + else + return 0; +} + +/* Detect whether the OS supports the LSX instruction set */ +static int os_support_lsx() { + int hwcap = (int)getauxval(AT_HWCAP); + + if (hwcap & LA_HWCAP_LSX) + return 1; else - return &gotoblas_LOONGSONGENERIC; + return 0; +} + +static uint32_t get_prid() { + uint32_t reg = 0; + __asm__ volatile ( + "cpucfg %0, %1 \n\t" + : "+&r"(reg) + : "r"(LOONGARCH_CFG0) + ); + return reg; +} + +/* Select core at runtime based on the + * cpu name and SIMD instructions supported + * by the system + */ +static gotoblas_t *get_coretype(void) { + uint32_t prid = get_prid(); + switch (prid & PRID_SERIES_MASK) { + case (PRID_SERIES_LA464): + case (PRID_SERIES_LA664): + if (os_support_lasx()) + return &gotoblas_LA464; + else if (os_support_lsx()) + return &gotoblas_LA264; + else + return &gotoblas_LA64_GENERIC; + break; + + case (PRID_SERIES_LA264): + case (PRID_SERIES_LA364): + if (os_support_lsx()) + return &gotoblas_LA264; + else + return &gotoblas_LA64_GENERIC; + break; + + default: + return &gotoblas_LA64_GENERIC; + break; + } } void gotoblas_dynamic_init(void) { diff --git a/driver/others/parameter.c b/driver/others/parameter.c index a208a1a9d7..597e5cac7e 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -752,7 +752,7 @@ int get_L3_size() { } void blas_set_parameter(void){ -#if defined(LOONGSON3R5) +#if defined(LA464) int L3_size = get_L3_size(); #ifdef SMP if(blas_num_threads == 1){ diff --git a/getarch.c b/getarch.c index 842a843fad..826dd1ce0a 100644 --- a/getarch.c +++ b/getarch.c @@ -135,11 +135,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* #define FORCE_CELL */ /* #define FORCE_MIPS64_GENERIC */ /* #define FORCE_SICORTEX */ -/* #define FORCE_LOONGSON3R3 */ -/* #define FORCE_LOONGSON3R4 */ +/* #define FORCE_LOONGSON3R3 */ +/* #define FORCE_LOONGSON3R4 */ /* #define FORCE_LOONGSON3R5 */ /* #define FORCE_LOONGSON2K1000 */ /* #define FORCE_LOONGSONGENERIC */ +/* #define FORCE_LA64_GENERIC */ +/* #define FORCE_LA264 */ +/* #define FORCE_LA464 */ /* #define FORCE_I6400 */ /* #define FORCE_P6600 */ /* #define FORCE_P5600 */ @@ -153,7 +156,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* #define FORCE_EV5 */ /* #define FORCE_EV6 */ /* #define FORCE_CSKY */ -/* #define FORCE_CK860FV */ +/* #define FORCE_CK860FV */ /* #define FORCE_GENERIC */ #ifdef FORCE_P2 @@ -979,46 +982,76 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #endif -#ifdef FORCE_LOONGSON3R5 +#if defined(FORCE_LA464) || defined(FORCE_LOONGSON3R5) #define FORCE #define ARCHITECTURE "LOONGARCH" -#define SUBARCHITECTURE "LOONGSON3R5" +#ifdef NO_LASX +#ifdef NO_LSX +#define SUBARCHITECTURE "LA64_GENERIC" #define SUBDIRNAME "loongarch64" -#define ARCHCONFIG "-DLOONGSON3R5 " \ +#define ARCHCONFIG "-DLA64_GENERIC " \ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ - "-DL2_SIZE=1048576 -DL2_LINESIZE=64 " \ - "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 -DHAVE_MSA" -#define LIBNAME "loongson3r5" -#define CORENAME "LOONGSON3R5" + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 " +#define LIBNAME "la64_generic" +#define CORENAME "LA64_GENERIC" #else +#define SUBARCHITECTURE "LA264" +#define SUBDIRNAME "loongarch64" +#define ARCHCONFIG "-DLA264 " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 " +#define LIBNAME "la264" +#define CORENAME "LA264" +#endif +#else +#define SUBARCHITECTURE "LA464" +#define SUBDIRNAME "loongarch64" +#define ARCHCONFIG "-DLA464 " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 " +#define LIBNAME "la464" +#define CORENAME "LA464" +#endif #endif -#ifdef FORCE_LOONGSON2K1000 +#if defined(FORCE_LA264) || defined(FORCE_LOONGSON2K1000) #define FORCE #define ARCHITECTURE "LOONGARCH" -#define SUBARCHITECTURE "LOONGSON2K1000" +#ifdef NO_LSX +#define SUBARCHITECTURE "LA64_GENERIC" #define SUBDIRNAME "loongarch64" -#define ARCHCONFIG "-DLOONGSON2K1000 " \ +#define ARCHCONFIG "-DLA64_GENERIC " \ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ - "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 -DHAVE_MSA" -#define LIBNAME "loongson2k1000" -#define CORENAME "LOONGSON2K1000" + "-DDTB_DEFAULT_ENTRIES=64 " +#define LIBNAME "la64_generic" +#define CORENAME "LA64_GENERIC" #else +#define SUBARCHITECTURE "LA264" +#define SUBDIRNAME "loongarch64" +#define ARCHCONFIG "-DLA264 " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 " +#define LIBNAME "la264" +#define CORENAME "LA264" +#endif #endif -#ifdef FORCE_LOONGSONGENERIC +#if defined(FORCE_LA64_GENERIC) || defined(FORCE_LOONGSONGENERIC) #define FORCE #define ARCHITECTURE "LOONGARCH" -#define SUBARCHITECTURE "LOONGSONGENERIC" +#define SUBARCHITECTURE "LA64_GENERIC" #define SUBDIRNAME "loongarch64" -#define ARCHCONFIG "-DLOONGSONGENERIC " \ +#define ARCHCONFIG "-DLA64_GENERIC " \ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \ "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ - "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 -DHAVE_MSA" -#define LIBNAME "loongsongeneric" -#define CORENAME "LOONGSONGENERIC" -#else + "-DDTB_DEFAULT_ENTRIES=64 " +#define LIBNAME "la64_generic" +#define CORENAME "LA64_GENERIC" #endif #ifdef FORCE_I6400 diff --git a/interface/gemm.c b/interface/gemm.c index 64b8b620cf..c030947b6f 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -572,7 +572,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS buffer = (XFLOAT *)blas_memory_alloc(0); -//For target LOONGSON3R5, applying an offset to the buffer is essential +//For LOONGARCH64, applying an offset to the buffer is essential //for minimizing cache conflicts and optimizing performance. #if defined(ARCH_LOONGARCH64) && !defined(NO_AFFINITY) sa = (XFLOAT *)((BLASLONG)buffer + (WhereAmI() & 0xf) * GEMM_OFFSET_A); diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LA264 similarity index 100% rename from kernel/loongarch64/KERNEL.LOONGSON2K1000 rename to kernel/loongarch64/KERNEL.LA264 diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LA464 similarity index 100% rename from kernel/loongarch64/KERNEL.LOONGSON3R5 rename to kernel/loongarch64/KERNEL.LA464 diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index 9d494bfc62..fa61a209e1 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -1086,7 +1086,7 @@ static void init_parameter(void) { TABLE_NAME.sbgemm_r = SBGEMM_DEFAULT_R; #endif -#if defined(LOONGSON3R5) +#if defined(LA464) int L3_size = get_L3_size(); #ifdef SMP if(blas_num_threads == 1){ diff --git a/param.h b/param.h index 0e4d8965d9..66eedc7980 100644 --- a/param.h +++ b/param.h @@ -2838,7 +2838,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 #endif -#if defined (LOONGSON3R5) +#if defined (LA464) #define SNUMOPT 2 #define DNUMOPT 2 @@ -2891,7 +2891,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 #endif -#ifdef LOONGSON2K1000 +#ifdef LA264 #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL @@ -2926,7 +2926,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 #endif -#ifdef LOONGSONGENERIC +#ifdef LA64_GENERIC #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL From 30af9278dcea8faf49b40cecd821a5a34bcab9eb Mon Sep 17 00:00:00 2001 From: gxw Date: Thu, 26 Sep 2024 16:55:06 +0800 Subject: [PATCH 058/244] LoongArch64: Enable cmake cross-compilation --- cmake/arch.cmake | 4 ++ cmake/prebuild.cmake | 48 +++++++++++++++++++++++ cmake/system.cmake | 4 +- cmake/system_check.cmake | 2 + driver/others/CMakeLists.txt | 2 + kernel/loongarch64/KERNEL.LOONGSONGENERIC | 6 +++ 6 files changed, 64 insertions(+), 2 deletions(-) create mode 100644 kernel/loongarch64/KERNEL.LOONGSONGENERIC diff --git a/cmake/arch.cmake b/cmake/arch.cmake index 5f3703ae0b..0ff4f1df31 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -94,6 +94,10 @@ if (DYNAMIC_ARCH) endif () endif () + if (LOONGARCH64) + set(DYNAMIC_CORE LOONGSONGENERIC LOONGSON2K1000 LOONGSON3R5) + endif () + if (EXISTS ${PROJECT_SOURCE_DIR}/config_kernel.h) message (FATAL_ERROR "Your build directory contains a file config_kernel.h, probably from a previous compilation with make. This will conflict with the cmake compilation and cause strange compiler errors - please remove the file before trying again") endif () diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index 609fbe2417..785c275c78 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -1349,6 +1349,54 @@ endif () "#define DTB_DEFAULT_ENTRIES 128\n" "#define DTB_SIZE 4096\n" "#define L2_ASSOCIATIVE 4\n") + elseif ("${TCORE}" STREQUAL "LOONGSONGENERIC") + file(APPEND ${TARGET_CONF_TEMP} + "#define DTB_DEFAULT_ENTRIES 64\n") + set(SGEMM_UNROLL_M 2) + set(SGEMM_UNROLL_N 8) + set(DGEMM_UNROLL_M 2) + set(DGEMM_UNROLL_N 8) + set(CGEMM_UNROLL_M 1) + set(CGEMM_UNROLL_N 4) + set(ZGEMM_UNROLL_M 1) + set(ZGEMM_UNROLL_N 4) + set(CGEMM3M_UNROLL_M 2) + set(CGEMM3M_UNROLL_N 8) + set(ZGEMM3M_UNROLL_M 2) + set(ZGEMM3M_UNROLL_N 8) + elseif ("${TCORE}" STREQUAL "LOONGSON2K1000") + file(APPEND ${TARGET_CONF_TEMP} + "#define DTB_DEFAULT_ENTRIES 64\n") + set(HAVE_LSX 1) + set(SGEMM_UNROLL_M 2) + set(SGEMM_UNROLL_N 8) + set(DGEMM_UNROLL_M 8) + set(DGEMM_UNROLL_N 4) + set(CGEMM_UNROLL_M 8) + set(CGEMM_UNROLL_N 4) + set(ZGEMM_UNROLL_M 4) + set(ZGEMM_UNROLL_N 4) + set(CGEMM3M_UNROLL_M 2) + set(CGEMM3M_UNROLL_N 8) + set(ZGEMM3M_UNROLL_M 8) + set(ZGEMM3M_UNROLL_N 4) + elseif ("${TCORE}" STREQUAL "LOONGSON3R5") + file(APPEND ${TARGET_CONF_TEMP} + "#define DTB_DEFAULT_ENTRIES 64\n") + set(HAVE_LASX 1) + set(HAVE_LSX 1) + set(SGEMM_UNROLL_M 16) + set(SGEMM_UNROLL_N 8) + set(DGEMM_UNROLL_M 16) + set(DGEMM_UNROLL_N 6) + set(CGEMM_UNROLL_M 16) + set(CGEMM_UNROLL_N 4) + set(ZGEMM_UNROLL_M 8) + set(ZGEMM_UNROLL_N 4) + set(CGEMM3M_UNROLL_M 16) + set(CGEMM3M_UNROLL_N 8) + set(ZGEMM3M_UNROLL_M 16) + set(ZGEMM3M_UNROLL_N 6) endif() set(SBGEMM_UNROLL_M 8) set(SBGEMM_UNROLL_N 4) diff --git a/cmake/system.cmake b/cmake/system.cmake index a0b73ddae0..d697d69405 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -388,7 +388,7 @@ if (NEED_PIC) endif() endif () -if (X86_64 OR ${CORE} STREQUAL POWER10) +if (X86_64 OR ${CORE} STREQUAL POWER10 OR LOONGARCH64) set(SMALL_MATRIX_OPT TRUE) endif () if (ARM64) @@ -403,7 +403,7 @@ if (SMALL_MATRIX_OPT) endif () if (DYNAMIC_ARCH) - if (X86 OR X86_64 OR ARM64 OR POWER OR RISCV64) + if (X86 OR X86_64 OR ARM64 OR POWER OR RISCV64 OR LOONGARCH64) set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH") if (DYNAMIC_OLDER) set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER") diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index e94497a048..59a1358789 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -104,6 +104,8 @@ elseif(ARM) set(ARCH "arm") elseif(ARM64) set(ARCH "arm64") +elseif(LOONGARCH64) + set(ARCH "loongarch64") else() set(ARCH ${CMAKE_SYSTEM_PROCESSOR} CACHE STRING "Target Architecture") endif () diff --git a/driver/others/CMakeLists.txt b/driver/others/CMakeLists.txt index 659449fbc2..139f329ecf 100644 --- a/driver/others/CMakeLists.txt +++ b/driver/others/CMakeLists.txt @@ -54,6 +54,8 @@ if (DYNAMIC_ARCH) list(APPEND COMMON_SOURCES dynamic_power.c) elseif (RISCV64) list(APPEND COMMON_SOURCES dynamic_riscv64.c detect_riscv64.c) + elseif (LOONGARCH64) + list(APPEND COMMON_SOURCES dynamic_loongarch64.c) else () list(APPEND COMMON_SOURCES dynamic.c) endif () diff --git a/kernel/loongarch64/KERNEL.LOONGSONGENERIC b/kernel/loongarch64/KERNEL.LOONGSONGENERIC new file mode 100644 index 0000000000..fc4c12008d --- /dev/null +++ b/kernel/loongarch64/KERNEL.LOONGSONGENERIC @@ -0,0 +1,6 @@ +include $(KERNELDIR)/KERNEL + +STRMMKERNEL = gemm_kernel.S +DTRMMKERNEL = gemm_kernel.S +CTRMMKERNEL = zgemm_kernel.S +ZTRMMKERNEL = zgemm_kernel.S From 7087b0a7d0038fb31fff48d65d0d2ba6ab3397ce Mon Sep 17 00:00:00 2001 From: gxw Date: Sun, 29 Sep 2024 10:31:26 +0800 Subject: [PATCH 059/244] ARM64: Enable SMALL_MATRIX_OPT when compiling with CMake --- cmake/system.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/system.cmake b/cmake/system.cmake index a0b73ddae0..7a11d27e21 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -388,7 +388,7 @@ if (NEED_PIC) endif() endif () -if (X86_64 OR ${CORE} STREQUAL POWER10) +if (X86_64 OR ${CORE} STREQUAL POWER10 OR ARM64) set(SMALL_MATRIX_OPT TRUE) endif () if (ARM64) From 0228d362112fd586b7dbe37f4fec54fe73f69668 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 30 Sep 2024 21:38:05 +0200 Subject: [PATCH 060/244] move -fopenmp to CFLAGS --- cmake/openblas.pc.in | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/openblas.pc.in b/cmake/openblas.pc.in index 9526d2df6e..374221b47f 100644 --- a/cmake/openblas.pc.in +++ b/cmake/openblas.pc.in @@ -9,5 +9,5 @@ Name: OpenBLAS Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version Version: @OpenBLAS_VERSION@ URL: https://github.com/OpenMathLib/OpenBLAS -Libs: @OpenMP_C_FLAGS@ -L${libdir} -l${libnameprefix}openblas${libnamesuffix}${libsuffix} -Cflags: -I${includedir} +Libs: -L${libdir} -l${libnameprefix}openblas${libnamesuffix}${libsuffix} +Cflags: -I${includedir} @OpenMP_C_FLAGS@ From 176107d23ab42f11e94c940790876fec76006611 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 1 Oct 2024 13:31:14 +0200 Subject: [PATCH 061/244] Add -fopenmp to cflags in pkgconfig file if set --- Makefile.install | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Makefile.install b/Makefile.install index 6892efa510..129ed9a137 100644 --- a/Makefile.install +++ b/Makefile.install @@ -14,6 +14,9 @@ endif ifeq ($(INTERFACE64),1) USE_64BITINT=1 endif +ifeq ($(USE_OPENMP),1) + FOMP_OPT:= -fopenmp +endif PREFIX ?= /opt/OpenBLAS @@ -178,6 +181,7 @@ endif @echo 'libnamesuffix='$(LIBNAMESUFFIX) >> "$(PKGFILE)" @echo 'libsuffix='$(SYMBOLSUFFIX) >> "$(PKGFILE)" @echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(PKGFILE)" + @echo 'omp_opt='$(FOMP_OPT) >> "$(PKGFILE)" @echo 'openblas_config= USE_64BITINT='$(INTERFACE64) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(TARGET) 'MAX_THREADS='$(NUM_THREADS)>> "$(PKGFILE)" @echo 'version='$(VERSION) >> "$(PKGFILE)" @echo 'extralib='$(PKG_EXTRALIB) >> "$(PKGFILE)" From fa775613966c2e770f7f4da58e170dcd1cfc2d4b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 1 Oct 2024 13:32:45 +0200 Subject: [PATCH 062/244] add openmp option to pkgconfig template --- openblas.pc.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openblas.pc.in b/openblas.pc.in index 23804f4a2a..d9bb845499 100644 --- a/openblas.pc.in +++ b/openblas.pc.in @@ -4,4 +4,4 @@ Version: ${version} URL: https://github.com/xianyi/OpenBLAS Libs: -L${libdir} -l${libprefix}openblas${libnamesuffix} Libs.private: ${extralib} -Cflags: -I${includedir} +Cflags: -I${includedir} ${omp_opt} From 32095b0cbbfbf2a9db382931cacbc400ae975603 Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Tue, 1 Oct 2024 09:32:42 -0500 Subject: [PATCH 063/244] Remove parameter. --- kernel/power/sbgemv_common.c | 8 ++++---- kernel/power/sbgemv_t_vsx.c | 34 +++++++++++++++++----------------- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/kernel/power/sbgemv_common.c b/kernel/power/sbgemv_common.c index ab50f430af..c9438b7e6d 100644 --- a/kernel/power/sbgemv_common.c +++ b/kernel/power/sbgemv_common.c @@ -58,9 +58,9 @@ FORCEINLINE vec_f32 vec_load_mult(vec_bf16 *in, vec_f32 *inp, vec_bf16 zero) return vec_mult(inp, in0, zero); } -FORCEINLINE void vec_load_vec2(vec_bf16 *in, BLASLONG i, vec_f32 *v_x0, vec_bf16 zero) +FORCEINLINE void vec_load_vec2(vec_bf16 *in, vec_f32 *v_x0, vec_bf16 zero) { - vec_bf16 inp = (vec_bf16)vec_load_vec(&in[i]); + vec_bf16 inp = (vec_bf16)vec_load_vec(in); v_x0[0] = BF16_HI(inp, zero); v_x0[1] = BF16_LO(inp, zero); @@ -89,9 +89,9 @@ FORCEINLINE vec_f32 vec_loadN_mult(vec_bf16 *in, vec_f32 *inp, BLASLONG n, vec_b return vec_mult(inp, in0, zero); } -FORCEINLINE void vec_loadN_vec2(vec_bf16 *in, BLASLONG i, vec_f32 *v_x0, BLASLONG n, vec_bf16 zero) +FORCEINLINE void vec_loadN_vec2(vec_bf16 *in, vec_f32 *v_x0, BLASLONG n, vec_bf16 zero) { - vec_bf16 inp = vec_loadN(&in[i], n); + vec_bf16 inp = vec_loadN(in, n); v_x0[0] = BF16_HI(inp, zero); v_x0[1] = BF16_LO(inp, zero); diff --git a/kernel/power/sbgemv_t_vsx.c b/kernel/power/sbgemv_t_vsx.c index 0750405031..272dccef76 100644 --- a/kernel/power/sbgemv_t_vsx.c +++ b/kernel/power/sbgemv_t_vsx.c @@ -55,14 +55,14 @@ static void BF16GEMV_T_VSX_1(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FL BLASLONG i = 0; for (; i < n8; i++) { - vec_load_vec2(v_x, i, inp, zero); + vec_load_vec2(&v_x[i], inp, zero); temp0 += vec_load_mult(&va0[i], inp, zero); } n &= 7; if (n > 4) { - vec_loadN_vec2(v_x, i, inp, n, zero); + vec_loadN_vec2(&v_x[i], inp, n, zero); temp0 += vec_loadN_mult(&va0[i], inp, n, zero); } else if (n) { @@ -92,7 +92,7 @@ static void BF16GEMV_T_VSX_2(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FL BLASLONG i = 0; for (; i < n8; i++) { - vec_load_vec2(v_x, i, inp, zero); + vec_load_vec2(&v_x[i], inp, zero); temp0 += vec_load_mult(&va0[i], inp, zero); temp1 += vec_load_mult(&va1[i], inp, zero); @@ -100,7 +100,7 @@ static void BF16GEMV_T_VSX_2(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FL n &= 7; if (n > 4) { - vec_loadN_vec2(v_x, i, inp, n, zero); + vec_loadN_vec2(&v_x[i], inp, n, zero); temp0 += vec_loadN_mult(&va0[i], inp, n, zero); temp1 += vec_loadN_mult(&va1[i], inp, n, zero); @@ -139,7 +139,7 @@ static void BF16GEMV_T_VSX_4(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FL BLASLONG i = 0; for (; i < n8; i++) { - vec_load_vec2(v_x, i, inp, zero); + vec_load_vec2(&v_x[i], inp, zero); temp0 += vec_load_mult(&va0[i], inp, zero); temp1 += vec_load_mult(&va1[i], inp, zero); @@ -149,7 +149,7 @@ static void BF16GEMV_T_VSX_4(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FL n &= 7; if (n > 4) { - vec_loadN_vec2(v_x, i, inp, n, zero); + vec_loadN_vec2(&v_x[i], inp, n, zero); temp0 += vec_loadN_mult(&va0[i], inp, n, zero); temp1 += vec_loadN_mult(&va1[i], inp, n, zero); @@ -220,7 +220,7 @@ static void BF16GEMV_T_VSX_8(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FL BLASLONG i = 0; for (; i < n8; i++) { - vec_load_vec2(v_x, i, inp, zero); + vec_load_vec2(&v_x[i], inp, zero); temp0 += vec_load_mult(&va0[i], inp, zero); temp1 += vec_load_mult(&va1[i], inp, zero); @@ -234,7 +234,7 @@ static void BF16GEMV_T_VSX_8(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FL n &= 7; if (n > 4) { - vec_loadN_vec2(v_x, i, inp, n, zero); + vec_loadN_vec2(&v_x[i], inp, n, zero); temp0 += vec_loadN_mult(&va0[i], inp, n, zero); temp1 += vec_loadN_mult(&va1[i], inp, n, zero); @@ -257,7 +257,7 @@ static void BF16GEMV_T_VSX_8(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FL temp7 += vec_loadNHi_mult(&va7[i], inp[0], n, zero); } - vec_f32 t0, t1, t2, t3; + vec_f32 t0, t1, t2, t3, t10, t11, t12, t13; vec_f32 a = { alpha, alpha, alpha, alpha }; vec_f32 b = { beta, beta, beta, beta }; vec_f32 *v_y = (vec_f32 *) y; @@ -272,14 +272,14 @@ static void BF16GEMV_T_VSX_8(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FL temp3 = vec_mergel(t1, t3); temp0 += temp1 + temp2 + temp3; - t0 = vec_mergeh(temp4, temp6); - t1 = vec_mergel(temp4, temp6); - t2 = vec_mergeh(temp5, temp7); - t3 = vec_mergel(temp5, temp7); - temp4 = vec_mergeh(t0, t2); - temp5 = vec_mergel(t0, t2); - temp6 = vec_mergeh(t1, t3); - temp7 = vec_mergel(t1, t3); + t10 = vec_mergeh(temp4, temp6); + t11 = vec_mergel(temp4, temp6); + t12 = vec_mergeh(temp5, temp7); + t13 = vec_mergel(temp5, temp7); + temp4 = vec_mergeh(t10, t12); + temp5 = vec_mergel(t10, t12); + temp6 = vec_mergeh(t11, t13); + temp7 = vec_mergel(t11, t13); temp4 += temp5 + temp6 + temp7; vec_load_pair(inp, v_y); From e238a68c03db1fe808b4919ac2200089009b1382 Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Tue, 1 Oct 2024 11:06:23 -0500 Subject: [PATCH 064/244] Remove duplicate. --- kernel/power/sbgemv_common.c | 7 ------- kernel/power/sbgemv_n_vsx.c | 30 +++++++++++++++--------------- 2 files changed, 15 insertions(+), 22 deletions(-) diff --git a/kernel/power/sbgemv_common.c b/kernel/power/sbgemv_common.c index c9438b7e6d..ad040b3711 100644 --- a/kernel/power/sbgemv_common.c +++ b/kernel/power/sbgemv_common.c @@ -111,13 +111,6 @@ FORCEINLINE vec_f32 vec_loadNHi_mult(vec_bf16 *in, vec_f32 v_inp0, BLASLONG n, v return (v_inp0 * v_in00); } -FORCEINLINE vec_f32 vec_loadNHi_mult2(vec_f32 v_x0, vec_bf16 *in, BLASLONG n, vec_bf16 zero) -{ - vec_f32 v_in00 = vec_loadNHi(in, n, zero); - - return (v_x0 * v_in00); -} - FORCEINLINE vec_f32 vec_loadNHi_vec(vec_bf16 *in, BLASLONG i, BLASLONG n, vec_bf16 zero) { return vec_loadNHi(&in[i], n, zero); diff --git a/kernel/power/sbgemv_n_vsx.c b/kernel/power/sbgemv_n_vsx.c index e8f6dca9fc..390a87359d 100644 --- a/kernel/power/sbgemv_n_vsx.c +++ b/kernel/power/sbgemv_n_vsx.c @@ -80,7 +80,7 @@ static void BF16GEMV_N_VSX_1(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, FLOA } else if (n) { vy0[0] = vec_loadN_f32(&v_y[(i * 2) + 0], n); - vy0[0] += vec_loadNHi_mult2(v_x0, &va0[i], n, zero); + vy0[0] += vec_loadNHi_mult(&va0[i], v_x0, n, zero); vec_storeN_f32(vy0[0], &v_y[(i * 2) + 0], n); } @@ -131,8 +131,8 @@ static void BF16GEMV_N_VSX_2(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, FLOA } else if (n) { vy0[0] = vec_loadN_f32(&v_y[(i * 2) + 0], n); - vy0[0] += vec_loadNHi_mult2(v_x0, &va0[i], n, zero); - vy0[0] += vec_loadNHi_mult2(v_x1, &va1[i], n, zero); + vy0[0] += vec_loadNHi_mult(&va0[i], v_x0, n, zero); + vy0[0] += vec_loadNHi_mult(&va1[i], v_x1, n, zero); vec_storeN_f32(vy0[0], &v_y[(i * 2) + 0], n); } @@ -193,10 +193,10 @@ static void BF16GEMV_N_VSX_4(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, FLOA } else if (n) { vy0[0] = vec_loadN_f32(&v_y[(i * 2) + 0], n); - vy0[0] += vec_loadNHi_mult2(v_x0, &va0[i], n, zero); - vy0[0] += vec_loadNHi_mult2(v_x1, &va1[i], n, zero); - vy0[0] += vec_loadNHi_mult2(v_x2, &va2[i], n, zero); - vy0[0] += vec_loadNHi_mult2(v_x3, &va3[i], n, zero); + vy0[0] += vec_loadNHi_mult(&va0[i], v_x0, n, zero); + vy0[0] += vec_loadNHi_mult(&va1[i], v_x1, n, zero); + vy0[0] += vec_loadNHi_mult(&va2[i], v_x2, n, zero); + vy0[0] += vec_loadNHi_mult(&va3[i], v_x3, n, zero); vec_storeN_f32(vy0[0], &v_y[(i * 2) + 0], n); } @@ -281,14 +281,14 @@ static void BF16GEMV_N_VSX_8(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, BLAS } else if (n) { vy0[0] = vec_loadN_f32(&v_y[(i * 2) + 0], n); - vy0[0] += vec_loadNHi_mult2(v_x0, &va0[i], n, zero); - vy0[0] += vec_loadNHi_mult2(v_x1, &va1[i], n, zero); - vy0[0] += vec_loadNHi_mult2(v_x2, &va2[i], n, zero); - vy0[0] += vec_loadNHi_mult2(v_x3, &va3[i], n, zero); - vy0[0] += vec_loadNHi_mult2(v_x4, &vb0[i], n, zero); - vy0[0] += vec_loadNHi_mult2(v_x5, &vb1[i], n, zero); - vy0[0] += vec_loadNHi_mult2(v_x6, &vb2[i], n, zero); - vy0[0] += vec_loadNHi_mult2(v_x7, &vb3[i], n, zero); + vy0[0] += vec_loadNHi_mult(&va0[i], v_x0, n, zero); + vy0[0] += vec_loadNHi_mult(&va1[i], v_x1, n, zero); + vy0[0] += vec_loadNHi_mult(&va2[i], v_x2, n, zero); + vy0[0] += vec_loadNHi_mult(&va3[i], v_x3, n, zero); + vy0[0] += vec_loadNHi_mult(&vb0[i], v_x4, n, zero); + vy0[0] += vec_loadNHi_mult(&vb1[i], v_x5, n, zero); + vy0[0] += vec_loadNHi_mult(&vb2[i], v_x6, n, zero); + vy0[0] += vec_loadNHi_mult(&vb3[i], v_x7, n, zero); vec_storeN_f32(vy0[0], &v_y[(i * 2) + 0], n); } From 7cc00f68c999750b8e5da8ffc6faf76cbe4deb58 Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Tue, 1 Oct 2024 11:23:32 -0500 Subject: [PATCH 065/244] Remove more duplicate. --- kernel/power/sbgemv_common.c | 5 ----- kernel/power/sbgemv_t_vsx.c | 8 ++++---- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/kernel/power/sbgemv_common.c b/kernel/power/sbgemv_common.c index ad040b3711..156eadce75 100644 --- a/kernel/power/sbgemv_common.c +++ b/kernel/power/sbgemv_common.c @@ -111,11 +111,6 @@ FORCEINLINE vec_f32 vec_loadNHi_mult(vec_bf16 *in, vec_f32 v_inp0, BLASLONG n, v return (v_inp0 * v_in00); } -FORCEINLINE vec_f32 vec_loadNHi_vec(vec_bf16 *in, BLASLONG i, BLASLONG n, vec_bf16 zero) -{ - return vec_loadNHi(&in[i], n, zero); -} - FORCEINLINE void copy_x(BLASLONG n, IFLOAT *src, IFLOAT *dest, BLASLONG inc_src) { for (BLASLONG i = 0; i < n; i++) { diff --git a/kernel/power/sbgemv_t_vsx.c b/kernel/power/sbgemv_t_vsx.c index 272dccef76..9d5e6d9976 100644 --- a/kernel/power/sbgemv_t_vsx.c +++ b/kernel/power/sbgemv_t_vsx.c @@ -66,7 +66,7 @@ static void BF16GEMV_T_VSX_1(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FL temp0 += vec_loadN_mult(&va0[i], inp, n, zero); } else if (n) { - inp[0] = vec_loadNHi_vec(v_x, i, n, zero); + inp[0] = vec_loadNHi(&v_x[i], n, zero); temp0 += vec_loadNHi_mult(&va0[i], inp[0], n, zero); } @@ -105,7 +105,7 @@ static void BF16GEMV_T_VSX_2(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FL temp0 += vec_loadN_mult(&va0[i], inp, n, zero); temp1 += vec_loadN_mult(&va1[i], inp, n, zero); } else if (n) { - inp[0] = vec_loadNHi_vec(v_x, i, n, zero); + inp[0] = vec_loadNHi(&v_x[i], n, zero); temp0 += vec_loadNHi_mult(&va0[i], inp[0], n, zero); temp1 += vec_loadNHi_mult(&va1[i], inp[0], n, zero); @@ -156,7 +156,7 @@ static void BF16GEMV_T_VSX_4(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FL temp2 += vec_loadN_mult(&va2[i], inp, n, zero); temp3 += vec_loadN_mult(&va3[i], inp, n, zero); } else if (n) { - inp[0] = vec_loadNHi_vec(v_x, i, n, zero); + inp[0] = vec_loadNHi(&v_x[i], n, zero); temp0 += vec_loadNHi_mult(&va0[i], inp[0], n, zero); temp1 += vec_loadNHi_mult(&va1[i], inp[0], n, zero); @@ -245,7 +245,7 @@ static void BF16GEMV_T_VSX_8(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FL temp6 += vec_loadN_mult(&va6[i], inp, n, zero); temp7 += vec_loadN_mult(&va7[i], inp, n, zero); } else if (n) { - inp[0] = vec_loadNHi_vec(v_x, i, n, zero); + inp[0] = vec_loadNHi(&v_x[i], n, zero); temp0 += vec_loadNHi_mult(&va0[i], inp[0], n, zero); temp1 += vec_loadNHi_mult(&va1[i], inp[0], n, zero); From 71131406aec579ec474a63eef7d7e555458454cf Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 2 Oct 2024 18:32:48 +0200 Subject: [PATCH 066/244] Declare the input array in CBLAS_?GEADD as const --- cblas.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cblas.h b/cblas.h index 097b4303d6..83686f7433 100644 --- a/cblas.h +++ b/cblas.h @@ -407,13 +407,13 @@ void cblas_cimatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum void cblas_zimatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double* calpha, double* a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST blasint cldb); -void cblas_sgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float calpha, float *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST float cbeta, +void cblas_sgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float calpha, OPENBLAS_CONST float *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST float cbeta, float *c, OPENBLAS_CONST blasint cldc); -void cblas_dgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double calpha, double *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST double cbeta, +void cblas_dgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double calpha, OPENBLAS_CONST double *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST double cbeta, double *c, OPENBLAS_CONST blasint cldc); -void cblas_cgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float *calpha, float *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST float *cbeta, +void cblas_cgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float *calpha, OPENBLAS_CONST float *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST float *cbeta, float *c, OPENBLAS_CONST blasint cldc); -void cblas_zgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double *calpha, double *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST double *cbeta, +void cblas_zgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double *calpha, OPENBLAS_CONST double *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST double *cbeta, double *c, OPENBLAS_CONST blasint cldc); void cblas_sgemm_batch(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransA_array, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransB_array, OPENBLAS_CONST blasint * M_array, OPENBLAS_CONST blasint * N_array, OPENBLAS_CONST blasint * K_array, From a49218166597b9465519d91f232ee67f143bf625 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 3 Oct 2024 15:58:47 +0200 Subject: [PATCH 067/244] filter out Loongarch -mabi options for flang-new --- Makefile.system | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile.system b/Makefile.system index 9587ce4d3f..7bae728552 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1720,8 +1720,8 @@ LAPACK_FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx override FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS)) endif ifeq ($(F_COMPILER),FLANGNEW) -LAPACK_FFLAGS := $(filter-out -m32 -m64 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 -mtune=% ,$(FFLAGS)) -override FFLAGS := $(filter-out -m32 -m64 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 -mtune=% ,$(FFLAGS)) +LAPACK_FFLAGS := $(filter-out -m32 -m64 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 -mtune=% -mabi=% ,$(FFLAGS)) +override FFLAGS := $(filter-out -m32 -m64 -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 -mtune=% -mabi=% ,$(FFLAGS)) endif LAPACK_CFLAGS = $(CFLAGS) From f817f260626744221798dbe9562b87602d1b5938 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 3 Oct 2024 16:01:10 +0200 Subject: [PATCH 068/244] Add simpler EPILOGUE for clang --- common_loongarch64.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/common_loongarch64.h b/common_loongarch64.h index 367e5df18b..2b48450a2d 100644 --- a/common_loongarch64.h +++ b/common_loongarch64.h @@ -281,9 +281,13 @@ REALNAME: ;\ #define GNUSTACK #endif /* defined(__linux__) && defined(__ELF__) */ +#ifdef __clang__ +#define EPILOGUE .end +#else #define EPILOGUE \ .end REALNAME ;\ GNUSTACK +#endif #define PROFCODE From 4f00f02567c660c45901b7946d3997735bd54092 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 3 Oct 2024 16:06:33 +0200 Subject: [PATCH 069/244] Do not add -mabi flags for Loongson when the compiler is flang --- cmake/fc.cmake | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/cmake/fc.cmake b/cmake/fc.cmake index 8798ce8b4c..db818e4a03 100644 --- a/cmake/fc.cmake +++ b/cmake/fc.cmake @@ -61,21 +61,25 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_F endif () if (LOONGARCH64) if (BINARY64) - CHECK_C_COMPILER_FLAG("-mabi=lp64d" COMPILER_SUPPORT_LP64D_ABI) - if(COMPILER_SUPPORT_LP64D_ABI) - set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64d") - else() - set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64") - endif () + if (NOT CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*") + CHECK_C_COMPILER_FLAG("-mabi=lp64d" COMPILER_SUPPORT_LP64D_ABI) + if(COMPILER_SUPPORT_LP64D_ABI) + set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64d") + else() + set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64") + endif () + endif () if (INTERFACE64) set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8") endif () else () - CHECK_C_COMPILER_FLAG("-mabi=ilp32d" COMPILER_SUPPORT_ILP32D_ABI) - if(COMPILER_SUPPORT_ILP32D_ABI) - set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=ilp32d") - else() - set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp32") + if (NOT CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*") + CHECK_C_COMPILER_FLAG("-mabi=ilp32d" COMPILER_SUPPORT_ILP32D_ABI) + if(COMPILER_SUPPORT_ILP32D_ABI) + set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=ilp32d") + else() + set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp32") + endif () endif () endif () endif () From 7ec3c16d822b18499c7276d3e7fe16bdf186f0a3 Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Thu, 3 Oct 2024 13:27:33 -0500 Subject: [PATCH 070/244] Remove beta from optimized functions. --- Makefile.system | 1 + kernel/power/sbgemv_common.c | 65 ++++++++++++++++++++++++++++++++- kernel/power/sbgemv_n.c | 59 ------------------------------ kernel/power/sbgemv_t.c | 28 ++++++++------ kernel/power/sbgemv_t_power10.c | 22 +++++------ kernel/power/sbgemv_t_vsx.c | 22 +++++------ 6 files changed, 101 insertions(+), 96 deletions(-) diff --git a/Makefile.system b/Makefile.system index 2c5ca96906..8c030842a4 100644 --- a/Makefile.system +++ b/Makefile.system @@ -282,6 +282,7 @@ GEMM_GEMV_FORWARD = 1 endif ifeq ($(ARCH), power) GEMM_GEMV_FORWARD = 1 +GEMM_GEMV_FORWARD_BF16 = 1 endif ifeq ($(SMALL_MATRIX_OPT), 1) diff --git a/kernel/power/sbgemv_common.c b/kernel/power/sbgemv_common.c index 156eadce75..47de837cc5 100644 --- a/kernel/power/sbgemv_common.c +++ b/kernel/power/sbgemv_common.c @@ -122,7 +122,10 @@ FORCEINLINE void copy_x(BLASLONG n, IFLOAT *src, IFLOAT *dest, BLASLONG inc_src) FORCEINLINE void copy_y_beta(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src, FLOAT beta) { if (beta == 0) { - memset(dest, 0, sizeof(FLOAT) * n); + for (BLASLONG i = 0; i < n; i++) { + *dest++ = (FLOAT)0; + src += inc_src; + } } else if (beta == 1) { for (BLASLONG i = 0; i < n; i++) { *dest++ = *src; @@ -163,4 +166,64 @@ FORCEINLINE void move_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) dest += inc_dest; } } + +static void BF16GEMV_N_beta(BLASLONG n, FLOAT *output_vector, FLOAT *input_vector, FLOAT beta) +{ + if (beta == 0) { + memset(output_vector, 0, sizeof(FLOAT) * n); + } else if (beta == 1) { + if (output_vector != input_vector) { + memcpy(output_vector, input_vector, sizeof(FLOAT) * n); + } + } else { + vec_f32 b = { beta, beta, beta, beta }; + + vec_f32 *in = (vec_f32 *)input_vector; + vec_f32 *out = (vec_f32 *)output_vector; + + BLASLONG n8 = n / 8; + BLASLONG i = 0; + vec_f32 v_inp0[2]; + + for (; i + 4 <= n8; i += 4) { + vec_f32 v_inp1[2], v_inp2[2], v_inp3[2]; + vec_load_pair(v_inp0, &in[(i * 2) + 0]); + vec_load_pair(v_inp1, &in[(i * 2) + 2]); + vec_load_pair(v_inp2, &in[(i * 2) + 4]); + vec_load_pair(v_inp3, &in[(i * 2) + 6]); + v_inp0[0] *= b; + v_inp0[1] *= b; + v_inp1[0] *= b; + v_inp1[1] *= b; + v_inp2[0] *= b; + v_inp2[1] *= b; + v_inp3[0] *= b; + v_inp3[1] *= b; + vec_store_pair(&out[(i * 2) + 0], v_inp0); + vec_store_pair(&out[(i * 2) + 2], v_inp1); + vec_store_pair(&out[(i * 2) + 4], v_inp2); + vec_store_pair(&out[(i * 2) + 6], v_inp3); + } + + for (; i < n8; i++) { + vec_load_pair(v_inp0, &in[(i * 2) + 0]); + v_inp0[0] *= b; + v_inp0[1] *= b; + vec_store_pair(&out[(i * 2) + 0], v_inp0); + } + + n &= 7; + if (n > 4) { + BLASLONG n3 = n & 3; + vec_loadN2_f32(v_inp0, &in[(i * 2) + 0], n3); + v_inp0[0] *= b; + v_inp0[1] *= b; + vec_storeN2_f32(v_inp0, &out[(i * 2) + 0], n3); + } else if (n) { + v_inp0[0] = vec_loadN_f32(&in[(i * 2) + 0], n); + v_inp0[0] *= b; + vec_storeN_f32(v_inp0[0], &out[(i * 2) + 0], n); + } + } +} #endif diff --git a/kernel/power/sbgemv_n.c b/kernel/power/sbgemv_n.c index eab0b4e33b..e6f7f587e6 100644 --- a/kernel/power/sbgemv_n.c +++ b/kernel/power/sbgemv_n.c @@ -27,65 +27,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef SBGEMV_N_COMMON_C #define SBGEMV_N_COMMON_C -static void BF16GEMV_N_beta(BLASLONG n, FLOAT *output_vector, FLOAT *input_vector, FLOAT beta) -{ - if (beta == 0) { - memset(output_vector, 0, sizeof(FLOAT) * n); - } else if (beta == 1) { - if (output_vector != input_vector) { - memcpy(output_vector, input_vector, sizeof(FLOAT) * n); - } - } else { - vec_f32 b = { beta, beta, beta, beta }; - - vec_f32 *in = (vec_f32 *)input_vector; - vec_f32 *out = (vec_f32 *)output_vector; - - BLASLONG n8 = n / 8; - BLASLONG i = 0; - vec_f32 v_inp0[2]; - - for (; i + 4 <= n8; i += 4) { - vec_f32 v_inp1[2], v_inp2[2], v_inp3[2]; - vec_load_pair(v_inp0, &in[(i * 2) + 0]); - vec_load_pair(v_inp1, &in[(i * 2) + 2]); - vec_load_pair(v_inp2, &in[(i * 2) + 4]); - vec_load_pair(v_inp3, &in[(i * 2) + 6]); - v_inp0[0] *= b; - v_inp0[1] *= b; - v_inp1[0] *= b; - v_inp1[1] *= b; - v_inp2[0] *= b; - v_inp2[1] *= b; - v_inp3[0] *= b; - v_inp3[1] *= b; - vec_store_pair(&out[(i * 2) + 0], v_inp0); - vec_store_pair(&out[(i * 2) + 2], v_inp1); - vec_store_pair(&out[(i * 2) + 4], v_inp2); - vec_store_pair(&out[(i * 2) + 6], v_inp3); - } - - for (; i < n8; i++) { - vec_load_pair(v_inp0, &in[(i * 2) + 0]); - v_inp0[0] *= b; - v_inp0[1] *= b; - vec_store_pair(&out[(i * 2) + 0], v_inp0); - } - - n &= 7; - if (n > 4) { - BLASLONG n3 = n & 3; - vec_loadN2_f32(v_inp0, &in[(i * 2) + 0], n3); - v_inp0[0] *= b; - v_inp0[1] *= b; - vec_storeN2_f32(v_inp0, &out[(i * 2) + 0], n3); - } else if (n) { - v_inp0[0] = vec_loadN_f32(&in[(i * 2) + 0], n); - v_inp0[0] *= b; - vec_storeN_f32(v_inp0[0], &out[(i * 2) + 0], n); - } - } -} #if (defined(_ARCH_PWR10) && (defined(USE_BFGEMV_8_N_MMA) || (!defined(USE_BFGEMV_N_MMA) && defined(USE_BFGEMV_8_N_VSX)))) || (!defined(_ARCH_PWR10) && defined(USE_BFGEMV_8_N_VSX)) #define USE_N_8 diff --git a/kernel/power/sbgemv_t.c b/kernel/power/sbgemv_t.c index c6fdb6b1ae..594b1fc57b 100644 --- a/kernel/power/sbgemv_t.c +++ b/kernel/power/sbgemv_t.c @@ -41,6 +41,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, IFLOAT *a, BLASLONG lda, IFLOAT * if ((m < 1) || (n < 1)) return 0; + if (inc_y == 1) { + BF16GEMV_N_beta(n, y, y, beta); + } + xbuffer = buffer; BLASLONG lda4 = lda << 2; @@ -58,18 +62,21 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, IFLOAT *a, BLASLONG lda, IFLOAT * } a_ptr = a; + a += NB; y_ptr = y; if (inc_x != 1) { copy_x(NB, x, xbuffer, inc_x); + x += NB * inc_x; } else { xbuffer = x; + x += NB; } if (inc_y == 1) { #ifdef USE_T_8 for (BLASLONG j = 0; j + 8 <= n; j += 8) { - BF16GEMV_T_8(NB, lda, a_ptr, xbuffer, y_ptr, alpha, beta); + BF16GEMV_T_8(NB, lda, a_ptr, xbuffer, y_ptr, alpha); y_ptr += 8; a_ptr += lda8; } @@ -77,23 +84,23 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, IFLOAT *a, BLASLONG lda, IFLOAT * #else for (BLASLONG j = 0; j + 4 <= n; j += 4) { #endif - BF16GEMV_T_4(NB, lda, a_ptr, xbuffer, y_ptr, alpha, beta); + BF16GEMV_T_4(NB, lda, a_ptr, xbuffer, y_ptr, alpha); y_ptr += 4; a_ptr += lda4; } if (n & 2) { - BF16GEMV_T_2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, beta); + BF16GEMV_T_2(NB, lda, a_ptr, xbuffer, y_ptr, alpha); y_ptr += 2; a_ptr += (lda * 2); } if (n & 1) { - BF16GEMV_T_1(NB, lda, a_ptr, xbuffer, y_ptr, alpha, beta); + BF16GEMV_T_1(NB, lda, a_ptr, xbuffer, y_ptr, alpha); } } else { #ifdef USE_T_8 for (BLASLONG j = 0; j + 8 <= n; j += 8) { memset(ybuffer, 0, sizeof(FLOAT) * 8); - BF16GEMV_T_8(NB, lda, a_ptr, xbuffer, ybuffer, alpha, beta); + BF16GEMV_T_8(NB, lda, a_ptr, xbuffer, ybuffer, alpha); copy_y(8, ybuffer, y_ptr, inc_y, beta); y_ptr += 8 * inc_y; a_ptr += lda8; @@ -103,28 +110,25 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, IFLOAT *a, BLASLONG lda, IFLOAT * for (BLASLONG j = 0; j + 4 <= n; j += 4) { #endif memset(ybuffer, 0, sizeof(FLOAT) * 4); - BF16GEMV_T_4(NB, lda, a_ptr, xbuffer, ybuffer, alpha, beta); + BF16GEMV_T_4(NB, lda, a_ptr, xbuffer, ybuffer, alpha); copy_y(4, ybuffer, y_ptr, inc_y, beta); y_ptr += 4 * inc_y; a_ptr += lda4; } if (n & 2) { memset(ybuffer, 0, sizeof(FLOAT) * 4); - BF16GEMV_T_2(NB, lda, a_ptr, xbuffer, ybuffer, alpha, beta); + BF16GEMV_T_2(NB, lda, a_ptr, xbuffer, ybuffer, alpha); copy_y(2, ybuffer, y_ptr, inc_y, beta); y_ptr += 2 * inc_y; a_ptr += (lda * 2); } if (n & 1) { memset(ybuffer, 0, sizeof(FLOAT) * 4); - BF16GEMV_T_1(NB, lda, a_ptr, xbuffer, ybuffer, alpha, beta); + BF16GEMV_T_1(NB, lda, a_ptr, xbuffer, ybuffer, alpha); copy_y(1, ybuffer, y_ptr, inc_y, beta); } + beta = (FLOAT)1; } - - a += NB; - x += NB * inc_x; - beta = (FLOAT)1; } return 0; diff --git a/kernel/power/sbgemv_t_power10.c b/kernel/power/sbgemv_t_power10.c index d2f6087f05..40c166354b 100644 --- a/kernel/power/sbgemv_t_power10.c +++ b/kernel/power/sbgemv_t_power10.c @@ -43,7 +43,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define USE_BFGEMV_8_T_MMA -static void BF16GEMV_T_MMA_1(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FLOAT *y, FLOAT alpha, FLOAT beta) +static void BF16GEMV_T_MMA_1(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FLOAT *y, FLOAT alpha) { IFLOAT *a0; vec_bf16 *va0, *v_x; @@ -90,10 +90,10 @@ static void BF16GEMV_T_MMA_1(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FL __builtin_mma_disassemble_acc((void*)temp00, &temp0); - y[0] = (alpha * (temp00[0][0] + temp00[1][1] + temp00[2][2] + temp00[3][3])) + (beta * y[0]); + y[0] += (alpha * (temp00[0][0] + temp00[1][1] + temp00[2][2] + temp00[3][3])); } -static void BF16GEMV_T_MMA_2(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FLOAT *y, FLOAT alpha, FLOAT beta) +static void BF16GEMV_T_MMA_2(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FLOAT *y, FLOAT alpha) { IFLOAT *a0, *a1; vec_bf16 *va0, *va1, *v_x; @@ -142,11 +142,11 @@ static void BF16GEMV_T_MMA_2(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FL vec_reduce_2(temp00, &temp0[0]); - y[0] = (alpha * (temp00[0][0] + temp00[1][1] + temp00[2][2] + temp00[3][3])) + (beta * y[0]); - y[1] = (alpha * (temp00[4][0] + temp00[5][1] + temp00[6][2] + temp00[7][3])) + (beta * y[1]); + y[0] += (alpha * (temp00[0][0] + temp00[1][1] + temp00[2][2] + temp00[3][3])); + y[1] += (alpha * (temp00[4][0] + temp00[5][1] + temp00[6][2] + temp00[7][3])); } -static void BF16GEMV_T_MMA_4(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FLOAT *y, FLOAT alpha, FLOAT beta) +static void BF16GEMV_T_MMA_4(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FLOAT *y, FLOAT alpha) { IFLOAT *a0, *a1, *a2, *a3; vec_bf16 *va0, *va1, *va2, *va3, *v_x; @@ -201,7 +201,6 @@ static void BF16GEMV_T_MMA_4(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FL vec_f32 t0, t1, t2, t3, t4, t5, t6, t7; vec_f32 a = { alpha, alpha, alpha, alpha }; - vec_f32 b = { beta, beta, beta, beta }; vec_f32 *v_y = (vec_f32 *) y; t0 = vec_mergeh(temp00[ 0], temp00[ 4]); @@ -219,11 +218,11 @@ static void BF16GEMV_T_MMA_4(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FL t0 += t2 + t4 + t6; - v_y[0] = (a * t0) + (b * v_y[0]); + v_y[0] += (a * t0); } #ifdef USE_BFGEMV_8_T_MMA -static void BF16GEMV_T_MMA_8(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FLOAT *y, FLOAT alpha, FLOAT beta) +static void BF16GEMV_T_MMA_8(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FLOAT *y, FLOAT alpha) { IFLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; vec_bf16 *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x; @@ -291,7 +290,6 @@ static void BF16GEMV_T_MMA_8(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FL vec_f32 t0, t1, t2, t3, t4, t5, t6, t7, t10, t11, t12, t13, t14, t15, t16, t17; vec_f32 a = { alpha, alpha, alpha, alpha }; - vec_f32 b = { beta, beta, beta, beta }; vec_f32 *v_y = (vec_f32 *) y; t0 = vec_mergeh(temp00[ 0], temp00[ 4]); @@ -326,8 +324,8 @@ static void BF16GEMV_T_MMA_8(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FL vec_f32 inp2[2]; vec_load_pair(inp2, v_y); - inp2[0] = (a * t0) + (b * inp2[0]); - inp2[1] = (a * t10) + (b * inp2[1]); + inp2[0] += (a * t0); + inp2[1] += (a * t10); vec_store_pair(v_y, inp2); } #endif diff --git a/kernel/power/sbgemv_t_vsx.c b/kernel/power/sbgemv_t_vsx.c index 9d5e6d9976..e72d2f31e0 100644 --- a/kernel/power/sbgemv_t_vsx.c +++ b/kernel/power/sbgemv_t_vsx.c @@ -40,7 +40,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define USE_BFGEMV_8_T_VSX -static void BF16GEMV_T_VSX_1(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FLOAT *y, FLOAT alpha, FLOAT beta) +static void BF16GEMV_T_VSX_1(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FLOAT *y, FLOAT alpha) { IFLOAT *a0; vec_bf16 *va0, *v_x; @@ -71,10 +71,10 @@ static void BF16GEMV_T_VSX_1(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FL temp0 += vec_loadNHi_mult(&va0[i], inp[0], n, zero); } - y[0] = (alpha * (temp0[0] + temp0[1] + temp0[2] + temp0[3])) + (beta * y[0]); + y[0] += (alpha * (temp0[0] + temp0[1] + temp0[2] + temp0[3])); } -static void BF16GEMV_T_VSX_2(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FLOAT *y, FLOAT alpha, FLOAT beta) +static void BF16GEMV_T_VSX_2(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FLOAT *y, FLOAT alpha) { IFLOAT *a0, *a1; vec_bf16 *va0, *va1, *v_x; @@ -111,11 +111,11 @@ static void BF16GEMV_T_VSX_2(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FL temp1 += vec_loadNHi_mult(&va1[i], inp[0], n, zero); } - y[0] = (alpha * (temp0[0] + temp0[1] + temp0[2] + temp0[3])) + (beta * y[0]); - y[1] = (alpha * (temp1[0] + temp1[1] + temp1[2] + temp1[3])) + (beta * y[1]); + y[0] += (alpha * (temp0[0] + temp0[1] + temp0[2] + temp0[3])); + y[1] += (alpha * (temp1[0] + temp1[1] + temp1[2] + temp1[3])); } -static void BF16GEMV_T_VSX_4(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FLOAT *y, FLOAT alpha, FLOAT beta) +static void BF16GEMV_T_VSX_4(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FLOAT *y, FLOAT alpha) { IFLOAT *a0, *a1, *a2, *a3; vec_bf16 *va0, *va1, *va2, *va3, *v_x; @@ -166,7 +166,6 @@ static void BF16GEMV_T_VSX_4(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FL vec_f32 t0, t1, t2, t3; vec_f32 a = { alpha, alpha, alpha, alpha }; - vec_f32 b = { beta, beta, beta, beta }; vec_f32 *v_y = (vec_f32 *) y; t0 = vec_mergeh(temp0, temp2); @@ -179,11 +178,11 @@ static void BF16GEMV_T_VSX_4(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FL temp3 = vec_mergel(t1, t3); temp0 += temp1 + temp2 + temp3; - v_y[0] = (a * temp0) + (b * v_y[0]); + v_y[0] += (a * temp0); } #ifdef USE_BFGEMV_8_T_VSX -static void BF16GEMV_T_VSX_8(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FLOAT *y, FLOAT alpha, FLOAT beta) +static void BF16GEMV_T_VSX_8(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FLOAT *y, FLOAT alpha) { IFLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; vec_bf16 *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x; @@ -259,7 +258,6 @@ static void BF16GEMV_T_VSX_8(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FL vec_f32 t0, t1, t2, t3, t10, t11, t12, t13; vec_f32 a = { alpha, alpha, alpha, alpha }; - vec_f32 b = { beta, beta, beta, beta }; vec_f32 *v_y = (vec_f32 *) y; t0 = vec_mergeh(temp0, temp2); @@ -283,8 +281,8 @@ static void BF16GEMV_T_VSX_8(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FL temp4 += temp5 + temp6 + temp7; vec_load_pair(inp, v_y); - inp[0] = (a * temp0) + (b * inp[0]); - inp[1] = (a * temp4) + (b * inp[1]); + inp[0] += (a * temp0); + inp[1] += (a * temp4); vec_store_pair(v_y, inp); } #endif From 915a6d6e44b838e7618e56021bf8dee6163b6ff0 Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Thu, 3 Oct 2024 14:08:21 -0500 Subject: [PATCH 071/244] Add casting. --- kernel/power/sbgemv_common.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/power/sbgemv_common.c b/kernel/power/sbgemv_common.c index 47de837cc5..8ad7f92e73 100644 --- a/kernel/power/sbgemv_common.c +++ b/kernel/power/sbgemv_common.c @@ -121,12 +121,12 @@ FORCEINLINE void copy_x(BLASLONG n, IFLOAT *src, IFLOAT *dest, BLASLONG inc_src) FORCEINLINE void copy_y_beta(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src, FLOAT beta) { - if (beta == 0) { + if (beta == (FLOAT)0) { for (BLASLONG i = 0; i < n; i++) { *dest++ = (FLOAT)0; src += inc_src; } - } else if (beta == 1) { + } else if (beta == (FLOAT)1) { for (BLASLONG i = 0; i < n; i++) { *dest++ = *src; src += inc_src; @@ -141,12 +141,12 @@ FORCEINLINE void copy_y_beta(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_s FORCEINLINE void copy_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src, FLOAT beta) { - if (beta == 0) { + if (beta == (FLOAT)0) { for (BLASLONG i = 0; i < n; i++) { *dest = *src++; dest += inc_src; } - } else if (beta == 1) { + } else if (beta == (FLOAT)1) { for (BLASLONG i = 0; i < n; i++) { *dest += *src++; dest += inc_src; @@ -169,9 +169,9 @@ FORCEINLINE void move_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) static void BF16GEMV_N_beta(BLASLONG n, FLOAT *output_vector, FLOAT *input_vector, FLOAT beta) { - if (beta == 0) { + if (beta == (FLOAT)0) { memset(output_vector, 0, sizeof(FLOAT) * n); - } else if (beta == 1) { + } else if (beta == (FLOAT)1) { if (output_vector != input_vector) { memcpy(output_vector, input_vector, sizeof(FLOAT) * n); } From 7c4f3638fd74194cd0afc3ecc8074417d09d9b95 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 3 Oct 2024 22:00:15 +0200 Subject: [PATCH 072/244] switch PPCG4 SGEMM kernel to 4x4 --- param.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/param.h b/param.h index 66eedc7980..259592cdfe 100644 --- a/param.h +++ b/param.h @@ -2243,7 +2243,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 1024 #define GEMM_DEFAULT_ALIGN 0x0ffffUL -#define SGEMM_DEFAULT_UNROLL_M 16 +#define SGEMM_DEFAULT_UNROLL_M 4 #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_M 4 #define DGEMM_DEFAULT_UNROLL_N 4 From d714013ab94ef45d9089fdfdded9beb32469d9a5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 3 Oct 2024 22:04:20 +0200 Subject: [PATCH 073/244] change sgemm kernel to 4x4 as the 16x4 altivec goes out of bounds --- kernel/power/KERNEL.PPCG4 | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/power/KERNEL.PPCG4 b/kernel/power/KERNEL.PPCG4 index c73601cee3..0297df5973 100644 --- a/kernel/power/KERNEL.PPCG4 +++ b/kernel/power/KERNEL.PPCG4 @@ -70,13 +70,13 @@ DSCALKERNEL = scal_ppc440.S CSCALKERNEL = zscal_ppc440.S ZSCALKERNEL = zscal_ppc440.S -SGEMMKERNEL = gemm_kernel_altivec_g4.S -SGEMMINCOPY = ../generic/gemm_ncopy_16.c -SGEMMITCOPY = ../generic/gemm_tcopy_16.c +SGEMMKERNEL = gemm_kernel_g4.S +SGEMMINCOPY = +SGEMMITCOPY = SGEMMONCOPY = gemm_ncopy_4.S SGEMMOTCOPY = gemm_tcopy_4.S -SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) -SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = gemm_kernel_g4.S From a6b775188177ea5cae240aa6ff8e44748b4232dc Mon Sep 17 00:00:00 2001 From: Rohit Goswami Date: Tue, 30 Jul 2024 15:14:05 +0000 Subject: [PATCH 074/244] BUG: Allow tests to be run multiple times Without failures due to existing files --- test/cblat3_3m.f | 5 ++--- test/zblat3_3m.f | 4 ++-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/test/cblat3_3m.f b/test/cblat3_3m.f index 19f7830bef..b61fca53c9 100644 --- a/test/cblat3_3m.f +++ b/test/cblat3_3m.f @@ -104,7 +104,7 @@ PROGRAM CBLAT3 * READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )NOUT - OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' ) + OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) NOUTC = NOUT * * Read name and unit number for snapshot output file and open file. @@ -113,7 +113,7 @@ PROGRAM CBLAT3 READ( NIN, FMT = * )NTRA TRACE = NTRA.GE.0 IF( TRACE )THEN - OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) + OPEN( NTRA, FILE = SNAPS, STATUS = 'UNKNOWN' ) END IF * Read the flag that directs rewinding of the snapshot file. READ( NIN, FMT = * )REWI @@ -3439,4 +3439,3 @@ SUBROUTINE XERBLA( SRNAME, INFO ) * End of XERBLA * END - diff --git a/test/zblat3_3m.f b/test/zblat3_3m.f index bac23aa547..86977d6742 100644 --- a/test/zblat3_3m.f +++ b/test/zblat3_3m.f @@ -105,7 +105,7 @@ PROGRAM ZBLAT3 * READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )NOUT - OPEN( NOUT, FILE = SUMMRY, STATUS = 'NEW' ) + OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) NOUTC = NOUT * * Read name and unit number for snapshot output file and open file. @@ -114,7 +114,7 @@ PROGRAM ZBLAT3 READ( NIN, FMT = * )NTRA TRACE = NTRA.GE.0 IF( TRACE )THEN - OPEN( NTRA, FILE = SNAPS, STATUS = 'NEW' ) + OPEN( NTRA, FILE = SNAPS, STATUS = 'UNKNOWN' ) END IF * Read the flag that directs rewinding of the snapshot file. READ( NIN, FMT = * )REWI From 722e4ae07a8adff3be55c29a5452fb1851c724a8 Mon Sep 17 00:00:00 2001 From: Rohit Goswami Date: Tue, 30 Jul 2024 15:24:23 +0000 Subject: [PATCH 075/244] MAINT: Explicitly replace instead of unknown --- test/cblat3_3m.f | 4 ++-- test/zblat3_3m.f | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/test/cblat3_3m.f b/test/cblat3_3m.f index b61fca53c9..6c7d7e1693 100644 --- a/test/cblat3_3m.f +++ b/test/cblat3_3m.f @@ -104,7 +104,7 @@ PROGRAM CBLAT3 * READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )NOUT - OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) + OPEN( NOUT, FILE = SUMMRY, STATUS = 'REPLACE' ) NOUTC = NOUT * * Read name and unit number for snapshot output file and open file. @@ -113,7 +113,7 @@ PROGRAM CBLAT3 READ( NIN, FMT = * )NTRA TRACE = NTRA.GE.0 IF( TRACE )THEN - OPEN( NTRA, FILE = SNAPS, STATUS = 'UNKNOWN' ) + OPEN( NTRA, FILE = SNAPS, STATUS = 'REPLACE' ) END IF * Read the flag that directs rewinding of the snapshot file. READ( NIN, FMT = * )REWI diff --git a/test/zblat3_3m.f b/test/zblat3_3m.f index 86977d6742..e65c72b489 100644 --- a/test/zblat3_3m.f +++ b/test/zblat3_3m.f @@ -105,7 +105,7 @@ PROGRAM ZBLAT3 * READ( NIN, FMT = * )SUMMRY READ( NIN, FMT = * )NOUT - OPEN( NOUT, FILE = SUMMRY, STATUS = 'UNKNOWN' ) + OPEN( NOUT, FILE = SUMMRY, STATUS = 'REPLACE' ) NOUTC = NOUT * * Read name and unit number for snapshot output file and open file. @@ -114,7 +114,7 @@ PROGRAM ZBLAT3 READ( NIN, FMT = * )NTRA TRACE = NTRA.GE.0 IF( TRACE )THEN - OPEN( NTRA, FILE = SNAPS, STATUS = 'UNKNOWN' ) + OPEN( NTRA, FILE = SNAPS, STATUS = 'REPLACE' ) END IF * Read the flag that directs rewinding of the snapshot file. READ( NIN, FMT = * )REWI From d9f368dfe6a9e96807d3860b96d9b30471583dc9 Mon Sep 17 00:00:00 2001 From: Rohit Goswami Date: Mon, 29 Jul 2024 03:51:21 +0000 Subject: [PATCH 076/244] TST: Signal abort for ctest failures correctly --- ctest/c_cblat1.f | 11 +++++++---- ctest/c_cblat2.f | 12 +++++++----- ctest/c_cblat3.f | 16 +++++++++------- ctest/c_cblat3_3m.f | 16 +++++++++------- ctest/c_dblat1.f | 15 +++++++++------ ctest/c_dblat2.f | 12 +++++++----- ctest/c_dblat3.f | 16 +++++++++------- ctest/c_sblat1.f | 17 ++++++++++------- ctest/c_sblat2.f | 12 +++++++----- ctest/c_sblat3.f | 16 +++++++++------- ctest/c_zblat1.f | 11 +++++++---- ctest/c_zblat2.f | 12 +++++++----- ctest/c_zblat3.f | 16 +++++++++------- ctest/c_zblat3_3m.f | 16 +++++++++------- 14 files changed, 115 insertions(+), 83 deletions(-) diff --git a/ctest/c_cblat1.f b/ctest/c_cblat1.f index cad7c7fa73..73ab485bbd 100644 --- a/ctest/c_cblat1.f +++ b/ctest/c_cblat1.f @@ -38,9 +38,12 @@ PROGRAM CCBLAT1 CALL CHECK1(SFAC) END IF * -- Print - IF (PASS) WRITE (NOUT,99998) + IF (PASS) THEN + WRITE (NOUT,99998) + ELSE + CALL ABORT + END IF 20 CONTINUE - STOP * 99999 FORMAT (' Complex CBLAS Test Program Results',/1X) 99998 FORMAT (' ----- PASS -----') @@ -228,7 +231,7 @@ SUBROUTINE CHECK1(SFAC) CALL ITEST1(ICAMAXTEST(N,CX,INCX),ITRUE3(NP1)) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK1' - STOP + CALL ABORT END IF * 40 CONTINUE @@ -512,7 +515,7 @@ SUBROUTINE CHECK2(SFAC) CALL CTEST(LENY,CY,CT10Y(1,KN,KI),CSIZE3,1.0E0) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK2' - STOP + CALL ABORT END IF * 40 CONTINUE diff --git a/ctest/c_cblat2.f b/ctest/c_cblat2.f index 9252339d45..d48c10b7c8 100644 --- a/ctest/c_cblat2.f +++ b/ctest/c_cblat2.f @@ -10,7 +10,7 @@ PROGRAM CBLAT2 * 'CBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO STOP ON FAILURES. +* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES. * T LOGICAL FLAG, T TO TEST ERROR EXITS. * 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH * 16.0 THRESHOLD VALUE OF TEST RATIO @@ -243,7 +243,7 @@ PROGRAM CBLAT2 $ GO TO 70 60 CONTINUE WRITE( NOUT, FMT = 9986 )SNAMET - STOP + CALL ABORT 70 LTEST( I ) = LTESTT GO TO 50 * @@ -283,7 +283,7 @@ PROGRAM CBLAT2 SAME = LCE( YY, YT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR - STOP + CALL ABORT END IF TRANS = 'T' CALL CMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G, @@ -291,7 +291,7 @@ PROGRAM CBLAT2 SAME = LCE( YY, YT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR - STOP + CALL ABORT END IF * * Test each subroutine in turn. @@ -418,7 +418,9 @@ PROGRAM CBLAT2 IF( TRACE ) $ CLOSE ( NTRA ) CLOSE ( NOUT ) - STOP + IF( FATAL ) THEN + CALL ABORT + END IF * 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) 10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' ) diff --git a/ctest/c_cblat3.f b/ctest/c_cblat3.f index 74293ce53a..5d289aafe0 100644 --- a/ctest/c_cblat3.f +++ b/ctest/c_cblat3.f @@ -10,7 +10,7 @@ PROGRAM CBLAT3 * 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO STOP ON FAILURES. +* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES. * T LOGICAL FLAG, T TO TEST ERROR EXITS. * 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH * 16.0 THRESHOLD VALUE OF TEST RATIO @@ -194,7 +194,7 @@ PROGRAM CBLAT3 $ GO TO 50 40 CONTINUE WRITE( NOUT, FMT = 9990 )SNAMET - STOP + CALL ABORT 50 LTEST( I ) = LTESTT GO TO 30 * @@ -237,7 +237,7 @@ PROGRAM CBLAT3 SAME = LCE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - STOP + CALL ABORT END IF TRANSB = 'C' CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, @@ -246,7 +246,7 @@ PROGRAM CBLAT3 SAME = LCE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - STOP + CALL ABORT END IF DO 120 J = 1, N AB( J, NMAX + 1 ) = N - J + 1 @@ -264,7 +264,7 @@ PROGRAM CBLAT3 SAME = LCE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - STOP + CALL ABORT END IF TRANSB = 'C' CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, @@ -273,7 +273,7 @@ PROGRAM CBLAT3 SAME = LCE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - STOP + CALL ABORT END IF * * Test each subroutine in turn. @@ -385,7 +385,9 @@ PROGRAM CBLAT3 IF( TRACE ) $ CLOSE ( NTRA ) CLOSE ( NOUT ) - STOP + IF( FATAL ) THEN + CALL ABORT + END IF * 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) 10001 FORMAT(' ROW-MAJOR DATA LAYOUT IS TESTED' ) diff --git a/ctest/c_cblat3_3m.f b/ctest/c_cblat3_3m.f index 9643ebc89d..73fca5664f 100644 --- a/ctest/c_cblat3_3m.f +++ b/ctest/c_cblat3_3m.f @@ -10,7 +10,7 @@ PROGRAM CBLAT3 * 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO STOP ON FAILURES. +* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES. * T LOGICAL FLAG, T TO TEST ERROR EXITS. * 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH * 16.0 THRESHOLD VALUE OF TEST RATIO @@ -194,7 +194,7 @@ PROGRAM CBLAT3 $ GO TO 50 40 CONTINUE WRITE( NOUT, FMT = 9990 )SNAMET - STOP + CALL ABORT 50 LTEST( I ) = LTESTT GO TO 30 * @@ -237,7 +237,7 @@ PROGRAM CBLAT3 SAME = LCE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - STOP + CALL ABORT END IF TRANSB = 'C' CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, @@ -246,7 +246,7 @@ PROGRAM CBLAT3 SAME = LCE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - STOP + CALL ABORT END IF DO 120 J = 1, N AB( J, NMAX + 1 ) = N - J + 1 @@ -264,7 +264,7 @@ PROGRAM CBLAT3 SAME = LCE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - STOP + CALL ABORT END IF TRANSB = 'C' CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, @@ -273,7 +273,7 @@ PROGRAM CBLAT3 SAME = LCE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - STOP + CALL ABORT END IF * * Test each subroutine in turn. @@ -385,7 +385,9 @@ PROGRAM CBLAT3 IF( TRACE ) $ CLOSE ( NTRA ) CLOSE ( NOUT ) - STOP + IF( FATAL ) THEN + CALL ABORT + END IF * 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) 10001 FORMAT(' ROW-MAJOR DATA LAYOUT IS TESTED' ) diff --git a/ctest/c_dblat1.f b/ctest/c_dblat1.f index 0139ede63d..99c8b5da49 100644 --- a/ctest/c_dblat1.f +++ b/ctest/c_dblat1.f @@ -44,9 +44,12 @@ PROGRAM DCBLAT1 CALL CHECK3(SFAC) END IF * -- Print - IF (PASS) WRITE (NOUT,99998) + IF (PASS) THEN + WRITE (NOUT,99998) + ELSE + CALL ABORT + END IF 20 CONTINUE - STOP * 99999 FORMAT (' Real CBLAS Test Program Results',/1X) 99998 FORMAT (' ----- PASS -----') @@ -136,7 +139,7 @@ SUBROUTINE CHECK0(SFAC) CALL STEST1(SS,DS1(K),DS1(K),SFAC) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK0' - STOP + CALL ABORT END IF 20 CONTINUE 40 RETURN @@ -229,7 +232,7 @@ SUBROUTINE CHECK1(SFAC) CALL ITEST1(IDAMAXTEST(N,SX,INCX),ITRUE2(NP1)) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK1' - STOP + CALL ABORT END IF 60 CONTINUE 80 CONTINUE @@ -384,7 +387,7 @@ SUBROUTINE CHECK2(SFAC) CALL STEST(LENY,SY,STY,SSIZE2(1,1),1.0D0) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK2' - STOP + CALL ABORT END IF 100 CONTINUE 120 CONTINUE @@ -472,7 +475,7 @@ SUBROUTINE CHECK3(SFAC) 70 CONTINUE ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK3' - STOP + CALL ABORT END IF 40 CONTINUE 60 CONTINUE diff --git a/ctest/c_dblat2.f b/ctest/c_dblat2.f index 0c7801d77d..01a21a7163 100644 --- a/ctest/c_dblat2.f +++ b/ctest/c_dblat2.f @@ -10,7 +10,7 @@ PROGRAM DBLAT2 * 'DBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO STOP ON FAILURES. +* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES. * T LOGICAL FLAG, T TO TEST ERROR EXITS. * 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH * 16.0 THRESHOLD VALUE OF TEST RATIO @@ -239,7 +239,7 @@ PROGRAM DBLAT2 $ GO TO 70 60 CONTINUE WRITE( NOUT, FMT = 9986 )SNAMET - STOP + CALL ABORT 70 LTEST( I ) = LTESTT GO TO 50 * @@ -279,7 +279,7 @@ PROGRAM DBLAT2 SAME = LDE( YY, YT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR - STOP + CALL ABORT END IF TRANS = 'T' CALL DMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G, @@ -287,7 +287,7 @@ PROGRAM DBLAT2 SAME = LDE( YY, YT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR - STOP + CALL ABORT END IF * * Test each subroutine in turn. @@ -414,7 +414,9 @@ PROGRAM DBLAT2 IF( TRACE ) $ CLOSE ( NTRA ) CLOSE ( NOUT ) - STOP + IF( FATAL ) THEN + CALL ABORT + END IF * 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) 10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' ) diff --git a/ctest/c_dblat3.f b/ctest/c_dblat3.f index 252fe3b718..00d16c2961 100644 --- a/ctest/c_dblat3.f +++ b/ctest/c_dblat3.f @@ -10,7 +10,7 @@ PROGRAM DBLAT3 * 'DBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO STOP ON FAILURES. +* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES. * T LOGICAL FLAG, T TO TEST ERROR EXITS. * 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH * 16.0 THRESHOLD VALUE OF TEST RATIO @@ -189,7 +189,7 @@ PROGRAM DBLAT3 $ GO TO 50 40 CONTINUE WRITE( NOUT, FMT = 9990 )SNAMET - STOP + CALL ABORT 50 LTEST( I ) = LTESTT GO TO 30 * @@ -232,7 +232,7 @@ PROGRAM DBLAT3 SAME = LDE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - STOP + CALL ABORT END IF TRANSB = 'T' CALL DMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, @@ -241,7 +241,7 @@ PROGRAM DBLAT3 SAME = LDE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - STOP + CALL ABORT END IF DO 120 J = 1, N AB( J, NMAX + 1 ) = N - J + 1 @@ -259,7 +259,7 @@ PROGRAM DBLAT3 SAME = LDE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - STOP + CALL ABORT END IF TRANSB = 'T' CALL DMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, @@ -268,7 +268,7 @@ PROGRAM DBLAT3 SAME = LDE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - STOP + CALL ABORT END IF * * Test each subroutine in turn. @@ -379,7 +379,9 @@ PROGRAM DBLAT3 IF( TRACE ) $ CLOSE ( NTRA ) CLOSE ( NOUT ) - STOP + IF( FATAL ) THEN + CALL ABORT + END IF * 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) 10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' ) diff --git a/ctest/c_sblat1.f b/ctest/c_sblat1.f index 66a5def897..b88c2b7835 100644 --- a/ctest/c_sblat1.f +++ b/ctest/c_sblat1.f @@ -44,9 +44,12 @@ PROGRAM SCBLAT1 CALL CHECK3(SFAC) END IF * -- Print - IF (PASS) WRITE (NOUT,99998) + IF (PASS) THEN + WRITE (NOUT,99998) + ELSE + CALL ABORT + END IF 20 CONTINUE - STOP * 99999 FORMAT (' Real CBLAS Test Program Results',/1X) 99998 FORMAT (' ----- PASS -----') @@ -136,7 +139,7 @@ SUBROUTINE CHECK0(SFAC) CALL STEST1(SS,DS1(K),DS1(K),SFAC) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK0' - STOP + CALL ABORT END IF 20 CONTINUE 40 RETURN @@ -229,7 +232,7 @@ SUBROUTINE CHECK1(SFAC) CALL ITEST1(ISAMAXTEST(N,SX,INCX),ITRUE2(NP1)) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK1' - STOP + CALL ABORT END IF 60 CONTINUE 80 CONTINUE @@ -384,7 +387,7 @@ SUBROUTINE CHECK2(SFAC) CALL STEST(LENY,SY,STY,SSIZE2(1,1),1.0E0) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK2' - STOP + CALL ABORT END IF 100 CONTINUE 120 CONTINUE @@ -479,7 +482,7 @@ SUBROUTINE CHECK3(SFAC) 70 CONTINUE ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK3' - STOP + CALL ABORT END IF 40 CONTINUE 60 CONTINUE @@ -759,4 +762,4 @@ SUBROUTINE srotm(N,SX,INCX,SY,INCY,SPARAM) END IF END IF RETURN - END \ No newline at end of file + END diff --git a/ctest/c_sblat2.f b/ctest/c_sblat2.f index 6386abe042..18d568d5d3 100644 --- a/ctest/c_sblat2.f +++ b/ctest/c_sblat2.f @@ -10,7 +10,7 @@ PROGRAM SBLAT2 * 'SBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO STOP ON FAILURES. +* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES. * T LOGICAL FLAG, T TO TEST ERROR EXITS. * 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH * 16.0 THRESHOLD VALUE OF TEST RATIO @@ -239,7 +239,7 @@ PROGRAM SBLAT2 $ GO TO 70 60 CONTINUE WRITE( NOUT, FMT = 9986 )SNAMET - STOP + CALL ABORT 70 LTEST( I ) = LTESTT GO TO 50 * @@ -279,7 +279,7 @@ PROGRAM SBLAT2 SAME = LSE( YY, YT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR - STOP + CALL ABORT END IF TRANS = 'T' CALL SMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G, @@ -287,7 +287,7 @@ PROGRAM SBLAT2 SAME = LSE( YY, YT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR - STOP + CALL ABORT END IF * * Test each subroutine in turn. @@ -414,7 +414,9 @@ PROGRAM SBLAT2 IF( TRACE ) $ CLOSE ( NTRA ) CLOSE ( NOUT ) - STOP + IF( FATAL ) THEN + CALL ABORT + END IF * 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) 10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' ) diff --git a/ctest/c_sblat3.f b/ctest/c_sblat3.f index 4cfc1c706e..bbb58d04f6 100644 --- a/ctest/c_sblat3.f +++ b/ctest/c_sblat3.f @@ -10,7 +10,7 @@ PROGRAM SBLAT3 * 'SBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO STOP ON FAILURES. +* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES. * T LOGICAL FLAG, T TO TEST ERROR EXITS. * 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH * 16.0 THRESHOLD VALUE OF TEST RATIO @@ -188,7 +188,7 @@ PROGRAM SBLAT3 $ GO TO 50 40 CONTINUE WRITE( NOUT, FMT = 9990 )SNAMET - STOP + CALL ABORT 50 LTEST( I ) = LTESTT GO TO 30 * @@ -231,7 +231,7 @@ PROGRAM SBLAT3 SAME = LSE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - STOP + CALL ABORT END IF TRANSB = 'T' CALL SMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, @@ -240,7 +240,7 @@ PROGRAM SBLAT3 SAME = LSE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - STOP + CALL ABORT END IF DO 120 J = 1, N AB( J, NMAX + 1 ) = N - J + 1 @@ -258,7 +258,7 @@ PROGRAM SBLAT3 SAME = LSE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - STOP + CALL ABORT END IF TRANSB = 'T' CALL SMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, @@ -267,7 +267,7 @@ PROGRAM SBLAT3 SAME = LSE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - STOP + CALL ABORT END IF * * Test each subroutine in turn. @@ -378,7 +378,9 @@ PROGRAM SBLAT3 IF( TRACE ) $ CLOSE ( NTRA ) CLOSE ( NOUT ) - STOP + IF( FATAL ) THEN + CALL ABORT + END IF * 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) 10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' ) diff --git a/ctest/c_zblat1.f b/ctest/c_zblat1.f index cd0c8541df..43486433e3 100644 --- a/ctest/c_zblat1.f +++ b/ctest/c_zblat1.f @@ -38,9 +38,12 @@ PROGRAM ZCBLAT1 CALL CHECK1(SFAC) END IF * -- Print - IF (PASS) WRITE (NOUT,99998) + IF (PASS) THEN + WRITE (NOUT,99998) + ELSE + CALL ABORT + END IF 20 CONTINUE - STOP * 99999 FORMAT (' Complex CBLAS Test Program Results',/1X) 99998 FORMAT (' ----- PASS -----') @@ -228,7 +231,7 @@ SUBROUTINE CHECK1(SFAC) CALL ITEST1(IZAMAXTEST(N,CX,INCX),ITRUE3(NP1)) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK1' - STOP + CALL ABORT END IF * 40 CONTINUE @@ -512,7 +515,7 @@ SUBROUTINE CHECK2(SFAC) CALL CTEST(LENY,CY,CT10Y(1,KN,KI),CSIZE3,1.0D0) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK2' - STOP + CALL ABORT END IF * 40 CONTINUE diff --git a/ctest/c_zblat2.f b/ctest/c_zblat2.f index cc5c1bad1a..daa1a603b2 100644 --- a/ctest/c_zblat2.f +++ b/ctest/c_zblat2.f @@ -10,7 +10,7 @@ PROGRAM ZBLAT2 * 'CBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO STOP ON FAILURES. +* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES. * T LOGICAL FLAG, T TO TEST ERROR EXITS. * 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH * 16.0 THRESHOLD VALUE OF TEST RATIO @@ -243,7 +243,7 @@ PROGRAM ZBLAT2 $ GO TO 70 60 CONTINUE WRITE( NOUT, FMT = 9986 )SNAMET - STOP + CALL ABORT 70 LTEST( I ) = LTESTT GO TO 50 * @@ -283,7 +283,7 @@ PROGRAM ZBLAT2 SAME = LZE( YY, YT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR - STOP + CALL ABORT END IF TRANS = 'T' CALL ZMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G, @@ -291,7 +291,7 @@ PROGRAM ZBLAT2 SAME = LZE( YY, YT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR - STOP + CALL ABORT END IF * * Test each subroutine in turn. @@ -418,7 +418,9 @@ PROGRAM ZBLAT2 IF( TRACE ) $ CLOSE ( NTRA ) CLOSE ( NOUT ) - STOP + IF( FATAL ) THEN + CALL ABORT + END IF * 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) 10001 FORMAT( ' ROW-MAJOR DATA LAYOUT IS TESTED' ) diff --git a/ctest/c_zblat3.f b/ctest/c_zblat3.f index cc109d6517..83eb9e9184 100644 --- a/ctest/c_zblat3.f +++ b/ctest/c_zblat3.f @@ -10,7 +10,7 @@ PROGRAM ZBLAT3 * 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO STOP ON FAILURES. +* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES. * T LOGICAL FLAG, T TO TEST ERROR EXITS. * 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH * 16.0 THRESHOLD VALUE OF TEST RATIO @@ -195,7 +195,7 @@ PROGRAM ZBLAT3 $ GO TO 50 40 CONTINUE WRITE( NOUT, FMT = 9990 )SNAMET - STOP + CALL ABORT 50 LTEST( I ) = LTESTT GO TO 30 * @@ -238,7 +238,7 @@ PROGRAM ZBLAT3 SAME = LZE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - STOP + CALL ABORT END IF TRANSB = 'C' CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, @@ -247,7 +247,7 @@ PROGRAM ZBLAT3 SAME = LZE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - STOP + CALL ABORT END IF DO 120 J = 1, N AB( J, NMAX + 1 ) = N - J + 1 @@ -265,7 +265,7 @@ PROGRAM ZBLAT3 SAME = LZE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - STOP + CALL ABORT END IF TRANSB = 'C' CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, @@ -274,7 +274,7 @@ PROGRAM ZBLAT3 SAME = LZE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - STOP + CALL ABORT END IF * * Test each subroutine in turn. @@ -386,7 +386,9 @@ PROGRAM ZBLAT3 IF( TRACE ) $ CLOSE ( NTRA ) CLOSE ( NOUT ) - STOP + IF( FATAL ) THEN + CALL ABORT + END IF * 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) 10001 FORMAT(' ROW-MAJOR DATA LAYOUT IS TESTED' ) diff --git a/ctest/c_zblat3_3m.f b/ctest/c_zblat3_3m.f index ead64da27f..d0923439e8 100644 --- a/ctest/c_zblat3_3m.f +++ b/ctest/c_zblat3_3m.f @@ -10,7 +10,7 @@ PROGRAM ZBLAT3 * 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO STOP ON FAILURES. +* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES. * T LOGICAL FLAG, T TO TEST ERROR EXITS. * 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH * 16.0 THRESHOLD VALUE OF TEST RATIO @@ -195,7 +195,7 @@ PROGRAM ZBLAT3 $ GO TO 50 40 CONTINUE WRITE( NOUT, FMT = 9990 )SNAMET - STOP + CALL ABORT 50 LTEST( I ) = LTESTT GO TO 30 * @@ -238,7 +238,7 @@ PROGRAM ZBLAT3 SAME = LZE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - STOP + CALL ABORT END IF TRANSB = 'C' CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, @@ -247,7 +247,7 @@ PROGRAM ZBLAT3 SAME = LZE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - STOP + CALL ABORT END IF DO 120 J = 1, N AB( J, NMAX + 1 ) = N - J + 1 @@ -265,7 +265,7 @@ PROGRAM ZBLAT3 SAME = LZE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - STOP + CALL ABORT END IF TRANSB = 'C' CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, @@ -274,7 +274,7 @@ PROGRAM ZBLAT3 SAME = LZE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - STOP + CALL ABORT END IF * * Test each subroutine in turn. @@ -386,7 +386,9 @@ PROGRAM ZBLAT3 IF( TRACE ) $ CLOSE ( NTRA ) CLOSE ( NOUT ) - STOP + IF( FATAL ) THEN + CALL ABORT + END IF * 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) 10001 FORMAT(' ROW-MAJOR DATA LAYOUT IS TESTED' ) From c9e92348a645842c47149ddd5ce3e418134470dc Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 6 Oct 2024 19:57:17 +0200 Subject: [PATCH 077/244] Handle inf/nan if dummy2 flag is set --- kernel/power/scal_ppc440.S | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/power/scal_ppc440.S b/kernel/power/scal_ppc440.S index d977b0b592..014e3989e3 100644 --- a/kernel/power/scal_ppc440.S +++ b/kernel/power/scal_ppc440.S @@ -63,6 +63,8 @@ #endif #endif +#define FLAG r11 + #define FZERO f0 #define ALPHA f1 @@ -88,6 +90,10 @@ fcmpu cr0, FZERO, ALPHA bne- cr0, LL(A1I1) + lwz FLAG, FRAMESLOT(0)(SP) + cmpwi cr0, FLAG, 1 + beq- cr0, LL(A1I1) + srawi. r0, N, 4 mtspr CTR, r0 beq- cr0, LL(A0I1_Remain) From 8a1710dd0da445d76e6eaeb35b180d24efac0919 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 6 Oct 2024 20:03:32 +0200 Subject: [PATCH 078/244] don't apply switch_ratio to tail of loop --- driver/level3/level3_thread.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index ddb39abd66..3d56c45a99 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -742,7 +742,7 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG num_parts = 0; while (n > 0){ width = blas_quickdivide(n + nthreads - num_parts - 1, nthreads - num_parts); - if (width < switch_ratio) { + if (width < switch_ratio && width > 1) { width = switch_ratio; } width = round_up(n, width, GEMM_PREFERED_SIZE); From d6bb8dcfd1139037ec7538a64b9e143a05216740 Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Sun, 6 Oct 2024 14:13:43 -0500 Subject: [PATCH 079/244] Common code. --- kernel/power/sbgemv_common.c | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/kernel/power/sbgemv_common.c b/kernel/power/sbgemv_common.c index 8ad7f92e73..830481fef3 100644 --- a/kernel/power/sbgemv_common.c +++ b/kernel/power/sbgemv_common.c @@ -122,10 +122,7 @@ FORCEINLINE void copy_x(BLASLONG n, IFLOAT *src, IFLOAT *dest, BLASLONG inc_src) FORCEINLINE void copy_y_beta(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src, FLOAT beta) { if (beta == (FLOAT)0) { - for (BLASLONG i = 0; i < n; i++) { - *dest++ = (FLOAT)0; - src += inc_src; - } + memset(dest, 0, n * sizeof(FLOAT)); } else if (beta == (FLOAT)1) { for (BLASLONG i = 0; i < n; i++) { *dest++ = *src; @@ -139,13 +136,18 @@ FORCEINLINE void copy_y_beta(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_s } } +FORCEINLINE void move_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) +{ + for (BLASLONG i = 0; i < n; i++) { + *dest = *src++; + dest += inc_dest; + } +} + FORCEINLINE void copy_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src, FLOAT beta) { if (beta == (FLOAT)0) { - for (BLASLONG i = 0; i < n; i++) { - *dest = *src++; - dest += inc_src; - } + move_y(n, src, dest, inc_src); } else if (beta == (FLOAT)1) { for (BLASLONG i = 0; i < n; i++) { *dest += *src++; @@ -159,14 +161,6 @@ FORCEINLINE void copy_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src, F } } -FORCEINLINE void move_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) -{ - for (BLASLONG i = 0; i < n; i++) { - *dest = *src++; - dest += inc_dest; - } -} - static void BF16GEMV_N_beta(BLASLONG n, FLOAT *output_vector, FLOAT *input_vector, FLOAT beta) { if (beta == (FLOAT)0) { From 9783dd07ab1259cde1716a67d82c5752c3acf582 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 6 Oct 2024 22:43:11 +0200 Subject: [PATCH 080/244] Rename KERNEL.LOONGSONGENERIC to KERNEL.LA64_GENERIC --- .../loongarch64/{KERNEL.LOONGSONGENERIC => KERNEL.LA64_GENERIC} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename kernel/loongarch64/{KERNEL.LOONGSONGENERIC => KERNEL.LA64_GENERIC} (100%) diff --git a/kernel/loongarch64/KERNEL.LOONGSONGENERIC b/kernel/loongarch64/KERNEL.LA64_GENERIC similarity index 100% rename from kernel/loongarch64/KERNEL.LOONGSONGENERIC rename to kernel/loongarch64/KERNEL.LA64_GENERIC From 9c707dc6b9845e8df25a70e67f5a07ee1ca7332b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 6 Oct 2024 22:46:03 +0200 Subject: [PATCH 081/244] Update dynamic arch list to new target scheme --- cmake/arch.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/arch.cmake b/cmake/arch.cmake index 0ff4f1df31..27ba6f8727 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -95,7 +95,7 @@ if (DYNAMIC_ARCH) endif () if (LOONGARCH64) - set(DYNAMIC_CORE LOONGSONGENERIC LOONGSON2K1000 LOONGSON3R5) + set(DYNAMIC_CORE LA64_GENERIC LA264 LA464) endif () if (EXISTS ${PROJECT_SOURCE_DIR}/config_kernel.h) From b0346e72f48515d30683d69cada17bdabe4d80ee Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 6 Oct 2024 22:48:33 +0200 Subject: [PATCH 082/244] update names of loongarch64 targets for cross-compilation --- cmake/prebuild.cmake | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index 785c275c78..53a78d782f 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -1349,7 +1349,7 @@ endif () "#define DTB_DEFAULT_ENTRIES 128\n" "#define DTB_SIZE 4096\n" "#define L2_ASSOCIATIVE 4\n") - elseif ("${TCORE}" STREQUAL "LOONGSONGENERIC") + elseif ("${TCORE}" STREQUAL "LA64_GENERIC") file(APPEND ${TARGET_CONF_TEMP} "#define DTB_DEFAULT_ENTRIES 64\n") set(SGEMM_UNROLL_M 2) @@ -1364,7 +1364,7 @@ endif () set(CGEMM3M_UNROLL_N 8) set(ZGEMM3M_UNROLL_M 2) set(ZGEMM3M_UNROLL_N 8) - elseif ("${TCORE}" STREQUAL "LOONGSON2K1000") + elseif ("${TCORE}" STREQUAL "LA264") file(APPEND ${TARGET_CONF_TEMP} "#define DTB_DEFAULT_ENTRIES 64\n") set(HAVE_LSX 1) @@ -1380,7 +1380,7 @@ endif () set(CGEMM3M_UNROLL_N 8) set(ZGEMM3M_UNROLL_M 8) set(ZGEMM3M_UNROLL_N 4) - elseif ("${TCORE}" STREQUAL "LOONGSON3R5") + elseif ("${TCORE}" STREQUAL "LA464") file(APPEND ${TARGET_CONF_TEMP} "#define DTB_DEFAULT_ENTRIES 64\n") set(HAVE_LASX 1) From 2c3b87a082984731748ae47f604b44c751d3dd83 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 8 Oct 2024 23:07:42 +0200 Subject: [PATCH 083/244] Add preliminary cpu autodetection for Zen5/5c --- cpuid_x86.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpuid_x86.c b/cpuid_x86.c index 9b2b7a51eb..4e13f1462c 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1689,6 +1689,7 @@ int get_cpuname(void){ return CPUTYPE_BARCELONA; } case 10: // Zen3/4 + case 11: // Zen5 #ifndef NO_AVX512 if(support_avx512_bf16()) return CPUTYPE_COOPERLAKE; @@ -2479,7 +2480,7 @@ int get_coretype(void){ } break; } - } else if (exfamily == 8 || exfamily == 10) { + } else if (exfamily == 8 || exfamily == 10 || exfamily == 11) { switch (model) { case 1: // AMD Ryzen From 3ab8b1408ec4a90168871fadbac916c365d871d2 Mon Sep 17 00:00:00 2001 From: gxw Date: Tue, 8 Oct 2024 21:08:09 +0800 Subject: [PATCH 084/244] LoongArch64: Update README.md --- README.md | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/README.md b/README.md index 45bcf10e70..f6c7ec7431 100644 --- a/README.md +++ b/README.md @@ -221,6 +221,26 @@ e.g.: HOSTCC=gcc HOSTFC=gfortran -j ``` +#### LOONGARCH64 + +- **LA64_GENERIC**: Optimized Level-3, Level-2 and Level-1 BLAS with scalar instruction + ```sh + make HOSTCC=gcc TARGET=LA64_GENERIC CC=loongarch64-unknown-linux-gnu-gcc FC=loongarch64-unknown-linux-gnu-gfortran USE_SIMPLE_THREADED_LEVEL3=1 + ``` + The old-style TARGET=LOONGSONGENERIC is still supported + +- **LA264**: Optimized Level-3, Level-2 and Level-1 BLAS with LSX instruction + ```sh + make HOSTCC=gcc TARGET=LA264 CC=loongarch64-unknown-linux-gnu-gcc FC=loongarch64-unknown-linux-gnu-gfortran USE_SIMPLE_THREADED_LEVEL3=1 + ``` + The old-style TARGET=LOONGSON2K1000 is still supported + +- **LA464**: Optimized Level-3, Level-2 and Level-1 BLAS with LASX instruction + ```sh + make HOSTCC=gcc TARGET=LA464 CC=loongarch64-unknown-linux-gnu-gcc FC=loongarch64-unknown-linux-gnu-gfortran USE_SIMPLE_THREADED_LEVEL3=1 + ``` + The old-style TARGET=LOONGSON3R5 is still supported + ### Support for multiple targets in a single library OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying `DYNAMIC_ARCH=1` in Makefile.rule, on the gmake command line or as `-DDYNAMIC_ARCH=TRUE` in cmake. @@ -238,6 +258,8 @@ on **ZARCH** it comprises Z13 and Z14 as well as generic zarch support. On **riscv64**, DYNAMIC_ARCH enables support for riscv64_zvl128b and riscv64_zvl256b in addition to generic riscv64 support. A compiler that supports RVV 1.0 is required to build OpenBLAS for riscv64 when DYNAMIC_ARCH is enabled. +On **LoongArch64**, it comprises LA264 and LA464 as well as generic LoongArch64 support. + The `TARGET` option can be used in conjunction with `DYNAMIC_ARCH=1` to specify which cpu model should be assumed for all the common code in the library, usually you will want to set this to the oldest model you expect to encounter. Please note that it is not possible to combine support for different architectures, so no combined 32 and 64 bit or x86_64 and arm64 in the same library. From bee123e8e3cb990197640e24aeebb1b03213ce94 Mon Sep 17 00:00:00 2001 From: NickelWenzel <55748692+NickelWenzel@users.noreply.github.com> Date: Wed, 9 Oct 2024 16:36:40 +0200 Subject: [PATCH 085/244] fix: add missing NO_AFFINITY checks --- common_arm64.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/common_arm64.h b/common_arm64.h index d80b9e4345..876a4aa6de 100644 --- a/common_arm64.h +++ b/common_arm64.h @@ -55,6 +55,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef ASSEMBLER +#ifndef NO_AFFINITY static __inline int WhereAmI(void){ uint64_t ret; __asm__ volatile ( @@ -67,6 +68,7 @@ static __inline int WhereAmI(void){ if ((int)ret <0) ret = 0; return (int)ret; } +#endif static __inline void blas_lock(volatile BLASULONG *address){ From 0b7fb5c7915ceb119a17bb762b28f22acbb04d0e Mon Sep 17 00:00:00 2001 From: Gordon Fossum Date: Wed, 9 Oct 2024 09:42:23 -0500 Subject: [PATCH 086/244] CGEMM & ZGEMM using C code. --- kernel/power/KERNEL.POWER10 | 24 +- kernel/power/cgemm_kernel_power10.c | 1154 +++++++++++++++++++++++++++ kernel/power/zgemm_kernel_power10.c | 761 ++++++++++++++++++ 3 files changed, 1931 insertions(+), 8 deletions(-) create mode 100644 kernel/power/cgemm_kernel_power10.c create mode 100644 kernel/power/zgemm_kernel_power10.c diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 index c84cd91d2a..4d17944ae7 100644 --- a/kernel/power/KERNEL.POWER10 +++ b/kernel/power/KERNEL.POWER10 @@ -17,11 +17,15 @@ SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) STRMMKERNEL = sgemm_kernel_power10.c DTRMMKERNEL = dgemm_kernel_power10.c ifeq ($(OSNAME), AIX) -CTRMMKERNEL = ctrmm_kernel_8x4_power8.S -ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S +#CTRMMKERNEL = ctrmm_kernel_8x4_power8.S +#ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S +CTRMMKERNEL = cgemm_kernel_power10.c +ZTRMMKERNEL = zgemm_kernel_power10.c else -CTRMMKERNEL = cgemm_kernel_power10.S -ZTRMMKERNEL = zgemm_kernel_power10.S +#CTRMMKERNEL = cgemm_kernel_power10.S +#ZTRMMKERNEL = zgemm_kernel_power10.S +CTRMMKERNEL = cgemm_kernel_power10.c +ZTRMMKERNEL = zgemm_kernel_power10.c endif SGEMMKERNEL = sgemm_kernel_power10.c @@ -65,9 +69,11 @@ DGEMM_SMALL_K_TN = dgemm_small_kernel_tn_power10.c DGEMM_SMALL_K_B0_TN = dgemm_small_kernel_tn_power10.c ifeq ($(OSNAME), AIX) -CGEMMKERNEL = cgemm_kernel_8x4_power8.S +#CGEMMKERNEL = cgemm_kernel_8x4_power8.S +CGEMMKERNEL = cgemm_kernel_power10.c else -CGEMMKERNEL = cgemm_kernel_power10.S +#CGEMMKERNEL = cgemm_kernel_power10.S +CGEMMKERNEL = cgemm_kernel_power10.c endif #CGEMMKERNEL = cgemm_kernel_8x4_power8.S CGEMMINCOPY = ../generic/zgemm_ncopy_8.c @@ -84,9 +90,11 @@ CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) ifeq ($(OSNAME), AIX) -ZGEMMKERNEL = zgemm_kernel_8x2_power8.S +#ZGEMMKERNEL = zgemm_kernel_8x2_power8.S +ZGEMMKERNEL = zgemm_kernel_power10.c else -ZGEMMKERNEL = zgemm_kernel_power10.S +#ZGEMMKERNEL = zgemm_kernel_power10.S +ZGEMMKERNEL = zgemm_kernel_power10.c endif ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c diff --git a/kernel/power/cgemm_kernel_power10.c b/kernel/power/cgemm_kernel_power10.c new file mode 100644 index 0000000000..279c83aec0 --- /dev/null +++ b/kernel/power/cgemm_kernel_power10.c @@ -0,0 +1,1154 @@ +/********************************************************************************* +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ +#include "common.h" +#include + +typedef __vector unsigned char vec_t; +typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); +typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); + +#define SET_ACC_ZERO() \ + __builtin_mma_xxsetaccz (&acc0); \ + __builtin_mma_xxsetaccz (&acc1); \ + __builtin_mma_xxsetaccz (&acc2); \ + __builtin_mma_xxsetaccz (&acc3); \ + __builtin_mma_xxsetaccz (&acc4); \ + __builtin_mma_xxsetaccz (&acc5); \ + __builtin_mma_xxsetaccz (&acc6); \ + __builtin_mma_xxsetaccz (&acc7); + +#if (defined(NN) || defined(NT) || defined(TN) || defined(TT)) +#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr - _aibi; _imag = _arbi + _aibr; } +#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr - _aibi; _imag += _arbi + _aibr; } +#endif + +#if (defined(NR) || defined(NC) || defined(TR) || defined(TC)) +#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr + _aibi; _imag = -_arbi + _aibr; } +#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr + _aibi; _imag += -_arbi + _aibr; } +#endif + +#if (defined(RN) || defined(RT) || defined(CN) || defined(CT)) +#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr + _aibi; _imag = _arbi - _aibr; } +#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr + _aibi; _imag += _arbi - _aibr; } +#endif + +#if (defined(RR) || defined(RC) || defined(CR) || defined(CC)) +#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr - _aibi; _imag = -_arbi - _aibr; } +#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr - _aibi; _imag += -_arbi - _aibr; } +#endif + +#if defined (TRMMKERNEL) +#define A_OP = +#else +#define A_OP += +#endif + +#define BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + __builtin_mma_disassemble_acc ((void *)result, &acc0); \ + __builtin_mma_disassemble_acc ((void *)&result[ 4], &acc1); \ + __builtin_mma_disassemble_acc ((void *)&result[ 8], &acc2); \ + __builtin_mma_disassemble_acc ((void *)&result[12], &acc3); \ + __builtin_mma_disassemble_acc ((void *)&result[16], &acc4); \ + __builtin_mma_disassemble_acc ((void *)&result[20], &acc5); \ + __builtin_mma_disassemble_acc ((void *)&result[24], &acc6); \ + __builtin_mma_disassemble_acc ((void *)&result[28], &acc7); + +#define COMP_MUL_1 \ + COMP_MUL(tr[0], res[ 0], res[ 5], ti[0], res[ 1], res[ 4]) + +#define COMP_MAC_1(_offset) { \ + FLOAT *_ro = &res[_offset]; \ + COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ +} + +#define COMP_MUL_2A \ + COMP_MUL(tr[0], res[ 0], res[ 5], ti[0], res[ 1], res[ 4]) \ + COMP_MUL(tr[1], res[ 2], res[ 7], ti[1], res[ 3], res[ 6]) + +#define COMP_MAC_2A(_offset) { \ + FLOAT *_ro = &res[_offset]; \ + COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ + COMP_MAC(tr[1], _ro[ 2], _ro[ 7], ti[1], _ro[ 3], _ro[ 6]) \ +} + +#define COMP_MUL_2B \ + COMP_MUL(tr[0], res[ 0], res[ 5], ti[0], res[ 1], res[ 4]) \ + COMP_MUL(tr[1], res[ 8], res[13], ti[1], res[ 9], res[12]) + +#define COMP_MAC_2B(_offset) { \ + FLOAT *_ro = &res[_offset]; \ + COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ + COMP_MAC(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12]) \ +} + +#define COMP_MUL_4A(_offset) { \ + FLOAT *_ro = &res[_offset]; \ + COMP_MUL(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ + COMP_MUL(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12]) \ + COMP_MUL(tr[2], _ro[16], _ro[21], ti[2], _ro[17], _ro[20]) \ + COMP_MUL(tr[3], _ro[24], _ro[29], ti[3], _ro[25], _ro[28]) \ +} + +#define COMP_MAC_4A(_offset) { \ + FLOAT *_ro = &res[_offset]; \ + COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ + COMP_MAC(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12]) \ + COMP_MAC(tr[2], _ro[16], _ro[21], ti[2], _ro[17], _ro[20]) \ + COMP_MAC(tr[3], _ro[24], _ro[29], ti[3], _ro[25], _ro[28]) \ +} + +#define COMP_MUL_4B(_offset) { \ + FLOAT *_ro = &res[_offset]; \ + COMP_MUL(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ + COMP_MUL(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12]) \ + COMP_MUL(tr[2], _ro[ 2], _ro[ 7], ti[2], _ro[ 3], _ro[ 6]) \ + COMP_MUL(tr[3], _ro[10], _ro[15], ti[3], _ro[11], _ro[14]) \ +} + +#define COMP_MAC_4B(_offset) { \ + FLOAT *_ro = &res[_offset]; \ + COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4]) \ + COMP_MAC(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12]) \ + COMP_MAC(tr[2], _ro[ 2], _ro[ 7], ti[2], _ro[ 3], _ro[ 6]) \ + COMP_MAC(tr[3], _ro[10], _ro[15], ti[3], _ro[11], _ro[14]) \ +} + + +#define SAVE_ACC_COMPLEX_11 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_1 \ + COMP_MAC_1(16) \ + COMP_MAC_1(32) \ + COMP_MAC_1(48) \ + COMP_MAC_1(64) \ + COMP_MAC_1(80) \ + COMP_MAC_1(96) \ + COMP_MAC_1(112) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; + +#define SAVE_ACC_COMPLEX_12 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_2A \ + COMP_MAC_2A(16) \ + COMP_MAC_2A(32) \ + COMP_MAC_2A(48) \ + COMP_MAC_2A(64) \ + COMP_MAC_2A(80) \ + COMP_MAC_2A(96) \ + COMP_MAC_2A(112) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2*ldc+0] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[2*ldc+1] A_OP ti[1] * alpha_r + tr[1] * alpha_i; + +#define SAVE_ACC_COMPLEX_21_1 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_2B \ + COMP_MAC_2B(16) \ + COMP_MAC_2B(32) \ + COMP_MAC_2B(48) \ + COMP_MAC_2B(64) \ + COMP_MAC_2B(80) \ + COMP_MAC_2B(96) \ + COMP_MAC_2B(112) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; + +#define SAVE_ACC_COMPLEX_21_2 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_4A(0) \ + COMP_MAC_4A(32) \ + COMP_MAC_4A(64) \ + COMP_MAC_4A(96) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; + +#define SAVE_ACC_COMPLEX_21_4 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_4A(0) \ + COMP_MAC_4A(64) \ + CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ + COMP_MUL_4A(32) \ + COMP_MAC_4A(96) \ + CO[ 8] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[ 9] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[10] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[11] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[12] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[13] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[14] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[15] A_OP ti[3] * alpha_r + tr[3] * alpha_i; + +#define SAVE_ACC_COMPLEX_22_4 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_4B(0) \ + CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+ 0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+ 1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+ 2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+ 3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ + COMP_MUL_4B(16) \ + CO[ 4] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[ 5] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[ 6] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[ 7] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ + COMP_MUL_4B(32) \ + CO[ 8] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[ 9] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[10] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[11] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+ 8] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+ 9] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+10] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+11] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ + COMP_MUL_4B(48) \ + CO[12] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[13] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[14] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[15] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+12] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+13] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+14] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+15] A_OP ti[3] * alpha_r + tr[3] * alpha_i; + +#define SAVE_ACC_COMPLEX_22_2 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_4B(0) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ + COMP_MUL_4B(16) \ + CO[4] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[5] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[6] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[7] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; + +#define SAVE_ACC_COMPLEX_22_1 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL_4B(0) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; + +#define SAVE_ACC_COMPLEX_24_ALL \ + __builtin_mma_disassemble_acc ((void *)result, &acc0); \ + __builtin_mma_disassemble_acc ((void *)(&result[4]), &acc4); \ + __builtin_mma_disassemble_acc ((void *)(&result[8]), &acc1); \ + __builtin_mma_disassemble_acc ((void *)(&result[12]), &acc5); \ + __builtin_mma_disassemble_acc ((void *)(&result[16]), &acc2); \ + __builtin_mma_disassemble_acc ((void *)(&result[20]), &acc6); \ + __builtin_mma_disassemble_acc ((void *)(&result[24]), &acc3); \ + __builtin_mma_disassemble_acc ((void *)(&result[28]), &acc7); \ + COMP_MUL(tr[ 0], res[ 0], res[ 5], ti[ 0], res[ 1], res[ 4]) \ + COMP_MUL(tr[ 1], res[ 8], res[ 13], ti[ 1], res[ 9], res[ 12]) \ + COMP_MUL(tr[ 2], res[ 2], res[ 7], ti[ 2], res[ 3], res[ 6]) \ + COMP_MUL(tr[ 3], res[ 10], res[ 15], ti[ 3], res[ 11], res[ 14]) \ + COMP_MUL(tr[ 4], res[ 16], res[ 21], ti[ 4], res[ 17], res[ 20]) \ + COMP_MUL(tr[ 5], res[ 24], res[ 29], ti[ 5], res[ 25], res[ 28]) \ + COMP_MUL(tr[ 6], res[ 18], res[ 23], ti[ 6], res[ 19], res[ 22]) \ + COMP_MUL(tr[ 7], res[ 26], res[ 31], ti[ 7], res[ 27], res[ 30]) \ + COMP_MUL(tr[ 8], res[ 32], res[ 37], ti[ 8], res[ 33], res[ 36]) \ + COMP_MUL(tr[ 9], res[ 40], res[ 45], ti[ 9], res[ 41], res[ 44]) \ + COMP_MUL(tr[10], res[ 34], res[ 39], ti[10], res[ 35], res[ 38]) \ + COMP_MUL(tr[11], res[ 42], res[ 47], ti[11], res[ 43], res[ 46]) \ + COMP_MUL(tr[12], res[ 48], res[ 53], ti[12], res[ 49], res[ 52]) \ + COMP_MUL(tr[13], res[ 56], res[ 61], ti[13], res[ 57], res[ 60]) \ + COMP_MUL(tr[14], res[ 50], res[ 55], ti[14], res[ 51], res[ 54]) \ + COMP_MUL(tr[15], res[ 58], res[ 63], ti[15], res[ 59], res[ 62]) \ + COMP_MUL(tr[16], res[ 64], res[ 69], ti[16], res[ 65], res[ 68]) \ + COMP_MUL(tr[17], res[ 72], res[ 77], ti[17], res[ 73], res[ 76]) \ + COMP_MUL(tr[18], res[ 66], res[ 71], ti[18], res[ 67], res[ 70]) \ + COMP_MUL(tr[19], res[ 74], res[ 79], ti[19], res[ 75], res[ 78]) \ + COMP_MUL(tr[20], res[ 80], res[ 85], ti[20], res[ 81], res[ 84]) \ + COMP_MUL(tr[21], res[ 88], res[ 93], ti[21], res[ 89], res[ 92]) \ + COMP_MUL(tr[22], res[ 82], res[ 87], ti[22], res[ 83], res[ 86]) \ + COMP_MUL(tr[23], res[ 90], res[ 95], ti[23], res[ 91], res[ 94]) \ + COMP_MUL(tr[24], res[ 96], res[101], ti[24], res[ 97], res[100]) \ + COMP_MUL(tr[25], res[104], res[109], ti[25], res[105], res[108]) \ + COMP_MUL(tr[26], res[ 98], res[103], ti[26], res[ 99], res[102]) \ + COMP_MUL(tr[27], res[106], res[111], ti[27], res[107], res[110]) \ + COMP_MUL(tr[28], res[112], res[117], ti[28], res[113], res[116]) \ + COMP_MUL(tr[29], res[120], res[125], ti[29], res[121], res[124]) \ + COMP_MUL(tr[30], res[114], res[119], ti[30], res[115], res[118]) \ + COMP_MUL(tr[31], res[122], res[127], ti[31], res[123], res[126]) \ + CO[ 0] A_OP tr[ 0] * alpha_r - ti[ 0] * alpha_i; \ + CO[ 1] A_OP ti[ 0] * alpha_r + tr[ 0] * alpha_i; \ + CO[ 2] A_OP tr[ 1] * alpha_r - ti[ 1] * alpha_i; \ + CO[ 3] A_OP ti[ 1] * alpha_r + tr[ 1] * alpha_i; \ + CO[2*ldc+ 0] A_OP tr[ 2] * alpha_r - ti[ 2] * alpha_i; \ + CO[2*ldc+ 1] A_OP ti[ 2] * alpha_r + tr[ 2] * alpha_i; \ + CO[2*ldc+ 2] A_OP tr[ 3] * alpha_r - ti[ 3] * alpha_i; \ + CO[2*ldc+ 3] A_OP ti[ 3] * alpha_r + tr[ 3] * alpha_i; \ + CO[4*ldc+ 0] A_OP tr[ 4] * alpha_r - ti[ 4] * alpha_i; \ + CO[4*ldc+ 1] A_OP ti[ 4] * alpha_r + tr[ 4] * alpha_i; \ + CO[4*ldc+ 2] A_OP tr[ 5] * alpha_r - ti[ 5] * alpha_i; \ + CO[4*ldc+ 3] A_OP ti[ 5] * alpha_r + tr[ 5] * alpha_i; \ + CO[6*ldc+ 0] A_OP tr[ 6] * alpha_r - ti[ 6] * alpha_i; \ + CO[6*ldc+ 1] A_OP ti[ 6] * alpha_r + tr[ 6] * alpha_i; \ + CO[6*ldc+ 2] A_OP tr[ 7] * alpha_r - ti[ 7] * alpha_i; \ + CO[6*ldc+ 3] A_OP ti[ 7] * alpha_r + tr[ 7] * alpha_i; \ + CO[ 4] A_OP tr[ 8] * alpha_r - ti[ 8] * alpha_i; \ + CO[ 5] A_OP ti[ 8] * alpha_r + tr[ 8] * alpha_i; \ + CO[ 6] A_OP tr[ 9] * alpha_r - ti[ 9] * alpha_i; \ + CO[ 7] A_OP ti[ 9] * alpha_r + tr[ 9] * alpha_i; \ + CO[2*ldc+ 4] A_OP tr[10] * alpha_r - ti[10] * alpha_i; \ + CO[2*ldc+ 5] A_OP ti[10] * alpha_r + tr[10] * alpha_i; \ + CO[2*ldc+ 6] A_OP tr[11] * alpha_r - ti[11] * alpha_i; \ + CO[2*ldc+ 7] A_OP ti[11] * alpha_r + tr[11] * alpha_i; \ + CO[4*ldc+ 4] A_OP tr[12] * alpha_r - ti[12] * alpha_i; \ + CO[4*ldc+ 5] A_OP ti[12] * alpha_r + tr[12] * alpha_i; \ + CO[4*ldc+ 6] A_OP tr[13] * alpha_r - ti[13] * alpha_i; \ + CO[4*ldc+ 7] A_OP ti[13] * alpha_r + tr[13] * alpha_i; \ + CO[6*ldc+ 4] A_OP tr[14] * alpha_r - ti[14] * alpha_i; \ + CO[6*ldc+ 5] A_OP ti[14] * alpha_r + tr[14] * alpha_i; \ + CO[6*ldc+ 6] A_OP tr[15] * alpha_r - ti[15] * alpha_i; \ + CO[6*ldc+ 7] A_OP ti[15] * alpha_r + tr[15] * alpha_i; \ + CO[ 8] A_OP tr[16] * alpha_r - ti[16] * alpha_i; \ + CO[ 9] A_OP ti[16] * alpha_r + tr[16] * alpha_i; \ + CO[ 10] A_OP tr[17] * alpha_r - ti[17] * alpha_i; \ + CO[ 11] A_OP ti[17] * alpha_r + tr[17] * alpha_i; \ + CO[2*ldc+ 8] A_OP tr[18] * alpha_r - ti[18] * alpha_i; \ + CO[2*ldc+ 9] A_OP ti[18] * alpha_r + tr[18] * alpha_i; \ + CO[2*ldc+10] A_OP tr[19] * alpha_r - ti[19] * alpha_i; \ + CO[2*ldc+11] A_OP ti[19] * alpha_r + tr[19] * alpha_i; \ + CO[4*ldc+ 8] A_OP tr[20] * alpha_r - ti[20] * alpha_i; \ + CO[4*ldc+ 9] A_OP ti[20] * alpha_r + tr[20] * alpha_i; \ + CO[4*ldc+10] A_OP tr[21] * alpha_r - ti[21] * alpha_i; \ + CO[4*ldc+11] A_OP ti[21] * alpha_r + tr[21] * alpha_i; \ + CO[6*ldc+ 8] A_OP tr[22] * alpha_r - ti[22] * alpha_i; \ + CO[6*ldc+ 9] A_OP ti[22] * alpha_r + tr[22] * alpha_i; \ + CO[6*ldc+10] A_OP tr[23] * alpha_r - ti[23] * alpha_i; \ + CO[6*ldc+11] A_OP ti[23] * alpha_r + tr[23] * alpha_i; \ + CO[ 12] A_OP tr[24] * alpha_r - ti[24] * alpha_i; \ + CO[ 13] A_OP ti[24] * alpha_r + tr[24] * alpha_i; \ + CO[ 14] A_OP tr[25] * alpha_r - ti[25] * alpha_i; \ + CO[ 15] A_OP ti[25] * alpha_r + tr[25] * alpha_i; \ + CO[2*ldc+12] A_OP tr[26] * alpha_r - ti[26] * alpha_i; \ + CO[2*ldc+13] A_OP ti[26] * alpha_r + tr[26] * alpha_i; \ + CO[2*ldc+14] A_OP tr[27] * alpha_r - ti[27] * alpha_i; \ + CO[2*ldc+15] A_OP ti[27] * alpha_r + tr[27] * alpha_i; \ + CO[4*ldc+12] A_OP tr[28] * alpha_r - ti[28] * alpha_i; \ + CO[4*ldc+13] A_OP ti[28] * alpha_r + tr[28] * alpha_i; \ + CO[4*ldc+14] A_OP tr[29] * alpha_r - ti[29] * alpha_i; \ + CO[4*ldc+15] A_OP ti[29] * alpha_r + tr[29] * alpha_i; \ + CO[6*ldc+12] A_OP tr[30] * alpha_r - ti[30] * alpha_i; \ + CO[6*ldc+13] A_OP ti[30] * alpha_r + tr[30] * alpha_i; \ + CO[6*ldc+14] A_OP tr[31] * alpha_r - ti[31] * alpha_i; \ + CO[6*ldc+15] A_OP ti[31] * alpha_r + tr[31] * alpha_i; + +#define SAVE_ACC_COMPLEX_24(ACC1, ACC2, CI) \ + __builtin_mma_disassemble_acc ((void *)result, ACC1); \ + __builtin_mma_disassemble_acc ((void *)(&result[4]), ACC2); \ + COMP_MUL(tr[0], res[0], res[5], ti[0], res[1], res[4]) \ + COMP_MUL(tr[1], res[8], res[13], ti[1], res[9], res[12]) \ + COMP_MUL(tr[2], res[2], res[7], ti[2], res[3], res[6]) \ + COMP_MUL(tr[3], res[10], res[15], ti[3], res[11], res[14]) \ + COMP_MUL(tr[4], res[16], res[21], ti[4], res[17], res[20]) \ + COMP_MUL(tr[5], res[24], res[29], ti[5], res[25], res[28]) \ + COMP_MUL(tr[6], res[18], res[23], ti[6], res[19], res[22]) \ + COMP_MUL(tr[7], res[26], res[31], ti[7], res[27], res[30]) \ + CO[CI+0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[CI+1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[CI+2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[CI+3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[CI+2*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[CI+2*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[CI+2*ldc+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[CI+2*ldc+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ + CO[CI+4*ldc+0] A_OP tr[4] * alpha_r - ti[4] * alpha_i; \ + CO[CI+4*ldc+1] A_OP ti[4] * alpha_r + tr[4] * alpha_i; \ + CO[CI+4*ldc+2] A_OP tr[5] * alpha_r - ti[5] * alpha_i; \ + CO[CI+4*ldc+3] A_OP ti[5] * alpha_r + tr[5] * alpha_i; \ + CO[CI+6*ldc+0] A_OP tr[6] * alpha_r - ti[6] * alpha_i; \ + CO[CI+6*ldc+1] A_OP ti[6] * alpha_r + tr[6] * alpha_i; \ + CO[CI+6*ldc+2] A_OP tr[7] * alpha_r - ti[7] * alpha_i; \ + CO[CI+6*ldc+3] A_OP ti[7] * alpha_r + tr[7] * alpha_i; + +#define SAVE_ACC_COMPLEX_14 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL(tr[0], res[ 0], res[ 5], ti[0], res[ 1], res[ 4]) \ + COMP_MUL(tr[1], res[ 2], res[ 7], ti[1], res[ 3], res[ 6]) \ + COMP_MUL(tr[2], res[ 16], res[ 21], ti[2], res[ 17], res[ 20]) \ + COMP_MUL(tr[3], res[ 18], res[ 23], ti[3], res[ 19], res[ 22]) \ + COMP_MAC(tr[0], res[ 32], res[ 37], ti[0], res[ 33], res[ 36]) \ + COMP_MAC(tr[1], res[ 34], res[ 39], ti[1], res[ 35], res[ 38]) \ + COMP_MAC(tr[2], res[ 48], res[ 53], ti[2], res[ 49], res[ 52]) \ + COMP_MAC(tr[3], res[ 50], res[ 55], ti[3], res[ 51], res[ 54]) \ + COMP_MAC(tr[0], res[ 64], res[ 69], ti[0], res[ 65], res[ 68]) \ + COMP_MAC(tr[1], res[ 66], res[ 71], ti[1], res[ 67], res[ 70]) \ + COMP_MAC(tr[2], res[ 80], res[ 85], ti[2], res[ 81], res[ 84]) \ + COMP_MAC(tr[3], res[ 82], res[ 87], ti[3], res[ 83], res[ 86]) \ + COMP_MAC(tr[0], res[ 96], res[101], ti[0], res[ 97], res[100]) \ + COMP_MAC(tr[1], res[ 98], res[103], ti[1], res[ 99], res[102]) \ + COMP_MAC(tr[2], res[112], res[117], ti[2], res[113], res[116]) \ + COMP_MAC(tr[3], res[114], res[119], ti[3], res[115], res[118]) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2*ldc+0] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[2*ldc+1] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[4*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[4*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[6*ldc+0] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[6*ldc+1] A_OP ti[3] * alpha_r + tr[3] * alpha_i; + +#define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory"); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +#define REFRESH_TEMP_BK(x, y) \ + temp = k - off; +#elif defined(LEFT) +#define REFRESH_TEMP_BK(x, y) \ + temp = off + x; +#else +#define REFRESH_TEMP_BK(x, y) \ + temp = off + y; +#endif +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +#define REFRESH_POINTERS(x, y) \ + BO = B; \ + REFRESH_TEMP_BK(x, y) +#else +#define REFRESH_POINTERS(x, y) \ + AO += off * (2*x); \ + BO = B + off * (2*y); \ + REFRESH_TEMP_BK(x, y) +#endif + +#ifdef LEFT +#define REFRESH_OFF(x) \ + off += x; +#else +#define REFRESH_OFF(x) +#endif + +#ifdef LEFT +#define UPDATE_TEMP(x, y) \ + temp -= x; +#else +#define UPDATE_TEMP(x, y) \ + temp -= y; +#endif + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +#define REFRESH_TMP_AFTER_SAVE(x, y) \ + temp = k - off; \ + UPDATE_TEMP(x, y) \ + AO += temp * (2*x); \ + BO += temp * (2*y); +#else +#define REFRESH_TMP_AFTER_SAVE(x, y) +#endif + +#define REFRESH_AFTER_SAVE(x,y) \ + REFRESH_TMP_AFTER_SAVE(x, y) \ + REFRESH_OFF(x) +/************************************************************************************* +* GEMM Kernel +*************************************************************************************/ +int +CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT * A, FLOAT * B, + FLOAT * C, BLASLONG ldc +#ifdef TRMMKERNEL + , BLASLONG offset +#endif + ) +{ + BLASLONG i1, i, l, temp; + FLOAT *AO, *BO, *CO; +#if defined(TRMMKERNEL) + BLASLONG off; +#endif +#if defined(TRMMKERNEL) && !defined(LEFT) + off = -offset; +#endif + + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + + v4sf_t result[32]; + FLOAT *res, tr[64], ti[64]; + res = (FLOAT *) result; + + for (i1 = 0; i1 < (n >> 2); i1++) + { +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + AO = A; + CO = C; + C += ldc << 3; + + for (i = 0; i < (m >> 3); i++) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 4); +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < temp; ++l) + { + vec_t rowA1 = *(vec_t *) & AO[l<<4]; + vec_t rowB1 = *(vec_t *) & BO[l<<3]; + vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4]; + vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1); + __builtin_mma_xvf32gerpp(&acc4, rowA1, rowB2); + __builtin_mma_xvf32gerpp(&acc5, rowA2, rowB2); + __builtin_mma_xvf32gerpp(&acc6, rowA3, rowB2); + __builtin_mma_xvf32gerpp(&acc7, rowA4, rowB2); + } + SAVE_ACC_COMPLEX_24_ALL + CO += 16; + AO += temp << 4; + BO += temp << 3; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 4) +#endif + } + if (m & 4) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 4); +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~1)); l+=2) + { + vec_t rowA1 = *(vec_t *) & AO[l<<3]; + vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<3)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<3)+12]; + vec_t rowB1 = *(vec_t *) & BO[l<<3]; + vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; + vec_t rowB3 = *(vec_t *) & BO[(l<<3)+8]; + vec_t rowB4 = *(vec_t *) & BO[(l<<3)+12]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA1, rowB2); + __builtin_mma_xvf32gerpp(&acc3, rowA2, rowB2); + __builtin_mma_xvf32gerpp(&acc0, rowA3, rowB3); + __builtin_mma_xvf32gerpp(&acc1, rowA4, rowB3); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB4); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB4); + } + for (l = (temp & (~1)); l < temp; ++l) + { + vec_t rowA1 = *(vec_t *) & AO[l<<3]; + vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4]; + vec_t rowB1 = *(vec_t *) & BO[l<<3]; + vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA1, rowB2); + __builtin_mma_xvf32gerpp(&acc3, rowA2, rowB2); + } + SAVE_ACC_COMPLEX_24(&acc0, &acc2, 0) + SAVE_ACC_COMPLEX_24(&acc1, &acc3, 4) + CO += 8; + AO += temp << 3; + BO += temp << 3; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 4) +#endif + } + if (m & 2) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 4); +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~3)); l+=4) + { + vec_t rowA1 = *(vec_t *) & AO[l<<2]; + vec_t rowA2 = *(vec_t *) & AO[(l<<2)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<2)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<2)+12]; + vec_t rowB1 = *(vec_t *) & BO[l<<3]; + vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; + vec_t rowB3 = *(vec_t *) & BO[(l<<3)+8]; + vec_t rowB4 = *(vec_t *) & BO[(l<<3)+12]; + vec_t rowB5 = *(vec_t *) & BO[(l<<3)+16]; + vec_t rowB6 = *(vec_t *) & BO[(l<<3)+20]; + vec_t rowB7 = *(vec_t *) & BO[(l<<3)+24]; + vec_t rowB8 = *(vec_t *) & BO[(l<<3)+28]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA1, rowB2); + __builtin_mma_xvf32gerpp(&acc0, rowA2, rowB3); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB4); + __builtin_mma_xvf32gerpp(&acc0, rowA3, rowB5); + __builtin_mma_xvf32gerpp(&acc1, rowA3, rowB6); + __builtin_mma_xvf32gerpp(&acc0, rowA4, rowB7); + __builtin_mma_xvf32gerpp(&acc1, rowA4, rowB8); + } + for (l = (temp & (~3)); l < temp; ++l) + { + vec_t rowA1 = *(vec_t *) & AO[l<<2]; + vec_t rowB1 = *(vec_t *) & BO[l<<3]; + vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA1, rowB2); + } + SAVE_ACC_COMPLEX_24(&acc0, &acc1, 0) + CO += 4; + AO += temp << 2; + BO += temp << 3; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 4) +#endif + } + if (m & 1) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 4) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~3)); l+=4) + { + vec_t rowA1 = *(vec_t *) & AO[l<<1]; + vec_t rowA2 = *(vec_t *) & AO[(l<<1)+2]; + vec_t rowA3 = *(vec_t *) & AO[(l<<1)+4]; + vec_t rowA4 = *(vec_t *) & AO[(l<<1)+6]; + vec_t rowB1 = *(vec_t *) & BO[l<<3]; + vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; + vec_t rowB3 = *(vec_t *) & BO[(l<<3)+8]; + vec_t rowB4 = *(vec_t *) & BO[(l<<3)+12]; + vec_t rowB5 = *(vec_t *) & BO[(l<<3)+16]; + vec_t rowB6 = *(vec_t *) & BO[(l<<3)+20]; + vec_t rowB7 = *(vec_t *) & BO[(l<<3)+24]; + vec_t rowB8 = *(vec_t *) & BO[(l<<3)+28]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA1, rowB2); + __builtin_mma_xvf32gerpp(&acc2, rowA2, rowB3); + __builtin_mma_xvf32gerpp(&acc3, rowA2, rowB4); + __builtin_mma_xvf32gerpp(&acc4, rowA3, rowB5); + __builtin_mma_xvf32gerpp(&acc5, rowA3, rowB6); + __builtin_mma_xvf32gerpp(&acc6, rowA4, rowB7); + __builtin_mma_xvf32gerpp(&acc7, rowA4, rowB8); + } + for (l = (temp & (~3)); l < temp; ++l) + { + vec_t rowA1 = *(vec_t *) & AO[l<<1]; + vec_t rowB1 = *(vec_t *) & BO[l<<3]; + vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA1, rowB2); + } + SAVE_ACC_COMPLEX_14 + CO += 2; + AO += temp << 1; + BO += temp << 3; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 4) +#endif + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 4; // number of values in A +#endif + + B += k << 3; + } + + if (n & 2) + { +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + AO = A; + CO = C; + C += ldc << 2; + + for (i = 0; i < (m >> 3); i++) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 2) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~1)); l+=2) + { + vec_t rowA1 = *(vec_t *) & AO[l<<4]; + vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12]; + vec_t rowA5 = *(vec_t *) & AO[(l<<4)+16]; + vec_t rowA6 = *(vec_t *) & AO[(l<<4)+20]; + vec_t rowA7 = *(vec_t *) & AO[(l<<4)+24]; + vec_t rowA8 = *(vec_t *) & AO[(l<<4)+28]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+4]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1); + __builtin_mma_xvf32gerpp(&acc0, rowA5, rowB2); + __builtin_mma_xvf32gerpp(&acc1, rowA6, rowB2); + __builtin_mma_xvf32gerpp(&acc2, rowA7, rowB2); + __builtin_mma_xvf32gerpp(&acc3, rowA8, rowB2); + } + for (l = (temp & (~1)); l < temp; ++l) + { + vec_t rowA1 = *(vec_t *) & AO[l<<4]; + vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1); + } + SAVE_ACC_COMPLEX_22_4 + AO += temp << 4; + BO += temp << 2; + CO += 16; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 2) +#endif + } + if (m & 4) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 2) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~3)); l+=4) + { + vec_t rowA1 = *(vec_t *) & AO[l<<3]; + vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<3)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<3)+12]; + vec_t rowA5 = *(vec_t *) & AO[(l<<3)+16]; + vec_t rowA6 = *(vec_t *) & AO[(l<<3)+20]; + vec_t rowA7 = *(vec_t *) & AO[(l<<3)+24]; + vec_t rowA8 = *(vec_t *) & AO[(l<<3)+28]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+4]; + vec_t rowB3 = *(vec_t *) & BO[(l<<2)+8]; + vec_t rowB4 = *(vec_t *) & BO[(l<<2)+12]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc0, rowA3, rowB2); + __builtin_mma_xvf32gerpp(&acc1, rowA4, rowB2); + __builtin_mma_xvf32gerpp(&acc0, rowA5, rowB3); + __builtin_mma_xvf32gerpp(&acc1, rowA6, rowB3); + __builtin_mma_xvf32gerpp(&acc0, rowA7, rowB4); + __builtin_mma_xvf32gerpp(&acc1, rowA8, rowB4); + } + for (l = (temp & (~3)); l < temp; ++l) + { + vec_t rowA1 = *(vec_t *) & AO[l<<3]; + vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + } + SAVE_ACC_COMPLEX_22_2 + AO += temp << 3; + BO += temp << 2; + CO += 8; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 2) +#endif + } if (m & 2) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 2) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~7)); l+=8) + { + vec_t rowA1 = *(vec_t *) & AO[l<<2]; + vec_t rowA2 = *(vec_t *) & AO[(l<<2)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<2)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<2)+12]; + vec_t rowA5 = *(vec_t *) & AO[(l<<2)+16]; + vec_t rowA6 = *(vec_t *) & AO[(l<<2)+20]; + vec_t rowA7 = *(vec_t *) & AO[(l<<2)+24]; + vec_t rowA8 = *(vec_t *) & AO[(l<<2)+28]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+4]; + vec_t rowB3 = *(vec_t *) & BO[(l<<2)+8]; + vec_t rowB4 = *(vec_t *) & BO[(l<<2)+12]; + vec_t rowB5 = *(vec_t *) & BO[(l<<2)+16]; + vec_t rowB6 = *(vec_t *) & BO[(l<<2)+20]; + vec_t rowB7 = *(vec_t *) & BO[(l<<2)+24]; + vec_t rowB8 = *(vec_t *) & BO[(l<<2)+28]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc0, rowA2, rowB2); + __builtin_mma_xvf32gerpp(&acc0, rowA3, rowB3); + __builtin_mma_xvf32gerpp(&acc0, rowA4, rowB4); + __builtin_mma_xvf32gerpp(&acc0, rowA5, rowB5); + __builtin_mma_xvf32gerpp(&acc0, rowA6, rowB6); + __builtin_mma_xvf32gerpp(&acc0, rowA7, rowB7); + __builtin_mma_xvf32gerpp(&acc0, rowA8, rowB8); + } + for (l = (temp & (~7)); l < temp; ++l) + { + vec_t rowA1 = *(vec_t *) & AO[l<<2]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + } + SAVE_ACC_COMPLEX_22_1 + AO += temp << 2; + BO += temp << 2; + CO += 4; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 2) +#endif + } + if (m & 1) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 2) +#else + BO = B; + temp = k; +#endif + // RIP OUT MMA STUFF! + SET_ACC_ZERO() + for (l = 0; l < (temp & (~7)); l+=8) + { + vec_t rowA1 = *(vec_t *) & AO[l<<1]; + vec_t rowA2 = *(vec_t *) & AO[(l<<1)+2]; + vec_t rowA3 = *(vec_t *) & AO[(l<<1)+4]; + vec_t rowA4 = *(vec_t *) & AO[(l<<1)+6]; + vec_t rowA5 = *(vec_t *) & AO[(l<<1)+8]; + vec_t rowA6 = *(vec_t *) & AO[(l<<1)+10]; + vec_t rowA7 = *(vec_t *) & AO[(l<<1)+12]; + vec_t rowA8 = *(vec_t *) & AO[(l<<1)+14]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+4]; + vec_t rowB3 = *(vec_t *) & BO[(l<<2)+8]; + vec_t rowB4 = *(vec_t *) & BO[(l<<2)+12]; + vec_t rowB5 = *(vec_t *) & BO[(l<<2)+16]; + vec_t rowB6 = *(vec_t *) & BO[(l<<2)+20]; + vec_t rowB7 = *(vec_t *) & BO[(l<<2)+24]; + vec_t rowB8 = *(vec_t *) & BO[(l<<2)+28]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB2); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB3); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB4); + __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB5); + __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB6); + __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB7); + __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB8); + } + for (l = (temp & (~7)); l < temp; ++l) + { + vec_t rowA1 = *(vec_t *) & AO[l<<1]; + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + } + SAVE_ACC_COMPLEX_12 + AO += temp<<1; + BO += temp<<2; + CO += 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 2) +#endif + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 2; // number of values in A +#endif + B += k << 2; + } + + if (n & 1) + { +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + AO = A; + CO = C; + C += ldc << 1; + + for (i = 0; i < (m >> 3); i++) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 1) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~1)); l+=2) + { + vec_t rowA1 = *(vec_t *) & AO[l<<4]; + vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12]; + vec_t rowA5 = *(vec_t *) & AO[(l<<4)+16]; + vec_t rowA6 = *(vec_t *) & AO[(l<<4)+20]; + vec_t rowA7 = *(vec_t *) & AO[(l<<4)+24]; + vec_t rowA8 = *(vec_t *) & AO[(l<<4)+28]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1); + __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB2); + __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB2); + __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB2); + __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB2); + } + for (l = (temp & (~1)); l < temp; ++l) + { + vec_t rowA1 = *(vec_t *) & AO[l<<4]; + vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1); + } + SAVE_ACC_COMPLEX_21_4 + AO += temp << 4; + BO += temp << 1; + CO += 16; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 1) +#endif + } + if (m & 4) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 1) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~3)); l+=4) + { + vec_t rowA1 = *(vec_t *) & AO[l<<3]; + vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<3)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<3)+12]; + vec_t rowA5 = *(vec_t *) & AO[(l<<3)+16]; + vec_t rowA6 = *(vec_t *) & AO[(l<<3)+20]; + vec_t rowA7 = *(vec_t *) & AO[(l<<3)+24]; + vec_t rowA8 = *(vec_t *) & AO[(l<<3)+28]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB2); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB2); + __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB3); + __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB3); + __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB4); + __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB4); + } + for (l = (temp & (~3)); l < temp; ++l) + { + vec_t rowA1 = *(vec_t *) & AO[l<<3]; + vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1); + } + SAVE_ACC_COMPLEX_21_2 + AO += temp << 3; + BO += temp << 1; + CO += 8; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 1) +#endif + } + if (m & 2) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 1) +#else + BO = B; + temp = k; +#endif + // RIP OUT MMA STUFF! + SET_ACC_ZERO() + for (l = 0; l < (temp & (~7)); l+=8) + { + vec_t rowA1 = *(vec_t *) & AO[l<<2]; + vec_t rowA2 = *(vec_t *) & AO[(l<<2)+4]; + vec_t rowA3 = *(vec_t *) & AO[(l<<2)+8]; + vec_t rowA4 = *(vec_t *) & AO[(l<<2)+12]; + vec_t rowA5 = *(vec_t *) & AO[(l<<2)+16]; + vec_t rowA6 = *(vec_t *) & AO[(l<<2)+20]; + vec_t rowA7 = *(vec_t *) & AO[(l<<2)+24]; + vec_t rowA8 = *(vec_t *) & AO[(l<<2)+28]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; + vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8]; + vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10]; + vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12]; + vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB2); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB3); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB4); + __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB5); + __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB6); + __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB7); + __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB8); + } + for (l = (temp & (~7)); l < temp; ++l) + { + vec_t rowA1 = *(vec_t *) & AO[l<<2]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + } + SAVE_ACC_COMPLEX_21_1 + AO += temp << 2; + BO += temp << 1; + CO += 4; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 1) +#endif + } + if (m & 1) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 1) +#else + BO = B; + temp = k; +#endif + // RIP OUT MMA STUFF! + SET_ACC_ZERO() + for (l = 0; l < (temp & (~7)); l+=8) + { + vec_t rowA1 = *(vec_t *) & AO[l<<1]; + vec_t rowA2 = *(vec_t *) & AO[(l<<1)+2]; + vec_t rowA3 = *(vec_t *) & AO[(l<<1)+4]; + vec_t rowA4 = *(vec_t *) & AO[(l<<1)+6]; + vec_t rowA5 = *(vec_t *) & AO[(l<<1)+8]; + vec_t rowA6 = *(vec_t *) & AO[(l<<1)+10]; + vec_t rowA7 = *(vec_t *) & AO[(l<<1)+12]; + vec_t rowA8 = *(vec_t *) & AO[(l<<1)+14]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; + vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8]; + vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10]; + vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12]; + vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB2); + __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB3); + __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB4); + __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB5); + __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB6); + __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB7); + __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB8); + } + for (l = (temp & (~7)); l < temp; ++l) + { + vec_t rowA1 = *(vec_t *) & AO[l<<1]; + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1); + } + SAVE_ACC_COMPLEX_11 + AO += temp<<1; + BO += temp<<1; + CO += 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 1) +#endif + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 1; // number of values in A +#endif + B += k << 1; + } + return 0; +} diff --git a/kernel/power/zgemm_kernel_power10.c b/kernel/power/zgemm_kernel_power10.c new file mode 100644 index 0000000000..e4e609067c --- /dev/null +++ b/kernel/power/zgemm_kernel_power10.c @@ -0,0 +1,761 @@ +/********************************************************************************* +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ +#include "common.h" +#include + +typedef __vector unsigned char vec_t; +typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); + +#define SET_ACC_ZERO() \ + __builtin_mma_xxsetaccz (&acc0); \ + __builtin_mma_xxsetaccz (&acc1); \ + __builtin_mma_xxsetaccz (&acc2); \ + __builtin_mma_xxsetaccz (&acc3); \ + __builtin_mma_xxsetaccz (&acc4); \ + __builtin_mma_xxsetaccz (&acc5); \ + __builtin_mma_xxsetaccz (&acc6); \ + __builtin_mma_xxsetaccz (&acc7); + +#if (defined(NN) || defined(NT) || defined(TN) || defined(TT)) +#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr - _aibi; _imag = _arbi + _aibr; } +#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr - _aibi; _imag += _arbi + _aibr; } +#endif + +#if (defined(NR) || defined(NC) || defined(TR) || defined(TC)) +#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr + _aibi; _imag = -_arbi + _aibr; } +#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr + _aibi; _imag += -_arbi + _aibr; } +#endif + +#if (defined(RN) || defined(RT) || defined(CN) || defined(CT)) +#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr + _aibi; _imag = _arbi - _aibr; } +#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr + _aibi; _imag += _arbi - _aibr; } +#endif + +#if (defined(RR) || defined(RC) || defined(CR) || defined(CC)) +#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real = _arbr - _aibi; _imag = -_arbi - _aibr; } +#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr - _aibi; _imag += -_arbi - _aibr; } +#endif + +#if defined(TRMMKERNEL) +#define A_OP = +#else +#define A_OP += +#endif + +#define BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + __builtin_mma_disassemble_acc ((void *)result, &acc0); \ + __builtin_mma_disassemble_acc ((void *)&result[4], &acc1); \ + __builtin_mma_disassemble_acc ((void *)&result[8], &acc2); \ + __builtin_mma_disassemble_acc ((void *)&result[12], &acc3); \ + __builtin_mma_disassemble_acc ((void *)&result[16], &acc4); \ + __builtin_mma_disassemble_acc ((void *)&result[20], &acc5); \ + __builtin_mma_disassemble_acc ((void *)&result[24], &acc6); \ + __builtin_mma_disassemble_acc ((void *)&result[28], &acc7); + +#define SAVE_ACC_COMPLEX_11 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \ + COMP_MAC(tr[0], res[ 8], res[11], ti[0], res[ 9], res[10]) \ + COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \ + COMP_MAC(tr[0], res[24], res[27], ti[0], res[25], res[26]) \ + COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \ + COMP_MAC(tr[0], res[40], res[43], ti[0], res[41], res[42]) \ + COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \ + COMP_MAC(tr[0], res[56], res[59], ti[0], res[57], res[58]) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; + +#define SAVE_ACC_COMPLEX_12 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \ + COMP_MUL(tr[1], res[ 8], res[11], ti[1], res[ 9], res[10]) \ + COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \ + COMP_MAC(tr[1], res[24], res[27], ti[1], res[25], res[26]) \ + COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \ + COMP_MAC(tr[1], res[40], res[43], ti[1], res[41], res[42]) \ + COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \ + COMP_MAC(tr[1], res[56], res[59], ti[1], res[57], res[58]) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2*ldc+0] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[2*ldc+1] A_OP ti[1] * alpha_r + tr[1] * alpha_i; + +#define SAVE_ACC_COMPLEX_21_1 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \ + COMP_MUL(tr[1], res[ 4], res[ 7], ti[1], res[ 5], res[ 6]) \ + COMP_MAC(tr[0], res[ 8], res[11], ti[0], res[ 9], res[10]) \ + COMP_MAC(tr[1], res[12], res[15], ti[1], res[13], res[14]) \ + COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \ + COMP_MAC(tr[1], res[20], res[23], ti[1], res[21], res[22]) \ + COMP_MAC(tr[0], res[24], res[27], ti[0], res[25], res[26]) \ + COMP_MAC(tr[1], res[28], res[31], ti[1], res[29], res[30]) \ + COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \ + COMP_MAC(tr[1], res[36], res[39], ti[1], res[37], res[38]) \ + COMP_MAC(tr[0], res[40], res[43], ti[0], res[41], res[42]) \ + COMP_MAC(tr[1], res[44], res[47], ti[1], res[45], res[46]) \ + COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \ + COMP_MAC(tr[1], res[52], res[55], ti[1], res[53], res[54]) \ + COMP_MAC(tr[0], res[56], res[59], ti[0], res[57], res[58]) \ + COMP_MAC(tr[1], res[60], res[63], ti[1], res[61], res[62]) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; + +#define SAVE_ACC_COMPLEX_21_2 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \ + COMP_MUL(tr[1], res[ 4], res[ 7], ti[1], res[ 5], res[ 6]) \ + COMP_MUL(tr[2], res[ 8], res[11], ti[2], res[ 9], res[10]) \ + COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14]) \ + COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18]) \ + COMP_MAC(tr[1], res[20], res[23], ti[1], res[21], res[22]) \ + COMP_MAC(tr[2], res[24], res[27], ti[2], res[25], res[26]) \ + COMP_MAC(tr[3], res[28], res[31], ti[3], res[29], res[30]) \ + COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \ + COMP_MAC(tr[1], res[36], res[39], ti[1], res[37], res[38]) \ + COMP_MAC(tr[2], res[40], res[43], ti[2], res[41], res[42]) \ + COMP_MAC(tr[3], res[44], res[47], ti[3], res[45], res[46]) \ + COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50]) \ + COMP_MAC(tr[1], res[52], res[55], ti[1], res[53], res[54]) \ + COMP_MAC(tr[2], res[56], res[59], ti[2], res[57], res[58]) \ + COMP_MAC(tr[3], res[60], res[63], ti[3], res[61], res[62]) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; + +#define SAVE_ACC_COMPLEX_21_4 \ + BUILTIN_MMA_DISASSEMBLE_ACC_8 \ + COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2]) \ + COMP_MUL(tr[1], res[ 4], res[ 7], ti[1], res[ 5], res[ 6]) \ + COMP_MUL(tr[2], res[ 8], res[11], ti[2], res[ 9], res[10]) \ + COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14]) \ + COMP_MUL(tr[4], res[16], res[19], ti[4], res[17], res[18]) \ + COMP_MUL(tr[5], res[20], res[23], ti[5], res[21], res[22]) \ + COMP_MUL(tr[6], res[24], res[27], ti[6], res[25], res[26]) \ + COMP_MUL(tr[7], res[28], res[31], ti[7], res[29], res[30]) \ + COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34]) \ + COMP_MAC(tr[1], res[36], res[39], ti[1], res[37], res[38]) \ + COMP_MAC(tr[2], res[40], res[43], ti[2], res[41], res[42]) \ + COMP_MAC(tr[3], res[44], res[47], ti[3], res[45], res[46]) \ + COMP_MAC(tr[4], res[48], res[51], ti[4], res[49], res[50]) \ + COMP_MAC(tr[5], res[52], res[55], ti[5], res[53], res[54]) \ + COMP_MAC(tr[6], res[56], res[59], ti[6], res[57], res[58]) \ + COMP_MAC(tr[7], res[60], res[63], ti[7], res[61], res[62]) \ + CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; \ + CO[ 8] A_OP tr[4] * alpha_r - ti[4] * alpha_i; \ + CO[ 9] A_OP ti[4] * alpha_r + tr[4] * alpha_i; \ + CO[10] A_OP tr[5] * alpha_r - ti[5] * alpha_i; \ + CO[11] A_OP ti[5] * alpha_r + tr[5] * alpha_i; \ + CO[12] A_OP tr[6] * alpha_r - ti[6] * alpha_i; \ + CO[13] A_OP ti[6] * alpha_r + tr[6] * alpha_i; \ + CO[14] A_OP tr[7] * alpha_r - ti[7] * alpha_i; \ + CO[15] A_OP ti[7] * alpha_r + tr[7] * alpha_i; + +#define SAVE_ACC_COMPLEX_22_1 \ + __builtin_mma_disassemble_acc ((void *)result, &acc0); \ + __builtin_mma_disassemble_acc ((void *)(&result[4]), &acc1); \ + COMP_MUL(tr[0], res[0], res[3], ti[0], res[1], res[2]) \ + COMP_MUL(tr[1], res[4], res[7], ti[1], res[5], res[6]) \ + COMP_MUL(tr[2], res[8], res[11], ti[2], res[9], res[10]) \ + COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14] ) \ + CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; + +#define SAVE_ACC_COMPLEX_22_2(ACC1, ACC2, CI) \ + __builtin_mma_disassemble_acc ((void *)result, ACC1); \ + __builtin_mma_disassemble_acc ((void *)(&result[4]), ACC2); \ + COMP_MUL(tr[0], res[0], res[3], ti[0], res[1], res[2]) \ + COMP_MUL(tr[1], res[4], res[7], ti[1], res[5], res[6]) \ + COMP_MUL(tr[2], res[8], res[11], ti[2], res[9], res[10]) \ + COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14]) \ + CO[CI+0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; \ + CO[CI+1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; \ + CO[CI+2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; \ + CO[CI+3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; \ + CO[2*ldc+CI+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i; \ + CO[2*ldc+CI+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i; \ + CO[2*ldc+CI+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i; \ + CO[2*ldc+CI+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i; + +#define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory"); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +#define REFRESH_TEMP_BK(x, y) \ + temp = k - off; +#elif defined(LEFT) +#define REFRESH_TEMP_BK(x, y) \ + temp = off + x; +#else +#define REFRESH_TEMP_BK(x, y) \ + temp = off + y; +#endif +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +#define REFRESH_POINTERS(x, y) \ + BO = B; \ + REFRESH_TEMP_BK(x, y) +#else +#define REFRESH_POINTERS(x, y) \ + AO += off * (2*x); \ + BO = B + off * (2*y); \ + REFRESH_TEMP_BK(x, y) +#endif + +#ifdef LEFT +#define REFRESH_OFF(x) \ + off += x; +#else +#define REFRESH_OFF(x) +#endif + +#ifdef LEFT +#define UPDATE_TEMP(x, y) \ + temp -= x; +#else +#define UPDATE_TEMP(x, y) \ + temp -= y; +#endif + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +#define REFRESH_TMP_AFTER_SAVE(x, y) \ + temp = k - off; \ + UPDATE_TEMP(x, y) \ + AO += temp * (2*x); \ + BO += temp * (2*y); +#else +#define REFRESH_TMP_AFTER_SAVE(x, y) +#endif + +#define REFRESH_AFTER_SAVE(x,y) \ + REFRESH_TMP_AFTER_SAVE(x, y) \ + REFRESH_OFF(x) +/************************************************************************************* +* GEMM Kernel +*************************************************************************************/ +int +CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT * A, FLOAT * B, + FLOAT * C, BLASLONG ldc +#ifdef TRMMKERNEL + , BLASLONG offset +#endif + ) +{ + BLASLONG i1, i, l, temp; + FLOAT *AO, *BO, *CO; +#if defined(TRMMKERNEL) + BLASLONG off; +#endif +#if defined(TRMMKERNEL) && !defined(LEFT) + off = -offset; +#endif + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + + v4sf_t result[32]; + FLOAT *res, tr[16], ti[16]; + res = (FLOAT *) result; + + for (i1 = 0; i1 < (n >> 1); i1++) + { +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + AO = A; + CO = C; + C += ldc<<2; + for (i = 0; i < (m >> 3); i++) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 2) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < temp; ++l) + { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12])); + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB1); + __builtin_mma_xvf64gerpp(&acc4, rowA1, rowB2); + __builtin_mma_xvf64gerpp(&acc5, rowA2, rowB2); + __builtin_mma_xvf64gerpp(&acc6, rowA3, rowB2); + __builtin_mma_xvf64gerpp(&acc7, rowA4, rowB2); + } + __builtin_mma_disassemble_acc ((void *)result, &acc0); + __builtin_mma_disassemble_acc ((void *)(&result[ 4]), &acc1); + __builtin_mma_disassemble_acc ((void *)(&result[ 8]), &acc2); + __builtin_mma_disassemble_acc ((void *)(&result[12]), &acc3); + __builtin_mma_disassemble_acc ((void *)(&result[16]), &acc4); + __builtin_mma_disassemble_acc ((void *)(&result[20]), &acc5); + __builtin_mma_disassemble_acc ((void *)(&result[24]), &acc6); + __builtin_mma_disassemble_acc ((void *)(&result[28]), &acc7); + COMP_MUL(tr[ 0], res[ 0], res[ 3], ti[ 0], res[ 1], res[ 2]) + COMP_MUL(tr[ 1], res[ 4], res[ 7], ti[ 1], res[ 5], res[ 6]) + COMP_MUL(tr[ 2], res[ 8], res[11], ti[ 2], res[ 9], res[10]) + COMP_MUL(tr[ 3], res[12], res[15], ti[ 3], res[13], res[14]) + COMP_MUL(tr[ 4], res[16], res[19], ti[ 4], res[17], res[18]) + COMP_MUL(tr[ 5], res[20], res[23], ti[ 5], res[21], res[22]) + COMP_MUL(tr[ 6], res[24], res[27], ti[ 6], res[25], res[26]) + COMP_MUL(tr[ 7], res[28], res[31], ti[ 7], res[29], res[30]) + COMP_MUL(tr[ 8], res[32], res[35], ti[ 8], res[33], res[34]) + COMP_MUL(tr[ 9], res[36], res[39], ti[ 9], res[37], res[38]) + COMP_MUL(tr[10], res[40], res[43], ti[10], res[41], res[42]) + COMP_MUL(tr[11], res[44], res[47], ti[11], res[45], res[46]) + COMP_MUL(tr[12], res[48], res[51], ti[12], res[49], res[50]) + COMP_MUL(tr[13], res[52], res[55], ti[13], res[53], res[54]) + COMP_MUL(tr[14], res[56], res[59], ti[14], res[57], res[58]) + COMP_MUL(tr[15], res[60], res[63], ti[15], res[61], res[62]) + CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i; + CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i; + CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i; + CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i; + CO[ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i; + CO[ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i; + CO[ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i; + CO[ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i; + CO[ 8] A_OP tr[4] * alpha_r - ti[4] * alpha_i; + CO[ 9] A_OP ti[4] * alpha_r + tr[4] * alpha_i; + CO[10] A_OP tr[5] * alpha_r - ti[5] * alpha_i; + CO[11] A_OP ti[5] * alpha_r + tr[5] * alpha_i; + CO[12] A_OP tr[6] * alpha_r - ti[6] * alpha_i; + CO[13] A_OP ti[6] * alpha_r + tr[6] * alpha_i; + CO[14] A_OP tr[7] * alpha_r - ti[7] * alpha_i; + CO[15] A_OP ti[7] * alpha_r + tr[7] * alpha_i; + CO[2*ldc+ 0] A_OP tr[ 8] * alpha_r - ti[ 8] * alpha_i; + CO[2*ldc+ 1] A_OP ti[ 8] * alpha_r + tr[ 8] * alpha_i; + CO[2*ldc+ 2] A_OP tr[ 9] * alpha_r - ti[ 9] * alpha_i; + CO[2*ldc+ 3] A_OP ti[ 9] * alpha_r + tr[ 9] * alpha_i; + CO[2*ldc+ 4] A_OP tr[10] * alpha_r - ti[10] * alpha_i; + CO[2*ldc+ 5] A_OP ti[10] * alpha_r + tr[10] * alpha_i; + CO[2*ldc+ 6] A_OP tr[11] * alpha_r - ti[11] * alpha_i; + CO[2*ldc+ 7] A_OP ti[11] * alpha_r + tr[11] * alpha_i; + CO[2*ldc+ 8] A_OP tr[12] * alpha_r - ti[12] * alpha_i; + CO[2*ldc+ 9] A_OP ti[12] * alpha_r + tr[12] * alpha_i; + CO[2*ldc+10] A_OP tr[13] * alpha_r - ti[13] * alpha_i; + CO[2*ldc+11] A_OP ti[13] * alpha_r + tr[13] * alpha_i; + CO[2*ldc+12] A_OP tr[14] * alpha_r - ti[14] * alpha_i; + CO[2*ldc+13] A_OP ti[14] * alpha_r + tr[14] * alpha_i; + CO[2*ldc+14] A_OP tr[15] * alpha_r - ti[15] * alpha_i; + CO[2*ldc+15] A_OP ti[15] * alpha_r + tr[15] * alpha_i; + + AO += temp << 4; + BO += temp << 2; + CO += 16; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 2) +#endif + } + if (m & 4) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 2) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~1)); l+=2) + { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<3)+8])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<3)+12])); + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<2)+6]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf64gerpp(&acc2, rowA1, rowB2); + __builtin_mma_xvf64gerpp(&acc3, rowA2, rowB2); + __builtin_mma_xvf64gerpp(&acc0, rowA3, rowB3); + __builtin_mma_xvf64gerpp(&acc1, rowA4, rowB3); + __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB4); + __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4); + } + for (l = (temp & (~1)); l < temp; ++l) + { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4])); + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf64gerpp(&acc2, rowA1, rowB2); + __builtin_mma_xvf64gerpp(&acc3, rowA2, rowB2); + } + SAVE_ACC_COMPLEX_22_2(&acc0, &acc2, 0) + SAVE_ACC_COMPLEX_22_2(&acc1, &acc3, 4) + AO += temp << 3; + BO += temp << 2; + CO += 8; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 2) +#endif + } + if (m & 2) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 2) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~3)); l+=4) + { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<2)+4])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<2)+8])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<2)+12])); + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<2)+6]; + vec_t rowB5 = *(vec_t *) & BO[(l<<2)+8]; + vec_t rowB6 = *(vec_t *) & BO[(l<<2)+10]; + vec_t rowB7 = *(vec_t *) & BO[(l<<2)+12]; + vec_t rowB8 = *(vec_t *) & BO[(l<<2)+14]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2); + __builtin_mma_xvf64gerpp(&acc0, rowA2, rowB3); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB4); + __builtin_mma_xvf64gerpp(&acc0, rowA3, rowB5); + __builtin_mma_xvf64gerpp(&acc1, rowA3, rowB6); + __builtin_mma_xvf64gerpp(&acc0, rowA4, rowB7); + __builtin_mma_xvf64gerpp(&acc1, rowA4, rowB8); + } + for (l = (temp & (~3)); l < temp; ++l) + { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2])); + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2); + } + SAVE_ACC_COMPLEX_22_1 + AO += temp << 2; + BO += temp << 2; + CO += 4; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 2) +#endif + } + if (m & 1) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 2) +#else + BO = B; + temp = k; +#endif + // RIP OUT MMA STUFF! + SET_ACC_ZERO() + for (l = 0; l < (temp & (~3)); l+=4) + { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<1)+2])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<1)+4])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<1)+6])); + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<2)+6]; + vec_t rowB5 = *(vec_t *) & BO[(l<<2)+8]; + vec_t rowB6 = *(vec_t *) & BO[(l<<2)+10]; + vec_t rowB7 = *(vec_t *) & BO[(l<<2)+12]; + vec_t rowB8 = *(vec_t *) & BO[(l<<2)+14]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2); + __builtin_mma_xvf64gerpp(&acc0, rowA2, rowB3); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB4); + __builtin_mma_xvf64gerpp(&acc0, rowA3, rowB5); + __builtin_mma_xvf64gerpp(&acc1, rowA3, rowB6); + __builtin_mma_xvf64gerpp(&acc0, rowA4, rowB7); + __builtin_mma_xvf64gerpp(&acc1, rowA4, rowB8); + } + for (l = (temp & (~3)); l < temp; ++l) + { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1])); + vec_t rowB1 = *(vec_t *) & BO[l<<2]; + vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2); + } + SAVE_ACC_COMPLEX_12 + AO += temp << 1; + BO += temp << 2; + CO += 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 2) +#endif + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 2; // number of values in A +#endif + B += k << 2; + } + if (n & 1) + { +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + AO = A; + CO = C; + C += ldc<<1; + for (i = 0; i < (m >> 3); i++) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 1) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~1)); l+=2) + { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12])); + __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<4)+16])); + __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<4)+20])); + __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<4)+24])); + __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<4)+28])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB1); + __builtin_mma_xvf64gerpp(&acc0, rowA5, rowB2); + __builtin_mma_xvf64gerpp(&acc1, rowA6, rowB2); + __builtin_mma_xvf64gerpp(&acc2, rowA7, rowB2); + __builtin_mma_xvf64gerpp(&acc3, rowA8, rowB2); + } + for (l = (temp & (~1)); l < temp; ++l) + { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB1); + __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB1); + } + SAVE_ACC_COMPLEX_21_4 + + AO += temp << 4; + BO += temp << 1; + CO += 16; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 1) +#endif + } + if (m & 4) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 1) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~3)); l+=4) + { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<3)+8])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<3)+12])); + __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<3)+16])); + __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<3)+20])); + __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<3)+24])); + __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<3)+28])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); + __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB2); + __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB2); + __builtin_mma_xvf64gerpp(&acc4, rowA5, rowB3); + __builtin_mma_xvf64gerpp(&acc5, rowA6, rowB3); + __builtin_mma_xvf64gerpp(&acc6, rowA7, rowB4); + __builtin_mma_xvf64gerpp(&acc7, rowA8, rowB4); + } + for (l = (temp & (~3)); l < temp; ++l) + { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); + } + SAVE_ACC_COMPLEX_21_2 + AO += temp << 3; + BO += temp << 1; + CO += 8; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 1) +#endif + } if (m & 2) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 1) +#else + BO = B; + temp = k; +#endif + SET_ACC_ZERO() + for (l = 0; l < (temp & (~7)); l+=8) + { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<2)+4])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<2)+8])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<2)+12])); + __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<2)+16])); + __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<2)+20])); + __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<2)+24])); + __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<2)+28])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; + vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8]; + vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10]; + vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12]; + vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB2); + __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB3); + __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4); + __builtin_mma_xvf64gerpp(&acc4, rowA5, rowB5); + __builtin_mma_xvf64gerpp(&acc5, rowA6, rowB6); + __builtin_mma_xvf64gerpp(&acc6, rowA7, rowB7); + __builtin_mma_xvf64gerpp(&acc7, rowA8, rowB8); + } + for (l = (temp & (~7)); l < temp; ++l) + { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + } + SAVE_ACC_COMPLEX_21_1 + AO += temp << 2; + BO += temp << 1; + CO += 4; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 1) +#endif + } + if (m & 1) + { +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 1) +#else + BO = B; + temp = k; +#endif + // RIP OUT MMA STUFF! + SET_ACC_ZERO() + for (l = 0; l < (temp & (~7)); l+=8) + { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1])); + __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<1)+2])); + __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<1)+4])); + __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<1)+6])); + __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<1)+8])); + __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<1)+10])); + __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<1)+12])); + __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<1)+14])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; + vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; + vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6]; + vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8]; + vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10]; + vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12]; + vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB2); + __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB3); + __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4); + __builtin_mma_xvf64gerpp(&acc4, rowA5, rowB5); + __builtin_mma_xvf64gerpp(&acc5, rowA6, rowB6); + __builtin_mma_xvf64gerpp(&acc6, rowA7, rowB7); + __builtin_mma_xvf64gerpp(&acc7, rowA8, rowB8); + } + for (l = (temp & (~7)); l < temp; ++l) + { + __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1])); + vec_t rowB1 = *(vec_t *) & BO[l<<1]; + __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); + } + SAVE_ACC_COMPLEX_11 + AO += temp << 1; + BO += temp << 1; + CO += 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 1) +#endif + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 1; // number of values in A +#endif + B += k << 1; + } + return 0; +} From 9762464718183fe3d73e42c83801596111a0df77 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 9 Oct 2024 18:06:39 +0200 Subject: [PATCH 087/244] Fix CBLAS interface filling in the wrong triangle for Row-Major --- interface/gemmt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/interface/gemmt.c b/interface/gemmt.c index cae00877ed..01747af41b 100644 --- a/interface/gemmt.c +++ b/interface/gemmt.c @@ -319,8 +319,8 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, lda = LDB; ldb = LDA; - if (Uplo == CblasUpper) uplo = 0; - if (Uplo == CblasLower) uplo = 1; + if (Uplo == CblasUpper) uplo = 1; + if (Uplo == CblasLower) uplo = 0; if (TransB == CblasNoTrans) transa = 0; From b89fb9632f73b9e01ec09631763bfb9b5cb65b51 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 10 Oct 2024 10:19:11 +0200 Subject: [PATCH 088/244] Update Android NDK install path for M1/armv7 crossbuild --- .cirrus.yml | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/.cirrus.yml b/.cirrus.yml index 17e4eb7e87..c253c7555f 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -94,16 +94,8 @@ task: name: AppleM1/LLVM armv7-androidndk xbuild compile_script: - brew install --cask android-ndk - - export #PATH=/opt/homebrew/opt/llvm/bin:$PATH - - export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib" - - export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include" - - export ANDROID_NDK_HOME="/opt/homebrew/share/android-ndk" - - ls /opt/homebrew - - ls -l /System/Volumes/Data/opt/homebrew/Caskroom/android-ndk - - find /opt/homebrew -name "armv7a-linux-androideabi*-ranlib" - - #export CC=/Applications/Xcode-13.4.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang - - #export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-13.4.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.0.sdk -arch arm64 -miphoneos-version-min=10.0" - - export CC=/System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/27/AndroidNDK*.app/Contents/NDK/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi23-clang + - export ANDROID_NDK_HOME="/opt/homebrew/share/android-ndk"" + - export CC=/opt/homebrew/share/android-ndk/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi23-clang - make TARGET=ARMV7 ARM_SOFTFP_ABI=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l" always: config_artifacts: From e4bc5e4718ef355e711b002750964eb48e4d43f2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 10 Oct 2024 11:02:56 +0200 Subject: [PATCH 089/244] remove stray quote --- .cirrus.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.cirrus.yml b/.cirrus.yml index c253c7555f..112afe352c 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -94,7 +94,7 @@ task: name: AppleM1/LLVM armv7-androidndk xbuild compile_script: - brew install --cask android-ndk - - export ANDROID_NDK_HOME="/opt/homebrew/share/android-ndk"" + - export ANDROID_NDK_HOME="/opt/homebrew/share/android-ndk" - export CC=/opt/homebrew/share/android-ndk/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi23-clang - make TARGET=ARMV7 ARM_SOFTFP_ABI=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l" always: From 550bc77832282e4f03c5cfc1b7e7bbb089bc6c26 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 10 Oct 2024 20:39:29 +0200 Subject: [PATCH 090/244] Fix expectation values for CblasRowMajor order --- utest/test_extensions/test_cgemmt.c | 22 ++++++++++++++++++++++ utest/test_extensions/test_dgemmt.c | 15 +++++++++++++++ utest/test_extensions/test_sgemmt.c | 15 +++++++++++++++ utest/test_extensions/test_zgemmt.c | 23 ++++++++++++++++++++++- 4 files changed, 74 insertions(+), 1 deletion(-) diff --git a/utest/test_extensions/test_cgemmt.c b/utest/test_extensions/test_cgemmt.c index dfeb06ff6e..92590b1e8b 100644 --- a/utest/test_extensions/test_cgemmt.c +++ b/utest/test_extensions/test_cgemmt.c @@ -81,6 +81,28 @@ static void cgemmt_trusted(char api, enum CBLAS_ORDER order, char uplo, char tra ldc *= 2; +#ifndef NO_CBLAS + if (order == CblasRowMajor) { + if (uplo == 'U' || uplo == CblasUpper) + { + for (i = 0; i < m; i++) + for (j = i * 2; j < m * 2; j+=2){ + data_cgemmt.c_verify[i * ldc + j] = + data_cgemmt.c_gemm[i * ldc + j]; + data_cgemmt.c_verify[i * ldc + j + 1] = + data_cgemmt.c_gemm[i * ldc + j + 1]; + } + } else { + for (i = 0; i < m; i++) + for (j = 0; j <= i * 2; j+=2){ + data_cgemmt.c_verify[i * ldc + j] = + data_cgemmt.c_gemm[i * ldc + j]; + data_cgemmt.c_verify[i * ldc + j + 1] = + data_cgemmt.c_gemm[i * ldc + j + 1]; + } + } + } else +#endif if (uplo == 'L' || uplo == CblasLower) { for (i = 0; i < m; i++) diff --git a/utest/test_extensions/test_dgemmt.c b/utest/test_extensions/test_dgemmt.c index fd8f5f6661..9ce8859cca 100644 --- a/utest/test_extensions/test_dgemmt.c +++ b/utest/test_extensions/test_dgemmt.c @@ -77,6 +77,21 @@ static void dgemmt_trusted(char api, enum CBLAS_ORDER order, char uplo, char tra else cblas_dgemm(order, transa, transb, m, m, k, alpha, data_dgemmt.a_test, lda, data_dgemmt.b_test, ldb, beta, data_dgemmt.c_gemm, ldc); + + if (order == CblasRowMajor) { + if (uplo == 'U' || uplo == CblasUpper) + { + for (i = 0; i < m; i++) + for (j = i; j < m; j++) + data_dgemmt.c_verify[i * ldc + j] = + data_dgemmt.c_gemm[i * ldc + j]; + } else { + for (i = 0; i < m; i++) + for (j = 0; j <= i; j++) + data_dgemmt.c_verify[i * ldc + j] = + data_dgemmt.c_gemm[i * ldc + j]; + } + }else #endif if (uplo == 'L' || uplo == CblasLower) diff --git a/utest/test_extensions/test_sgemmt.c b/utest/test_extensions/test_sgemmt.c index 177ce0d73b..f61fcf5fca 100644 --- a/utest/test_extensions/test_sgemmt.c +++ b/utest/test_extensions/test_sgemmt.c @@ -77,6 +77,21 @@ static void sgemmt_trusted(char api, enum CBLAS_ORDER order, char uplo, char tra else cblas_sgemm(order, transa, transb, m, m, k, alpha, data_sgemmt.a_test, lda, data_sgemmt.b_test, ldb, beta, data_sgemmt.c_gemm, ldc); + if (order == CblasRowMajor) { + if (uplo == 'U' || uplo == CblasUpper) + { + for (i = 0; i < m; i++) + for (j = i; j < m; j++) + data_sgemmt.c_verify[i * ldc + j] = + data_sgemmt.c_gemm[i * ldc + j]; + } else { + for (i = 0; i < m; i++) + for (j = 0; j <= i; j++) + data_sgemmt.c_verify[i * ldc + j] = + data_sgemmt.c_gemm[i * ldc + j]; + } + + } else #endif if (uplo == 'L' || uplo == CblasLower) diff --git a/utest/test_extensions/test_zgemmt.c b/utest/test_extensions/test_zgemmt.c index 34b8b61867..dcd70e9e36 100644 --- a/utest/test_extensions/test_zgemmt.c +++ b/utest/test_extensions/test_zgemmt.c @@ -80,7 +80,28 @@ static void zgemmt_trusted(char api, enum CBLAS_ORDER order, char uplo, char tra #endif ldc *= 2; - +#ifndef NO_CBLAS + if (order == CblasRowMajor) { + if (uplo == 'U' || uplo == CblasUpper) + { + for (i = 0; i < m; i++) + for (j = i * 2; j < m * 2; j+=2){ + data_zgemmt.c_verify[i * ldc + j] = + data_zgemmt.c_gemm[i * ldc + j]; + data_zgemmt.c_verify[i * ldc + j + 1] = + data_zgemmt.c_gemm[i * ldc + j + 1]; + } + } else { + for (i = 0; i < m; i++) + for (j = 0; j <= i * 2; j+=2){ + data_zgemmt.c_verify[i * ldc + j] = + data_zgemmt.c_gemm[i * ldc + j]; + data_zgemmt.c_verify[i * ldc + j + 1] = + data_zgemmt.c_gemm[i * ldc + j + 1]; + } + } + }else +#endif if (uplo == 'L' || uplo == CblasLower) { for (i = 0; i < m; i++) From 1d51ca579857b05a928336b4e9e961868182940b Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Fri, 11 Oct 2024 16:08:48 -0500 Subject: [PATCH 091/244] Change multi-threading logic for SBGEMV to be the same as SGEMV. --- interface/sbgemv.c | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/interface/sbgemv.c b/interface/sbgemv.c index 89debe82da..fce86f8e46 100644 --- a/interface/sbgemv.c +++ b/interface/sbgemv.c @@ -178,21 +178,10 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint m, blasi if (incy < 0) {y -= (leny - 1) * incy;} #ifdef SMP - int thread_thres_row = 20480; - if (trans) { - if (n <= thread_thres_row) { - nthreads = 1; - } else { - nthreads = num_cpu_avail(1); - } - } else { - if (m <= thread_thres_row) { - nthreads = 1; - } else { - nthreads = num_cpu_avail(1); - } - } - + if ( 1L * m * n < 115200L * GEMM_MULTITHREAD_THRESHOLD ) + nthreads = 1; + else + nthreads = num_cpu_avail(2); if (nthreads == 1) { #endif From f8e113f27b3a10911fe6b382148aeb846d4ade08 Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Sun, 13 Oct 2024 10:55:03 -0500 Subject: [PATCH 092/244] Replace types with include file. --- kernel/power/gemm_common.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/kernel/power/gemm_common.c b/kernel/power/gemm_common.c index ed00de95b0..88fa52de53 100644 --- a/kernel/power/gemm_common.c +++ b/kernel/power/gemm_common.c @@ -3,17 +3,12 @@ #include "common.h" #include +#include #define NBMAX 4096 #define FORCEINLINE inline __attribute__((always_inline)) -#ifdef __clang__ -#define uint16_t unsigned short -#define uint32_t unsigned int -#define uint64_t unsigned long long -#endif - #ifdef _ARCH_PWR10 #ifdef __has_builtin #if !__has_builtin(__builtin_vsx_assemble_pair) From 36bd3eeddfe2b21353789da39e67bc9523e22d5a Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Sun, 13 Oct 2024 13:46:11 -0500 Subject: [PATCH 093/244] Vectorize BF16 GEMV (VSX & MMA). Use GEMM_GEMV_FORWARD_BF16 (for Power). --- Makefile.system | 6 +- cmake/system.cmake | 3 + interface/gemm.c | 2 +- kernel/power/KERNEL.POWER10 | 2 + kernel/power/KERNEL.POWER8 | 2 + kernel/power/KERNEL.POWER9 | 2 + kernel/power/gemm_common.c | 153 +++++++ kernel/power/sbgemv_common.c | 223 ++++++++++ kernel/power/sbgemv_common_power10.c | 629 +++++++++++++++++++++++++++ kernel/power/sbgemv_n.c | 152 +++++++ kernel/power/sbgemv_n_power10.c | 474 ++++++++++++++++++++ kernel/power/sbgemv_n_vsx.c | 299 +++++++++++++ kernel/power/sbgemv_t.c | 137 ++++++ kernel/power/sbgemv_t_power10.c | 338 ++++++++++++++ kernel/power/sbgemv_t_vsx.c | 292 +++++++++++++ test/compare_sgemm_sbgemm.c | 30 +- 16 files changed, 2728 insertions(+), 16 deletions(-) create mode 100644 kernel/power/gemm_common.c create mode 100644 kernel/power/sbgemv_common.c create mode 100644 kernel/power/sbgemv_common_power10.c create mode 100644 kernel/power/sbgemv_n.c create mode 100644 kernel/power/sbgemv_n_power10.c create mode 100644 kernel/power/sbgemv_n_vsx.c create mode 100644 kernel/power/sbgemv_t.c create mode 100644 kernel/power/sbgemv_t_power10.c create mode 100644 kernel/power/sbgemv_t_vsx.c diff --git a/Makefile.system b/Makefile.system index 7bae728552..8351b8efb2 100644 --- a/Makefile.system +++ b/Makefile.system @@ -282,15 +282,19 @@ GEMM_GEMV_FORWARD = 1 endif ifeq ($(ARCH), power) GEMM_GEMV_FORWARD = 1 +GEMM_GEMV_FORWARD_BF16 = 1 endif ifeq ($(SMALL_MATRIX_OPT), 1) CCOMMON_OPT += -DSMALL_MATRIX_OPT endif -ifeq ($(GEMM_GEMV_FORWARD), 1) ifneq ($(ONLY_CBLAS), 1) +ifeq ($(GEMM_GEMV_FORWARD), 1) CCOMMON_OPT += -DGEMM_GEMV_FORWARD endif +ifeq ($(GEMM_GEMV_FORWARD_BF16), 1) +CCOMMON_OPT += -DGEMM_GEMV_FORWARD_BF16 +endif endif # This operation is expensive, so execution should be once. diff --git a/cmake/system.cmake b/cmake/system.cmake index d49d53449a..6b891ca0ef 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -398,6 +398,9 @@ endif () if (GEMM_GEMV_FORWARD AND NOT ONLY_CBLAS) set(CCOMMON_OPT "${CCOMMON_OPT} -DGEMM_GEMV_FORWARD") endif () +if (GEMM_GEMV_FORWARD_BF16 AND NOT ONLY_CBLAS) + set(CCOMMON_OPT "${CCOMMON_OPT} -DGEMM_GEMV_FORWARD_BF16") +endif () if (SMALL_MATRIX_OPT) set(CCOMMON_OPT "${CCOMMON_OPT} -DSMALL_MATRIX_OPT") endif () diff --git a/interface/gemm.c b/interface/gemm.c index c030947b6f..5742d36c4b 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -498,7 +498,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS args.m, args.n, args.k, args.lda, args.ldb, args.ldc); #endif -#if defined(GEMM_GEMV_FORWARD) && !defined(GEMM3M) && !defined(COMPLEX) && !defined(BFLOAT16) +#if defined(GEMM_GEMV_FORWARD) && !defined(GEMM3M) && !defined(COMPLEX) && (!defined(BFLOAT16) || defined(GEMM_GEMV_FORWARD_BF16)) // Check if we can convert GEMM -> GEMV if (args.k != 0) { if (args.n == 1) { diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 index 4d17944ae7..c009e33cf4 100644 --- a/kernel/power/KERNEL.POWER10 +++ b/kernel/power/KERNEL.POWER10 @@ -236,11 +236,13 @@ ZSWAPKERNEL = zswap.c # SGEMVNKERNEL = sgemv_n.c +SBGEMVNKERNEL = sbgemv_n_power10.c DGEMVNKERNEL = dgemv_n_power10.c CGEMVNKERNEL = cgemv_n.c ZGEMVNKERNEL = zgemv_n_power10.c # SGEMVTKERNEL = sgemv_t.c +SBGEMVTKERNEL = sbgemv_t_power10.c DGEMVTKERNEL = dgemv_t_power10.c CGEMVTKERNEL = cgemv_t.c ZGEMVTKERNEL = zgemv_t_4.c diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index 700a68e447..001401d532 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -257,11 +257,13 @@ ZSWAPKERNEL = zswap.c # SGEMVNKERNEL = sgemv_n.c +SBGEMVNKERNEL = sbgemv_n_vsx.c DGEMVNKERNEL = dgemv_n.c CGEMVNKERNEL = cgemv_n.c ZGEMVNKERNEL = zgemv_n_4.c # SGEMVTKERNEL = sgemv_t.c +SBGEMVTKERNEL = sbgemv_t_vsx.c DGEMVTKERNEL = dgemv_t.c CGEMVTKERNEL = cgemv_t.c ZGEMVTKERNEL = zgemv_t_4.c diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9 index 7d007d1a2b..a18c31a2e9 100644 --- a/kernel/power/KERNEL.POWER9 +++ b/kernel/power/KERNEL.POWER9 @@ -181,11 +181,13 @@ ZSWAPKERNEL = zswap.c # SGEMVNKERNEL = sgemv_n.c +SBGEMVNKERNEL = sbgemv_n_vsx.c DGEMVNKERNEL = dgemv_n.c CGEMVNKERNEL = cgemv_n.c ZGEMVNKERNEL = zgemv_n_4.c # SGEMVTKERNEL = sgemv_t.c +SBGEMVTKERNEL = sbgemv_t_vsx.c DGEMVTKERNEL = dgemv_t.c CGEMVTKERNEL = cgemv_t.c ZGEMVTKERNEL = zgemv_t_4.c diff --git a/kernel/power/gemm_common.c b/kernel/power/gemm_common.c new file mode 100644 index 0000000000..88fa52de53 --- /dev/null +++ b/kernel/power/gemm_common.c @@ -0,0 +1,153 @@ +#ifndef GEMM_COMMON_C +#define GEMM_COMMON_C +#include "common.h" + +#include +#include + +#define NBMAX 4096 + +#define FORCEINLINE inline __attribute__((always_inline)) + +#ifdef _ARCH_PWR10 +#ifdef __has_builtin +#if !__has_builtin(__builtin_vsx_assemble_pair) +#define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair +#endif +#if !__has_builtin(__builtin_vsx_disassemble_pair) +#define __builtin_vsx_disassemble_pair __builtin_mma_disassemble_pair +#endif +#endif + +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#define __builtin_vsx_assemble_pair2(vp0, v0, v1) __builtin_vsx_assemble_pair(vp0, v1, v0) +#else +#define __builtin_vsx_assemble_pair2(vp0, v0, v1) __builtin_vsx_assemble_pair(vp0, v0, v1) +#endif + +#define USE_VECTOR_PAIRS +#endif + +typedef __vector IFLOAT vec_bf16; +typedef __vector FLOAT vec_f32; +typedef __vector unsigned char vec_uc8; + +FORCEINLINE vec_uc8 vec_load_vec(void *src) +{ + return vec_xl(0, (unsigned char *)(src)); +} + +FORCEINLINE void vec_load_pair(vec_f32 *dst, vec_f32 *src) +{ +#ifdef USE_VECTOR_PAIRS + __vector_pair vy0p; +#ifdef __clang__ + vy0p = __builtin_vsx_lxvp(0L, (const __vector_pair *)(src)); +#else + vy0p = *(__vector_pair *)(src); +#endif + __builtin_vsx_disassemble_pair((void *)(dst), &vy0p); +#else + dst[0] = src[0]; + dst[1] = src[1]; +#endif +} + +FORCEINLINE void vec_store_pair(vec_f32 *dst, vec_f32 *src) +{ +#ifdef USE_VECTOR_PAIRS + __vector_pair vy0p; + __builtin_vsx_assemble_pair2(&vy0p, (vec_uc8)src[1], (vec_uc8)src[0]); +#ifdef __clang__ + __builtin_vsx_stxvp(vy0p, 0L, (__vector_pair *)(dst)); +#else + *(__vector_pair *)(dst) = vy0p; +#endif +#else + dst[0] = src[0]; + dst[1] = src[1]; +#endif +} + +FORCEINLINE vec_bf16 vec_loadN(void *src, BLASLONG n) +{ + IFLOAT *src2 = (IFLOAT *)(src); +#ifdef _ARCH_PWR9 + return vec_xl_len(src2, n * sizeof(IFLOAT)); +#else + __attribute__((aligned(16))) IFLOAT data[sizeof(vec_bf16) / sizeof(IFLOAT)]; + memset(data, 0, sizeof(vec_bf16)); + if (n & 4) { + memcpy(data, src2, sizeof(uint64_t)); + } + if (n & 2) { + BLASLONG n4 = n & 4; + memcpy(data + n4, src2 + n4, sizeof(uint32_t)); + } + if (n & 1) { + BLASLONG n6 = n & 6; + data[n6] = src2[n6]; + } + return (vec_bf16)vec_load_vec(data); +#endif +} + +FORCEINLINE vec_f32 vec_loadN_f32(void *src, BLASLONG n) +{ +#ifndef _ARCH_PWR9 + if (n & 4) { + return (vec_f32)vec_load_vec(src); + } +#endif + return (vec_f32)vec_loadN(src, n * (sizeof(FLOAT) / sizeof(IFLOAT))); +} + +FORCEINLINE void vec_loadN2_f32(vec_f32 *data, vec_f32 *src, BLASLONG n) +{ + data[0] = src[0]; + data[1] = vec_loadN_f32(&src[1], n); +} + +FORCEINLINE void vec_storeN(vec_bf16 data, void *dst, BLASLONG n) +{ + IFLOAT *dst2 = (IFLOAT *)(dst); +#ifdef _ARCH_PWR9 + vec_xst_len(data, dst2, n * sizeof(IFLOAT)); +#else + if (n & 8) { + vec_xst(data, 0, dst2); + return; + } + __attribute__((aligned(16))) IFLOAT data2[sizeof(vec_f32) / sizeof(IFLOAT)]; + vec_xst(data, 0, data2); + if (n & 4) { + memcpy(dst2, data2, sizeof(uint64_t)); + } + if (n & 2) { + BLASLONG n4 = n & 4; + memcpy(dst2 + n4, data2 + n4, sizeof(uint32_t)); + } + if (n & 1) { + BLASLONG n6 = n & 6; + dst2[n6] = data2[n6]; + } +#endif +} + +FORCEINLINE void vec_storeN_f32(vec_f32 data, void *dst, BLASLONG n) +{ +#ifndef _ARCH_PWR9 + if (n & 4) { + vec_xst(data, 0, (FLOAT *)dst); + return; + } +#endif + return vec_storeN((vec_bf16)data, dst, n * (sizeof(FLOAT) / sizeof(IFLOAT))); +} + +FORCEINLINE void vec_storeN2_f32(vec_f32 *data, vec_f32 *dst, BLASLONG n) +{ + dst[0] = data[0]; + vec_storeN_f32(data[1], &dst[1], n); +} +#endif diff --git a/kernel/power/sbgemv_common.c b/kernel/power/sbgemv_common.c new file mode 100644 index 0000000000..830481fef3 --- /dev/null +++ b/kernel/power/sbgemv_common.c @@ -0,0 +1,223 @@ +/*************************************************************************** +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#ifndef SBGEMV_COMMON_C +#define SBGEMV_COMMON_C +#include "gemm_common.c" + +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#define BF16_HI(data, zero) (vec_f32)vec_mergeh(data, zero) +#define BF16_LO(data, zero) (vec_f32)vec_mergel(data, zero) +#else +#define BF16_HI(data, zero) (vec_f32)vec_mergeh(zero, data) +#define BF16_LO(data, zero) (vec_f32)vec_mergel(zero, data) +#endif + +FORCEINLINE vec_f32 vec_loadNHi(void *src, BLASLONG n, vec_bf16 zero) +{ + vec_bf16 data = vec_loadN(src, n); + return BF16_HI(data, zero); +} + +FORCEINLINE vec_f32 vec_mult(vec_f32 *inp, vec_bf16 in0, vec_bf16 zero) +{ + vec_f32 v_in00 = BF16_HI(in0, zero); + vec_f32 v_in01 = BF16_LO(in0, zero); + + return (inp[0] * v_in00) + (inp[1] * v_in01); +} + +FORCEINLINE vec_f32 vec_load_mult(vec_bf16 *in, vec_f32 *inp, vec_bf16 zero) +{ + vec_bf16 in0 = (vec_bf16)vec_load_vec(in); + + return vec_mult(inp, in0, zero); +} + +FORCEINLINE void vec_load_vec2(vec_bf16 *in, vec_f32 *v_x0, vec_bf16 zero) +{ + vec_bf16 inp = (vec_bf16)vec_load_vec(in); + + v_x0[0] = BF16_HI(inp, zero); + v_x0[1] = BF16_LO(inp, zero); +} + +FORCEINLINE void vec_mult2(vec_f32 v_x0, vec_bf16 in0, vec_bf16 zero, vec_f32 *vy0) +{ + vec_f32 v_in00 = BF16_HI(in0, zero); + vec_f32 v_in01 = BF16_LO(in0, zero); + + vy0[0] += (v_x0 * v_in00); + vy0[1] += (v_x0 * v_in01); +} + +FORCEINLINE void vec_load_mult2(vec_f32 v_x0, vec_bf16 *in, vec_bf16 zero, vec_f32 *vy0) +{ + vec_bf16 in0 = (vec_bf16)vec_load_vec(in); + + vec_mult2(v_x0, in0, zero, vy0); +} + +FORCEINLINE vec_f32 vec_loadN_mult(vec_bf16 *in, vec_f32 *inp, BLASLONG n, vec_bf16 zero) +{ + vec_bf16 in0 = vec_loadN(in, n); + + return vec_mult(inp, in0, zero); +} + +FORCEINLINE void vec_loadN_vec2(vec_bf16 *in, vec_f32 *v_x0, BLASLONG n, vec_bf16 zero) +{ + vec_bf16 inp = vec_loadN(in, n); + + v_x0[0] = BF16_HI(inp, zero); + v_x0[1] = BF16_LO(inp, zero); +} + +FORCEINLINE void vec_loadN_mult2(vec_f32 v_x0, vec_bf16 *in, BLASLONG n, vec_bf16 zero, vec_f32 *vy0) +{ + vec_bf16 in0 = vec_loadN(in, n); + + vec_mult2(v_x0, in0, zero, vy0); +} + +FORCEINLINE vec_f32 vec_loadNHi_mult(vec_bf16 *in, vec_f32 v_inp0, BLASLONG n, vec_bf16 zero) +{ + vec_f32 v_in00 = vec_loadNHi(in, n, zero); + + return (v_inp0 * v_in00); +} + +FORCEINLINE void copy_x(BLASLONG n, IFLOAT *src, IFLOAT *dest, BLASLONG inc_src) +{ + for (BLASLONG i = 0; i < n; i++) { + *dest++ = *src; + src += inc_src; + } +} + +FORCEINLINE void copy_y_beta(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src, FLOAT beta) +{ + if (beta == (FLOAT)0) { + memset(dest, 0, n * sizeof(FLOAT)); + } else if (beta == (FLOAT)1) { + for (BLASLONG i = 0; i < n; i++) { + *dest++ = *src; + src += inc_src; + } + } else { + for (BLASLONG i = 0; i < n; i++) { + *dest++ = *src * beta; + src += inc_src; + } + } +} + +FORCEINLINE void move_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) +{ + for (BLASLONG i = 0; i < n; i++) { + *dest = *src++; + dest += inc_dest; + } +} + +FORCEINLINE void copy_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src, FLOAT beta) +{ + if (beta == (FLOAT)0) { + move_y(n, src, dest, inc_src); + } else if (beta == (FLOAT)1) { + for (BLASLONG i = 0; i < n; i++) { + *dest += *src++; + dest += inc_src; + } + } else { + for (BLASLONG i = 0; i < n; i++) { + *dest = *src++ + (beta * *dest); + dest += inc_src; + } + } +} + +static void BF16GEMV_N_beta(BLASLONG n, FLOAT *output_vector, FLOAT *input_vector, FLOAT beta) +{ + if (beta == (FLOAT)0) { + memset(output_vector, 0, sizeof(FLOAT) * n); + } else if (beta == (FLOAT)1) { + if (output_vector != input_vector) { + memcpy(output_vector, input_vector, sizeof(FLOAT) * n); + } + } else { + vec_f32 b = { beta, beta, beta, beta }; + + vec_f32 *in = (vec_f32 *)input_vector; + vec_f32 *out = (vec_f32 *)output_vector; + + BLASLONG n8 = n / 8; + BLASLONG i = 0; + vec_f32 v_inp0[2]; + + for (; i + 4 <= n8; i += 4) { + vec_f32 v_inp1[2], v_inp2[2], v_inp3[2]; + vec_load_pair(v_inp0, &in[(i * 2) + 0]); + vec_load_pair(v_inp1, &in[(i * 2) + 2]); + vec_load_pair(v_inp2, &in[(i * 2) + 4]); + vec_load_pair(v_inp3, &in[(i * 2) + 6]); + v_inp0[0] *= b; + v_inp0[1] *= b; + v_inp1[0] *= b; + v_inp1[1] *= b; + v_inp2[0] *= b; + v_inp2[1] *= b; + v_inp3[0] *= b; + v_inp3[1] *= b; + vec_store_pair(&out[(i * 2) + 0], v_inp0); + vec_store_pair(&out[(i * 2) + 2], v_inp1); + vec_store_pair(&out[(i * 2) + 4], v_inp2); + vec_store_pair(&out[(i * 2) + 6], v_inp3); + } + + for (; i < n8; i++) { + vec_load_pair(v_inp0, &in[(i * 2) + 0]); + v_inp0[0] *= b; + v_inp0[1] *= b; + vec_store_pair(&out[(i * 2) + 0], v_inp0); + } + + n &= 7; + if (n > 4) { + BLASLONG n3 = n & 3; + vec_loadN2_f32(v_inp0, &in[(i * 2) + 0], n3); + v_inp0[0] *= b; + v_inp0[1] *= b; + vec_storeN2_f32(v_inp0, &out[(i * 2) + 0], n3); + } else if (n) { + v_inp0[0] = vec_loadN_f32(&in[(i * 2) + 0], n); + v_inp0[0] *= b; + vec_storeN_f32(v_inp0[0], &out[(i * 2) + 0], n); + } + } +} +#endif diff --git a/kernel/power/sbgemv_common_power10.c b/kernel/power/sbgemv_common_power10.c new file mode 100644 index 0000000000..b0e611cb68 --- /dev/null +++ b/kernel/power/sbgemv_common_power10.c @@ -0,0 +1,629 @@ +/*************************************************************************** +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#ifndef SBGEMV_COMMON_MMA_C +#define SBGEMV_COMMON_MMA_C +#include "sbgemv_common.c" + +#if defined(_AIX) || defined(__clang__) +#define USE_MERGE_MMA +#endif + +FORCEINLINE void vec_load_pair2(vec_bf16 *in0, vec_bf16 *in) +{ + vec_load_pair((vec_f32 *)(in0 + 0), (vec_f32 *)(in + 0)); + vec_load_pair((vec_f32 *)(in0 + 2), (vec_f32 *)(in + 2)); +} + +FORCEINLINE void vec_load_mult_mma(__vector_quad *out, vec_bf16 *in, vec_bf16 inp) +{ + vec_bf16 in0 = (vec_bf16)vec_load_vec(in); + + __builtin_mma_xvbf16ger2pp(out, (vec_uc8)in0, (vec_uc8)inp); +} + +FORCEINLINE void vec_load_mult12a_mma(__vector_quad *out, vec_bf16 *in0, vec_bf16 *in1, vec_bf16 inp) +{ + vec_bf16 in11 = (vec_bf16)vec_load_vec(in1); + + vec_load_mult_mma(out, in0, inp); + + __builtin_mma_xvbf16ger2pp(out + 1, (vec_uc8)in11, (vec_uc8)inp); +} + +FORCEINLINE void vec_load_mult14_mma(__vector_quad *out, vec_bf16 *in0, vec_bf16 *in1, vec_bf16 *in2, vec_bf16 *in3, vec_bf16 inp) +{ + vec_bf16 in21 = (vec_bf16)vec_load_vec(in2); + vec_bf16 in31 = (vec_bf16)vec_load_vec(in3); + + vec_load_mult12a_mma(out, in0, in1, inp); + + __builtin_mma_xvbf16ger2pp(out + 2, (vec_uc8)in21, (vec_uc8)inp); + __builtin_mma_xvbf16ger2pp(out + 3, (vec_uc8)in31, (vec_uc8)inp); +} + +FORCEINLINE void vec_load_mult2_mma(__vector_quad *out, vec_bf16 *in, vec_bf16 *inp) +{ + vec_bf16 in0[2]; + + vec_load_pair((vec_f32 *)in0, (vec_f32 *)in); + + __builtin_mma_xvbf16ger2pp(out, (vec_uc8)in0[0], (vec_uc8)inp[0]); + __builtin_mma_xvbf16ger2pp(out, (vec_uc8)in0[1], (vec_uc8)inp[1]); +} + +FORCEINLINE void vec_mult2d_mma(__vector_quad *out, vec_bf16 *in01, vec_bf16 *in11, vec_bf16 *inp) +{ + __builtin_mma_xvbf16ger2pp(out + 0, (vec_uc8)in01[0], (vec_uc8)inp[0]); + __builtin_mma_xvbf16ger2pp(out + 1, (vec_uc8)in11[0], (vec_uc8)inp[0]); +} + +FORCEINLINE void vec_load_mult22_mma(__vector_quad *out, vec_bf16 *in0, vec_bf16 *in1, vec_bf16 *inp) +{ + vec_bf16 in01[2], in11[2]; + + vec_load_pair((vec_f32 *)in01, (vec_f32 *)in0); + vec_load_pair((vec_f32 *)in11, (vec_f32 *)in1); + + vec_mult2d_mma(out, in01 + 0, in11 + 0, inp + 0); + vec_mult2d_mma(out, in01 + 1, in11 + 1, inp + 1); +} + +FORCEINLINE void vec_load_mult24_mma(__vector_quad *out, vec_bf16 *in0, vec_bf16 *in1, vec_bf16 *in2, vec_bf16 *in3, vec_bf16 *inp) +{ + vec_bf16 in01[2], in11[2], in21[2], in31[2]; + + vec_load_pair((vec_f32 *)in01, (vec_f32 *)in0); + vec_load_pair((vec_f32 *)in11, (vec_f32 *)in1); + vec_load_pair((vec_f32 *)in21, (vec_f32 *)in2); + vec_load_pair((vec_f32 *)in31, (vec_f32 *)in3); + + vec_mult2d_mma(out + 0, in01 + 0, in11 + 0, inp + 0); + vec_mult2d_mma(out + 2, in21 + 0, in31 + 0, inp + 0); + vec_mult2d_mma(out + 0, in01 + 1, in11 + 1, inp + 1); + vec_mult2d_mma(out + 2, in21 + 1, in31 + 1, inp + 1); +} + +FORCEINLINE void vec_load_mult4_mma(__vector_quad *out, vec_bf16 *in, vec_bf16 *inp) +{ + vec_bf16 in0[2]; + + vec_load_pair((vec_f32 *)(in0 + 0), (vec_f32 *)(in + 2)); + + vec_load_mult2_mma(out, in + 0, inp + 0); + + __builtin_mma_xvbf16ger2pp(out, (vec_uc8)in0[0], (vec_uc8)inp[2]); + __builtin_mma_xvbf16ger2pp(out, (vec_uc8)in0[1], (vec_uc8)inp[3]); +} + +FORCEINLINE void vec_load_mult42_mma(__vector_quad *out, vec_bf16 *in0, vec_bf16 *in1, vec_bf16 *inp) +{ + vec_bf16 in01[4], in11[4]; + + vec_load_pair2(in01, in0); + vec_load_pair2(in11, in1); + + vec_mult2d_mma(out, in01 + 0, in11 + 0, inp + 0); + vec_mult2d_mma(out, in01 + 1, in11 + 1, inp + 1); + vec_mult2d_mma(out, in01 + 2, in11 + 2, inp + 2); + vec_mult2d_mma(out, in01 + 3, in11 + 3, inp + 3); +} + +FORCEINLINE void vec_mult4d_mma(__vector_quad *out, vec_bf16 *in01, vec_bf16 *in11, vec_bf16 *in21, vec_bf16 *in31, vec_bf16 *inp) +{ + vec_mult2d_mma(out + 0, in01, in11, inp); + vec_mult2d_mma(out + 2, in21, in31, inp); +} + +FORCEINLINE void vec_load_mult44_mma(__vector_quad *out, vec_bf16 *in0, vec_bf16 *in1, vec_bf16 *in2, vec_bf16 *in3, vec_bf16 *inp) +{ + vec_bf16 in01[4], in11[4], in21[4], in31[4]; + + vec_load_pair2(in01, in0); + vec_load_pair2(in11, in1); + vec_load_pair2(in21, in2); + vec_load_pair2(in31, in3); + + vec_mult4d_mma(out, in01 + 0, in11 + 0, in21 + 0, in31 + 0, inp + 0); + vec_mult4d_mma(out, in01 + 1, in11 + 1, in21 + 1, in31 + 1, inp + 1); + vec_mult4d_mma(out, in01 + 2, in11 + 2, in21 + 2, in31 + 2, inp + 2); + vec_mult4d_mma(out, in01 + 3, in11 + 3, in21 + 3, in31 + 3, inp + 3); +} + +FORCEINLINE void vec_loadN_mult_mma(__vector_quad *out, vec_bf16 *in, vec_bf16 inp, BLASLONG n) +{ + vec_bf16 in0 = vec_loadN(in, n); + + __builtin_mma_xvbf16ger2pp(out, (vec_uc8)in0, (vec_uc8)inp); +} + +FORCEINLINE void vec_loadN_mult12a_mma(__vector_quad *out, vec_bf16 *in0, vec_bf16 *in1, vec_bf16 inp, BLASLONG n) +{ + vec_bf16 in11 = (vec_bf16)vec_loadN(in1, n); + + vec_loadN_mult_mma(out, in0, inp, n); + + __builtin_mma_xvbf16ger2pp(out + 1, (vec_uc8)in11, (vec_uc8)inp); +} + +FORCEINLINE void vec_loadN_mult14_mma(__vector_quad *out, vec_bf16 *in0, vec_bf16 *in1, vec_bf16 *in2, vec_bf16 *in3, vec_bf16 inp, BLASLONG n) +{ + vec_bf16 in21 = (vec_bf16)vec_loadN(in2, n); + vec_bf16 in31 = (vec_bf16)vec_loadN(in3, n); + + vec_loadN_mult12a_mma(out, in0, in1, inp, n); + + __builtin_mma_xvbf16ger2pp(out + 2, (vec_uc8)in21, (vec_uc8)inp); + __builtin_mma_xvbf16ger2pp(out + 3, (vec_uc8)in31, (vec_uc8)inp); +} + +FORCEINLINE void vec_mult1_mma(__vector_quad *out, vec_bf16 in0, vec_bf16 inp) +{ + vec_bf16 in00 = vec_mergeh(in0, in0); + + __builtin_mma_xvbf16ger2(out, (vec_uc8)inp, (vec_uc8)in00); +} + +FORCEINLINE void vec_mult2_mma(__vector_quad *out, vec_bf16 in0, vec_bf16 inp) +{ + vec_bf16 in01 = vec_mergel(in0, in0); + + vec_mult1_mma(&out[0], in0, inp); + + __builtin_mma_xvbf16ger2(&out[1], (vec_uc8)inp, (vec_uc8)in01); +} + +#ifndef USE_MERGE_MMA +FORCEINLINE void vec_mult4_mma(__vector_quad *out, vec_bf16 *in0, vec_bf16 inp) +{ + vec_mult2_mma(out + 0, in0[0], inp); + vec_mult2_mma(out + 2, in0[1], inp); +} +#endif + +FORCEINLINE void vec_loadN_mult11_mma(__vector_quad *out, vec_bf16 *in, vec_bf16 inp, BLASLONG n) +{ + vec_bf16 in0 = vec_loadN(in, n); + + vec_mult1_mma(out, in0, inp); +} + +FORCEINLINE void vec_loadN_mult12_mma(__vector_quad *out, vec_bf16 *in, vec_bf16 inp, BLASLONG n) +{ + vec_bf16 in0 = vec_loadN(in, n); + + vec_mult2_mma(out, in0, inp); +} + +FORCEINLINE void vec_load_mult12_mma(__vector_quad *out, vec_bf16 *in, vec_bf16 inp) +{ + vec_bf16 in0 = (vec_bf16)vec_load_vec(in); + + vec_mult2_mma(out, in0, inp); +} + +#ifndef USE_MERGE_MMA +FORCEINLINE void vec_load_mult18_mma(__vector_quad *out, vec_bf16 *in, vec_bf16 inp) +{ + vec_bf16 in0[4]; + + vec_load_pair((vec_f32 *)(in0 + 0), (vec_f32 *)(in + 0)); + vec_load_pair((vec_f32 *)(in0 + 2), (vec_f32 *)(in + 2)); + + vec_mult4_mma(&out[0], in0 + 0, inp); + vec_mult4_mma(&out[4], in0 + 2, inp); +} +#endif + +FORCEINLINE void vec_reduce1_mma(__vector_quad *out, vec_f32 *temp, vec_f32 v_alpha, vec_f32 *vy0) +{ + __builtin_mma_disassemble_acc((void*)temp, &out[0]); + + vy0[0] += (temp[0] * v_alpha); +} + +FORCEINLINE void vec_reduce2_mma(__vector_quad *out, vec_f32 *temp, vec_f32 v_alpha, vec_f32 *vy0) +{ + vec_reduce1_mma(&out[0], &temp[0], v_alpha, &vy0[0]); + vec_reduce1_mma(&out[1], &temp[4], v_alpha, &vy0[1]); +} + +#ifndef USE_MERGE_MMA +FORCEINLINE void vec_reduce8_mma(__vector_quad *out, vec_f32 *temp, vec_f32 v_alpha, vec_f32 *vy0) +{ + vec_reduce2_mma(&out[0], &temp[0], v_alpha, vy0 + 0); + vec_reduce2_mma(&out[2], &temp[8], v_alpha, vy0 + 2); + vec_reduce2_mma(&out[4], &temp[16], v_alpha, vy0 + 4); + vec_reduce2_mma(&out[6], &temp[24], v_alpha, vy0 + 6); +} +#else +FORCEINLINE void vec_reduce44_mma(__vector_quad *out, vec_f32 *temp, vec_f32 v_alpha, vec_f32 *vy0) +{ + __builtin_mma_disassemble_acc((void*)temp, &out[0]); + + vy0[0] += (temp[0] * v_alpha); + vy0[2] += (temp[1] * v_alpha); + vy0[4] += (temp[2] * v_alpha); + vy0[6] += (temp[3] * v_alpha); +} + +FORCEINLINE void vec_reduce84_mma(__vector_quad *out, vec_f32 *temp, vec_f32 v_alpha, vec_f32 *vy0) +{ + vec_reduce44_mma(&out[0], &temp[0], v_alpha, vy0 + 0); + vec_reduce44_mma(&out[1], &temp[4], v_alpha, vy0 + 1); +} + +FORCEINLINE void vec_reduce88_mma(__vector_quad *out, vec_f32 *temp, vec_f32 v_alpha, vec_f32 *vy0) +{ + vec_reduce44_mma(&out[0], &temp[ 0], v_alpha, vy0 + 0); + vec_reduce44_mma(&out[1], &temp[ 4], v_alpha, vy0 + 1); + vec_reduce44_mma(&out[2], &temp[ 8], v_alpha, vy0 + 8); + vec_reduce44_mma(&out[3], &temp[12], v_alpha, vy0 + 9); +} +#endif + +FORCEINLINE void vec_mult11a_mma(__vector_quad *out, vec_bf16 in0, vec_bf16 in1, vec_bf16 inp) +{ + vec_bf16 in00 = vec_mergeh(in0, in1); + + __builtin_mma_xvbf16ger2(out, (vec_uc8)inp, (vec_uc8)in00); +} + +FORCEINLINE void vec_mult2a_mma(__vector_quad *out, vec_bf16 in0, vec_bf16 in1, vec_bf16 inp) +{ + vec_bf16 in01 = vec_mergel(in0, in1); + + vec_mult11a_mma(&out[0], in0, in1, inp); + + __builtin_mma_xvbf16ger2(&out[1], (vec_uc8)inp, (vec_uc8)in01); +} + +FORCEINLINE void vec_mult4a_mma(__vector_quad *out, vec_bf16 *in0, vec_bf16 *in1, vec_bf16 inp) +{ + vec_mult2a_mma(out + 0, in0[0], in1[0], inp); + vec_mult2a_mma(out + 2, in0[1], in1[1], inp); +} + +FORCEINLINE void vec_loadN_mult11a_mma(__vector_quad *out, vec_bf16 *ina, vec_bf16 *inb, vec_bf16 inp, BLASLONG n) +{ + vec_bf16 in0 = vec_loadN(ina, n); + vec_bf16 in1 = vec_loadN(inb, n); + + vec_mult11a_mma(out, in0, in1, inp); +} + +FORCEINLINE void vec_load_mult22a_mma(__vector_quad *out, vec_bf16 *ina, vec_bf16 *inb, vec_bf16 inp) +{ + vec_bf16 in0 = (vec_bf16)vec_load_vec(ina); + vec_bf16 in1 = (vec_bf16)vec_load_vec(inb); + + vec_mult2a_mma(out, in0, in1, inp); +} + +FORCEINLINE void vec_load4_mma(vec_bf16 *in0, vec_bf16 *in1, vec_bf16 *ina, vec_bf16 *inb) +{ + vec_load_pair((vec_f32 *)(in0 + 0), (vec_f32 *)(ina + 0)); + vec_load_pair((vec_f32 *)(in1 + 0), (vec_f32 *)(inb + 0)); + vec_load_pair((vec_f32 *)(in0 + 2), (vec_f32 *)(ina + 2)); + vec_load_pair((vec_f32 *)(in1 + 2), (vec_f32 *)(inb + 2)); +} + +#ifndef USE_MERGE_MMA +FORCEINLINE void vec_load_mult28a_mma(__vector_quad *out, vec_bf16 *ina, vec_bf16 *inb, vec_bf16 inp) +{ + vec_bf16 in0[4], in1[4]; + + vec_load4_mma(in0, in1, ina, inb); + + vec_mult4a_mma(&out[0], in0 + 0, in1 + 0, inp); + vec_mult4a_mma(&out[4], in0 + 2, in1 + 2, inp); +} +#endif + +FORCEINLINE void vec_loadN_mult22a_mma(__vector_quad *out, vec_bf16 *ina, vec_bf16 *inb, vec_bf16 inp, BLASLONG n) +{ + vec_bf16 in0 = vec_loadN(ina, n); + vec_bf16 in1 = vec_loadN(inb, n); + + vec_mult2a_mma(out, in0, in1, inp); +} + +FORCEINLINE void vec_mult11b_mma(__vector_quad *out, vec_bf16 in0, vec_bf16 in1, vec_bf16 inp) +{ + vec_bf16 in00 = vec_mergeh(in0, in1); + + __builtin_mma_xvbf16ger2pp(out, (vec_uc8)inp, (vec_uc8)in00); +} + +FORCEINLINE void vec_mult2b_mma(__vector_quad *out, vec_bf16 in0, vec_bf16 in1, vec_bf16 inp) +{ + vec_bf16 in01 = vec_mergel(in0, in1); + + vec_mult11b_mma(&out[0], in0, in1, inp); + + __builtin_mma_xvbf16ger2pp(&out[1], (vec_uc8)inp, (vec_uc8)in01); +} + +FORCEINLINE void vec_mult4b_mma(__vector_quad *out, vec_bf16 *in0, vec_bf16 *in1, vec_bf16 inp) +{ + vec_mult2b_mma(out + 0, in0[0], in1[0], inp); + vec_mult2b_mma(out + 2, in0[1], in1[1], inp); +} + +#ifdef USE_MERGE_MMA +FORCEINLINE void vec_mult1c_mma(__vector_quad *out, vec_bf16 in0, vec_bf16 inp) +{ + vec_bf16 in00 = vec_mergeh(in0, in0); + + __builtin_mma_xvbf16ger2pp(out, (vec_uc8)inp, (vec_uc8)in00); +} + +FORCEINLINE void vec_mult2c_mma(__vector_quad *out, vec_bf16 in0, vec_bf16 inp) +{ + vec_bf16 in01 = vec_mergel(in0, in0); + + vec_mult1c_mma(&out[0], in0, inp); + + __builtin_mma_xvbf16ger2pp(&out[1], (vec_uc8)inp, (vec_uc8)in01); +} + +FORCEINLINE void vec_mult44_mma(__vector_quad *out, vec_bf16 *in, vec_bf16 *inp) +{ + vec_mult2_mma(out, in[0], inp[0]); + vec_mult2c_mma(out, in[1], inp[1]); +} + +FORCEINLINE void vec_mult44c_mma(__vector_quad *out, vec_bf16 *in, vec_bf16 *inp) +{ + vec_mult2c_mma(out, in[0], inp[0]); + vec_mult2c_mma(out, in[1], inp[1]); +} + +FORCEINLINE void vec_mult44a_mma(__vector_quad *out, vec_bf16 *in0, vec_bf16 *in1, vec_bf16 *inp) +{ + vec_mult2a_mma(out, in0[0], in1[0], inp[0]); + vec_mult2b_mma(out, in0[1], in1[1], inp[1]); +} + +FORCEINLINE void vec_mult44b_mma(__vector_quad *out, vec_bf16 *in0, vec_bf16 *in1, vec_bf16 *inp) +{ + vec_mult2b_mma(out, in0[0], in1[0], inp[0]); + vec_mult2b_mma(out, in0[1], in1[1], inp[1]); +} +#endif + +FORCEINLINE void vec_loadN_mult11b_mma(__vector_quad *out, vec_bf16 *ina, vec_bf16 *inb, vec_bf16 inp, BLASLONG n) +{ + vec_bf16 in0 = vec_loadN(ina, n); + vec_bf16 in1 = vec_loadN(inb, n); + + vec_mult11b_mma(out, in0, in1, inp); +} + +FORCEINLINE void vec_load_mult22b_mma(__vector_quad *out, vec_bf16 *ina, vec_bf16 *inb, vec_bf16 inp) +{ + vec_bf16 in0 = (vec_bf16)vec_load_vec(ina); + vec_bf16 in1 = (vec_bf16)vec_load_vec(inb); + + vec_mult2b_mma(out, in0, in1, inp); +} + +#ifndef USE_MERGE_MMA +FORCEINLINE void vec_load_mult28b_mma(__vector_quad *out, vec_bf16 *ina, vec_bf16 *inb, vec_bf16 inp) +{ + vec_bf16 in0[4], in1[4]; + + vec_load4_mma(in0, in1, ina, inb); + + vec_mult4b_mma(&out[0], in0 + 0, in1 + 0, inp); + vec_mult4b_mma(&out[4], in0 + 2, in1 + 2, inp); +} +#else +FORCEINLINE void vec_load_mult184_mma(__vector_quad *out, vec_bf16 *in, vec_bf16 *inp) +{ + vec_bf16 in0[4]; + + vec_load_pair((vec_f32 *)(in0 + 0), (vec_f32 *)(in + 0)); + vec_load_pair((vec_f32 *)(in0 + 2), (vec_f32 *)(in + 2)); + + vec_mult44_mma(out, in0 + 0, inp + 0); + vec_mult44c_mma(out, in0 + 2, inp + 2); +} + +FORCEINLINE void vec_load_mult284a_mma(__vector_quad *out, vec_bf16 *ina, vec_bf16 *inb, vec_bf16 *inp) +{ + vec_bf16 in0[4], in1[4]; + + vec_load4_mma(in0, in1, ina, inb); + + vec_mult44a_mma(out, in0 + 0, in1 + 0, inp + 0); + vec_mult44b_mma(out, in0 + 2, in1 + 2, inp + 2); +} + +FORCEINLINE void vec_load_mult284b_mma(__vector_quad *out, vec_bf16 *ina, vec_bf16 *inb, vec_bf16 *inp) +{ + vec_bf16 in0[4], in1[4]; + + vec_load4_mma(in0, in1, ina, inb); + + vec_mult44b_mma(out, in0 + 0, in1 + 0, inp + 0); + vec_mult44b_mma(out, in0 + 2, in1 + 2, inp + 2); +} + +FORCEINLINE void vec_load_mult288a_mma(__vector_quad *out, vec_bf16 *ina, vec_bf16 *inb, vec_bf16 *inp) +{ + vec_bf16 in0[8], in1[8]; + + vec_load4_mma(in0 + 0, in1 + 0, ina + 0, inb + 0); + vec_load4_mma(in0 + 4, in1 + 4, ina + 4, inb + 4); + + vec_mult44a_mma(out + 0, in0 + 0, in1 + 0, inp + 0); + vec_mult44a_mma(out + 2, in0 + 4, in1 + 4, inp + 0); + vec_mult44b_mma(out + 0, in0 + 2, in1 + 2, inp + 2); + vec_mult44b_mma(out + 2, in0 + 6, in1 + 6, inp + 2); +} + +FORCEINLINE void vec_load_mult288b_mma(__vector_quad *out, vec_bf16 *ina, vec_bf16 *inb, vec_bf16 *inp) +{ + vec_bf16 in0[8], in1[8]; + + vec_load4_mma(in0 + 0, in1 + 0, ina + 0, inb + 0); + vec_load4_mma(in0 + 4, in1 + 4, ina + 4, inb + 4); + + vec_mult44b_mma(out + 0, in0 + 0, in1 + 0, inp + 0); + vec_mult44b_mma(out + 2, in0 + 4, in1 + 4, inp + 0); + vec_mult44b_mma(out + 0, in0 + 2, in1 + 2, inp + 2); + vec_mult44b_mma(out + 2, in0 + 6, in1 + 6, inp + 2); +} +#endif + +FORCEINLINE void vec_loadN_mult22b_mma(__vector_quad *out, vec_bf16 *ina, vec_bf16 *inb, vec_bf16 inp, BLASLONG n) +{ + vec_bf16 in0 = vec_loadN(ina, n); + vec_bf16 in1 = vec_loadN(inb, n); + + vec_mult2b_mma(out, in0, in1, inp); +} + +FORCEINLINE void vec_load4_pair(vec_f32 *vy0, vec_f32 *v_y) +{ + vec_load_pair(vy0 + 0, v_y + 0); + vec_load_pair(vy0 + 2, v_y + 2); + vec_load_pair(vy0 + 4, v_y + 4); + vec_load_pair(vy0 + 6, v_y + 6); +} + +FORCEINLINE void vec_store4_pair(vec_f32 *v_y, vec_f32 *vy0) +{ + vec_store_pair(v_y + 0, vy0 + 0); + vec_store_pair(v_y + 2, vy0 + 2); + vec_store_pair(v_y + 4, vy0 + 4); + vec_store_pair(v_y + 6, vy0 + 6); +} + +FORCEINLINE void vec_setzero_2(__vector_quad *temp0) +{ + __builtin_mma_xxsetaccz(&temp0[0]); + __builtin_mma_xxsetaccz(&temp0[1]); +} + +FORCEINLINE void vec_setzero_4(__vector_quad *temp0) +{ + vec_setzero_2(temp0 + 0); + vec_setzero_2(temp0 + 2); +} + +FORCEINLINE void vec_setzero_8(__vector_quad *temp0) +{ + vec_setzero_4(temp0 + 0); + vec_setzero_4(temp0 + 4); +} + +FORCEINLINE void vec_reduce_2(vec_f32 *temp00, __vector_quad *temp0) +{ + __builtin_mma_disassemble_acc((void*)(temp00 + 0), &temp0[0]); + __builtin_mma_disassemble_acc((void*)(temp00 + 4), &temp0[1]); +} + +FORCEINLINE void vec_reduce_4(vec_f32 *temp00, __vector_quad *temp0) +{ + vec_reduce_2(temp00 + 0, temp0 + 0); + vec_reduce_2(temp00 + 8, temp0 + 2); +} + +FORCEINLINE void vec_reduce_8(vec_f32 *temp00, __vector_quad *temp0) +{ + vec_reduce_4(temp00 + 0, temp0 + 0); + vec_reduce_4(temp00 + 16, temp0 + 4); +} + +#ifdef USE_MERGE_MMA +FORCEINLINE void vec_load8_pair(vec_f32 *vy0, vec_f32 *v_y) +{ + vec_load4_pair(vy0 + 0, v_y + 0); + vec_load4_pair(vy0 + 8, v_y + 8); +} + +FORCEINLINE void vec_store8_pair(vec_f32 *v_y, vec_f32 *vy0) +{ + vec_store4_pair(v_y + 0, vy0 + 0); + vec_store4_pair(v_y + 8, vy0 + 8); +} + +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#define VEC_SHIFT(data, shift) vec_sldw(data, data, 4 - shift) + +#define MASK_0 0xf000 +#define MASK_1 0x0f00 +#define MASK_2 0x00f0 +#define MASK_3 0x000f +#else +#define VEC_SHIFT(data, shift) vec_sldw(data, data, shift) + +#define MASK_0 0x000f +#define MASK_1 0x00f0 +#define MASK_2 0x0f00 +#define MASK_3 0xf000 +#endif + +FORCEINLINE void vec_make_mult1(vec_bf16 *v_x0, const bool mask) +{ + if (mask) { + v_x0[ 0] = vec_and(v_x0[0], (vec_bf16)vec_genbm(MASK_0)); + } + + v_x0[ 1] = VEC_SHIFT(v_x0[ 0], 1); + v_x0[ 2] = VEC_SHIFT(v_x0[ 0], 2); + v_x0[ 3] = VEC_SHIFT(v_x0[ 0], 3); +} + +FORCEINLINE void vec_make_mult2(vec_bf16 *v_x0) +{ + v_x0[ 5] = vec_and(v_x0[0], (vec_bf16)vec_genbm(MASK_1)); + vec_make_mult1(v_x0, true); + + v_x0[ 4] = VEC_SHIFT(v_x0[ 5], 3); + v_x0[ 6] = VEC_SHIFT(v_x0[ 5], 1); + v_x0[ 7] = VEC_SHIFT(v_x0[ 5], 2); +} + +FORCEINLINE void vec_make_mult4(vec_bf16 *v_x0) +{ + v_x0[10] = vec_and(v_x0[0], (vec_bf16)vec_genbm(MASK_2)); + v_x0[15] = vec_and(v_x0[0], (vec_bf16)vec_genbm(MASK_3)); + vec_make_mult2(v_x0); + + v_x0[ 8] = VEC_SHIFT(v_x0[10], 2); + v_x0[ 9] = VEC_SHIFT(v_x0[10], 3); + v_x0[11] = VEC_SHIFT(v_x0[10], 1); + v_x0[12] = VEC_SHIFT(v_x0[15], 1); + v_x0[13] = VEC_SHIFT(v_x0[15], 2); + v_x0[14] = VEC_SHIFT(v_x0[15], 3); +} +#endif + +#endif diff --git a/kernel/power/sbgemv_n.c b/kernel/power/sbgemv_n.c new file mode 100644 index 0000000000..e6f7f587e6 --- /dev/null +++ b/kernel/power/sbgemv_n.c @@ -0,0 +1,152 @@ +/*************************************************************************** +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#ifndef SBGEMV_N_COMMON_C +#define SBGEMV_N_COMMON_C + +#if (defined(_ARCH_PWR10) && (defined(USE_BFGEMV_8_N_MMA) || (!defined(USE_BFGEMV_N_MMA) && defined(USE_BFGEMV_8_N_VSX)))) || (!defined(_ARCH_PWR10) && defined(USE_BFGEMV_8_N_VSX)) +#define USE_N_8 +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, IFLOAT *a, BLASLONG lda, IFLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y) +{ + IFLOAT *x_ptr, *ap[4]; + IFLOAT xbuffer[8] __attribute__((aligned(16))); + FLOAT *y_ptr, *ybuffer; + FLOAT buffer[NBMAX] __attribute__((aligned(16))); + + if ((m < 1) || (n < 1)) return 0; + + ybuffer = buffer; + y_ptr = y; + + BLASLONG lda4 = lda << 2; +#ifdef USE_N_8 + BLASLONG lda8 = lda << 3; +#endif + BLASLONG NB = NBMAX; + BLASLONG m2 = (m & (NBMAX - 1)); + + while (NB == NBMAX) { + m -= NB; + if (m < 0) { + if (m2 == 0) break; + NB = m2; + } + + if (inc_y != 1) { + copy_y_beta(NB, y_ptr, ybuffer, inc_y, beta); + } else { + ybuffer = y_ptr; + BF16GEMV_N_beta(NB, ybuffer, ybuffer, beta); + } + + x_ptr = x; + + ap[0] = a; + ap[1] = a + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + + if (inc_x == 1) { +#ifdef USE_N_8 + for (BLASLONG j = 0; j + 8 <= n; j += 8) { + BF16GEMV_N_8(NB, ap, x_ptr, ybuffer, lda4, alpha); + ap[0] += lda8; + ap[1] += lda8; + ap[2] += lda8; + ap[3] += lda8; + x_ptr += 8; + } + if (n & 4) { +#else + for (BLASLONG j = 0; j + 4 <= n; j += 4) { +#endif + BF16GEMV_N_4(NB, ap, x_ptr, ybuffer, alpha); + ap[0] += lda4; + ap[1] += lda4; +#ifndef USE_N_8 + ap[2] += lda4; + ap[3] += lda4; +#endif + x_ptr += 4; + } + if (n & 2) { + BF16GEMV_N_2(NB, ap, x_ptr, ybuffer, alpha); + ap[0] += (lda * 2); + x_ptr += 2; + } + if (n & 1) { + BF16GEMV_N_1(NB, ap, x_ptr, ybuffer, alpha); + } + } else { +#ifdef USE_N_8 + for (BLASLONG j = 0; j + 8 <= n; j += 8) { + copy_x(8, x_ptr, xbuffer, inc_x); + BF16GEMV_N_8(NB, ap, xbuffer, ybuffer, lda4, alpha); + ap[0] += lda8; + ap[1] += lda8; + ap[2] += lda8; + ap[3] += lda8; + x_ptr += 8 * inc_x; + } + if (n & 4) { +#else + for (BLASLONG j = 0; j + 4 <= n; j += 4) { +#endif + copy_x(4, x_ptr, xbuffer, inc_x); + BF16GEMV_N_4(NB, ap, xbuffer, ybuffer, alpha); + ap[0] += lda4; + ap[1] += lda4; +#ifndef USE_N_8 + ap[2] += lda4; + ap[3] += lda4; +#endif + x_ptr += 4 * inc_x; + } + if (n & 2) { + copy_x(2, x_ptr, xbuffer, inc_x); + BF16GEMV_N_2(NB, ap, xbuffer, ybuffer, alpha); + ap[0] += (lda * 2); + x_ptr += 2 * inc_x; + } + if (n & 1) { + copy_x(1, x_ptr, xbuffer, inc_x); + BF16GEMV_N_1(NB, ap, xbuffer, ybuffer, alpha); + } + } + + a += NB; + if (inc_y != 1) { + move_y(NB, ybuffer, y_ptr, inc_y); + } + y_ptr += (NB * inc_y); + } + + return 0; +} +#endif diff --git a/kernel/power/sbgemv_n_power10.c b/kernel/power/sbgemv_n_power10.c new file mode 100644 index 0000000000..b1dcb2fcc4 --- /dev/null +++ b/kernel/power/sbgemv_n_power10.c @@ -0,0 +1,474 @@ +/*************************************************************************** +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#ifndef SBGEMV_N_MMA_C +#define SBGEMV_N_MMA_C + +#define USE_BFGEMV_N_MMA + +#ifdef USE_BFGEMV_N_MMA +#include "sbgemv_common_power10.c" + +#ifndef BF16GEMV_N_X +#define BF16GEMV_N_X +#define BF16GEMV_N_8 BF16GEMV_N_MMA_8 +#define BF16GEMV_N_4 BF16GEMV_N_MMA_4 +#define BF16GEMV_N_2 BF16GEMV_N_MMA_2 +#define BF16GEMV_N_1 BF16GEMV_N_MMA_1 +#endif + +#define USE_BFGEMV_8_N_MMA + +static void BF16GEMV_N_MMA_1(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, FLOAT alpha) +{ + IFLOAT *a0; + __vector_quad temp[2*4]; + vec_f32 temp0[8*4]; + vec_f32 v_alpha = { alpha, alpha, alpha, alpha }; + + a0 = ap[0]; + + vec_bf16 *va0 = (vec_bf16 *)a0; + + vec_bf16 *x_bf = (vec_bf16 *)(xo); + + vec_f32 *v_y = (vec_f32 *)y; + BLASLONG n8 = n / 8; + BLASLONG i = 0; + +#ifdef USE_MERGE_MMA + vec_bf16 v_x0[4]; + v_x0[0] = vec_loadN(x_bf, 1); + vec_f32 vy0[2*4*2]; + + vec_make_mult1(v_x0, false); + + for (; i + 8 <= n8; i += 8) { + vec_load_mult184_mma(&temp[0], &va0[i + 0], &v_x0[ 0]); + vec_load_mult184_mma(&temp[2], &va0[i + 4], &v_x0[ 0]); + + vec_load8_pair(vy0, &v_y[(i * 2) + 0]); + + vec_reduce88_mma(&temp[0], temp0 + 0, v_alpha, vy0 + 0); + + vec_store8_pair(&v_y[(i * 2) + 0], vy0); + } + + if (n8 & 4) { + vec_load_mult184_mma(&temp[0], &va0[i + 0], &v_x0[ 0]); + + vec_load4_pair(vy0, &v_y[(i * 2) + 0]); + + vec_reduce84_mma(&temp[0], temp0, v_alpha, vy0); + + vec_store4_pair(&v_y[(i * 2) + 0], vy0); + + i += 4; + } +#else + vec_bf16 v_x0[1]; + v_x0[0] = vec_loadN(x_bf, 1); + vec_f32 vy0[2*4]; + + for (; i + 4 <= n8; i += 4) { + vec_load_mult18_mma(&temp[0], &va0[i + 0], v_x0[ 0]); + + vec_load4_pair(vy0, &v_y[(i * 2) + 0]); + + vec_reduce8_mma(&temp[0], temp0, v_alpha, vy0); + + vec_store4_pair(&v_y[(i * 2) + 0], vy0); + } +#endif + + for (; i < n8; i++) { + vec_load_mult12_mma(&temp[0], &va0[i], v_x0[ 0]); + + vec_load_pair(vy0, &v_y[(i * 2) + 0]); + + vec_reduce2_mma(&temp[0], temp0, v_alpha, vy0); + + vec_store_pair(&v_y[(i * 2) + 0], vy0); + } + + n &= 7; + if (n > 4) { + vec_loadN_mult12_mma(&temp[0], &va0[i], v_x0[ 0], n); + + n &= 3; + vec_loadN2_f32(vy0, &v_y[(i * 2) + 0], n); + + vec_reduce2_mma(&temp[0], temp0, v_alpha, vy0); + + vec_storeN2_f32(vy0, &v_y[(i * 2) + 0], n); + } else if (n) { + vec_loadN_mult11_mma(&temp[0], &va0[i], v_x0[ 0], n); + + vy0[0] = vec_loadN_f32(&v_y[(i * 2) + 0], n); + + vec_reduce1_mma(&temp[0], temp0, v_alpha, vy0); + + vec_storeN_f32(vy0[0], &v_y[(i * 2) + 0], n); + } +} + +static void BF16GEMV_N_MMA_2(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, FLOAT alpha) +{ + IFLOAT *a0, *a1; + __vector_quad temp[2*4]; + vec_f32 temp0[8*4]; + vec_f32 v_alpha = { alpha, alpha, alpha, alpha }; + + a0 = ap[0]; + a1 = ap[1]; + + vec_bf16 *va0 = (vec_bf16 *)a0; + vec_bf16 *va1 = (vec_bf16 *)a1; + + vec_bf16 *x_bf = (vec_bf16 *)(xo); + + vec_f32 *v_y = (vec_f32 *)y; + BLASLONG n8 = n / 8; + BLASLONG i = 0; + +#ifdef USE_MERGE_MMA + vec_bf16 v_x0[4]; + vec_f32 vy0[2*4*2]; + v_x0[0] = vec_loadN(x_bf, 2); + + vec_make_mult1(v_x0, false); + + for (; i + 8 <= n8; i += 8) { + vec_load_mult288a_mma(&temp[0], &va0[i + 0], &va1[i + 0], &v_x0[ 0]); + + vec_load8_pair(vy0, &v_y[(i * 2) + 0]); + + vec_reduce88_mma(&temp[0], temp0 + 0, v_alpha, vy0 + 0); + + vec_store8_pair(&v_y[(i * 2) + 0], vy0); + } + + if (n8 & 4) { + vec_load_mult284a_mma(&temp[0], &va0[i + 0], &va1[i + 0], &v_x0[ 0]); + + vec_load4_pair(vy0, &v_y[(i * 2) + 0]); + + vec_reduce84_mma(&temp[0], temp0, v_alpha, vy0); + + vec_store4_pair(&v_y[(i * 2) + 0], vy0); + + i += 4; + } +#else + vec_bf16 v_x0[1]; + vec_f32 vy0[2*4]; + v_x0[0] = vec_loadN(x_bf, 2); + + for (; i + 4 <= n8; i += 4) { + vec_load_mult28a_mma(&temp[0], &va0[i + 0], &va1[i + 0], v_x0[ 0]); + + vec_load4_pair(vy0, &v_y[(i * 2) + 0]); + + vec_reduce8_mma(&temp[0], temp0, v_alpha, vy0); + + vec_store4_pair(&v_y[(i * 2) + 0], vy0); + } +#endif + + for (; i < n8; i++) { + vec_load_mult22a_mma(&temp[0], &va0[i], &va1[i], v_x0[ 0]); + + vec_load_pair(vy0, &v_y[(i * 2) + 0]); + + vec_reduce2_mma(&temp[0], temp0, v_alpha, vy0); + + vec_store_pair(&v_y[(i * 2) + 0], vy0); + } + + n &= 7; + if (n > 4) { + vec_loadN_mult22a_mma(&temp[0], &va0[i], &va1[i], v_x0[ 0], n); + + n &= 3; + vec_loadN2_f32(vy0, &v_y[(i * 2) + 0], n); + + vec_reduce2_mma(&temp[0], temp0, v_alpha, vy0); + + vec_storeN2_f32(vy0, &v_y[(i * 2) + 0], n); + } else if (n) { + vec_loadN_mult11a_mma(&temp[0], &va0[i], &va1[i], v_x0[ 0], n); + + vy0[0] = vec_loadN_f32(&v_y[(i * 2) + 0], n); + + vec_reduce1_mma(&temp[0], temp0, v_alpha, vy0); + + vec_storeN_f32(vy0[0], &v_y[(i * 2) + 0], n); + } +} + +static void BF16GEMV_N_MMA_4(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, FLOAT alpha) +{ + IFLOAT *a0, *a1, *a2, *a3; + __vector_quad temp[2*4]; + vec_f32 temp0[8*4]; + vec_f32 v_alpha = { alpha, alpha, alpha, alpha }; + + a0 = ap[0]; + a1 = ap[1]; + a2 = ap[2]; + a3 = ap[3]; + + vec_bf16 *va0 = (vec_bf16 *)a0; + vec_bf16 *va1 = (vec_bf16 *)a1; + vec_bf16 *va2 = (vec_bf16 *)a2; + vec_bf16 *va3 = (vec_bf16 *)a3; + + vec_bf16 *x_bf = (vec_bf16 *)(xo); + + vec_f32 *v_y = (vec_f32 *)y; + BLASLONG n8 = n / 8; + BLASLONG i = 0; + +#ifdef USE_MERGE_MMA + vec_bf16 v_x0[8]; + vec_f32 vy0[2*4*2]; + v_x0[0] = vec_loadN(x_bf, 4); + + vec_make_mult2(v_x0); + + for (; i + 8 <= n8; i += 8) { + vec_load_mult288a_mma(&temp[0], &va0[i + 0], &va1[i + 0], &v_x0[ 0]); + vec_load_mult288b_mma(&temp[0], &va2[i + 0], &va3[i + 0], &v_x0[ 4]); + + vec_load8_pair(vy0, &v_y[(i * 2) + 0]); + + vec_reduce88_mma(&temp[0], temp0 + 0, v_alpha, vy0 + 0); + + vec_store8_pair(&v_y[(i * 2) + 0], vy0); + } + + if (n8 & 4) { + vec_load_mult284a_mma(&temp[0], &va0[i + 0], &va1[i + 0], &v_x0[ 0]); + vec_load_mult284b_mma(&temp[0], &va2[i + 0], &va3[i + 0], &v_x0[ 4]); + + vec_load4_pair(vy0, &v_y[(i * 2) + 0]); + + vec_reduce84_mma(&temp[0], temp0, v_alpha, vy0); + + vec_store4_pair(&v_y[(i * 2) + 0], vy0); + + i += 4; + } +#else + vec_bf16 v_x0[5]; + vec_f32 vy0[2*4]; + v_x0[0] = vec_loadN(x_bf, 4); + + v_x0[ 4] = (vec_bf16)vec_splat((vec_f32)v_x0[0], 1); + + for (; i + 4 <= n8; i += 4) { + vec_load_mult28a_mma(&temp[0], &va0[i + 0], &va1[i + 0], v_x0[ 0]); + vec_load_mult28b_mma(&temp[0], &va2[i + 0], &va3[i + 0], v_x0[ 4]); + + vec_load4_pair(vy0, &v_y[(i * 2) + 0]); + + vec_reduce8_mma(&temp[0], temp0, v_alpha, vy0); + + vec_store4_pair(&v_y[(i * 2) + 0], vy0); + } +#endif + + for (; i < n8; i++) { + vec_load_mult22a_mma(&temp[0], &va0[i], &va1[i], v_x0[ 0]); + vec_load_mult22b_mma(&temp[0], &va2[i], &va3[i], v_x0[ 4]); + + vec_load_pair(vy0, &v_y[(i * 2) + 0]); + + vec_reduce2_mma(&temp[0], temp0, v_alpha, vy0); + + vec_store_pair(&v_y[(i * 2) + 0], vy0); + } + + n &= 7; + if (n > 4) { + vec_loadN_mult22a_mma(&temp[0], &va0[i], &va1[i], v_x0[ 0], n); + vec_loadN_mult22b_mma(&temp[0], &va2[i], &va3[i], v_x0[ 4], n); + + n &= 3; + vec_loadN2_f32(vy0, &v_y[(i * 2) + 0], n); + + vec_reduce2_mma(&temp[0], temp0, v_alpha, vy0); + + vec_storeN2_f32(vy0, &v_y[(i * 2) + 0], n); + } else if (n) { + vec_loadN_mult11a_mma(&temp[0], &va0[i], &va1[i], v_x0[ 0], n); + vec_loadN_mult11b_mma(&temp[0], &va2[i], &va3[i], v_x0[ 4], n); + + vy0[0] = vec_loadN_f32(&v_y[(i * 2) + 0], n); + + vec_reduce1_mma(&temp[0], temp0, v_alpha, vy0); + + vec_storeN_f32(vy0[0], &v_y[(i * 2) + 0], n); + } +} + +#ifdef USE_BFGEMV_8_N_MMA +static void BF16GEMV_N_MMA_8(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT alpha) +{ + IFLOAT *a0, *a1, *a2, *a3, *b0, *b1, *b2, *b3; + __vector_quad temp[2*4]; + vec_f32 temp0[8*4]; + vec_f32 v_alpha = { alpha, alpha, alpha, alpha }; + + a0 = ap[0]; + a1 = ap[1]; + a2 = ap[2]; + a3 = ap[3]; + b0 = a0 + lda4; + b1 = a1 + lda4; + b2 = a2 + lda4; + b3 = a3 + lda4; + + vec_bf16 *va0 = (vec_bf16 *)a0; + vec_bf16 *va1 = (vec_bf16 *)a1; + vec_bf16 *va2 = (vec_bf16 *)a2; + vec_bf16 *va3 = (vec_bf16 *)a3; + vec_bf16 *vb0 = (vec_bf16 *)b0; + vec_bf16 *vb1 = (vec_bf16 *)b1; + vec_bf16 *vb2 = (vec_bf16 *)b2; + vec_bf16 *vb3 = (vec_bf16 *)b3; + + vec_bf16 *x_bf = (vec_bf16 *)(xo); + + vec_f32 *v_y = (vec_f32 *)y; + BLASLONG n8 = n / 8; + BLASLONG i = 0; + +#ifdef USE_MERGE_MMA + vec_bf16 v_x0[16]; + vec_f32 vy0[2*4*2]; + v_x0[0] = (vec_bf16)vec_load_vec(x_bf); + + vec_make_mult4(v_x0); + + for (; i + 8 <= n8; i += 8) { + vec_load_mult288a_mma(&temp[0], &va0[i + 0], &va1[i + 0], &v_x0[ 0]); + vec_load_mult288b_mma(&temp[0], &va2[i + 0], &va3[i + 0], &v_x0[ 4]); + vec_load_mult288b_mma(&temp[0], &vb0[i + 0], &vb1[i + 0], &v_x0[ 8]); + vec_load_mult288b_mma(&temp[0], &vb2[i + 0], &vb3[i + 0], &v_x0[12]); + + vec_load8_pair(vy0, &v_y[(i * 2) + 0]); + + vec_reduce88_mma(&temp[0], temp0 + 0, v_alpha, vy0 + 0); + + vec_store8_pair(&v_y[(i * 2) + 0], vy0); + } + + if (n8 & 4) { + vec_load_mult284a_mma(&temp[0], &va0[i + 0], &va1[i + 0], &v_x0[ 0]); + vec_load_mult284b_mma(&temp[0], &va2[i + 0], &va3[i + 0], &v_x0[ 4]); + vec_load_mult284b_mma(&temp[0], &vb0[i + 0], &vb1[i + 0], &v_x0[ 8]); + vec_load_mult284b_mma(&temp[0], &vb2[i + 0], &vb3[i + 0], &v_x0[12]); + + vec_load4_pair(vy0, &v_y[(i * 2) + 0]); + + vec_reduce84_mma(&temp[0], temp0, v_alpha, vy0); + + vec_store4_pair(&v_y[(i * 2) + 0], vy0); + + i += 4; + } +#else + vec_bf16 v_x0[13]; + vec_f32 vy0[2*4]; + v_x0[0] = (vec_bf16)vec_load_vec(x_bf); + + v_x0[ 4] = (vec_bf16)vec_splat((vec_f32)v_x0[0], 1); + v_x0[ 8] = (vec_bf16)vec_splat((vec_f32)v_x0[0], 2); + v_x0[12] = (vec_bf16)vec_splat((vec_f32)v_x0[0], 3); + + for (; i + 4 <= n8; i += 4) { + vec_load_mult28a_mma(&temp[0], &va0[i + 0], &va1[i + 0], v_x0[ 0]); + vec_load_mult28b_mma(&temp[0], &va2[i + 0], &va3[i + 0], v_x0[ 4]); + vec_load_mult28b_mma(&temp[0], &vb0[i + 0], &vb1[i + 0], v_x0[ 8]); + vec_load_mult28b_mma(&temp[0], &vb2[i + 0], &vb3[i + 0], v_x0[12]); + + vec_load4_pair(vy0, &v_y[(i * 2) + 0]); + + vec_reduce8_mma(&temp[0], temp0, v_alpha, vy0); + + vec_store4_pair(&v_y[(i * 2) + 0], vy0); + } +#endif + + for (; i < n8; i++) { + vec_load_mult22a_mma(&temp[0], &va0[i], &va1[i], v_x0[ 0]); + vec_load_mult22b_mma(&temp[0], &va2[i], &va3[i], v_x0[ 4]); + vec_load_mult22b_mma(&temp[0], &vb0[i], &vb1[i], v_x0[ 8]); + vec_load_mult22b_mma(&temp[0], &vb2[i], &vb3[i], v_x0[12]); + + vec_load_pair(vy0, &v_y[(i * 2) + 0]); + + vec_reduce2_mma(&temp[0], temp0, v_alpha, vy0); + + vec_store_pair(&v_y[(i * 2) + 0], vy0); + } + + n &= 7; + if (n > 4) { + vec_loadN_mult22a_mma(&temp[0], &va0[i], &va1[i], v_x0[ 0], n); + vec_loadN_mult22b_mma(&temp[0], &va2[i], &va3[i], v_x0[ 4], n); + vec_loadN_mult22b_mma(&temp[0], &vb0[i], &vb1[i], v_x0[ 8], n); + vec_loadN_mult22b_mma(&temp[0], &vb2[i], &vb3[i], v_x0[12], n); + + n &= 3; + vec_loadN2_f32(vy0, &v_y[(i * 2) + 0], n); + + vec_reduce2_mma(&temp[0], temp0, v_alpha, vy0); + + vec_storeN2_f32(vy0, &v_y[(i * 2) + 0], n); + } else if (n) { + vec_loadN_mult11a_mma(&temp[0], &va0[i], &va1[i], v_x0[ 0], n); + vec_loadN_mult11b_mma(&temp[0], &va2[i], &va3[i], v_x0[ 4], n); + vec_loadN_mult11b_mma(&temp[0], &vb0[i], &vb1[i], v_x0[ 8], n); + vec_loadN_mult11b_mma(&temp[0], &vb2[i], &vb3[i], v_x0[12], n); + + vy0[0] = vec_loadN_f32(&v_y[(i * 2) + 0], n); + + vec_reduce1_mma(&temp[0], temp0, v_alpha, vy0); + + vec_storeN_f32(vy0[0], &v_y[(i * 2) + 0], n); + } +} +#endif + +#include "sbgemv_n.c" +#else +#include "sbgemv_n_vsx.c" +#endif +#endif + diff --git a/kernel/power/sbgemv_n_vsx.c b/kernel/power/sbgemv_n_vsx.c new file mode 100644 index 0000000000..390a87359d --- /dev/null +++ b/kernel/power/sbgemv_n_vsx.c @@ -0,0 +1,299 @@ +/*************************************************************************** +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#ifndef SBGEMV_N_VSX_C +#define SBGEMV_N_VSX_C + +#include "sbgemv_common.c" + +#ifndef BF16GEMV_N_X +#define BF16GEMV_N_X +#define BF16GEMV_N_8 BF16GEMV_N_VSX_8 +#define BF16GEMV_N_4 BF16GEMV_N_VSX_4 +#define BF16GEMV_N_2 BF16GEMV_N_VSX_2 +#define BF16GEMV_N_1 BF16GEMV_N_VSX_1 +#endif + +#define USE_BFGEMV_8_N_VSX + +static void BF16GEMV_N_VSX_1(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, FLOAT alpha) +{ + IFLOAT *a0; + vec_bf16 zero = { 0, 0, 0, 0, 0, 0, 0, 0 }; + vec_f32 v_alpha = { alpha, alpha, alpha, alpha }; + + a0 = ap[0]; + + vec_bf16 *va0 = (vec_bf16 *)a0; + + vec_bf16 *x_bf = (vec_bf16 *)(xo); + vec_f32 x_0 = vec_loadNHi(x_bf, 1, zero); + x_0 *= v_alpha; + + vec_f32 v_x0 = vec_splat(x_0, 0); + + vec_f32 *v_y = (vec_f32 *)y; + BLASLONG n8 = n / 8; + BLASLONG i = 0; + vec_f32 vy0[2]; + + for (; i < n8; i++) { + vec_load_pair(vy0, &v_y[(i * 2) + 0]); + + vec_load_mult2(v_x0, &va0[i], zero, vy0); + + vec_store_pair(&v_y[(i * 2) + 0], vy0); + } + + n &= 7; + if (n > 4) { + BLASLONG n3 = n & 3; + vec_loadN2_f32(vy0, &v_y[(i * 2) + 0], n3); + + vec_loadN_mult2(v_x0, &va0[i], n, zero, vy0); + + vec_storeN2_f32(vy0, &v_y[(i * 2) + 0], n3); + } else if (n) { + vy0[0] = vec_loadN_f32(&v_y[(i * 2) + 0], n); + + vy0[0] += vec_loadNHi_mult(&va0[i], v_x0, n, zero); + + vec_storeN_f32(vy0[0], &v_y[(i * 2) + 0], n); + } +} + +static void BF16GEMV_N_VSX_2(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, FLOAT alpha) +{ + IFLOAT *a0, *a1; + vec_bf16 zero = { 0, 0, 0, 0, 0, 0, 0, 0 }; + vec_f32 v_alpha = { alpha, alpha, alpha, alpha }; + + a0 = ap[0]; + a1 = ap[1]; + + vec_bf16 *va0 = (vec_bf16 *)a0; + vec_bf16 *va1 = (vec_bf16 *)a1; + + vec_bf16 *x_bf = (vec_bf16 *)(xo); + vec_f32 x_0 = vec_loadNHi(x_bf, 2, zero); + x_0 *= v_alpha; + + vec_f32 v_x0 = vec_splat(x_0, 0); + vec_f32 v_x1 = vec_splat(x_0, 1); + + vec_f32 *v_y = (vec_f32 *)y; + BLASLONG n8 = n / 8; + BLASLONG i = 0; + vec_f32 vy0[2]; + + for (; i < n8; i++) { + vec_load_pair(vy0, &v_y[(i * 2) + 0]); + + vec_load_mult2(v_x0, &va0[i], zero, vy0); + vec_load_mult2(v_x1, &va1[i], zero, vy0); + + vec_store_pair(&v_y[(i * 2) + 0], vy0); + } + + n &= 7; + if (n > 4) { + BLASLONG n3 = n & 3; + vec_loadN2_f32(vy0, &v_y[(i * 2) + 0], n3); + + vec_loadN_mult2(v_x0, &va0[i], n, zero, vy0); + vec_loadN_mult2(v_x1, &va1[i], n, zero, vy0); + + vec_storeN2_f32(vy0, &v_y[(i * 2) + 0], n3); + } else if (n) { + vy0[0] = vec_loadN_f32(&v_y[(i * 2) + 0], n); + + vy0[0] += vec_loadNHi_mult(&va0[i], v_x0, n, zero); + vy0[0] += vec_loadNHi_mult(&va1[i], v_x1, n, zero); + + vec_storeN_f32(vy0[0], &v_y[(i * 2) + 0], n); + } +} + +static void BF16GEMV_N_VSX_4(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, FLOAT alpha) +{ + IFLOAT *a0, *a1, *a2, *a3; + vec_bf16 zero = { 0, 0, 0, 0, 0, 0, 0, 0 }; + vec_f32 v_alpha = { alpha, alpha, alpha, alpha }; + + a0 = ap[0]; + a1 = ap[1]; + a2 = ap[2]; + a3 = ap[3]; + + vec_bf16 *va0 = (vec_bf16 *)a0; + vec_bf16 *va1 = (vec_bf16 *)a1; + vec_bf16 *va2 = (vec_bf16 *)a2; + vec_bf16 *va3 = (vec_bf16 *)a3; + + vec_bf16 *x_bf = (vec_bf16 *)(xo); + vec_f32 x_0 = vec_loadNHi(x_bf, 4, zero); + x_0 *= v_alpha; + + vec_f32 v_x0 = vec_splat(x_0, 0); + vec_f32 v_x1 = vec_splat(x_0, 1); + vec_f32 v_x2 = vec_splat(x_0, 2); + vec_f32 v_x3 = vec_splat(x_0, 3); + + vec_f32 *v_y = (vec_f32 *)y; + BLASLONG n8 = n / 8; + BLASLONG i = 0; + vec_f32 vy0[2]; + + for (; i < n8; i++) { + vec_load_pair(vy0, &v_y[(i * 2) + 0]); + + vec_load_mult2(v_x0, &va0[i], zero, vy0); + vec_load_mult2(v_x1, &va1[i], zero, vy0); + vec_load_mult2(v_x2, &va2[i], zero, vy0); + vec_load_mult2(v_x3, &va3[i], zero, vy0); + + vec_store_pair(&v_y[(i * 2) + 0], vy0); + } + + n &= 7; + if (n > 4) { + BLASLONG n3 = n & 3; + vec_loadN2_f32(vy0, &v_y[(i * 2) + 0], n3); + + vec_loadN_mult2(v_x0, &va0[i], n, zero, vy0); + vec_loadN_mult2(v_x1, &va1[i], n, zero, vy0); + vec_loadN_mult2(v_x2, &va2[i], n, zero, vy0); + vec_loadN_mult2(v_x3, &va3[i], n, zero, vy0); + + vec_storeN2_f32(vy0, &v_y[(i * 2) + 0], n3); + } else if (n) { + vy0[0] = vec_loadN_f32(&v_y[(i * 2) + 0], n); + + vy0[0] += vec_loadNHi_mult(&va0[i], v_x0, n, zero); + vy0[0] += vec_loadNHi_mult(&va1[i], v_x1, n, zero); + vy0[0] += vec_loadNHi_mult(&va2[i], v_x2, n, zero); + vy0[0] += vec_loadNHi_mult(&va3[i], v_x3, n, zero); + + vec_storeN_f32(vy0[0], &v_y[(i * 2) + 0], n); + } +} + +#ifdef USE_BFGEMV_8_N_VSX +static void BF16GEMV_N_VSX_8(BLASLONG n, IFLOAT **ap, IFLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT alpha) +{ + IFLOAT *a0, *a1, *a2, *a3, *b0, *b1, *b2, *b3; + vec_bf16 zero = { 0, 0, 0, 0, 0, 0, 0, 0 }; + vec_f32 v_alpha = { alpha, alpha, alpha, alpha }; + + a0 = ap[0]; + a1 = ap[1]; + a2 = ap[2]; + a3 = ap[3]; + b0 = a0 + lda4; + b1 = a1 + lda4; + b2 = a2 + lda4; + b3 = a3 + lda4; + + vec_bf16 *va0 = (vec_bf16 *)a0; + vec_bf16 *va1 = (vec_bf16 *)a1; + vec_bf16 *va2 = (vec_bf16 *)a2; + vec_bf16 *va3 = (vec_bf16 *)a3; + vec_bf16 *vb0 = (vec_bf16 *)b0; + vec_bf16 *vb1 = (vec_bf16 *)b1; + vec_bf16 *vb2 = (vec_bf16 *)b2; + vec_bf16 *vb3 = (vec_bf16 *)b3; + + vec_bf16 *x_bf = (vec_bf16 *)(xo); + vec_bf16 x_in = (vec_bf16)vec_load_vec(x_bf); + vec_f32 x_0 = BF16_HI(x_in, zero); + vec_f32 x_1 = BF16_LO(x_in, zero); + x_0 *= v_alpha; + x_1 *= v_alpha; + + vec_f32 v_x0 = vec_splat(x_0, 0); + vec_f32 v_x1 = vec_splat(x_0, 1); + vec_f32 v_x2 = vec_splat(x_0, 2); + vec_f32 v_x3 = vec_splat(x_0, 3); + vec_f32 v_x4 = vec_splat(x_1, 0); + vec_f32 v_x5 = vec_splat(x_1, 1); + vec_f32 v_x6 = vec_splat(x_1, 2); + vec_f32 v_x7 = vec_splat(x_1, 3); + + vec_f32 *v_y = (vec_f32 *)y; + BLASLONG n8 = n / 8; + BLASLONG i = 0; + vec_f32 vy0[2]; + + for (; i < n8; i++) { + vec_load_pair(vy0, &v_y[(i * 2) + 0]); + + vec_load_mult2(v_x0, &va0[i], zero, vy0); + vec_load_mult2(v_x1, &va1[i], zero, vy0); + vec_load_mult2(v_x2, &va2[i], zero, vy0); + vec_load_mult2(v_x3, &va3[i], zero, vy0); + vec_load_mult2(v_x4, &vb0[i], zero, vy0); + vec_load_mult2(v_x5, &vb1[i], zero, vy0); + vec_load_mult2(v_x6, &vb2[i], zero, vy0); + vec_load_mult2(v_x7, &vb3[i], zero, vy0); + + vec_store_pair(&v_y[(i * 2) + 0], vy0); + } + + n &= 7; + if (n > 4) { + BLASLONG n3 = n & 3; + vec_loadN2_f32(vy0, &v_y[(i * 2) + 0], n3); + + vec_loadN_mult2(v_x0, &va0[i], n, zero, vy0); + vec_loadN_mult2(v_x1, &va1[i], n, zero, vy0); + vec_loadN_mult2(v_x2, &va2[i], n, zero, vy0); + vec_loadN_mult2(v_x3, &va3[i], n, zero, vy0); + vec_loadN_mult2(v_x4, &vb0[i], n, zero, vy0); + vec_loadN_mult2(v_x5, &vb1[i], n, zero, vy0); + vec_loadN_mult2(v_x6, &vb2[i], n, zero, vy0); + vec_loadN_mult2(v_x7, &vb3[i], n, zero, vy0); + + vec_storeN2_f32(vy0, &v_y[(i * 2) + 0], n3); + } else if (n) { + vy0[0] = vec_loadN_f32(&v_y[(i * 2) + 0], n); + + vy0[0] += vec_loadNHi_mult(&va0[i], v_x0, n, zero); + vy0[0] += vec_loadNHi_mult(&va1[i], v_x1, n, zero); + vy0[0] += vec_loadNHi_mult(&va2[i], v_x2, n, zero); + vy0[0] += vec_loadNHi_mult(&va3[i], v_x3, n, zero); + vy0[0] += vec_loadNHi_mult(&vb0[i], v_x4, n, zero); + vy0[0] += vec_loadNHi_mult(&vb1[i], v_x5, n, zero); + vy0[0] += vec_loadNHi_mult(&vb2[i], v_x6, n, zero); + vy0[0] += vec_loadNHi_mult(&vb3[i], v_x7, n, zero); + + vec_storeN_f32(vy0[0], &v_y[(i * 2) + 0], n); + } +} +#endif + +#include "sbgemv_n.c" +#endif diff --git a/kernel/power/sbgemv_t.c b/kernel/power/sbgemv_t.c new file mode 100644 index 0000000000..594b1fc57b --- /dev/null +++ b/kernel/power/sbgemv_t.c @@ -0,0 +1,137 @@ +/*************************************************************************** +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#ifndef SBGEMV_T_COMMON_C +#define SBGEMV_T_COMMON_C + +#if (defined(_ARCH_PWR10) && (defined(USE_BFGEMV_8_T_MMA) || (!defined(USE_BFGEMV_N_MMA) && defined(USE_BFGEMV_8_T_VSX)))) || (!defined(_ARCH_PWR10) && defined(USE_BFGEMV_8_T_VSX)) +#define USE_T_8 +#endif + +int CNAME(BLASLONG m, BLASLONG n, FLOAT alpha, IFLOAT *a, BLASLONG lda, IFLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y) +{ + IFLOAT *xbuffer, *a_ptr; + IFLOAT buffer[NBMAX] __attribute__((aligned(16))); + FLOAT ybuffer[8] __attribute__((aligned(16))); + FLOAT *y_ptr; + + if ((m < 1) || (n < 1)) return 0; + + if (inc_y == 1) { + BF16GEMV_N_beta(n, y, y, beta); + } + + xbuffer = buffer; + + BLASLONG lda4 = lda << 2; +#ifdef USE_T_8 + BLASLONG lda8 = lda << 3; +#endif + BLASLONG NB = NBMAX; + BLASLONG m2 = (m & (NBMAX - 1)); + + while (NB == NBMAX) { + m -= NB; + if (m < 0) { + if (m2 == 0) break; + NB = m2; + } + + a_ptr = a; + a += NB; + y_ptr = y; + + if (inc_x != 1) { + copy_x(NB, x, xbuffer, inc_x); + x += NB * inc_x; + } else { + xbuffer = x; + x += NB; + } + + if (inc_y == 1) { +#ifdef USE_T_8 + for (BLASLONG j = 0; j + 8 <= n; j += 8) { + BF16GEMV_T_8(NB, lda, a_ptr, xbuffer, y_ptr, alpha); + y_ptr += 8; + a_ptr += lda8; + } + if (n & 4) { +#else + for (BLASLONG j = 0; j + 4 <= n; j += 4) { +#endif + BF16GEMV_T_4(NB, lda, a_ptr, xbuffer, y_ptr, alpha); + y_ptr += 4; + a_ptr += lda4; + } + if (n & 2) { + BF16GEMV_T_2(NB, lda, a_ptr, xbuffer, y_ptr, alpha); + y_ptr += 2; + a_ptr += (lda * 2); + } + if (n & 1) { + BF16GEMV_T_1(NB, lda, a_ptr, xbuffer, y_ptr, alpha); + } + } else { +#ifdef USE_T_8 + for (BLASLONG j = 0; j + 8 <= n; j += 8) { + memset(ybuffer, 0, sizeof(FLOAT) * 8); + BF16GEMV_T_8(NB, lda, a_ptr, xbuffer, ybuffer, alpha); + copy_y(8, ybuffer, y_ptr, inc_y, beta); + y_ptr += 8 * inc_y; + a_ptr += lda8; + } + if (n & 4) { +#else + for (BLASLONG j = 0; j + 4 <= n; j += 4) { +#endif + memset(ybuffer, 0, sizeof(FLOAT) * 4); + BF16GEMV_T_4(NB, lda, a_ptr, xbuffer, ybuffer, alpha); + copy_y(4, ybuffer, y_ptr, inc_y, beta); + y_ptr += 4 * inc_y; + a_ptr += lda4; + } + if (n & 2) { + memset(ybuffer, 0, sizeof(FLOAT) * 4); + BF16GEMV_T_2(NB, lda, a_ptr, xbuffer, ybuffer, alpha); + copy_y(2, ybuffer, y_ptr, inc_y, beta); + y_ptr += 2 * inc_y; + a_ptr += (lda * 2); + } + if (n & 1) { + memset(ybuffer, 0, sizeof(FLOAT) * 4); + BF16GEMV_T_1(NB, lda, a_ptr, xbuffer, ybuffer, alpha); + copy_y(1, ybuffer, y_ptr, inc_y, beta); + } + beta = (FLOAT)1; + } + } + + return 0; +} +#endif + diff --git a/kernel/power/sbgemv_t_power10.c b/kernel/power/sbgemv_t_power10.c new file mode 100644 index 0000000000..40c166354b --- /dev/null +++ b/kernel/power/sbgemv_t_power10.c @@ -0,0 +1,338 @@ +/*************************************************************************** +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#ifndef SBGEMV_T_MMA_C +#define SBGEMV_T_MMA_C + +#define USE_BFGEMV_T_MMA + +#ifdef USE_BFGEMV_T_MMA +#include "sbgemv_common_power10.c" + +#ifndef BF16GEMV_T_X +#define BF16GEMV_T_X +#define BF16GEMV_T_8 BF16GEMV_T_MMA_8 +#define BF16GEMV_T_4 BF16GEMV_T_MMA_4 +#define BF16GEMV_T_2 BF16GEMV_T_MMA_2 +#define BF16GEMV_T_1 BF16GEMV_T_MMA_1 +#endif + +#define USE_BFGEMV_8_T_MMA + +static void BF16GEMV_T_MMA_1(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FLOAT *y, FLOAT alpha) +{ + IFLOAT *a0; + vec_bf16 *va0, *v_x; + __vector_quad temp0; + vec_f32 temp00[4]; + vec_bf16 inp[4]; + + __builtin_mma_xxsetaccz(&temp0); + + a0 = ap; + va0 = (vec_bf16 *)a0; + v_x = (vec_bf16 *)x; + BLASLONG n8 = n / 8; + BLASLONG i = 0; + + for (; i + 4 <= n8; i += 4) { + vec_load_pair2(inp, &v_x[i]); + + vec_load_mult4_mma(&temp0, &va0[i + 0], inp); + } + + if (n8 & 2) { + vec_load_pair((vec_f32 *)inp, (vec_f32 *)&v_x[i]); + + vec_load_mult2_mma(&temp0, &va0[i + 0], inp); + + i += 2; + } + + if (n8 & 1) { + inp[0] = (vec_bf16)vec_load_vec(&v_x[i]); + + vec_load_mult_mma(&temp0, &va0[i], inp[0]); + + i++; + } + + n &= 7; + if (n) { + inp[0] = vec_loadN(&v_x[i], n); + + vec_loadN_mult_mma(&temp0, &va0[i], inp[0], n); + } + + __builtin_mma_disassemble_acc((void*)temp00, &temp0); + + y[0] += (alpha * (temp00[0][0] + temp00[1][1] + temp00[2][2] + temp00[3][3])); +} + +static void BF16GEMV_T_MMA_2(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FLOAT *y, FLOAT alpha) +{ + IFLOAT *a0, *a1; + vec_bf16 *va0, *va1, *v_x; + __vector_quad temp0[2]; + vec_f32 temp00[4*2]; + vec_bf16 inp[4]; + + vec_setzero_2(&temp0[0]); + + a0 = ap; + a1 = ap + lda; + va0 = (vec_bf16 *)a0; + va1 = (vec_bf16 *)a1; + v_x = (vec_bf16 *)x; + BLASLONG n8 = n / 8; + BLASLONG i = 0; + + for (; i + 4 <= n8; i += 4) { + vec_load_pair2(inp, &v_x[i]); + + vec_load_mult42_mma(&temp0[0], &va0[i + 0], &va1[i + 0], inp); + } + + if (n8 & 2) { + vec_load_pair((vec_f32 *)inp, (vec_f32 *)&v_x[i]); + + vec_load_mult22_mma(&temp0[0], &va0[i + 0], &va1[i + 0], inp); + + i += 2; + } + + if (n8 & 1) { + inp[0] = (vec_bf16)vec_load_vec(&v_x[i]); + + vec_load_mult12a_mma(&temp0[0], &va0[i], &va1[i], inp[0]); + + i++; + } + + n &= 7; + if (n) { + inp[0] = vec_loadN(&v_x[i], n); + + vec_loadN_mult12a_mma(&temp0[0], &va0[i], &va1[i], inp[0], n); + } + + vec_reduce_2(temp00, &temp0[0]); + + y[0] += (alpha * (temp00[0][0] + temp00[1][1] + temp00[2][2] + temp00[3][3])); + y[1] += (alpha * (temp00[4][0] + temp00[5][1] + temp00[6][2] + temp00[7][3])); +} + +static void BF16GEMV_T_MMA_4(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FLOAT *y, FLOAT alpha) +{ + IFLOAT *a0, *a1, *a2, *a3; + vec_bf16 *va0, *va1, *va2, *va3, *v_x; + __vector_quad temp0[4]; + vec_f32 temp00[4*4]; + vec_bf16 inp[4]; + + vec_setzero_4(&temp0[0]); + + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + va0 = (vec_bf16 *)a0; + va1 = (vec_bf16 *)a1; + va2 = (vec_bf16 *)a2; + va3 = (vec_bf16 *)a3; + v_x = (vec_bf16 *)x; + BLASLONG n8 = n / 8; + BLASLONG i = 0; + + for (; i + 4 <= n8; i += 4) { + vec_load_pair2(inp, &v_x[i]); + + vec_load_mult44_mma(&temp0[0], &va0[i + 0], &va1[i + 0], &va2[i + 0], &va3[i + 0], inp); + } + + if (n8 & 2) { + vec_load_pair((vec_f32 *)inp, (vec_f32 *)&v_x[i]); + + vec_load_mult24_mma(&temp0[0], &va0[i + 0], &va1[i + 0], &va2[i + 0], &va3[i + 0], inp); + + i += 2; + } + + if (n8 & 1) { + inp[0] = (vec_bf16)vec_load_vec(&v_x[i]); + + vec_load_mult14_mma(&temp0[0], &va0[i], &va1[i], &va2[i], &va3[i], inp[0]); + + i++; + } + + n &= 7; + if (n) { + inp[0] = vec_loadN(&v_x[i], n); + + vec_loadN_mult14_mma(&temp0[0], &va0[i], &va1[i], &va2[i], &va3[i], inp[0], n); + } + + vec_reduce_4(temp00, &temp0[0]); + + vec_f32 t0, t1, t2, t3, t4, t5, t6, t7; + vec_f32 a = { alpha, alpha, alpha, alpha }; + vec_f32 *v_y = (vec_f32 *) y; + + t0 = vec_mergeh(temp00[ 0], temp00[ 4]); + t1 = vec_mergeh(temp00[ 8], temp00[12]); + t2 = vec_mergeo(temp00[ 1], temp00[ 5]); + t3 = vec_mergeo(temp00[ 9], temp00[13]); + t4 = vec_mergel(temp00[ 2], temp00[ 6]); + t5 = vec_mergel(temp00[10], temp00[14]); + t6 = vec_mergeo(temp00[ 3], temp00[ 7]); + t7 = vec_mergeo(temp00[11], temp00[15]); + t0 = vec_xxpermdi(t0, t1, 0); + t2 = vec_xxpermdi(t2, t3, 0); + t4 = vec_xxpermdi(t4, t5, 0); + t6 = vec_xxpermdi(t6, t7, 3); + + t0 += t2 + t4 + t6; + + v_y[0] += (a * t0); +} + +#ifdef USE_BFGEMV_8_T_MMA +static void BF16GEMV_T_MMA_8(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FLOAT *y, FLOAT alpha) +{ + IFLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; + vec_bf16 *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x; + __vector_quad temp0[8]; + vec_f32 temp00[4*8]; + vec_bf16 inp[4]; + + vec_setzero_8(&temp0[0]); + + BLASLONG lda4 = lda << 2; + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + a4 = a0 + lda4; + a5 = a1 + lda4; + a6 = a2 + lda4; + a7 = a3 + lda4; + va0 = (vec_bf16 *)a0; + va1 = (vec_bf16 *)a1; + va2 = (vec_bf16 *)a2; + va3 = (vec_bf16 *)a3; + va4 = (vec_bf16 *)a4; + va5 = (vec_bf16 *)a5; + va6 = (vec_bf16 *)a6; + va7 = (vec_bf16 *)a7; + v_x = (vec_bf16 *)x; + BLASLONG n8 = n / 8; + BLASLONG i = 0; + + for (; i + 4 <= n8; i += 4) { + vec_load_pair2(inp, &v_x[i]); + + vec_load_mult44_mma(&temp0[0], &va0[i + 0], &va1[i + 0], &va2[i + 0], &va3[i + 0], inp); + vec_load_mult44_mma(&temp0[4], &va4[i + 0], &va5[i + 0], &va6[i + 0], &va7[i + 0], inp); + } + + if (n8 & 2) { + vec_load_pair((vec_f32 *)inp, (vec_f32 *)&v_x[i]); + + vec_load_mult24_mma(&temp0[0], &va0[i + 0], &va1[i + 0], &va2[i + 0], &va3[i + 0], inp); + vec_load_mult24_mma(&temp0[4], &va4[i + 0], &va5[i + 0], &va6[i + 0], &va7[i + 0], inp); + + i += 2; + } + + if (n8 & 1) { + inp[0] = (vec_bf16)vec_load_vec(&v_x[i]); + + vec_load_mult14_mma(&temp0[0], &va0[i], &va1[i], &va2[i], &va3[i], inp[0]); + vec_load_mult14_mma(&temp0[4], &va4[i], &va5[i], &va6[i], &va7[i], inp[0]); + + i++; + } + + n &= 7; + if (n) { + inp[0] = vec_loadN(&v_x[i], n); + + vec_loadN_mult14_mma(&temp0[0], &va0[i], &va1[i], &va2[i], &va3[i], inp[0], n); + vec_loadN_mult14_mma(&temp0[4], &va4[i], &va5[i], &va6[i], &va7[i], inp[0], n); + } + + vec_reduce_8(temp00, &temp0[0]); + + vec_f32 t0, t1, t2, t3, t4, t5, t6, t7, t10, t11, t12, t13, t14, t15, t16, t17; + vec_f32 a = { alpha, alpha, alpha, alpha }; + vec_f32 *v_y = (vec_f32 *) y; + + t0 = vec_mergeh(temp00[ 0], temp00[ 4]); + t1 = vec_mergeh(temp00[ 8], temp00[12]); + t2 = vec_mergeo(temp00[ 1], temp00[ 5]); + t3 = vec_mergeo(temp00[ 9], temp00[13]); + t4 = vec_mergel(temp00[ 2], temp00[ 6]); + t5 = vec_mergel(temp00[10], temp00[14]); + t6 = vec_mergeo(temp00[ 3], temp00[ 7]); + t7 = vec_mergeo(temp00[11], temp00[15]); + t0 = vec_xxpermdi(t0, t1, 0); + t2 = vec_xxpermdi(t2, t3, 0); + t4 = vec_xxpermdi(t4, t5, 0); + t6 = vec_xxpermdi(t6, t7, 3); + + t0 += t2 + t4 + t6; + + t10 = vec_mergeh(temp00[16], temp00[20]); + t11 = vec_mergeh(temp00[24], temp00[28]); + t12 = vec_mergeo(temp00[17], temp00[21]); + t13 = vec_mergeo(temp00[25], temp00[29]); + t14 = vec_mergel(temp00[18], temp00[22]); + t15 = vec_mergel(temp00[26], temp00[30]); + t16 = vec_mergeo(temp00[19], temp00[23]); + t17 = vec_mergeo(temp00[27], temp00[31]); + t10 = vec_xxpermdi(t10, t11, 0); + t12 = vec_xxpermdi(t12, t13, 0); + t14 = vec_xxpermdi(t14, t15, 0); + t16 = vec_xxpermdi(t16, t17, 3); + + t10 += t12 + t14 + t16; + + vec_f32 inp2[2]; + vec_load_pair(inp2, v_y); + inp2[0] += (a * t0); + inp2[1] += (a * t10); + vec_store_pair(v_y, inp2); +} +#endif + +#include "sbgemv_t.c" +#else +#include "sbgemv_t_vsx.c" +#endif +#endif + diff --git a/kernel/power/sbgemv_t_vsx.c b/kernel/power/sbgemv_t_vsx.c new file mode 100644 index 0000000000..e72d2f31e0 --- /dev/null +++ b/kernel/power/sbgemv_t_vsx.c @@ -0,0 +1,292 @@ +/*************************************************************************** +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#ifndef SBGEMV_T_VSX_C +#define SBGEMV_T_VSX_C + +#include "sbgemv_common.c" + +#ifndef BF16GEMV_T_X +#define BF16GEMV_T_X +#define BF16GEMV_T_8 BF16GEMV_T_VSX_8 +#define BF16GEMV_T_4 BF16GEMV_T_VSX_4 +#define BF16GEMV_T_2 BF16GEMV_T_VSX_2 +#define BF16GEMV_T_1 BF16GEMV_T_VSX_1 +#endif + +#define USE_BFGEMV_8_T_VSX + +static void BF16GEMV_T_VSX_1(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FLOAT *y, FLOAT alpha) +{ + IFLOAT *a0; + vec_bf16 *va0, *v_x; + vec_f32 temp0 = { 0, 0, 0, 0 }; + vec_bf16 zero = { 0, 0, 0, 0, 0, 0, 0, 0 }; + vec_f32 inp[2]; + + a0 = ap; + va0 = (vec_bf16 *)a0; + v_x = (vec_bf16 *)x; + BLASLONG n8 = n / 8; + BLASLONG i = 0; + + for (; i < n8; i++) { + vec_load_vec2(&v_x[i], inp, zero); + + temp0 += vec_load_mult(&va0[i], inp, zero); + } + + n &= 7; + if (n > 4) { + vec_loadN_vec2(&v_x[i], inp, n, zero); + + temp0 += vec_loadN_mult(&va0[i], inp, n, zero); + } else if (n) { + inp[0] = vec_loadNHi(&v_x[i], n, zero); + + temp0 += vec_loadNHi_mult(&va0[i], inp[0], n, zero); + } + + y[0] += (alpha * (temp0[0] + temp0[1] + temp0[2] + temp0[3])); +} + +static void BF16GEMV_T_VSX_2(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FLOAT *y, FLOAT alpha) +{ + IFLOAT *a0, *a1; + vec_bf16 *va0, *va1, *v_x; + vec_f32 temp0 = { 0, 0, 0, 0 }; + vec_f32 temp1 = { 0, 0, 0, 0 }; + vec_bf16 zero = { 0, 0, 0, 0, 0, 0, 0, 0 }; + vec_f32 inp[2]; + + a0 = ap; + a1 = ap + lda; + va0 = (vec_bf16 *)a0; + va1 = (vec_bf16 *)a1; + v_x = (vec_bf16 *)x; + BLASLONG n8 = n / 8; + BLASLONG i = 0; + + for (; i < n8; i++) { + vec_load_vec2(&v_x[i], inp, zero); + + temp0 += vec_load_mult(&va0[i], inp, zero); + temp1 += vec_load_mult(&va1[i], inp, zero); + } + + n &= 7; + if (n > 4) { + vec_loadN_vec2(&v_x[i], inp, n, zero); + + temp0 += vec_loadN_mult(&va0[i], inp, n, zero); + temp1 += vec_loadN_mult(&va1[i], inp, n, zero); + } else if (n) { + inp[0] = vec_loadNHi(&v_x[i], n, zero); + + temp0 += vec_loadNHi_mult(&va0[i], inp[0], n, zero); + temp1 += vec_loadNHi_mult(&va1[i], inp[0], n, zero); + } + + y[0] += (alpha * (temp0[0] + temp0[1] + temp0[2] + temp0[3])); + y[1] += (alpha * (temp1[0] + temp1[1] + temp1[2] + temp1[3])); +} + +static void BF16GEMV_T_VSX_4(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FLOAT *y, FLOAT alpha) +{ + IFLOAT *a0, *a1, *a2, *a3; + vec_bf16 *va0, *va1, *va2, *va3, *v_x; + vec_f32 temp0 = { 0, 0, 0, 0 }; + vec_f32 temp1 = { 0, 0, 0, 0 }; + vec_f32 temp2 = { 0, 0, 0, 0 }; + vec_f32 temp3 = { 0, 0, 0, 0 }; + vec_bf16 zero = { 0, 0, 0, 0, 0, 0, 0, 0 }; + vec_f32 inp[2]; + + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + va0 = (vec_bf16 *)a0; + va1 = (vec_bf16 *)a1; + va2 = (vec_bf16 *)a2; + va3 = (vec_bf16 *)a3; + v_x = (vec_bf16 *)x; + BLASLONG n8 = n / 8; + BLASLONG i = 0; + + for (; i < n8; i++) { + vec_load_vec2(&v_x[i], inp, zero); + + temp0 += vec_load_mult(&va0[i], inp, zero); + temp1 += vec_load_mult(&va1[i], inp, zero); + temp2 += vec_load_mult(&va2[i], inp, zero); + temp3 += vec_load_mult(&va3[i], inp, zero); + } + + n &= 7; + if (n > 4) { + vec_loadN_vec2(&v_x[i], inp, n, zero); + + temp0 += vec_loadN_mult(&va0[i], inp, n, zero); + temp1 += vec_loadN_mult(&va1[i], inp, n, zero); + temp2 += vec_loadN_mult(&va2[i], inp, n, zero); + temp3 += vec_loadN_mult(&va3[i], inp, n, zero); + } else if (n) { + inp[0] = vec_loadNHi(&v_x[i], n, zero); + + temp0 += vec_loadNHi_mult(&va0[i], inp[0], n, zero); + temp1 += vec_loadNHi_mult(&va1[i], inp[0], n, zero); + temp2 += vec_loadNHi_mult(&va2[i], inp[0], n, zero); + temp3 += vec_loadNHi_mult(&va3[i], inp[0], n, zero); + } + + vec_f32 t0, t1, t2, t3; + vec_f32 a = { alpha, alpha, alpha, alpha }; + vec_f32 *v_y = (vec_f32 *) y; + + t0 = vec_mergeh(temp0, temp2); + t1 = vec_mergel(temp0, temp2); + t2 = vec_mergeh(temp1, temp3); + t3 = vec_mergel(temp1, temp3); + temp0 = vec_mergeh(t0, t2); + temp1 = vec_mergel(t0, t2); + temp2 = vec_mergeh(t1, t3); + temp3 = vec_mergel(t1, t3); + temp0 += temp1 + temp2 + temp3; + + v_y[0] += (a * temp0); +} + +#ifdef USE_BFGEMV_8_T_VSX +static void BF16GEMV_T_VSX_8(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FLOAT *y, FLOAT alpha) +{ + IFLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; + vec_bf16 *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x; + vec_f32 temp0 = { 0, 0, 0, 0 }; + vec_f32 temp1 = { 0, 0, 0, 0 }; + vec_f32 temp2 = { 0, 0, 0, 0 }; + vec_f32 temp3 = { 0, 0, 0, 0 }; + vec_f32 temp4 = { 0, 0, 0, 0 }; + vec_f32 temp5 = { 0, 0, 0, 0 }; + vec_f32 temp6 = { 0, 0, 0, 0 }; + vec_f32 temp7 = { 0, 0, 0, 0 }; + vec_bf16 zero = { 0, 0, 0, 0, 0, 0, 0, 0 }; + vec_f32 inp[2]; + + BLASLONG lda4 = lda << 2; + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + a4 = a0 + lda4; + a5 = a1 + lda4; + a6 = a2 + lda4; + a7 = a3 + lda4; + va0 = (vec_bf16 *)a0; + va1 = (vec_bf16 *)a1; + va2 = (vec_bf16 *)a2; + va3 = (vec_bf16 *)a3; + va4 = (vec_bf16 *)a4; + va5 = (vec_bf16 *)a5; + va6 = (vec_bf16 *)a6; + va7 = (vec_bf16 *)a7; + v_x = (vec_bf16 *)x; + BLASLONG n8 = n / 8; + BLASLONG i = 0; + + for (; i < n8; i++) { + vec_load_vec2(&v_x[i], inp, zero); + + temp0 += vec_load_mult(&va0[i], inp, zero); + temp1 += vec_load_mult(&va1[i], inp, zero); + temp2 += vec_load_mult(&va2[i], inp, zero); + temp3 += vec_load_mult(&va3[i], inp, zero); + temp4 += vec_load_mult(&va4[i], inp, zero); + temp5 += vec_load_mult(&va5[i], inp, zero); + temp6 += vec_load_mult(&va6[i], inp, zero); + temp7 += vec_load_mult(&va7[i], inp, zero); + } + + n &= 7; + if (n > 4) { + vec_loadN_vec2(&v_x[i], inp, n, zero); + + temp0 += vec_loadN_mult(&va0[i], inp, n, zero); + temp1 += vec_loadN_mult(&va1[i], inp, n, zero); + temp2 += vec_loadN_mult(&va2[i], inp, n, zero); + temp3 += vec_loadN_mult(&va3[i], inp, n, zero); + temp4 += vec_loadN_mult(&va4[i], inp, n, zero); + temp5 += vec_loadN_mult(&va5[i], inp, n, zero); + temp6 += vec_loadN_mult(&va6[i], inp, n, zero); + temp7 += vec_loadN_mult(&va7[i], inp, n, zero); + } else if (n) { + inp[0] = vec_loadNHi(&v_x[i], n, zero); + + temp0 += vec_loadNHi_mult(&va0[i], inp[0], n, zero); + temp1 += vec_loadNHi_mult(&va1[i], inp[0], n, zero); + temp2 += vec_loadNHi_mult(&va2[i], inp[0], n, zero); + temp3 += vec_loadNHi_mult(&va3[i], inp[0], n, zero); + temp4 += vec_loadNHi_mult(&va4[i], inp[0], n, zero); + temp5 += vec_loadNHi_mult(&va5[i], inp[0], n, zero); + temp6 += vec_loadNHi_mult(&va6[i], inp[0], n, zero); + temp7 += vec_loadNHi_mult(&va7[i], inp[0], n, zero); + } + + vec_f32 t0, t1, t2, t3, t10, t11, t12, t13; + vec_f32 a = { alpha, alpha, alpha, alpha }; + vec_f32 *v_y = (vec_f32 *) y; + + t0 = vec_mergeh(temp0, temp2); + t1 = vec_mergel(temp0, temp2); + t2 = vec_mergeh(temp1, temp3); + t3 = vec_mergel(temp1, temp3); + temp0 = vec_mergeh(t0, t2); + temp1 = vec_mergel(t0, t2); + temp2 = vec_mergeh(t1, t3); + temp3 = vec_mergel(t1, t3); + temp0 += temp1 + temp2 + temp3; + + t10 = vec_mergeh(temp4, temp6); + t11 = vec_mergel(temp4, temp6); + t12 = vec_mergeh(temp5, temp7); + t13 = vec_mergel(temp5, temp7); + temp4 = vec_mergeh(t10, t12); + temp5 = vec_mergel(t10, t12); + temp6 = vec_mergeh(t11, t13); + temp7 = vec_mergel(t11, t13); + temp4 += temp5 + temp6 + temp7; + + vec_load_pair(inp, v_y); + inp[0] += (a * temp0); + inp[1] += (a * temp4); + vec_store_pair(v_y, inp); +} +#endif + +#include "sbgemv_t.c" +#endif + diff --git a/test/compare_sgemm_sbgemm.c b/test/compare_sgemm_sbgemm.c index b8aaee8be3..05d9b33aba 100644 --- a/test/compare_sgemm_sbgemm.c +++ b/test/compare_sgemm_sbgemm.c @@ -202,16 +202,17 @@ main (int argc, char *argv[]) return ret; } + for (l = 0; l < 2; l++) { // l = 1 to test inc_x & inc_y not equal to one. for (x = 1; x <= loop; x++) { - k = (x == 0) ? 0 : 1; + k = (x == 0) ? 0 : l + 1; float *A = (float *)malloc_safe(x * x * sizeof(FLOAT)); - float *B = (float *)malloc_safe(x * sizeof(FLOAT)); - float *C = (float *)malloc_safe(x * sizeof(FLOAT)); + float *B = (float *)malloc_safe(x * sizeof(FLOAT) << l); + float *C = (float *)malloc_safe(x * sizeof(FLOAT) << l); bfloat16_bits *AA = (bfloat16_bits *)malloc_safe(x * x * sizeof(bfloat16_bits)); - bfloat16_bits *BB = (bfloat16_bits *)malloc_safe(x * sizeof(bfloat16_bits)); + bfloat16_bits *BB = (bfloat16_bits *)malloc_safe(x * sizeof(bfloat16_bits) << l); float *DD = (float *)malloc_safe(x * sizeof(FLOAT)); - float *CC = (float *)malloc_safe(x * sizeof(FLOAT)); + float *CC = (float *)malloc_safe(x * sizeof(FLOAT) << l); if ((A == NULL) || (B == NULL) || (C == NULL) || (AA == NULL) || (BB == NULL) || (DD == NULL) || (CC == NULL)) return 1; @@ -226,9 +227,9 @@ main (int argc, char *argv[]) sbstobf16_(&one, &A[j*x+i], &one, &atmp, &one); AA[j * x + i].v = atmp; } - B[j] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; - sbstobf16_(&one, &B[j], &one, &btmp, &one); - BB[j].v = btmp; + B[j << l] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; + sbstobf16_(&one, &B[j << l], &one, &btmp, &one); + BB[j << l].v = btmp; } for (y = 0; y < 2; y++) { @@ -238,9 +239,9 @@ main (int argc, char *argv[]) transA = 'T'; } - memset(CC, 0, x * sizeof(FLOAT)); + memset(CC, 0, x * sizeof(FLOAT) << l); memset(DD, 0, x * sizeof(FLOAT)); - memset(C, 0, x * sizeof(FLOAT)); + memset(C, 0, x * sizeof(FLOAT) << l); SGEMV (&transA, &x, &x, &alpha, A, &x, B, &k, &beta, C, &k); SBGEMV (&transA, &x, &x, &alpha, (bfloat16*) AA, &x, (bfloat16*) BB, &k, &beta, CC, &k); @@ -248,15 +249,15 @@ main (int argc, char *argv[]) for (j = 0; j < x; j++) for (i = 0; i < x; i++) if (transA == 'N') { - DD[i] += float16to32 (AA[j * x + i]) * float16to32 (BB[j]); + DD[i] += float16to32 (AA[j * x + i]) * float16to32 (BB[j << l]); } else if (transA == 'T') { - DD[j] += float16to32 (AA[j * x + i]) * float16to32 (BB[i]); + DD[j] += float16to32 (AA[j * x + i]) * float16to32 (BB[i << l]); } for (j = 0; j < x; j++) { - if (fabs (CC[j] - C[j]) > 1.0) + if (fabs (CC[j << l] - C[j << l]) > 1.0) ret++; - if (fabs (CC[j] - DD[j]) > 1.0) + if (fabs (CC[j << l] - DD[j]) > 1.0) ret++; } } @@ -268,6 +269,7 @@ main (int argc, char *argv[]) free(DD); free(CC); } + } if (ret != 0) fprintf (stderr, "FATAL ERROR SBGEMV - Return code: %d\n", ret); From a47b3c886737a782fa6f2e89df0d712200801de9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 13 Oct 2024 22:54:34 +0200 Subject: [PATCH 094/244] Fix unroll parameter selection for MIPS64_GENERIC --- param.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/param.h b/param.h index 259592cdfe..fee9195d02 100644 --- a/param.h +++ b/param.h @@ -2969,7 +2969,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN (BLASLONG) 0x03fffUL -#if defined(NO_MSA) +#if defined(NO_MSA) || defined(MIPS64_GENERIC) #define SGEMM_DEFAULT_UNROLL_M 2 #define SGEMM_DEFAULT_UNROLL_N 2 From a659f40fe116f237f624dd309ea1cfaac3015227 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 14 Oct 2024 18:53:30 +0200 Subject: [PATCH 095/244] Fix leading dimension for B (Reference-LAPACK PR 1064) --- lapack-netlib/TESTING/EIG/cget52.f | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/TESTING/EIG/cget52.f b/lapack-netlib/TESTING/EIG/cget52.f index 30bf6ba394..558ec60ba6 100644 --- a/lapack-netlib/TESTING/EIG/cget52.f +++ b/lapack-netlib/TESTING/EIG/cget52.f @@ -256,7 +256,7 @@ SUBROUTINE CGET52( LEFT, N, A, LDA, B, LDB, E, LDE, ALPHA, BETA, END IF CALL CGEMV( TRANS, N, N, ACOEFF, A, LDA, E( 1, JVEC ), 1, $ CZERO, WORK( N*( JVEC-1 )+1 ), 1 ) - CALL CGEMV( TRANS, N, N, -BCOEFF, B, LDA, E( 1, JVEC ), 1, + CALL CGEMV( TRANS, N, N, -BCOEFF, B, LDB, E( 1, JVEC ), 1, $ CONE, WORK( N*( JVEC-1 )+1 ), 1 ) 10 CONTINUE * From 7018c1b001bd79dff9d7af842b7f48c66ec03dcb Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 14 Oct 2024 18:56:44 +0200 Subject: [PATCH 096/244] Fix leading dimension for B (Reference-LAPACK PR 1064) --- lapack-netlib/TESTING/EIG/dget52.f | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lapack-netlib/TESTING/EIG/dget52.f b/lapack-netlib/TESTING/EIG/dget52.f index 68196f5f3b..b662bb6945 100644 --- a/lapack-netlib/TESTING/EIG/dget52.f +++ b/lapack-netlib/TESTING/EIG/dget52.f @@ -293,7 +293,7 @@ SUBROUTINE DGET52( LEFT, N, A, LDA, B, LDB, E, LDE, ALPHAR, BCOEFR = SCALE*SALFR CALL DGEMV( TRANS, N, N, ACOEF, A, LDA, E( 1, JVEC ), 1, $ ZERO, WORK( N*( JVEC-1 )+1 ), 1 ) - CALL DGEMV( TRANS, N, N, -BCOEFR, B, LDA, E( 1, JVEC ), + CALL DGEMV( TRANS, N, N, -BCOEFR, B, LDB, E( 1, JVEC ), $ 1, ONE, WORK( N*( JVEC-1 )+1 ), 1 ) ELSE * @@ -323,16 +323,16 @@ SUBROUTINE DGET52( LEFT, N, A, LDA, B, LDB, E, LDE, ALPHAR, * CALL DGEMV( TRANS, N, N, ACOEF, A, LDA, E( 1, JVEC ), 1, $ ZERO, WORK( N*( JVEC-1 )+1 ), 1 ) - CALL DGEMV( TRANS, N, N, -BCOEFR, B, LDA, E( 1, JVEC ), + CALL DGEMV( TRANS, N, N, -BCOEFR, B, LDB, E( 1, JVEC ), $ 1, ONE, WORK( N*( JVEC-1 )+1 ), 1 ) - CALL DGEMV( TRANS, N, N, BCOEFI, B, LDA, E( 1, JVEC+1 ), + CALL DGEMV( TRANS, N, N, BCOEFI, B, LDB, E( 1, JVEC+1 ), $ 1, ONE, WORK( N*( JVEC-1 )+1 ), 1 ) * CALL DGEMV( TRANS, N, N, ACOEF, A, LDA, E( 1, JVEC+1 ), $ 1, ZERO, WORK( N*JVEC+1 ), 1 ) - CALL DGEMV( TRANS, N, N, -BCOEFI, B, LDA, E( 1, JVEC ), + CALL DGEMV( TRANS, N, N, -BCOEFI, B, LDB, E( 1, JVEC ), $ 1, ONE, WORK( N*JVEC+1 ), 1 ) - CALL DGEMV( TRANS, N, N, -BCOEFR, B, LDA, E( 1, JVEC+1 ), + CALL DGEMV( TRANS, N, N, -BCOEFR, B, LDB, E( 1, JVEC+1 ), $ 1, ONE, WORK( N*JVEC+1 ), 1 ) END IF END IF From 27ed6da33133b3e866a0dc744739a5d12c1862bc Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 14 Oct 2024 18:57:50 +0200 Subject: [PATCH 097/244] Fix leading dimension for B (Reference-LAPACK PR 1064) --- lapack-netlib/TESTING/EIG/sget52.f | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lapack-netlib/TESTING/EIG/sget52.f b/lapack-netlib/TESTING/EIG/sget52.f index 9f54126021..5bd5e414c2 100644 --- a/lapack-netlib/TESTING/EIG/sget52.f +++ b/lapack-netlib/TESTING/EIG/sget52.f @@ -293,7 +293,7 @@ SUBROUTINE SGET52( LEFT, N, A, LDA, B, LDB, E, LDE, ALPHAR, BCOEFR = SCALE*SALFR CALL SGEMV( TRANS, N, N, ACOEF, A, LDA, E( 1, JVEC ), 1, $ ZERO, WORK( N*( JVEC-1 )+1 ), 1 ) - CALL SGEMV( TRANS, N, N, -BCOEFR, B, LDA, E( 1, JVEC ), + CALL SGEMV( TRANS, N, N, -BCOEFR, B, LDB, E( 1, JVEC ), $ 1, ONE, WORK( N*( JVEC-1 )+1 ), 1 ) ELSE * @@ -323,16 +323,16 @@ SUBROUTINE SGET52( LEFT, N, A, LDA, B, LDB, E, LDE, ALPHAR, * CALL SGEMV( TRANS, N, N, ACOEF, A, LDA, E( 1, JVEC ), 1, $ ZERO, WORK( N*( JVEC-1 )+1 ), 1 ) - CALL SGEMV( TRANS, N, N, -BCOEFR, B, LDA, E( 1, JVEC ), + CALL SGEMV( TRANS, N, N, -BCOEFR, B, LDB, E( 1, JVEC ), $ 1, ONE, WORK( N*( JVEC-1 )+1 ), 1 ) - CALL SGEMV( TRANS, N, N, BCOEFI, B, LDA, E( 1, JVEC+1 ), + CALL SGEMV( TRANS, N, N, BCOEFI, B, LDB, E( 1, JVEC+1 ), $ 1, ONE, WORK( N*( JVEC-1 )+1 ), 1 ) * CALL SGEMV( TRANS, N, N, ACOEF, A, LDA, E( 1, JVEC+1 ), $ 1, ZERO, WORK( N*JVEC+1 ), 1 ) - CALL SGEMV( TRANS, N, N, -BCOEFI, B, LDA, E( 1, JVEC ), + CALL SGEMV( TRANS, N, N, -BCOEFI, B, LDB, E( 1, JVEC ), $ 1, ONE, WORK( N*JVEC+1 ), 1 ) - CALL SGEMV( TRANS, N, N, -BCOEFR, B, LDA, E( 1, JVEC+1 ), + CALL SGEMV( TRANS, N, N, -BCOEFR, B, LDB, E( 1, JVEC+1 ), $ 1, ONE, WORK( N*JVEC+1 ), 1 ) END IF END IF From 22628f1a6943263bee10dad5fe6a1f12ea572e41 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 14 Oct 2024 18:59:03 +0200 Subject: [PATCH 098/244] Fix leading dimension for B (Reference-LAPACK PR 1064) --- lapack-netlib/TESTING/EIG/zget52.f | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/TESTING/EIG/zget52.f b/lapack-netlib/TESTING/EIG/zget52.f index e22939a6c2..1e9ebafe55 100644 --- a/lapack-netlib/TESTING/EIG/zget52.f +++ b/lapack-netlib/TESTING/EIG/zget52.f @@ -257,7 +257,7 @@ SUBROUTINE ZGET52( LEFT, N, A, LDA, B, LDB, E, LDE, ALPHA, BETA, END IF CALL ZGEMV( TRANS, N, N, ACOEFF, A, LDA, E( 1, JVEC ), 1, $ CZERO, WORK( N*( JVEC-1 )+1 ), 1 ) - CALL ZGEMV( TRANS, N, N, -BCOEFF, B, LDA, E( 1, JVEC ), 1, + CALL ZGEMV( TRANS, N, N, -BCOEFF, B, LDB, E( 1, JVEC ), 1, $ CONE, WORK( N*( JVEC-1 )+1 ), 1 ) 10 CONTINUE * From 457d1c6972a628f21db2b313a7796182d03a7a17 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 17 Oct 2024 10:33:08 +0200 Subject: [PATCH 099/244] remove unused CI badges, wiki->docs, xianyi->OpenMathLib --- README.md | 45 ++++++++++++++++++++------------------------- 1 file changed, 20 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index f6c7ec7431..4bff64b153 100644 --- a/README.md +++ b/README.md @@ -2,12 +2,8 @@ [![Join the chat at https://gitter.im/xianyi/OpenBLAS](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/xianyi/OpenBLAS?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) -Travis CI: [![Build Status](https://travis-ci.com/xianyi/OpenBLAS.svg?branch=develop)](https://travis-ci.com/xianyi/OpenBLAS) - -AppVeyor: [![Build status](https://ci.appveyor.com/api/projects/status/09sohd35n8nkkx64/branch/develop?svg=true)](https://ci.appveyor.com/project/xianyi/openblas/branch/develop) - Cirrus CI: [![Build Status](https://api.cirrus-ci.com/github/xianyi/OpenBLAS.svg?branch=develop)](https://cirrus-ci.com/github/xianyi/OpenBLAS) - + [![Build Status](https://dev.azure.com/xianyi/OpenBLAS/_apis/build/status/xianyi.OpenBLAS?branchName=develop)](https://dev.azure.com/xianyi/OpenBLAS/_build/latest?definitionId=1&branchName=develop) @@ -19,7 +15,7 @@ OSUOSL IBMZ-CI [![Build Status](http://ibmz-ci.osuosl.org/buildStatus/icon?job=O OpenBLAS is an optimized BLAS (Basic Linear Algebra Subprograms) library based on GotoBLAS2 1.13 BSD version. -Please read the documentation on the OpenBLAS wiki pages: . +Please read the documentation in the OpenBLAS folder: . For a general introduction to the BLAS routines, please refer to the extensive documentation of their reference implementation hosted at netlib: . On that site you will likewise find documentation for the reference implementation of the higher-level library LAPACK - the **L**inear **A**lgebra **Pack**age that comes included with OpenBLAS. If you are looking for a general primer or refresher on Linear Algebra, the set of six @@ -31,12 +27,12 @@ We provide official binary packages for the following platform: * Windows x86/x86_64 -You can download them from [file hosting on sourceforge.net](https://sourceforge.net/projects/openblas/files/) or from the Releases section of the github project page, [https://github.com/xianyi/OpenBLAS/releases](https://github.com/xianyi/OpenBLAS/releases). +You can download them from [file hosting on sourceforge.net](https://sourceforge.net/projects/openblas/files/) or from the Releases section of the github project page, [https://github.com/OpenMathLib/OpenBLAS/releases](https://github.com/OpenMathLib/OpenBLAS/releases). ## Installation from Source -Download from project homepage, https://xianyi.github.com/OpenBLAS/, or check out the code -using Git from https://github.com/xianyi/OpenBLAS.git. (If you want the most up to date version, be +Download from project homepage, https://github.com/OpenMathLib/OpenBLAS/, or check out the code +using Git from https://github.com/OpenMathLib/OpenBLAS.git. (If you want the most up to date version, be sure to use the develop branch - master is several years out of date due to a change of maintainership.) Buildtime parameters can be chosen in Makefile.rule, see there for a short description of each option. Most can also be given directly on the make or cmake command line. @@ -45,10 +41,10 @@ Most can also be given directly on the make or cmake command line. Building OpenBLAS requires the following to be installed: -* GNU Make +* GNU Make or CMake * A C compiler, e.g. GCC or Clang * A Fortran compiler (optional, for LAPACK) -* IBM MASS (optional, see below) + ### Normal compile @@ -66,24 +62,22 @@ build options you plan to set. ### Cross compile -Set `CC` and `FC` to point to the cross toolchains, and set `HOSTCC` to your host C compiler. +Set `CC` and `FC` to point to the cross toolchains, and if you use `make`, also set `HOSTCC` to your host C compiler. The target must be specified explicitly when cross compiling. Examples: -* On an x86 box, compile this library for a loongson3a CPU: +* On a Linux system, cross-compiling to an older MIPS64 router board: ```sh - make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A + make BINARY=64 CC=mipsisa64r6el-linux-gnuabi64-gcc FC=mipsisa64r6el-linux-gnuabi64-gfortran HOSTCC=gcc TARGET=P6600 ``` - or same with the newer mips-crosscompiler put out by Loongson that defaults to the 32bit ABI: +* or to a Windows x64 host: ```sh - make HOSTCC=gcc CC='/opt/mips-loongson-gcc7.3-linux-gnu/2019.06-29/bin/mips-linux-gnu-gcc -mabi=64' FC='/opt/mips-loongson-gcc7.3-linux-gnu/2019.06-29/bin/mips-linux-gnu-gfortran -mabi=64' TARGET=LOONGSON3A + make CC="i686-w64-mingw32-gcc -Bstatic" FC="i686-w64-mingw32-gfortran -static-libgfortran" TARGET=HASWELL BINARY=32 CROSS=1 NUM_THREADS=20 CONSISTENT_FPCSR=1 HOSTCC=gcc ``` -* On an x86 box, compile this library for a loongson3a CPU with loongcc (based on Open64) compiler: - ```sh - make CC=loongcc FC=loongf95 HOSTCC=gcc TARGET=LOONGSON3A CROSS=1 CROSS_SUFFIX=mips64el-st-linux-gnu- NO_LAPACKE=1 NO_SHARED=1 BINARY=32 - ``` +You can find instructions for other cases both in the "Supported Systems" section below and in the docs folder. The .yml scripts included with the sources (which contain the +build scripts for the "continuous integration" (CI) build tests automatically run on every proposed change to the sources) may also provide additional hints. When compiling for a more modern CPU TARGET of the same architecture, e.g. TARGET=SKYLAKEX on a HASWELL host, option "CROSS=1" can be used to suppress the automatic invocation of the tests at the end of the build. @@ -315,20 +309,21 @@ If you compile this library with `USE_OPENMP=1`, you should use the above functi ## Reporting bugs -Please submit an issue in https://github.com/xianyi/OpenBLAS/issues. +Please submit an issue in https://github.com/OpenMathLib/OpenBLAS/issues. ## Contact ++ Use github discussions: https://github.com/OpenMathLib/OpenBLAS/discussions * OpenBLAS users mailing list: https://groups.google.com/forum/#!forum/openblas-users * OpenBLAS developers mailing list: https://groups.google.com/forum/#!forum/openblas-dev ## Change log -Please see Changelog.txt to view the differences between OpenBLAS and GotoBLAS2 1.13 BSD version. +Please see Changelog.txt. ## Troubleshooting -* Please read the [FAQ](https://github.com/xianyi/OpenBLAS/wiki/Faq) first. +* Please read the [FAQ](https://github.com/OpenMathLib/OpenBLAS/docs/faq,md) in the docs folder first. * Please use GCC version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MinGW/BSD. * Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. Clang 3.0 will generate the wrong AVX binary code. @@ -345,9 +340,9 @@ Please see Changelog.txt to view the differences between OpenBLAS and GotoBLAS2 ## Contributing -1. [Check for open issues](https://github.com/xianyi/OpenBLAS/issues) or open a fresh issue +1. [Check for open issues](https://github.com/OpenMathLib/OpenBLAS/issues) or open a fresh issue to start a discussion around a feature idea or a bug. -2. Fork the [OpenBLAS](https://github.com/xianyi/OpenBLAS) repository to start making your changes. +2. Fork the [OpenBLAS](https://github.com/OpenMathLib/OpenBLAS) repository to start making your changes. 3. Write a test which shows that the bug was fixed or that the feature works as expected. 4. Send a pull request. Make sure to add yourself to `CONTRIBUTORS.md`. From 15edb441bf827c03ab39c6dc693f6c01c1da971e Mon Sep 17 00:00:00 2001 From: gxw Date: Mon, 14 Oct 2024 17:36:56 +0800 Subject: [PATCH 100/244] LoongArch64: Opt somatcopy_rt with LASX --- kernel/loongarch64/KERNEL.LA464 | 2 + kernel/loongarch64/somatcopy_rt_lasx.c | 200 +++++++++++++++++++++++++ 2 files changed, 202 insertions(+) create mode 100644 kernel/loongarch64/somatcopy_rt_lasx.c diff --git a/kernel/loongarch64/KERNEL.LA464 b/kernel/loongarch64/KERNEL.LA464 index eff1581d9c..2eec3cd55e 100644 --- a/kernel/loongarch64/KERNEL.LA464 +++ b/kernel/loongarch64/KERNEL.LA464 @@ -172,4 +172,6 @@ DGEMM_SMALL_K_TN = dgemm_small_kernel_tn_lasx.S DGEMM_SMALL_K_B0_TN = dgemm_small_kernel_tn_lasx.S DGEMM_SMALL_K_TT = dgemm_small_kernel_tt_lasx.S DGEMM_SMALL_K_B0_TT = dgemm_small_kernel_tt_lasx.S + +SOMATCOPY_RT = somatcopy_rt_lasx.c endif diff --git a/kernel/loongarch64/somatcopy_rt_lasx.c b/kernel/loongarch64/somatcopy_rt_lasx.c new file mode 100644 index 0000000000..51a141149e --- /dev/null +++ b/kernel/loongarch64/somatcopy_rt_lasx.c @@ -0,0 +1,200 @@ +/*************************************************************************** +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#define SAVE1x4(c1) \ + "vfmul.s $vr"#c1",$vr"#c1",$vr15;vstelm.w $vr"#c1",%4,0,0;add.d %4,%4,%3;\n\t" \ + "vstelm.w $vr"#c1",%4,0,1;add.d %4,%4,%3;vstelm.w $vr"#c1",%4,0,2;add.d %4,%4,%3; \n\t" \ + "vstelm.w $vr"#c1",%4,0,3;add.d %4,%4,%3;\n\t" +#define SAVE_2x4(c1, c2, t1, t2) \ + "vilvl.w $vr"#t1",$vr"#c2",$vr"#c1";vfmul.s $vr"#t1",$vr"#t1",$vr15; \n\t" \ + "vstelm.d $vr"#t1",%4,0,0;add.d %4,%4,%3;vstelm.d $vr"#t1",%4,0,1;add.d %4,%4,%3 \n\t" \ + "vilvh.w $vr"#t2",$vr"#c2",$vr"#c1";vfmul.s $vr"#t2",$vr"#t2",$vr15; \n\t" \ + "vstelm.d $vr"#t2",%4,0,0;add.d %4,%4,%3;vstelm.d $vr"#t2",%4,0,1;add.d %4,%4,%3 \n\t" +#define SAVE_4x1(b1) \ + "vst $vr"#b1",%4,0; add.d %4,%4,%3 \n\t" +#define SAVE_4x2(b1, b2) \ + "vst $vr"#b1",%4,0; add.d %4,%4,%3; vst $vr"#b2",%4,0; add.d %4,%4,%3 \n\t" +#define SAVE_4x4(b1, b2, b3, b4) \ + "vst $vr"#b1",%4,0; add.d %4,%4,%3; vst $vr"#b2",%4,0; add.d %4,%4,%3 \n\t" \ + "vst $vr"#b3",%4,0; add.d %4,%4,%3; vst $vr"#b4",%4,0; add.d %4,%4,%3 \n\t" +#define SAVE_4x8(b1, b2, b3, b4) \ + SAVE_4x4(b1, b2, b3, b4) \ + "xvpermi.q $xr"#b1",$xr"#b1",1; xvpermi.q $xr"#b2",$xr"#b2",1; xvpermi.q $xr"#b3",$xr"#b3",1; xvpermi.q $xr"#b4",$xr"#b4",1; \n\t" \ + SAVE_4x4(b1, b2, b3, b4) + +#define TRANS_4x4(a1, a2, a3, a4, t1, t2, t3, t4) \ + "vilvl.w $vr"#t1",$vr"#a2",$vr"#a1";vilvh.w $vr"#t2",$vr"#a2",$vr"#a1"; \n\t" \ + "vilvl.w $vr"#t3",$vr"#a4",$vr"#a3";vilvh.w $vr"#t4",$vr"#a4",$vr"#a3"; \n\t" \ + "vilvl.d $vr"#a1",$vr"#t3",$vr"#t1";vilvh.d $vr"#a2",$vr"#t3",$vr"#t1"; \n\t" \ + "vilvl.d $vr"#a3",$vr"#t4",$vr"#t2";vilvh.d $vr"#a4",$vr"#t4",$vr"#t2"; \n\t" +#define TRANS_4x8(a1, a2, a3, a4, t1, t2, t3, t4) \ + "xvilvl.w $xr"#t1",$xr"#a2",$xr"#a1"; xvilvh.w $xr"#t2",$xr"#a2",$xr"#a1"; \n\t" \ + "xvilvl.w $xr"#t3",$xr"#a4",$xr"#a3"; xvilvh.w $xr"#t4",$xr"#a4",$xr"#a3"; \n\t" \ + "xvilvl.d $xr"#a1",$xr"#t3",$xr"#t1"; xvilvh.d $xr"#a2",$xr"#t3",$xr"#t1"; \n\t" \ + "xvilvl.d $xr"#a3",$xr"#t4",$xr"#t2"; xvilvh.d $xr"#a4",$xr"#t4",$xr"#t2"; \n\t" +#define COPY_4x16 \ + "move %4,%1; addi.d %1,%1,16 \n\t" \ + "xvld $xr0,%0,0; xvld $xr4,%0,32; add.d %0,%0,%2; \n\t" \ + "xvld $xr1,%0,0; xvld $xr5,%0,32; add.d %0,%0,%2; \n\t" \ + "xvld $xr2,%0,0; xvld $xr6,%0,32; add.d %0,%0,%2; \n\t" \ + "xvld $xr3,%0,0; xvld $xr7,%0,32; add.d %0,%0,%2; \n\t" \ + "xvfmul.s $xr0,$xr0,$xr15;xvfmul.s $xr1,$xr1,$xr15;xvfmul.s $xr2,$xr2,$xr15;xvfmul.s $xr3,$xr3,$xr15 \n\t" \ + "xvfmul.s $xr4,$xr4,$xr15;xvfmul.s $xr5,$xr5,$xr15;xvfmul.s $xr6,$xr6,$xr15;xvfmul.s $xr7,$xr7,$xr15 \n\t" \ + TRANS_4x8(0,1,2,3,8,9,10,11) SAVE_4x8(0,1,2,3) \ + TRANS_4x8(4,5,6,7,8,9,10,11) SAVE_4x8(4,5,6,7) +#define COPY_4x8 \ + "move %4,%1; addi.d %1,%1,16 \n\t" \ + "xvld $xr0,%0,0; add.d %0,%0,%2; \n\t" \ + "xvld $xr1,%0,0; add.d %0,%0,%2; \n\t" \ + "xvld $xr2,%0,0; add.d %0,%0,%2; \n\t" \ + "xvld $xr3,%0,0; add.d %0,%0,%2; \n\t" \ + "xvfmul.s $xr0,$xr0,$xr15;xvfmul.s $xr1,$xr1,$xr15;xvfmul.s $xr2,$xr2,$xr15;xvfmul.s $xr3,$xr3,$xr15 \n\t" \ + TRANS_4x8(0,1,2,3,8,9,10,11) SAVE_4x8(0,1,2,3) +#define COPY_4x4 \ + "move %4,%1; addi.d %1,%1,16 \n\t" \ + "vld $vr0,%0,0; add.d %0,%0,%2; \n\t" \ + "vld $vr1,%0,0; add.d %0,%0,%2; \n\t" \ + "vld $vr2,%0,0; add.d %0,%0,%2; \n\t" \ + "vld $vr3,%0,0; add.d %0,%0,%2; \n\t" \ + "vfmul.s $vr0,$vr0,$vr15;vfmul.s $vr1,$vr1,$vr15;vfmul.s $vr2,$vr2,$vr15;vfmul.s $vr3,$vr3,$vr15 \n\t" \ + TRANS_4x4(0,1,2,3,8,9,10,11) SAVE_4x4(0,1,2,3) +#define COPY_4x2 \ + "move %4,%1; addi.d %1,%1,16 \n\t" \ + "vld $vr0,%0,0; add.d %0,%0,%2; \n\t" \ + "vld $vr1,%0,0; add.d %0,%0,%2; \n\t" \ + "vld $vr2,%0,0; add.d %0,%0,%2; \n\t" \ + "vld $vr3,%0,0; add.d %0,%0,%2; \n\t" \ + "vfmul.s $vr0,$vr0,$vr15;vfmul.s $vr1,$vr1,$vr15;vfmul.s $vr2,$vr2,$vr15;vfmul.s $vr3,$vr3,$vr15 \n\t" \ + TRANS_4x4(0,1,2,3,8,9,10,11) SAVE_4x2(0,1) +#define COPY_4x1 \ + "move %4,%1; addi.d %1,%1,16 \n\t" \ + "fld.s $f0,%0,0; add.d %0,%0,%2; \n\t" \ + "fld.s $f1,%0,0; add.d %0,%0,%2; \n\t" \ + "fld.s $f2,%0,0; add.d %0,%0,%2; \n\t" \ + "fld.s $f3,%0,0; add.d %0,%0,%2; \n\t" \ + "xvinsve0.w $xr0,$xr1,1;xvinsve0.w $xr0,$xr2,2;xvinsve0.w $xr0,$xr3,3; \n\t" \ + "vfmul.s $vr0,$vr0,$vr15; \n\t" \ + SAVE_4x1(0) + +#define COPY_2x16 \ + "move %4,%1; addi.d %1,%1,8 \n\t" \ + "xvld $xr0,%0,0; xvld $xr2,%0,32; add.d %0,%0,%2; \n\t" \ + "xvld $xr1,%0,0; xvld $xr3,%0,32; add.d %0,%0,%2; \n\t" \ + "xvpermi.q $xr4,$xr0,1;xvpermi.q $xr6,$xr2,1;xvpermi.q $xr5,$xr1,1;xvpermi.q $xr7,$xr3,1; \n\t" \ + SAVE_2x4(0,1,8,9) SAVE_2x4(4,5,8,9) SAVE_2x4(2,3,8,9) SAVE_2x4(6,7,8,9) +#define COPY_2x8 \ + "move %4,%1; addi.d %1,%1,8 \n\t" \ + "xvld $xr0,%0,0; add.d %0,%0,%2; \n\t" \ + "xvld $xr1,%0,0; add.d %0,%0,%2; \n\t" \ + "xvpermi.q $xr2,$xr0,1;xvpermi.q $xr3,$xr1,1; \n\t" \ + SAVE_2x4(0,1,4,5) SAVE_2x4(2,3,4,5) +#define COPY_2x4 \ + "move %4,%1; addi.d %1,%1,8 \n\t" \ + "vld $vr0,%0,0; add.d %0,%0,%2; \n\t" \ + "vld $vr1,%0,0; add.d %0,%0,%2; \n\t" \ + SAVE_2x4(0,1,4,5) +#define COPY_2x2 \ + "move %4,%1; addi.d %1,%1,8 \n\t" \ + "fld.d $f0,%0,0;add.d %0,%0,%2; \n\t" \ + "fld.d $f1,%0,0;add.d %0,%0,%2; \n\t" \ + "xvinsve0.d $xr0,$xr1,1;vfmul.s $vr0,$vr0,$vr15;vshuf4i.w $vr0,$vr0,0xd8 \n\t" \ + "vstelm.d $vr0,%4,0,0;add.d %4,%4,%3;vstelm.d $vr0,%4,0,1 \n\t" +#define COPY_2x1 \ + "move %4,%1; addi.d %1,%1,8 \n\t" \ + "fld.s $f0,%0,0;add.d %0,%0,%2; \n\t" \ + "fld.s $f1,%0,0;add.d %0,%0,%2; \n\t" \ + "xvinsve0.w $xr0,$xr1,1;vfmul.s $vr0,$vr0,$vr15; \n\t" \ + "vstelm.d $vr0,%4,0,0; \n\t" + +#define COPY_1x16 \ + "move %4,%1; addi.d %1,%1,4 \n\t" \ + "vld $vr1,%0,0;" SAVE1x4(1) "vld $vr2,%0,16;" SAVE1x4(2) \ + "vld $vr1,%0,32;" SAVE1x4(1) "vld $vr2,%0,48;" SAVE1x4(2) \ + "add.d %0,%0,%2 \n\t" +#define COPY_1x8 \ + "move %4,%1; addi.d %1,%1,4 \n\t" \ + "vld $vr1,%0,0;" SAVE1x4(1) "vld $vr2,%0,16;" SAVE1x4(2) \ + "add.d %0,%0,%2 \n\t" +#define COPY_1x4 \ + "move %4,%1; addi.d %1,%1,4 \n\t" \ + "vld $vr1,%0,0;" SAVE1x4(1) \ + "add.d %0,%0,%2 \n\t" +#define COPY_1x2 \ + "move %4,%1;fld.d $f1,%0,0;add.d %0,%0,%2;vfmul.s $vr1,$vr1,$vr15;vstelm.w $vr1,%4,0,0;add.d %4,%4,%3;vstelm.w $vr1,%4,0,1;\n\t" \ + "addi.d %1,%1,4;\n\t" +#define COPY_1x1 \ + "fld.s $f1,%0,0;fmul.s $f1,$f1,$f15;fst.s $f1,%1,0;add.d %0,%0,%2;addi.d %1,%1,4;\n\t" + +#define ROWS_OF_BLOCK 128 + +#define COMPUTE(ndim) \ + src = src_base; dst = dst_base; \ + __asm__ __volatile__( \ + "xvldrepl.w $xr15, %6, 0 \n\t" \ + "srli.d $r6, %5, 2 \n\t" \ + "beqz $r6, "#ndim"3f \n\t" \ + #ndim"4: \n\t" \ + COPY_4x##ndim \ + "addi.d $r6, $r6, -1 \n\t" \ + "bnez $r6, "#ndim"4b \n\t" \ + #ndim"3: \n\t" \ + "andi $r6, %5, 2 \n\t" \ + "beqz $r6, "#ndim"1f \n\t" \ + #ndim"2: \n\t" \ + COPY_2x##ndim \ + #ndim"1: \n\t" \ + "andi $r6, %5, 1 \n\t" \ + "beqz $r6, "#ndim"0f \n\t" \ + COPY_1x##ndim \ + #ndim"0: \n\t" \ + :"+r"(src),"+r"(dst),"+r"(src_ld_bytes),"+r"(dst_ld_bytes),"+r"(dst_tmp) \ + :"r"(num_rows),"r"(&ALPHA) \ + :"memory", "$r6", "$f0", "$f1", "$f2", "$f3", "$f4", "$f5", "$f6", "$f7", "$f8", "$f9", "$f10", "$f11", "$f15" \ + ); + +int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb){ + float *src, *dst, *dst_tmp=0, *src_base, *dst_base; + uint64_t src_ld_bytes = (uint64_t)lda * sizeof(float), dst_ld_bytes = (uint64_t)ldb * sizeof(float), num_rows = 0; + BLASLONG cols_left, rows_done; float ALPHA = alpha; + if (ALPHA == 0.0) { + dst_base = b; + for (cols_left = cols; cols_left > 0; cols_left--) {memset(dst_base, 0, rows * sizeof(float)); dst_base += ldb;} + return 0; + } + for (rows_done = 0; rows_done < rows; rows_done += num_rows) { + num_rows = rows - rows_done; + if (num_rows > ROWS_OF_BLOCK) num_rows = ROWS_OF_BLOCK; + cols_left = cols; src_base = a + (int64_t)lda * (int64_t)rows_done; dst_base = b + rows_done; + for (;cols_left > 15; cols_left -= 16) {COMPUTE(16) src_base += 16; dst_base += 16 * ldb;} + for (;cols_left > 7; cols_left -= 8) {COMPUTE(8) src_base += 8; dst_base += 8 * ldb;} + for (;cols_left > 3; cols_left -= 4) {COMPUTE(4) src_base += 4; dst_base += 4 * ldb;} + for (;cols_left > 1; cols_left -= 2) {COMPUTE(2) src_base += 2; dst_base += 2 * ldb;} + if (cols_left > 0) {COMPUTE(1) src_base ++; dst_base += ldb;} + } +} From acf6cab30478787922d03895ba27960ee293569f Mon Sep 17 00:00:00 2001 From: gxw Date: Thu, 17 Oct 2024 09:50:02 +0000 Subject: [PATCH 101/244] LoongArch64: Opt somatcopy_rn with LASX --- kernel/loongarch64/KERNEL.LA464 | 1 + kernel/loongarch64/somatcopy_rn_lasx.c | 193 +++++++++++++++++++++++++ 2 files changed, 194 insertions(+) create mode 100644 kernel/loongarch64/somatcopy_rn_lasx.c diff --git a/kernel/loongarch64/KERNEL.LA464 b/kernel/loongarch64/KERNEL.LA464 index 2eec3cd55e..1a6b6ee6c5 100644 --- a/kernel/loongarch64/KERNEL.LA464 +++ b/kernel/loongarch64/KERNEL.LA464 @@ -174,4 +174,5 @@ DGEMM_SMALL_K_TT = dgemm_small_kernel_tt_lasx.S DGEMM_SMALL_K_B0_TT = dgemm_small_kernel_tt_lasx.S SOMATCOPY_RT = somatcopy_rt_lasx.c +SOMATCOPY_RN = somatcopy_rn_lasx.c endif diff --git a/kernel/loongarch64/somatcopy_rn_lasx.c b/kernel/loongarch64/somatcopy_rn_lasx.c new file mode 100644 index 0000000000..c9bfb4c5e8 --- /dev/null +++ b/kernel/loongarch64/somatcopy_rn_lasx.c @@ -0,0 +1,193 @@ +/*************************************************************************** +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#define SAVE_4x1(b1, b2, b3, b4) \ + "vstelm.w $vr"#b1",%1,0,0;add.d %1,%1,%3;vstelm.w $vr"#b2",%1,0,0;add.d %1,%1,%3;\n\t" \ + "vstelm.w $vr"#b3",%1,0,0;add.d %1,%1,%3;vstelm.w $vr"#b4",%1,0,0;add.d %1,%1,%3;\n\t" +#define SAVE_4x2(b1, b2, b3, b4) \ + "vstelm.d $vr"#b1",%1,0,0;add.d %1,%1,%3;vstelm.d $vr"#b2",%1,0,0;add.d %1,%1,%3;\n\t" \ + "vstelm.d $vr"#b3",%1,0,0;add.d %1,%1,%3;vstelm.d $vr"#b4",%1,0,0;add.d %1,%1,%3;\n\t" +#define SAVE_4x4(b1, b2, b3, b4) \ + "vst $vr"#b1",%1,0;add.d %1,%1,%3;vst $vr"#b2",%1,0;add.d %1,%1,%3;\n\t" \ + "vst $vr"#b3",%1,0;add.d %1,%1,%3;vst $vr"#b4",%1,0;add.d %1,%1,%3;\n\t" +#define SAVE_4x8(b1, b2, b3, b4) \ + "xvst $xr"#b1",%1,0;add.d %1,%1,%3;xvst $xr"#b2",%1,0;add.d %1,%1,%3;\n\t" \ + "xvst $xr"#b3",%1,0;add.d %1,%1,%3;xvst $xr"#b4",%1,0;add.d %1,%1,%3;\n\t" +#define SAVE_4x16(b1, b2, b3, b4, b5, b6, b7, b8) \ + "xvst $xr"#b1",%1,0;xvst $xr"#b2",%1,32;add.d %1,%1,%3;\n\t" \ + "xvst $xr"#b3",%1,0;xvst $xr"#b4",%1,32;add.d %1,%1,%3;\n\t" \ + "xvst $xr"#b5",%1,0;xvst $xr"#b6",%1,32;add.d %1,%1,%3;\n\t" \ + "xvst $xr"#b7",%1,0;xvst $xr"#b8",%1,32;add.d %1,%1,%3;\n\t" +#define SAVE_2x16(b1, b2, b3, b4) \ + "xvst $xr"#b1",%1,0;xvst $xr"#b2",%1,32;add.d %1,%1,%3;\n\t" \ + "xvst $xr"#b3",%1,0;xvst $xr"#b4",%1,32;add.d %1,%1,%3;\n\t" +#define SAVE_2x8(b1, b2) \ + "xvst $xr"#b1",%1,0;add.d %1,%1,%3;xvst $xr"#b2",%1,0;add.d %1,%1,%3;\n\t" +#define SAVE_2x4(b1, b2) \ + "vst $vr"#b1",%1,0;add.d %1,%1,%3;vst $vr"#b2",%1,0;add.d %1,%1,%3;\n\t" +#define SAVE_2x2(b1, b2) \ + "vstelm.d $vr"#b1",%1,0,0;add.d %1,%1,%3;vstelm.d $vr"#b2",%1,0,0;add.d %1,%1,%3;\n\t" +#define SAVE_2x1(b1, b2) \ + "vstelm.w $vr"#b1",%1,0,0;add.d %1,%1,%3;vstelm.w $vr"#b2",%1,0,0;add.d %1,%1,%3;\n\t" +#define SAVE_1x16(b1, b2) \ + "xvst $xr"#b1",%1,0;xvst $xr"#b2",%1,32;add.d %1,%1,%3;\n\t" +#define SAVE_1x8(b1) \ + "xvst $xr"#b1",%1,0;add.d %1,%1,%3;\n\t" +#define SAVE_1x4(b1) \ + "vst $vr"#b1",%1,0;add.d %1,%1,%3;\n\t" +#define SAVE_1x2(b1) \ + "vstelm.d $vr"#b1",%1,0,0;add.d %1,%1,%3;\n\t" +#define SAVE_1x1(b1) \ + "vstelm.w $vr"#b1",%1,0,0;add.d %1,%1,%3;\n\t" +#define COPY_4x16 \ + "xvld $xr0,%0,0; xvld $xr4,%0,32; add.d %0,%0,%2; \n\t" \ + "xvld $xr1,%0,0; xvld $xr5,%0,32; add.d %0,%0,%2; \n\t" \ + "xvld $xr2,%0,0; xvld $xr6,%0,32; add.d %0,%0,%2; \n\t" \ + "xvld $xr3,%0,0; xvld $xr7,%0,32; add.d %0,%0,%2; \n\t" \ + "xvfmul.s $xr0,$xr0,$xr15;xvfmul.s $xr1,$xr1,$xr15;xvfmul.s $xr2,$xr2,$xr15;xvfmul.s $xr3,$xr3,$xr15 \n\t" \ + "xvfmul.s $xr4,$xr4,$xr15;xvfmul.s $xr5,$xr5,$xr15;xvfmul.s $xr6,$xr6,$xr15;xvfmul.s $xr7,$xr7,$xr15 \n\t" \ + SAVE_4x16(0,4,1,5,2,6,3,7) +#define COPY_4x8 \ + "xvld $xr0,%0,0; add.d %0,%0,%2; \n\t" \ + "xvld $xr1,%0,0; add.d %0,%0,%2; \n\t" \ + "xvld $xr2,%0,0; add.d %0,%0,%2; \n\t" \ + "xvld $xr3,%0,0; add.d %0,%0,%2; \n\t" \ + "xvfmul.s $xr0,$xr0,$xr15;xvfmul.s $xr1,$xr1,$xr15;xvfmul.s $xr2,$xr2,$xr15;xvfmul.s $xr3,$xr3,$xr15 \n\t" \ + SAVE_4x8(0,1,2,3) +#define COPY_4x4 \ + "vld $vr0,%0,0; add.d %0,%0,%2; \n\t" \ + "vld $vr1,%0,0; add.d %0,%0,%2; \n\t" \ + "vld $vr2,%0,0; add.d %0,%0,%2; \n\t" \ + "vld $vr3,%0,0; add.d %0,%0,%2; \n\t" \ + "vfmul.s $vr0,$vr0,$vr15;vfmul.s $vr1,$vr1,$vr15;vfmul.s $vr2,$vr2,$vr15;vfmul.s $vr3,$vr3,$vr15 \n\t" \ + SAVE_4x4(0,1,2,3) +#define COPY_4x2 \ + "vld $vr0,%0,0; add.d %0,%0,%2; \n\t" \ + "vld $vr1,%0,0; add.d %0,%0,%2; \n\t" \ + "vld $vr2,%0,0; add.d %0,%0,%2; \n\t" \ + "vld $vr3,%0,0; add.d %0,%0,%2; \n\t" \ + "vfmul.s $vr0,$vr0,$vr15;vfmul.s $vr1,$vr1,$vr15;vfmul.s $vr2,$vr2,$vr15;vfmul.s $vr3,$vr3,$vr15 \n\t" \ + SAVE_4x2(0,1,2,3) +#define COPY_4x1 \ + "fld.s $f0,%0,0; add.d %0,%0,%2; \n\t" \ + "fld.s $f1,%0,0; add.d %0,%0,%2; \n\t" \ + "fld.s $f2,%0,0; add.d %0,%0,%2; \n\t" \ + "fld.s $f3,%0,0; add.d %0,%0,%2; \n\t" \ + "fmul.s $f0,$f0,$f15;fmul.s $f1,$f1,$f15;fmul.s $f2,$f2,$f15;fmul.s $f3,$f3,$f15 \n\t" \ + SAVE_4x1(0,1,2,3) +#define COPY_2x16 \ + "xvld $xr0,%0,0; xvld $xr2,%0,32; add.d %0,%0,%2; \n\t" \ + "xvld $xr1,%0,0; xvld $xr3,%0,32; add.d %0,%0,%2; \n\t" \ + "xvfmul.s $xr0,$xr0,$xr15;xvfmul.s $xr1,$xr1,$xr15;xvfmul.s $xr2,$xr2,$xr15;xvfmul.s $xr3,$xr3,$xr15 \n\t" \ + SAVE_2x16(0,2,1,3) +#define COPY_2x8 \ + "xvld $xr0,%0,0; add.d %0,%0,%2; \n\t" \ + "xvld $xr1,%0,0; add.d %0,%0,%2; \n\t" \ + "xvfmul.s $xr0,$xr0,$xr15;xvfmul.s $xr1,$xr1,$xr15; \n\t" \ + SAVE_2x8(0,1) +#define COPY_2x4 \ + "vld $vr0,%0,0; add.d %0,%0,%2; \n\t" \ + "vld $vr1,%0,0; add.d %0,%0,%2; \n\t" \ + "vfmul.s $vr0,$vr0,$vr15;vfmul.s $vr1,$vr1,$vr15; \n\t" \ + SAVE_2x4(0,1) +#define COPY_2x2 \ + "fld.d $f0,%0,0;add.d %0,%0,%2; \n\t" \ + "fld.d $f1,%0,0;add.d %0,%0,%2; \n\t" \ + "vfmul.s $vr0,$vr0,$vr15;vfmul.s $vr1,$vr1,$vr15; \n\t" \ + SAVE_2x2(0,1) +#define COPY_2x1 \ + "fld.s $f0,%0,0;add.d %0,%0,%2; \n\t" \ + "fld.s $f1,%0,0;add.d %0,%0,%2; \n\t" \ + "fmul.s $f0,$f0,$f15;fmul.s $f1,$f1,$f15; \n\t" \ + SAVE_2x1(0,1) +#define COPY_1x16 \ + "xvld $xr0,%0,0; xvld $xr1,%0,32; add.d %0,%0,%2; \n\t" \ + "xvfmul.s $xr0,$xr0,$xr15;xvfmul.s $xr1,$xr1,$xr15; \n\t" \ + SAVE_1x16(0,1) +#define COPY_1x8 \ + "xvld $xr0,%0,0; add.d %0,%0,%2; \n\t" \ + "xvfmul.s $xr0,$xr0,$xr15; \n\t" \ + SAVE_1x8(0) +#define COPY_1x4 \ + "vld $vr0,%0,0; add.d %0,%0,%2; \n\t" \ + "vfmul.s $vr0,$vr0,$vr15; \n\t" \ + SAVE_1x4(0) +#define COPY_1x2 \ + "fld.d $f0,%0,0;add.d %0,%0,%2; \n\t" \ + "vfmul.s $vr0,$vr0,$vr15; \n\t" \ + SAVE_1x2(0) +#define COPY_1x1 \ + "fld.s $f0,%0,0;add.d %0,%0,%2; \n\t" \ + "fmul.s $f0,$f0,$f15; \n\t" \ + SAVE_1x1(0) +#define ROWS_OF_BLOCK 128 +#define COMPUTE(ndim) \ + src = src_base; dst = dst_base; \ + __asm__ __volatile__( \ + "xvldrepl.w $xr15, %6, 0 \n\t" \ + "srli.d $r6, %5, 2 \n\t" \ + "beqz $r6, "#ndim"3f \n\t" \ + #ndim"4: \n\t" \ + COPY_4x##ndim \ + "addi.d $r6, $r6, -1 \n\t" \ + "bnez $r6, "#ndim"4b \n\t" \ + #ndim"3: \n\t" \ + "andi $r6, %5, 2 \n\t" \ + "beqz $r6, "#ndim"1f \n\t" \ + #ndim"2: \n\t" \ + COPY_2x##ndim \ + #ndim"1: \n\t" \ + "andi $r6, %5, 1 \n\t" \ + "beqz $r6, "#ndim"0f \n\t" \ + COPY_1x##ndim \ + #ndim"0: \n\t" \ + :"+r"(src),"+r"(dst),"+r"(src_ld_bytes),"+r"(dst_ld_bytes),"+r"(dst_tmp) \ + :"r"(num_rows),"r"(&ALPHA) \ + :"memory", "$r6", "$f0", "$f1", "$f2", "$f3", "$f4", "$f5", "$f6", "$f7", "$f8", "$f9", "$f10", "$f11", "$f15" \ + ); + +int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb){ + float *src, *dst, *dst_tmp=0, *src_base, *dst_base; + uint64_t src_ld_bytes = (uint64_t)lda * sizeof(float), dst_ld_bytes = (uint64_t)ldb * sizeof(float), num_rows = 0; + BLASLONG cols_left, rows_done; float ALPHA = alpha; + if (ALPHA == 0.0) { + dst_base = b; + for (cols_left = cols; cols_left > 0; cols_left--) {memset(dst_base, 0, rows * sizeof(float)); dst_base += ldb;} + return 0; + } + for (rows_done = 0; rows_done < rows; rows_done += num_rows) { + num_rows = rows - rows_done; + if (num_rows > ROWS_OF_BLOCK) num_rows = ROWS_OF_BLOCK; + cols_left = cols; src_base = a + (int64_t)lda * (int64_t)rows_done; dst_base = b + rows_done; + for (;cols_left > 15; cols_left -= 16) {COMPUTE(16) src_base += 16; dst_base += 16;} + for (;cols_left > 7; cols_left -= 8) {COMPUTE(8) src_base += 8; dst_base += 8;} + for (;cols_left > 3; cols_left -= 4) {COMPUTE(4) src_base += 4; dst_base += 4;} + for (;cols_left > 1; cols_left -= 2) {COMPUTE(2) src_base += 2; dst_base += 2;} + if (cols_left > 0) {COMPUTE(1) src_base ++; dst_base ++;} + } +} From b37129341ba4372caf2fdc680edbb09f0628d06b Mon Sep 17 00:00:00 2001 From: gxw Date: Thu, 17 Oct 2024 11:27:55 +0000 Subject: [PATCH 102/244] LoongArch64: Opt somatcopy_cn with LASX --- kernel/loongarch64/KERNEL.LA464 | 1 + kernel/loongarch64/somatcopy_cn_lasx.c | 193 +++++++++++++++++++++++++ 2 files changed, 194 insertions(+) create mode 100644 kernel/loongarch64/somatcopy_cn_lasx.c diff --git a/kernel/loongarch64/KERNEL.LA464 b/kernel/loongarch64/KERNEL.LA464 index 1a6b6ee6c5..f664f551b1 100644 --- a/kernel/loongarch64/KERNEL.LA464 +++ b/kernel/loongarch64/KERNEL.LA464 @@ -175,4 +175,5 @@ DGEMM_SMALL_K_B0_TT = dgemm_small_kernel_tt_lasx.S SOMATCOPY_RT = somatcopy_rt_lasx.c SOMATCOPY_RN = somatcopy_rn_lasx.c +SOMATCOPY_CN = somatcopy_cn_lasx.c endif diff --git a/kernel/loongarch64/somatcopy_cn_lasx.c b/kernel/loongarch64/somatcopy_cn_lasx.c new file mode 100644 index 0000000000..e4d1f9f901 --- /dev/null +++ b/kernel/loongarch64/somatcopy_cn_lasx.c @@ -0,0 +1,193 @@ +/*************************************************************************** +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#define SAVE_4x1(b1, b2, b3, b4) \ + "vstelm.w $vr"#b1",%1,0,0;add.d %1,%1,%3;vstelm.w $vr"#b2",%1,0,0;add.d %1,%1,%3;\n\t" \ + "vstelm.w $vr"#b3",%1,0,0;add.d %1,%1,%3;vstelm.w $vr"#b4",%1,0,0;add.d %1,%1,%3;\n\t" +#define SAVE_4x2(b1, b2, b3, b4) \ + "vstelm.d $vr"#b1",%1,0,0;add.d %1,%1,%3;vstelm.d $vr"#b2",%1,0,0;add.d %1,%1,%3;\n\t" \ + "vstelm.d $vr"#b3",%1,0,0;add.d %1,%1,%3;vstelm.d $vr"#b4",%1,0,0;add.d %1,%1,%3;\n\t" +#define SAVE_4x4(b1, b2, b3, b4) \ + "vst $vr"#b1",%1,0;add.d %1,%1,%3;vst $vr"#b2",%1,0;add.d %1,%1,%3;\n\t" \ + "vst $vr"#b3",%1,0;add.d %1,%1,%3;vst $vr"#b4",%1,0;add.d %1,%1,%3;\n\t" +#define SAVE_4x8(b1, b2, b3, b4) \ + "xvst $xr"#b1",%1,0;add.d %1,%1,%3;xvst $xr"#b2",%1,0;add.d %1,%1,%3;\n\t" \ + "xvst $xr"#b3",%1,0;add.d %1,%1,%3;xvst $xr"#b4",%1,0;add.d %1,%1,%3;\n\t" +#define SAVE_4x16(b1, b2, b3, b4, b5, b6, b7, b8) \ + "xvst $xr"#b1",%1,0;xvst $xr"#b2",%1,32;add.d %1,%1,%3;\n\t" \ + "xvst $xr"#b3",%1,0;xvst $xr"#b4",%1,32;add.d %1,%1,%3;\n\t" \ + "xvst $xr"#b5",%1,0;xvst $xr"#b6",%1,32;add.d %1,%1,%3;\n\t" \ + "xvst $xr"#b7",%1,0;xvst $xr"#b8",%1,32;add.d %1,%1,%3;\n\t" +#define SAVE_2x16(b1, b2, b3, b4) \ + "xvst $xr"#b1",%1,0;xvst $xr"#b2",%1,32;add.d %1,%1,%3;\n\t" \ + "xvst $xr"#b3",%1,0;xvst $xr"#b4",%1,32;add.d %1,%1,%3;\n\t" +#define SAVE_2x8(b1, b2) \ + "xvst $xr"#b1",%1,0;add.d %1,%1,%3;xvst $xr"#b2",%1,0;add.d %1,%1,%3;\n\t" +#define SAVE_2x4(b1, b2) \ + "vst $vr"#b1",%1,0;add.d %1,%1,%3;vst $vr"#b2",%1,0;add.d %1,%1,%3;\n\t" +#define SAVE_2x2(b1, b2) \ + "vstelm.d $vr"#b1",%1,0,0;add.d %1,%1,%3;vstelm.d $vr"#b2",%1,0,0;add.d %1,%1,%3;\n\t" +#define SAVE_2x1(b1, b2) \ + "vstelm.w $vr"#b1",%1,0,0;add.d %1,%1,%3;vstelm.w $vr"#b2",%1,0,0;add.d %1,%1,%3;\n\t" +#define SAVE_1x16(b1, b2) \ + "xvst $xr"#b1",%1,0;xvst $xr"#b2",%1,32;add.d %1,%1,%3;\n\t" +#define SAVE_1x8(b1) \ + "xvst $xr"#b1",%1,0;add.d %1,%1,%3;\n\t" +#define SAVE_1x4(b1) \ + "vst $vr"#b1",%1,0;add.d %1,%1,%3;\n\t" +#define SAVE_1x2(b1) \ + "vstelm.d $vr"#b1",%1,0,0;add.d %1,%1,%3;\n\t" +#define SAVE_1x1(b1) \ + "vstelm.w $vr"#b1",%1,0,0;add.d %1,%1,%3;\n\t" +#define COPY_4x16 \ + "xvld $xr0,%0,0; xvld $xr4,%0,32; add.d %0,%0,%2; \n\t" \ + "xvld $xr1,%0,0; xvld $xr5,%0,32; add.d %0,%0,%2; \n\t" \ + "xvld $xr2,%0,0; xvld $xr6,%0,32; add.d %0,%0,%2; \n\t" \ + "xvld $xr3,%0,0; xvld $xr7,%0,32; add.d %0,%0,%2; \n\t" \ + "xvfmul.s $xr0,$xr0,$xr15;xvfmul.s $xr1,$xr1,$xr15;xvfmul.s $xr2,$xr2,$xr15;xvfmul.s $xr3,$xr3,$xr15 \n\t" \ + "xvfmul.s $xr4,$xr4,$xr15;xvfmul.s $xr5,$xr5,$xr15;xvfmul.s $xr6,$xr6,$xr15;xvfmul.s $xr7,$xr7,$xr15 \n\t" \ + SAVE_4x16(0,4,1,5,2,6,3,7) +#define COPY_4x8 \ + "xvld $xr0,%0,0; add.d %0,%0,%2; \n\t" \ + "xvld $xr1,%0,0; add.d %0,%0,%2; \n\t" \ + "xvld $xr2,%0,0; add.d %0,%0,%2; \n\t" \ + "xvld $xr3,%0,0; add.d %0,%0,%2; \n\t" \ + "xvfmul.s $xr0,$xr0,$xr15;xvfmul.s $xr1,$xr1,$xr15;xvfmul.s $xr2,$xr2,$xr15;xvfmul.s $xr3,$xr3,$xr15 \n\t" \ + SAVE_4x8(0,1,2,3) +#define COPY_4x4 \ + "vld $vr0,%0,0; add.d %0,%0,%2; \n\t" \ + "vld $vr1,%0,0; add.d %0,%0,%2; \n\t" \ + "vld $vr2,%0,0; add.d %0,%0,%2; \n\t" \ + "vld $vr3,%0,0; add.d %0,%0,%2; \n\t" \ + "vfmul.s $vr0,$vr0,$vr15;vfmul.s $vr1,$vr1,$vr15;vfmul.s $vr2,$vr2,$vr15;vfmul.s $vr3,$vr3,$vr15 \n\t" \ + SAVE_4x4(0,1,2,3) +#define COPY_4x2 \ + "vld $vr0,%0,0; add.d %0,%0,%2; \n\t" \ + "vld $vr1,%0,0; add.d %0,%0,%2; \n\t" \ + "vld $vr2,%0,0; add.d %0,%0,%2; \n\t" \ + "vld $vr3,%0,0; add.d %0,%0,%2; \n\t" \ + "vfmul.s $vr0,$vr0,$vr15;vfmul.s $vr1,$vr1,$vr15;vfmul.s $vr2,$vr2,$vr15;vfmul.s $vr3,$vr3,$vr15 \n\t" \ + SAVE_4x2(0,1,2,3) +#define COPY_4x1 \ + "fld.s $f0,%0,0; add.d %0,%0,%2; \n\t" \ + "fld.s $f1,%0,0; add.d %0,%0,%2; \n\t" \ + "fld.s $f2,%0,0; add.d %0,%0,%2; \n\t" \ + "fld.s $f3,%0,0; add.d %0,%0,%2; \n\t" \ + "fmul.s $f0,$f0,$f15;fmul.s $f1,$f1,$f15;fmul.s $f2,$f2,$f15;fmul.s $f3,$f3,$f15 \n\t" \ + SAVE_4x1(0,1,2,3) +#define COPY_2x16 \ + "xvld $xr0,%0,0; xvld $xr2,%0,32; add.d %0,%0,%2; \n\t" \ + "xvld $xr1,%0,0; xvld $xr3,%0,32; add.d %0,%0,%2; \n\t" \ + "xvfmul.s $xr0,$xr0,$xr15;xvfmul.s $xr1,$xr1,$xr15;xvfmul.s $xr2,$xr2,$xr15;xvfmul.s $xr3,$xr3,$xr15 \n\t" \ + SAVE_2x16(0,2,1,3) +#define COPY_2x8 \ + "xvld $xr0,%0,0; add.d %0,%0,%2; \n\t" \ + "xvld $xr1,%0,0; add.d %0,%0,%2; \n\t" \ + "xvfmul.s $xr0,$xr0,$xr15;xvfmul.s $xr1,$xr1,$xr15; \n\t" \ + SAVE_2x8(0,1) +#define COPY_2x4 \ + "vld $vr0,%0,0; add.d %0,%0,%2; \n\t" \ + "vld $vr1,%0,0; add.d %0,%0,%2; \n\t" \ + "vfmul.s $vr0,$vr0,$vr15;vfmul.s $vr1,$vr1,$vr15; \n\t" \ + SAVE_2x4(0,1) +#define COPY_2x2 \ + "fld.d $f0,%0,0;add.d %0,%0,%2; \n\t" \ + "fld.d $f1,%0,0;add.d %0,%0,%2; \n\t" \ + "vfmul.s $vr0,$vr0,$vr15;vfmul.s $vr1,$vr1,$vr15; \n\t" \ + SAVE_2x2(0,1) +#define COPY_2x1 \ + "fld.s $f0,%0,0;add.d %0,%0,%2; \n\t" \ + "fld.s $f1,%0,0;add.d %0,%0,%2; \n\t" \ + "fmul.s $f0,$f0,$f15;fmul.s $f1,$f1,$f15; \n\t" \ + SAVE_2x1(0,1) +#define COPY_1x16 \ + "xvld $xr0,%0,0; xvld $xr1,%0,32; add.d %0,%0,%2; \n\t" \ + "xvfmul.s $xr0,$xr0,$xr15;xvfmul.s $xr1,$xr1,$xr15; \n\t" \ + SAVE_1x16(0,1) +#define COPY_1x8 \ + "xvld $xr0,%0,0; add.d %0,%0,%2; \n\t" \ + "xvfmul.s $xr0,$xr0,$xr15; \n\t" \ + SAVE_1x8(0) +#define COPY_1x4 \ + "vld $vr0,%0,0; add.d %0,%0,%2; \n\t" \ + "vfmul.s $vr0,$vr0,$vr15; \n\t" \ + SAVE_1x4(0) +#define COPY_1x2 \ + "fld.d $f0,%0,0;add.d %0,%0,%2; \n\t" \ + "vfmul.s $vr0,$vr0,$vr15; \n\t" \ + SAVE_1x2(0) +#define COPY_1x1 \ + "fld.s $f0,%0,0;add.d %0,%0,%2; \n\t" \ + "fmul.s $f0,$f0,$f15; \n\t" \ + SAVE_1x1(0) +#define ROWS_OF_BLOCK 128 +#define COMPUTE(ndim) \ + src = src_base; dst = dst_base; \ + __asm__ __volatile__( \ + "xvldrepl.w $xr15, %6, 0 \n\t" \ + "srli.d $r6, %5, 2 \n\t" \ + "beqz $r6, "#ndim"3f \n\t" \ + #ndim"4: \n\t" \ + COPY_4x##ndim \ + "addi.d $r6, $r6, -1 \n\t" \ + "bnez $r6, "#ndim"4b \n\t" \ + #ndim"3: \n\t" \ + "andi $r6, %5, 2 \n\t" \ + "beqz $r6, "#ndim"1f \n\t" \ + #ndim"2: \n\t" \ + COPY_2x##ndim \ + #ndim"1: \n\t" \ + "andi $r6, %5, 1 \n\t" \ + "beqz $r6, "#ndim"0f \n\t" \ + COPY_1x##ndim \ + #ndim"0: \n\t" \ + :"+r"(src),"+r"(dst),"+r"(src_ld_bytes),"+r"(dst_ld_bytes),"+r"(dst_tmp) \ + :"r"(num_cols),"r"(&ALPHA) \ + :"memory", "$r6", "$f0", "$f1", "$f2", "$f3", "$f4", "$f5", "$f6", "$f7", "$f8", "$f9", "$f10", "$f11", "$f15" \ + ); + +int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb){ + float *src, *dst, *dst_tmp=0, *src_base, *dst_base; + uint64_t src_ld_bytes = (uint64_t)lda * sizeof(float), dst_ld_bytes = (uint64_t)ldb * sizeof(float), num_cols = 0; + BLASLONG rows_left, cols_done; float ALPHA = alpha; + if (ALPHA == 0.0) { + dst_base = b; + for (rows_left = rows; rows_left > 0; rows_left--) {memset(dst_base, 0, cols * sizeof(float)); dst_base += ldb;} + return 0; + } + for (cols_done = 0; cols_done < cols; cols_done += num_cols) { + num_cols = cols - cols_done; + if (num_cols > ROWS_OF_BLOCK) num_cols = ROWS_OF_BLOCK; + rows_left = rows; src_base = a + (int64_t)lda * (int64_t)cols_done; dst_base = b + cols_done; + for (;rows_left > 15; rows_left -= 16) {COMPUTE(16) src_base += 16; dst_base += 16;} + for (;rows_left > 7; rows_left -= 8) {COMPUTE(8) src_base += 8; dst_base += 8;} + for (;rows_left > 3; rows_left -= 4) {COMPUTE(4) src_base += 4; dst_base += 4;} + for (;rows_left > 1; rows_left -= 2) {COMPUTE(2) src_base += 2; dst_base += 2;} + if (rows_left > 0) {COMPUTE(1) src_base ++; dst_base ++;} + } +} From bb31bbef522b8b105ae2cb1bfce484ded5839b22 Mon Sep 17 00:00:00 2001 From: gxw Date: Thu, 17 Oct 2024 11:45:13 +0000 Subject: [PATCH 103/244] LoongArch64: Opt somatcopy_ct with LASX --- kernel/loongarch64/KERNEL.LA464 | 1 + kernel/loongarch64/somatcopy_ct_lasx.c | 200 +++++++++++++++++++++++++ 2 files changed, 201 insertions(+) create mode 100644 kernel/loongarch64/somatcopy_ct_lasx.c diff --git a/kernel/loongarch64/KERNEL.LA464 b/kernel/loongarch64/KERNEL.LA464 index f664f551b1..ca8c4d3884 100644 --- a/kernel/loongarch64/KERNEL.LA464 +++ b/kernel/loongarch64/KERNEL.LA464 @@ -175,5 +175,6 @@ DGEMM_SMALL_K_B0_TT = dgemm_small_kernel_tt_lasx.S SOMATCOPY_RT = somatcopy_rt_lasx.c SOMATCOPY_RN = somatcopy_rn_lasx.c +SOMATCOPY_CT = somatcopy_ct_lasx.c SOMATCOPY_CN = somatcopy_cn_lasx.c endif diff --git a/kernel/loongarch64/somatcopy_ct_lasx.c b/kernel/loongarch64/somatcopy_ct_lasx.c new file mode 100644 index 0000000000..c400d186ef --- /dev/null +++ b/kernel/loongarch64/somatcopy_ct_lasx.c @@ -0,0 +1,200 @@ +/*************************************************************************** +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#define SAVE1x4(c1) \ + "vfmul.s $vr"#c1",$vr"#c1",$vr15;vstelm.w $vr"#c1",%4,0,0;add.d %4,%4,%3;\n\t" \ + "vstelm.w $vr"#c1",%4,0,1;add.d %4,%4,%3;vstelm.w $vr"#c1",%4,0,2;add.d %4,%4,%3; \n\t" \ + "vstelm.w $vr"#c1",%4,0,3;add.d %4,%4,%3;\n\t" +#define SAVE_2x4(c1, c2, t1, t2) \ + "vilvl.w $vr"#t1",$vr"#c2",$vr"#c1";vfmul.s $vr"#t1",$vr"#t1",$vr15; \n\t" \ + "vstelm.d $vr"#t1",%4,0,0;add.d %4,%4,%3;vstelm.d $vr"#t1",%4,0,1;add.d %4,%4,%3 \n\t" \ + "vilvh.w $vr"#t2",$vr"#c2",$vr"#c1";vfmul.s $vr"#t2",$vr"#t2",$vr15; \n\t" \ + "vstelm.d $vr"#t2",%4,0,0;add.d %4,%4,%3;vstelm.d $vr"#t2",%4,0,1;add.d %4,%4,%3 \n\t" +#define SAVE_4x1(b1) \ + "vst $vr"#b1",%4,0; add.d %4,%4,%3 \n\t" +#define SAVE_4x2(b1, b2) \ + "vst $vr"#b1",%4,0; add.d %4,%4,%3; vst $vr"#b2",%4,0; add.d %4,%4,%3 \n\t" +#define SAVE_4x4(b1, b2, b3, b4) \ + "vst $vr"#b1",%4,0; add.d %4,%4,%3; vst $vr"#b2",%4,0; add.d %4,%4,%3 \n\t" \ + "vst $vr"#b3",%4,0; add.d %4,%4,%3; vst $vr"#b4",%4,0; add.d %4,%4,%3 \n\t" +#define SAVE_4x8(b1, b2, b3, b4) \ + SAVE_4x4(b1, b2, b3, b4) \ + "xvpermi.q $xr"#b1",$xr"#b1",1; xvpermi.q $xr"#b2",$xr"#b2",1; xvpermi.q $xr"#b3",$xr"#b3",1; xvpermi.q $xr"#b4",$xr"#b4",1; \n\t" \ + SAVE_4x4(b1, b2, b3, b4) + +#define TRANS_4x4(a1, a2, a3, a4, t1, t2, t3, t4) \ + "vilvl.w $vr"#t1",$vr"#a2",$vr"#a1";vilvh.w $vr"#t2",$vr"#a2",$vr"#a1"; \n\t" \ + "vilvl.w $vr"#t3",$vr"#a4",$vr"#a3";vilvh.w $vr"#t4",$vr"#a4",$vr"#a3"; \n\t" \ + "vilvl.d $vr"#a1",$vr"#t3",$vr"#t1";vilvh.d $vr"#a2",$vr"#t3",$vr"#t1"; \n\t" \ + "vilvl.d $vr"#a3",$vr"#t4",$vr"#t2";vilvh.d $vr"#a4",$vr"#t4",$vr"#t2"; \n\t" +#define TRANS_4x8(a1, a2, a3, a4, t1, t2, t3, t4) \ + "xvilvl.w $xr"#t1",$xr"#a2",$xr"#a1"; xvilvh.w $xr"#t2",$xr"#a2",$xr"#a1"; \n\t" \ + "xvilvl.w $xr"#t3",$xr"#a4",$xr"#a3"; xvilvh.w $xr"#t4",$xr"#a4",$xr"#a3"; \n\t" \ + "xvilvl.d $xr"#a1",$xr"#t3",$xr"#t1"; xvilvh.d $xr"#a2",$xr"#t3",$xr"#t1"; \n\t" \ + "xvilvl.d $xr"#a3",$xr"#t4",$xr"#t2"; xvilvh.d $xr"#a4",$xr"#t4",$xr"#t2"; \n\t" +#define COPY_4x16 \ + "move %4,%1; addi.d %1,%1,16 \n\t" \ + "xvld $xr0,%0,0; xvld $xr4,%0,32; add.d %0,%0,%2; \n\t" \ + "xvld $xr1,%0,0; xvld $xr5,%0,32; add.d %0,%0,%2; \n\t" \ + "xvld $xr2,%0,0; xvld $xr6,%0,32; add.d %0,%0,%2; \n\t" \ + "xvld $xr3,%0,0; xvld $xr7,%0,32; add.d %0,%0,%2; \n\t" \ + "xvfmul.s $xr0,$xr0,$xr15;xvfmul.s $xr1,$xr1,$xr15;xvfmul.s $xr2,$xr2,$xr15;xvfmul.s $xr3,$xr3,$xr15 \n\t" \ + "xvfmul.s $xr4,$xr4,$xr15;xvfmul.s $xr5,$xr5,$xr15;xvfmul.s $xr6,$xr6,$xr15;xvfmul.s $xr7,$xr7,$xr15 \n\t" \ + TRANS_4x8(0,1,2,3,8,9,10,11) SAVE_4x8(0,1,2,3) \ + TRANS_4x8(4,5,6,7,8,9,10,11) SAVE_4x8(4,5,6,7) +#define COPY_4x8 \ + "move %4,%1; addi.d %1,%1,16 \n\t" \ + "xvld $xr0,%0,0; add.d %0,%0,%2; \n\t" \ + "xvld $xr1,%0,0; add.d %0,%0,%2; \n\t" \ + "xvld $xr2,%0,0; add.d %0,%0,%2; \n\t" \ + "xvld $xr3,%0,0; add.d %0,%0,%2; \n\t" \ + "xvfmul.s $xr0,$xr0,$xr15;xvfmul.s $xr1,$xr1,$xr15;xvfmul.s $xr2,$xr2,$xr15;xvfmul.s $xr3,$xr3,$xr15 \n\t" \ + TRANS_4x8(0,1,2,3,8,9,10,11) SAVE_4x8(0,1,2,3) +#define COPY_4x4 \ + "move %4,%1; addi.d %1,%1,16 \n\t" \ + "vld $vr0,%0,0; add.d %0,%0,%2; \n\t" \ + "vld $vr1,%0,0; add.d %0,%0,%2; \n\t" \ + "vld $vr2,%0,0; add.d %0,%0,%2; \n\t" \ + "vld $vr3,%0,0; add.d %0,%0,%2; \n\t" \ + "vfmul.s $vr0,$vr0,$vr15;vfmul.s $vr1,$vr1,$vr15;vfmul.s $vr2,$vr2,$vr15;vfmul.s $vr3,$vr3,$vr15 \n\t" \ + TRANS_4x4(0,1,2,3,8,9,10,11) SAVE_4x4(0,1,2,3) +#define COPY_4x2 \ + "move %4,%1; addi.d %1,%1,16 \n\t" \ + "vld $vr0,%0,0; add.d %0,%0,%2; \n\t" \ + "vld $vr1,%0,0; add.d %0,%0,%2; \n\t" \ + "vld $vr2,%0,0; add.d %0,%0,%2; \n\t" \ + "vld $vr3,%0,0; add.d %0,%0,%2; \n\t" \ + "vfmul.s $vr0,$vr0,$vr15;vfmul.s $vr1,$vr1,$vr15;vfmul.s $vr2,$vr2,$vr15;vfmul.s $vr3,$vr3,$vr15 \n\t" \ + TRANS_4x4(0,1,2,3,8,9,10,11) SAVE_4x2(0,1) +#define COPY_4x1 \ + "move %4,%1; addi.d %1,%1,16 \n\t" \ + "fld.s $f0,%0,0; add.d %0,%0,%2; \n\t" \ + "fld.s $f1,%0,0; add.d %0,%0,%2; \n\t" \ + "fld.s $f2,%0,0; add.d %0,%0,%2; \n\t" \ + "fld.s $f3,%0,0; add.d %0,%0,%2; \n\t" \ + "xvinsve0.w $xr0,$xr1,1;xvinsve0.w $xr0,$xr2,2;xvinsve0.w $xr0,$xr3,3; \n\t" \ + "vfmul.s $vr0,$vr0,$vr15; \n\t" \ + SAVE_4x1(0) + +#define COPY_2x16 \ + "move %4,%1; addi.d %1,%1,8 \n\t" \ + "xvld $xr0,%0,0; xvld $xr2,%0,32; add.d %0,%0,%2; \n\t" \ + "xvld $xr1,%0,0; xvld $xr3,%0,32; add.d %0,%0,%2; \n\t" \ + "xvpermi.q $xr4,$xr0,1;xvpermi.q $xr6,$xr2,1;xvpermi.q $xr5,$xr1,1;xvpermi.q $xr7,$xr3,1; \n\t" \ + SAVE_2x4(0,1,8,9) SAVE_2x4(4,5,8,9) SAVE_2x4(2,3,8,9) SAVE_2x4(6,7,8,9) +#define COPY_2x8 \ + "move %4,%1; addi.d %1,%1,8 \n\t" \ + "xvld $xr0,%0,0; add.d %0,%0,%2; \n\t" \ + "xvld $xr1,%0,0; add.d %0,%0,%2; \n\t" \ + "xvpermi.q $xr2,$xr0,1;xvpermi.q $xr3,$xr1,1; \n\t" \ + SAVE_2x4(0,1,4,5) SAVE_2x4(2,3,4,5) +#define COPY_2x4 \ + "move %4,%1; addi.d %1,%1,8 \n\t" \ + "vld $vr0,%0,0; add.d %0,%0,%2; \n\t" \ + "vld $vr1,%0,0; add.d %0,%0,%2; \n\t" \ + SAVE_2x4(0,1,4,5) +#define COPY_2x2 \ + "move %4,%1; addi.d %1,%1,8 \n\t" \ + "fld.d $f0,%0,0;add.d %0,%0,%2; \n\t" \ + "fld.d $f1,%0,0;add.d %0,%0,%2; \n\t" \ + "xvinsve0.d $xr0,$xr1,1;vfmul.s $vr0,$vr0,$vr15;vshuf4i.w $vr0,$vr0,0xd8 \n\t" \ + "vstelm.d $vr0,%4,0,0;add.d %4,%4,%3;vstelm.d $vr0,%4,0,1 \n\t" +#define COPY_2x1 \ + "move %4,%1; addi.d %1,%1,8 \n\t" \ + "fld.s $f0,%0,0;add.d %0,%0,%2; \n\t" \ + "fld.s $f1,%0,0;add.d %0,%0,%2; \n\t" \ + "xvinsve0.w $xr0,$xr1,1;vfmul.s $vr0,$vr0,$vr15; \n\t" \ + "vstelm.d $vr0,%4,0,0; \n\t" + +#define COPY_1x16 \ + "move %4,%1; addi.d %1,%1,4 \n\t" \ + "vld $vr1,%0,0;" SAVE1x4(1) "vld $vr2,%0,16;" SAVE1x4(2) \ + "vld $vr1,%0,32;" SAVE1x4(1) "vld $vr2,%0,48;" SAVE1x4(2) \ + "add.d %0,%0,%2 \n\t" +#define COPY_1x8 \ + "move %4,%1; addi.d %1,%1,4 \n\t" \ + "vld $vr1,%0,0;" SAVE1x4(1) "vld $vr2,%0,16;" SAVE1x4(2) \ + "add.d %0,%0,%2 \n\t" +#define COPY_1x4 \ + "move %4,%1; addi.d %1,%1,4 \n\t" \ + "vld $vr1,%0,0;" SAVE1x4(1) \ + "add.d %0,%0,%2 \n\t" +#define COPY_1x2 \ + "move %4,%1;fld.d $f1,%0,0;add.d %0,%0,%2;vfmul.s $vr1,$vr1,$vr15;vstelm.w $vr1,%4,0,0;add.d %4,%4,%3;vstelm.w $vr1,%4,0,1;\n\t" \ + "addi.d %1,%1,4;\n\t" +#define COPY_1x1 \ + "fld.s $f1,%0,0;fmul.s $f1,$f1,$f15;fst.s $f1,%1,0;add.d %0,%0,%2;addi.d %1,%1,4;\n\t" + +#define ROWS_OF_BLOCK 128 + +#define COMPUTE(ndim) \ + src = src_base; dst = dst_base; \ + __asm__ __volatile__( \ + "xvldrepl.w $xr15, %6, 0 \n\t" \ + "srli.d $r6, %5, 2 \n\t" \ + "beqz $r6, "#ndim"3f \n\t" \ + #ndim"4: \n\t" \ + COPY_4x##ndim \ + "addi.d $r6, $r6, -1 \n\t" \ + "bnez $r6, "#ndim"4b \n\t" \ + #ndim"3: \n\t" \ + "andi $r6, %5, 2 \n\t" \ + "beqz $r6, "#ndim"1f \n\t" \ + #ndim"2: \n\t" \ + COPY_2x##ndim \ + #ndim"1: \n\t" \ + "andi $r6, %5, 1 \n\t" \ + "beqz $r6, "#ndim"0f \n\t" \ + COPY_1x##ndim \ + #ndim"0: \n\t" \ + :"+r"(src),"+r"(dst),"+r"(src_ld_bytes),"+r"(dst_ld_bytes),"+r"(dst_tmp) \ + :"r"(num_cols),"r"(&ALPHA) \ + :"memory", "$r6", "$f0", "$f1", "$f2", "$f3", "$f4", "$f5", "$f6", "$f7", "$f8", "$f9", "$f10", "$f11", "$f15" \ + ); + +int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb){ + float *src, *dst, *dst_tmp=0, *src_base, *dst_base; + uint64_t src_ld_bytes = (uint64_t)lda * sizeof(float), dst_ld_bytes = (uint64_t)ldb * sizeof(float), num_cols = 0; + BLASLONG rows_left, cols_done; float ALPHA = alpha; + if (ALPHA == 0.0) { + dst_base = b; + for (rows_left = rows; rows_left > 0; rows_left--) {memset(dst_base, 0, cols * sizeof(float)); dst_base += ldb;} + return 0; + } + for (cols_done = 0; cols_done < cols; cols_done += num_cols) { + num_cols = cols - cols_done; + if (num_cols > ROWS_OF_BLOCK) num_cols = ROWS_OF_BLOCK; + rows_left = rows; src_base = a + (int64_t)lda * (int64_t)cols_done; dst_base = b + cols_done; + for (;rows_left > 15; rows_left -= 16) {COMPUTE(16) src_base += 16; dst_base += 16 * ldb;} + for (;rows_left > 7; rows_left -= 8) {COMPUTE(8) src_base += 8; dst_base += 8 * ldb;} + for (;rows_left > 3; rows_left -= 4) {COMPUTE(4) src_base += 4; dst_base += 4 * ldb;} + for (;rows_left > 1; rows_left -= 2) {COMPUTE(2) src_base += 2; dst_base += 2 * ldb;} + if (rows_left > 0) {COMPUTE(1) src_base ++; dst_base += ldb;} + } +} From ab71a1edf24e309f18013b97c6473b92fbfb9608 Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Thu, 17 Oct 2024 08:25:02 -0500 Subject: [PATCH 104/244] Better VSX. --- kernel/power/sbgemv_t_vsx.c | 70 +++++++++++++++++++++++++++---------- 1 file changed, 51 insertions(+), 19 deletions(-) diff --git a/kernel/power/sbgemv_t_vsx.c b/kernel/power/sbgemv_t_vsx.c index e72d2f31e0..ecee23a0cf 100644 --- a/kernel/power/sbgemv_t_vsx.c +++ b/kernel/power/sbgemv_t_vsx.c @@ -195,7 +195,7 @@ static void BF16GEMV_T_VSX_8(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FL vec_f32 temp6 = { 0, 0, 0, 0 }; vec_f32 temp7 = { 0, 0, 0, 0 }; vec_bf16 zero = { 0, 0, 0, 0, 0, 0, 0, 0 }; - vec_f32 inp[2]; + vec_f32 inp[2], inp0[2], inp1[2], inp2[2], inp3[2], inp4[2], inp5[2], inp6[2], inp7[2]; BLASLONG lda4 = lda << 2; a0 = ap; @@ -220,29 +220,61 @@ static void BF16GEMV_T_VSX_8(BLASLONG n, BLASLONG lda, IFLOAT *ap, IFLOAT *x, FL for (; i < n8; i++) { vec_load_vec2(&v_x[i], inp, zero); - - temp0 += vec_load_mult(&va0[i], inp, zero); - temp1 += vec_load_mult(&va1[i], inp, zero); - temp2 += vec_load_mult(&va2[i], inp, zero); - temp3 += vec_load_mult(&va3[i], inp, zero); - temp4 += vec_load_mult(&va4[i], inp, zero); - temp5 += vec_load_mult(&va5[i], inp, zero); - temp6 += vec_load_mult(&va6[i], inp, zero); - temp7 += vec_load_mult(&va7[i], inp, zero); + vec_load_vec2(&va0[i], inp0, zero); + vec_load_vec2(&va1[i], inp1, zero); + vec_load_vec2(&va2[i], inp2, zero); + vec_load_vec2(&va3[i], inp3, zero); + vec_load_vec2(&va4[i], inp4, zero); + vec_load_vec2(&va5[i], inp5, zero); + vec_load_vec2(&va6[i], inp6, zero); + vec_load_vec2(&va7[i], inp7, zero); + + temp0 += (inp[0] * inp0[0]); + temp1 += (inp[0] * inp1[0]); + temp2 += (inp[0] * inp2[0]); + temp3 += (inp[0] * inp3[0]); + temp4 += (inp[0] * inp4[0]); + temp5 += (inp[0] * inp5[0]); + temp6 += (inp[0] * inp6[0]); + temp7 += (inp[0] * inp7[0]); + temp0 += (inp[1] * inp0[1]); + temp1 += (inp[1] * inp1[1]); + temp2 += (inp[1] * inp2[1]); + temp3 += (inp[1] * inp3[1]); + temp4 += (inp[1] * inp4[1]); + temp5 += (inp[1] * inp5[1]); + temp6 += (inp[1] * inp6[1]); + temp7 += (inp[1] * inp7[1]); } n &= 7; if (n > 4) { vec_loadN_vec2(&v_x[i], inp, n, zero); - - temp0 += vec_loadN_mult(&va0[i], inp, n, zero); - temp1 += vec_loadN_mult(&va1[i], inp, n, zero); - temp2 += vec_loadN_mult(&va2[i], inp, n, zero); - temp3 += vec_loadN_mult(&va3[i], inp, n, zero); - temp4 += vec_loadN_mult(&va4[i], inp, n, zero); - temp5 += vec_loadN_mult(&va5[i], inp, n, zero); - temp6 += vec_loadN_mult(&va6[i], inp, n, zero); - temp7 += vec_loadN_mult(&va7[i], inp, n, zero); + vec_loadN_vec2(&va0[i], inp0, n, zero); + vec_loadN_vec2(&va1[i], inp1, n, zero); + vec_loadN_vec2(&va2[i], inp2, n, zero); + vec_loadN_vec2(&va3[i], inp3, n, zero); + vec_loadN_vec2(&va4[i], inp4, n, zero); + vec_loadN_vec2(&va5[i], inp5, n, zero); + vec_loadN_vec2(&va6[i], inp6, n, zero); + vec_loadN_vec2(&va7[i], inp7, n, zero); + + temp0 += (inp[0] * inp0[0]); + temp1 += (inp[0] * inp1[0]); + temp2 += (inp[0] * inp2[0]); + temp3 += (inp[0] * inp3[0]); + temp4 += (inp[0] * inp4[0]); + temp5 += (inp[0] * inp5[0]); + temp6 += (inp[0] * inp6[0]); + temp7 += (inp[0] * inp7[0]); + temp0 += (inp[1] * inp0[1]); + temp1 += (inp[1] * inp1[1]); + temp2 += (inp[1] * inp2[1]); + temp3 += (inp[1] * inp3[1]); + temp4 += (inp[1] * inp4[1]); + temp5 += (inp[1] * inp5[1]); + temp6 += (inp[1] * inp6[1]); + temp7 += (inp[1] * inp7[1]); } else if (n) { inp[0] = vec_loadNHi(&v_x[i], n, zero); From 33078d11e428048a7683a48d8a74b884b1d73eac Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 17 Oct 2024 21:07:49 +0200 Subject: [PATCH 105/244] stress importance of TARGET setting in DYNAMIC_ARCH builds --- README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 4bff64b153..a31588be02 100644 --- a/README.md +++ b/README.md @@ -254,8 +254,10 @@ On **riscv64**, DYNAMIC_ARCH enables support for riscv64_zvl128b and riscv64_zvl On **LoongArch64**, it comprises LA264 and LA464 as well as generic LoongArch64 support. -The `TARGET` option can be used in conjunction with `DYNAMIC_ARCH=1` to specify which cpu model should be assumed for all the -common code in the library, usually you will want to set this to the oldest model you expect to encounter. +The `TARGET` option can - and usually **should** - be used in conjunction with `DYNAMIC_ARCH=1` to specify which cpu model should be assumed for all the common code in the library, usually you will want to set this to the oldest model you expect to encounter. +Failure to specify this may lead to advanced instructions being used by the compiler, just because the build host happens to support them. This is most likely to happen when aggressive optimization options are in effect, and the resulting library may then crash with an +illegal instruction error on weaker hardware, before it even reaches the BLAS routines specifically included for that cpu. + Please note that it is not possible to combine support for different architectures, so no combined 32 and 64 bit or x86_64 and arm64 in the same library. ### Supported OS From ffaa5765a41e283916049820a8fc9d805c72f04f Mon Sep 17 00:00:00 2001 From: gxw Date: Thu, 17 Oct 2024 12:32:54 +0000 Subject: [PATCH 106/244] Bench: Add omatcopy --- benchmark/Makefile | 39 +++++++++++++- benchmark/omatcopy.c | 122 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 160 insertions(+), 1 deletion(-) create mode 100644 benchmark/omatcopy.c diff --git a/benchmark/Makefile b/benchmark/Makefile index b7493950a3..c295b14585 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -103,6 +103,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ sgetri.goto dgetri.goto cgetri.goto zgetri.goto \ spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \ ssymm.goto dsymm.goto csymm.goto zsymm.goto \ + somatcopy.goto domatcopy.goto comatcopy.goto zomatcopy.goto \ saxpby.goto daxpby.goto caxpby.goto zaxpby.goto $(GOTO_HALF_TARGETS) acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ @@ -276,6 +277,7 @@ goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ samin.goto damin.goto camin.goto zamin.goto \ smin.goto dmin.goto \ saxpby.goto daxpby.goto caxpby.goto zaxpby.goto \ + somatcopy.goto domatcopy.goto comatcopy.goto zomatcopy.goto \ snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto $(GOTO_LAPACK_TARGETS) $(GOTO_HALF_TARGETS) acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ @@ -2906,6 +2908,29 @@ dznrm2.goto : dznrm2.$(SUFFIX) ../$(LIBNAME) dznrm2.atlas : dznrm2.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +################################################################################################### + +############################################ SOMATCOPY ############################################ +somatcopy.goto : somatcopy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +################################################################################################### + +############################################ DOMATCOPY ############################################ +domatcopy.goto : domatcopy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +################################################################################################### + +############################################ COMATCOPY ############################################ +comatcopy.goto : comatcopy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +################################################################################################### + +############################################ ZOMATCOPY ############################################ +zomatcopy.goto : zomatcopy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm ################################################################################################### @@ -3435,6 +3460,18 @@ scnrm2.$(SUFFIX) : nrm2.c dznrm2.$(SUFFIX) : nrm2.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ +somatcopy.$(SUFFIX) : omatcopy.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +domatcopy.$(SUFFIX) : omatcopy.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +comatcopy.$(SUFFIX) : omatcopy.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zomatcopy.$(SUFFIX) : omatcopy.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + smallscaling: smallscaling.c ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(EXTRALIB) -fopenmp -lm -lpthread @@ -3442,4 +3479,4 @@ smallscaling: smallscaling.c ../$(LIBNAME) clean :: @rm -f *.goto *.mkl *.acml *.atlas *.veclib *.essl smallscaling -include $(TOPDIR)/Makefile.tail \ No newline at end of file +include $(TOPDIR)/Makefile.tail diff --git a/benchmark/omatcopy.c b/benchmark/omatcopy.c new file mode 100644 index 0000000000..c3348c9be8 --- /dev/null +++ b/benchmark/omatcopy.c @@ -0,0 +1,122 @@ +/*************************************************************************** +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "bench.h" + +#undef OMATCOPY + +#ifndef COMPLEX +#ifdef DOUBLE +#define OMATCOPY BLASFUNC(domatcopy) +#else +#define OMATCOPY BLASFUNC(somatcopy) +#endif +#else +#ifdef DOUBLE +#define OMATCOPY BLASFUNC(zomatcopy) +#else +#define OMATCOPY BLASFUNC(comatcopy) +#endif +#endif +int main(int argc, char *argv[]){ + FLOAT *a, *b; + FLOAT alpha[] = {1.0, 0.0}; + char trans = 'N'; + char order = 'C'; + blasint crows, ccols, clda, cldb; + int loops = 1; + char *p; + + int from = 1; + int to = 200; + int step = 1; + int i, j; + + double time1, timeg; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++; } + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++; } + if (argc > 0) { step = atol(*argv); argc--; argv++; } + + if ((p = getenv("OPENBLAS_TRANS"))) { + trans=*p; + } + if ((p = getenv("OPENBLAS_ORDER"))) { + order=*p; + } + TOUPPER(trans); + TOUPPER(order); + fprintf(stderr, "From : %3d To : %3d Step=%d : Trans=%c : Order=%c\n", from, to, step, trans, order); + p = getenv("OPENBLAS_LOOPS"); + if ( p != NULL ) { + loops = atoi(p); + } + + if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) { + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + if (( b = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) { + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef __linux + srandom(getpid()); +#endif + + for (i = 0; i < to * to * COMPSIZE; i++) { + a[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + for (i = 0; i < to * to * COMPSIZE; i++) { + b[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + + fprintf(stderr, " SIZE Flops Time\n"); + for (i = from; i <= to; i += step) { + cldb = clda = crows = ccols = i; + fprintf(stderr, " ROWS=%4d, COLS=%4d : ", (int)crows, (int)ccols); + begin(); + + for (j=0; j Date: Fri, 18 Oct 2024 14:14:43 +0200 Subject: [PATCH 107/244] write HAVE_SVE to config where applicable --- cpuid_arm64.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cpuid_arm64.c b/cpuid_arm64.c index 2cfa96ea60..77d5e286bc 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -401,6 +401,7 @@ void get_cpuconfig(void) break; case CPU_NEOVERSEV1: + printf("#define HAVE_SVE 1\n"); case CPU_CORTEXA76: printf("#define %s\n", cpuname[d]); printf("#define L1_CODE_SIZE 65536\n"); @@ -432,6 +433,7 @@ void get_cpuconfig(void) break; case CPU_NEOVERSEV2: printf("#define ARMV9\n"); + printf("#define HAVE_SVE 1\n"); printf("#define %s\n", cpuname[d]); printf("#define L1_CODE_SIZE 65536\n"); printf("#define L1_CODE_LINESIZE 64\n"); @@ -452,6 +454,7 @@ void get_cpuconfig(void) case CPU_CORTEXX1: case CPU_CORTEXX2: printf("#define ARMV9\n"); + printf("#define HAVE_SVE 1\n"); printf("#define %s\n", cpuname[d]); printf("#define L1_CODE_SIZE 65536\n"); printf("#define L1_CODE_LINESIZE 64\n"); @@ -568,6 +571,7 @@ void get_cpuconfig(void) break; case CPU_A64FX: printf("#define A64FX\n"); + printf("#define HAVE_SVE 1\n"); printf("#define L1_CODE_SIZE 65535\n"); printf("#define L1_DATA_SIZE 65535\n"); printf("#define L1_DATA_LINESIZE 256\n"); From c4bb4e74fc5ce7987d32cb0eb8c9dfedc4ecd7ae Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 18 Oct 2024 14:50:55 +0200 Subject: [PATCH 108/244] NeoverseN2 has SVE too --- cpuid_arm64.c | 1 + 1 file changed, 1 insertion(+) diff --git a/cpuid_arm64.c b/cpuid_arm64.c index 77d5e286bc..5d25d2ff69 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -430,6 +430,7 @@ void get_cpuconfig(void) printf("#define L2_ASSOCIATIVE 8\n"); printf("#define DTB_DEFAULT_ENTRIES 48\n"); printf("#define DTB_SIZE 4096\n"); + printf("#define HAVE_SVE 1\n"); break; case CPU_NEOVERSEV2: printf("#define ARMV9\n"); From 4dba6ce6ea549b226ed3a4e481165370abd1b4dc Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 20 Oct 2024 20:25:06 +0200 Subject: [PATCH 109/244] work around mingw32-gfortran 14.2 miscompiling CBLAS1 tests --- ctest/CMakeLists.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ctest/CMakeLists.txt b/ctest/CMakeLists.txt index c56a78346f..4496eff825 100644 --- a/ctest/CMakeLists.txt +++ b/ctest/CMakeLists.txt @@ -6,6 +6,10 @@ enable_language(Fortran) endif() set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DADD${BU} -DCBLAS") +if (BINARY32 AND CMAKE_C_PLATFORM_ID MATCHES "MinGW" AND CMAKE_Fortran_COMPILER_VERSION VERSION_EQUAL 14.2) + list(REMOVE_ITEM ${CMAKE_Fortran_FLAGS} -O3 -O2 -O1 -Os) + set (CMAKE_Fortran_FLAGS_RELEASE "" CACHE STRING "" FORCE) +endif() if (CMAKE_Fortran_COMPILER_ID STREQUAL GNU) set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fno-tree-vectorize") endif() From b6ec73e77c619f9c10d56ea78a50ea639a4490c4 Mon Sep 17 00:00:00 2001 From: Ayappan Perumal Date: Mon, 21 Oct 2024 07:38:03 -0500 Subject: [PATCH 110/244] Fix AIX build --- kernel/power/gemm_common.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/power/gemm_common.c b/kernel/power/gemm_common.c index 88fa52de53..2dd5a8982d 100644 --- a/kernel/power/gemm_common.c +++ b/kernel/power/gemm_common.c @@ -28,7 +28,12 @@ #define USE_VECTOR_PAIRS #endif +#ifdef _AIX +#include +typedef __vector unsigned short vec_bf16; +#else typedef __vector IFLOAT vec_bf16; +#endif typedef __vector FLOAT vec_f32; typedef __vector unsigned char vec_uc8; From 020cce106876abc127705041f8eb2307d8117ef8 Mon Sep 17 00:00:00 2001 From: Ayappan Perumal Date: Wed, 23 Oct 2024 04:24:06 -0500 Subject: [PATCH 111/244] Fix build issues with gcc compiler as well --- kernel/power/gemm_common.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/power/gemm_common.c b/kernel/power/gemm_common.c index 2dd5a8982d..adfbb85c2a 100644 --- a/kernel/power/gemm_common.c +++ b/kernel/power/gemm_common.c @@ -49,7 +49,7 @@ FORCEINLINE void vec_load_pair(vec_f32 *dst, vec_f32 *src) #ifdef __clang__ vy0p = __builtin_vsx_lxvp(0L, (const __vector_pair *)(src)); #else - vy0p = *(__vector_pair *)(src); + vy0p = *(__vector_pair *)((void *)src); #endif __builtin_vsx_disassemble_pair((void *)(dst), &vy0p); #else @@ -66,7 +66,7 @@ FORCEINLINE void vec_store_pair(vec_f32 *dst, vec_f32 *src) #ifdef __clang__ __builtin_vsx_stxvp(vy0p, 0L, (__vector_pair *)(dst)); #else - *(__vector_pair *)(dst) = vy0p; + *(__vector_pair *)((void *)dst) = vy0p; #endif #else dst[0] = src[0]; From 79f4bbd4cdadaef0f2d30dc178289686d96422c9 Mon Sep 17 00:00:00 2001 From: nickel Date: Thu, 24 Oct 2024 11:20:02 +0200 Subject: [PATCH 112/244] fix: return types of *trtrs routines --- lapack-netlib/LAPACKE/include/lapack.h | 8 ++++---- lapack-netlib/SRC/cgels.c | 2 +- lapack-netlib/SRC/cgelst.c | 2 +- lapack-netlib/SRC/cgetsls.c | 2 +- lapack-netlib/SRC/cggglm.c | 2 +- lapack-netlib/SRC/cgglse.c | 2 +- lapack-netlib/SRC/dgels.c | 2 +- lapack-netlib/SRC/dgelst.c | 2 +- lapack-netlib/SRC/dgetsls.c | 2 +- lapack-netlib/SRC/dggglm.c | 2 +- lapack-netlib/SRC/sgels.c | 2 +- lapack-netlib/SRC/sgelst.c | 2 +- lapack-netlib/SRC/sgetsls.c | 2 +- lapack-netlib/SRC/sggglm.c | 2 +- lapack-netlib/SRC/sgglse.c | 2 +- lapack-netlib/SRC/zgels.c | 2 +- lapack-netlib/SRC/zgelst.c | 2 +- lapack-netlib/SRC/zgetsls.c | 2 +- lapack-netlib/SRC/zggglm.c | 2 +- lapack-netlib/SRC/zgglse.c | 2 +- 20 files changed, 23 insertions(+), 23 deletions(-) diff --git a/lapack-netlib/LAPACKE/include/lapack.h b/lapack-netlib/LAPACKE/include/lapack.h index 532aae574e..0ed9ad01a9 100644 --- a/lapack-netlib/LAPACKE/include/lapack.h +++ b/lapack-netlib/LAPACKE/include/lapack.h @@ -22379,7 +22379,7 @@ lapack_int LAPACK_ztrtri_base( #endif #define LAPACK_ctrtrs_base LAPACK_GLOBAL(ctrtrs,CTRTRS) -lapack_int LAPACK_ctrtrs_base( +void LAPACK_ctrtrs_base( char const* uplo, char const* trans, char const* diag, lapack_int const* n, lapack_int const* nrhs, lapack_complex_float const* A, lapack_int const* lda, @@ -22396,7 +22396,7 @@ lapack_int LAPACK_ctrtrs_base( #endif #define LAPACK_dtrtrs_base LAPACK_GLOBAL(dtrtrs,DTRTRS) -lapack_int LAPACK_dtrtrs_base( +void LAPACK_dtrtrs_base( char const* uplo, char const* trans, char const* diag, lapack_int const* n, lapack_int const* nrhs, double const* A, lapack_int const* lda, @@ -22413,7 +22413,7 @@ lapack_int LAPACK_dtrtrs_base( #endif #define LAPACK_strtrs_base LAPACK_GLOBAL(strtrs,STRTRS) -lapack_int LAPACK_strtrs_base( +void LAPACK_strtrs_base( char const* uplo, char const* trans, char const* diag, lapack_int const* n, lapack_int const* nrhs, float const* A, lapack_int const* lda, @@ -22430,7 +22430,7 @@ lapack_int LAPACK_strtrs_base( #endif #define LAPACK_ztrtrs_base LAPACK_GLOBAL(ztrtrs,ZTRTRS) -lapack_int LAPACK_ztrtrs_base( +void LAPACK_ztrtrs_base( char const* uplo, char const* trans, char const* diag, lapack_int const* n, lapack_int const* nrhs, lapack_complex_double const* A, lapack_int const* lda, diff --git a/lapack-netlib/SRC/cgels.c b/lapack-netlib/SRC/cgels.c index 1a84f97b37..827210975d 100644 --- a/lapack-netlib/SRC/cgels.c +++ b/lapack-netlib/SRC/cgels.c @@ -739,7 +739,7 @@ static integer c__0 = 0; complex *, complex *, integer *, complex *, integer *, integer *); real smlnum; logical lquery; - extern /* Subroutine */ int ctrtrs_(char *, char *, char *, integer *, + extern /* Subroutine */ void ctrtrs_(char *, char *, char *, integer *, integer *, complex *, integer *, complex *, integer *, integer *); diff --git a/lapack-netlib/SRC/cgelst.c b/lapack-netlib/SRC/cgelst.c index 2378d4074f..86c5341a94 100644 --- a/lapack-netlib/SRC/cgelst.c +++ b/lapack-netlib/SRC/cgelst.c @@ -748,7 +748,7 @@ f"> */ integer mnnrhs; real smlnum; logical lquery; - extern /* Subroutine */ int ctrtrs_(char *, char *, char *, integer *, + extern /* Subroutine */ void ctrtrs_(char *, char *, char *, integer *, integer *, complex *, integer *, complex *, integer *, integer *); extern void cgemlqt_(char *, char *, integer *, integer *, integer *, integer *, complex *, integer *, complex *, diff --git a/lapack-netlib/SRC/cgetsls.c b/lapack-netlib/SRC/cgetsls.c index 33c7b192ab..40a6322d4c 100644 --- a/lapack-netlib/SRC/cgetsls.c +++ b/lapack-netlib/SRC/cgetsls.c @@ -713,7 +713,7 @@ static integer c__0 = 0; real bignum, smlnum; integer wsizem, wsizeo; logical lquery; - extern /* Subroutine */ int ctrtrs_(char *, char *, char *, integer *, + extern /* Subroutine */ void ctrtrs_(char *, char *, char *, integer *, integer *, complex *, integer *, complex *, integer *, integer *); integer lw1, lw2, mnk; real dum[1]; diff --git a/lapack-netlib/SRC/cggglm.c b/lapack-netlib/SRC/cggglm.c index a71d656f0d..b8b69614bf 100644 --- a/lapack-netlib/SRC/cggglm.c +++ b/lapack-netlib/SRC/cggglm.c @@ -728,7 +728,7 @@ f"> */ complex *, complex *, integer *, complex *, integer *, integer *); integer lwkopt; logical lquery; - extern /* Subroutine */ int ctrtrs_(char *, char *, char *, integer *, + extern /* Subroutine */ void ctrtrs_(char *, char *, char *, integer *, integer *, complex *, integer *, complex *, integer *, integer *); diff --git a/lapack-netlib/SRC/cgglse.c b/lapack-netlib/SRC/cgglse.c index 67871b3180..1a991c0c72 100644 --- a/lapack-netlib/SRC/cgglse.c +++ b/lapack-netlib/SRC/cgglse.c @@ -725,7 +725,7 @@ f"> */ complex *, complex *, integer *, complex *, integer *, integer *); integer lwkopt; logical lquery; - extern /* Subroutine */ int ctrtrs_(char *, char *, char *, integer *, + extern /* Subroutine */ void ctrtrs_(char *, char *, char *, integer *, integer *, complex *, integer *, complex *, integer *, integer *); diff --git a/lapack-netlib/SRC/dgels.c b/lapack-netlib/SRC/dgels.c index 543ad8ec04..1e029992e0 100644 --- a/lapack-netlib/SRC/dgels.c +++ b/lapack-netlib/SRC/dgels.c @@ -739,7 +739,7 @@ static integer c__0 = 0; doublereal *, integer *, integer *); doublereal smlnum; logical lquery; - extern /* Subroutine */ int dtrtrs_(char *, char *, char *, integer *, + extern /* Subroutine */ void dtrtrs_(char *, char *, char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, integer *); diff --git a/lapack-netlib/SRC/dgelst.c b/lapack-netlib/SRC/dgelst.c index afaeaf7ccb..4413002ed0 100644 --- a/lapack-netlib/SRC/dgelst.c +++ b/lapack-netlib/SRC/dgelst.c @@ -747,7 +747,7 @@ f"> */ integer mnnrhs; doublereal smlnum; logical lquery; - extern /* Subroutine */ int dtrtrs_(char *, char *, char *, integer *, + extern /* Subroutine */ void dtrtrs_(char *, char *, char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, integer *); extern void dgemlqt_(char *, char *, diff --git a/lapack-netlib/SRC/dgetsls.c b/lapack-netlib/SRC/dgetsls.c index 1c4cf686b4..5713ba79b5 100644 --- a/lapack-netlib/SRC/dgetsls.c +++ b/lapack-netlib/SRC/dgetsls.c @@ -713,7 +713,7 @@ static integer c__0 = 0; doublereal bignum, smlnum; integer wsizem, wsizeo; logical lquery; - extern /* Subroutine */ int dtrtrs_(char *, char *, char *, integer *, + extern /* Subroutine */ void dtrtrs_(char *, char *, char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, integer *); integer lw1, lw2, mnk, lwm, lwo; diff --git a/lapack-netlib/SRC/dggglm.c b/lapack-netlib/SRC/dggglm.c index 75913c4972..28036f36d1 100644 --- a/lapack-netlib/SRC/dggglm.c +++ b/lapack-netlib/SRC/dggglm.c @@ -730,7 +730,7 @@ f"> */ doublereal *, integer *, integer *); integer lwkopt; logical lquery; - extern /* Subroutine */ int dtrtrs_(char *, char *, char *, integer *, + extern /* Subroutine */ void dtrtrs_(char *, char *, char *, integer *, integer *, doublereal *, integer *, doublereal *, integer *, integer *); diff --git a/lapack-netlib/SRC/sgels.c b/lapack-netlib/SRC/sgels.c index f0eef99aa0..20ed3cf169 100644 --- a/lapack-netlib/SRC/sgels.c +++ b/lapack-netlib/SRC/sgels.c @@ -485,7 +485,7 @@ static integer c__0 = 0; extern /* Subroutine */ void sormqr_(char *, char *, integer *, integer *, integer *, real *, integer *, real *, real *, integer *, real *, integer *, integer *); - extern int strtrs_(char *, char *, + extern void strtrs_(char *, char *, char *, integer *, integer *, real *, integer *, real *, integer * , integer *); diff --git a/lapack-netlib/SRC/sgelst.c b/lapack-netlib/SRC/sgelst.c index 6a6ed86bf1..eec93ec91a 100644 --- a/lapack-netlib/SRC/sgelst.c +++ b/lapack-netlib/SRC/sgelst.c @@ -744,7 +744,7 @@ f"> */ *, integer *, real *, integer *, real *, integer *); real smlnum; logical lquery; - extern /* Subroutine */ int strtrs_(char *, char *, char *, integer *, + extern /* Subroutine */ void strtrs_(char *, char *, char *, integer *, integer *, real *, integer *, real *, integer *, integer *); extern void sgemlqt_(char *, char *, integer *, integer *, integer *, integer *, real *, integer *, real *, diff --git a/lapack-netlib/SRC/sgetsls.c b/lapack-netlib/SRC/sgetsls.c index 09b8d3a593..1156eff449 100644 --- a/lapack-netlib/SRC/sgetsls.c +++ b/lapack-netlib/SRC/sgetsls.c @@ -711,7 +711,7 @@ static integer c__0 = 0; integer wsizem, wsizeo; logical lquery; integer lw1, lw2; - extern /* Subroutine */ int strtrs_(char *, char *, char *, integer *, + extern /* Subroutine */ void strtrs_(char *, char *, char *, integer *, integer *, real *, integer *, real *, integer *, integer *); integer mnk, lwm, lwo; diff --git a/lapack-netlib/SRC/sggglm.c b/lapack-netlib/SRC/sggglm.c index 38694a1c55..157482b983 100644 --- a/lapack-netlib/SRC/sggglm.c +++ b/lapack-netlib/SRC/sggglm.c @@ -473,7 +473,7 @@ f"> */ integer *, integer *), sormrq_(char *, char *, integer *, integer *, integer *, real *, integer *, real *, real * , integer *, real *, integer *, integer *); - extern int strtrs_(char *, char *, char *, integer *, integer *, real *, + extern void strtrs_(char *, char *, char *, integer *, integer *, real *, integer *, real *, integer *, integer *); diff --git a/lapack-netlib/SRC/sgglse.c b/lapack-netlib/SRC/sgglse.c index c731ea1a7b..99393a1b88 100644 --- a/lapack-netlib/SRC/sgglse.c +++ b/lapack-netlib/SRC/sgglse.c @@ -471,7 +471,7 @@ f"> */ integer *, integer *), sormrq_(char *, char *, integer *, integer *, integer *, real *, integer *, real *, real * , integer *, real *, integer *, integer *); - extern int strtrs_(char *, char *, char *, integer *, integer *, real *, + extern void strtrs_(char *, char *, char *, integer *, integer *, real *, integer *, real *, integer *, integer *); diff --git a/lapack-netlib/SRC/zgels.c b/lapack-netlib/SRC/zgels.c index 5f3ca5e4b9..540ee45613 100644 --- a/lapack-netlib/SRC/zgels.c +++ b/lapack-netlib/SRC/zgels.c @@ -738,7 +738,7 @@ static integer c__0 = 0; doublecomplex *, integer *, doublecomplex *, integer *, integer *), zunmqr_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); - extern int ztrtrs_(char *, char *, char *, integer *, + extern void ztrtrs_(char *, char *, char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); diff --git a/lapack-netlib/SRC/zgelst.c b/lapack-netlib/SRC/zgelst.c index 42282011cd..29c2713b18 100644 --- a/lapack-netlib/SRC/zgelst.c +++ b/lapack-netlib/SRC/zgelst.c @@ -752,7 +752,7 @@ f"> */ doublecomplex *, integer *, doublecomplex *, integer *, doublecomplex *, integer *); logical lquery; - extern /* Subroutine */ int ztrtrs_(char *, char *, char *, integer *, + extern /* Subroutine */ void ztrtrs_(char *, char *, char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); extern void zgemlqt_(char *, char *, diff --git a/lapack-netlib/SRC/zgetsls.c b/lapack-netlib/SRC/zgetsls.c index c9f1d5d971..1309687183 100644 --- a/lapack-netlib/SRC/zgetsls.c +++ b/lapack-netlib/SRC/zgetsls.c @@ -716,7 +716,7 @@ static integer c__0 = 0; integer wsizem, wsizeo; logical lquery; integer lw1, lw2; - extern /* Subroutine */ int ztrtrs_(char *, char *, char *, integer *, + extern /* Subroutine */ void ztrtrs_(char *, char *, char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); integer mnk; diff --git a/lapack-netlib/SRC/zggglm.c b/lapack-netlib/SRC/zggglm.c index e4ed67fb04..5aaca4dd16 100644 --- a/lapack-netlib/SRC/zggglm.c +++ b/lapack-netlib/SRC/zggglm.c @@ -730,7 +730,7 @@ f"> */ doublecomplex *, integer *, doublecomplex *, integer *, integer *), zunmrq_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); - extern int ztrtrs_(char *, char *, char *, integer *, + extern void ztrtrs_(char *, char *, char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); diff --git a/lapack-netlib/SRC/zgglse.c b/lapack-netlib/SRC/zgglse.c index 06a3c266b1..801210cd78 100644 --- a/lapack-netlib/SRC/zgglse.c +++ b/lapack-netlib/SRC/zgglse.c @@ -727,7 +727,7 @@ f"> */ integer *, doublecomplex *, integer *, doublecomplex *, doublecomplex *, integer *, doublecomplex *, integer *, integer *), zunmrq_(char *, char *, integer *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, - doublecomplex *, integer *, doublecomplex *, integer *, integer *); extern int ztrtrs_(char *, char *, char *, integer *, + doublecomplex *, integer *, doublecomplex *, integer *, integer *), ztrtrs_(char *, char *, char *, integer *, integer *, doublecomplex *, integer *, doublecomplex *, integer *, integer *); From cb48505251d0a6cb4fb65787f4838478d8806e92 Mon Sep 17 00:00:00 2001 From: Chris Daley Date: Thu, 24 Oct 2024 21:05:26 -0700 Subject: [PATCH 113/244] optimize gemv forwarding on ARM64 systems --- CONTRIBUTORS.md | 3 +++ interface/gemm.c | 24 ++++++++++++++++++++---- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index d885a01b96..a6d25b50bd 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -226,3 +226,6 @@ In chronological order: * Dirreke * [2024-01-16] Add basic support for the CSKY architecture + +* Christopher Daley + * [2024-01-24] Optimize GEMV forwarding on ARM64 systems diff --git a/interface/gemm.c b/interface/gemm.c index 5742d36c4b..576e94593c 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -39,6 +39,7 @@ #include #include +#include #include "common.h" #ifdef FUNCTION_PROFILE #include "functable.h" @@ -499,6 +500,15 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS #endif #if defined(GEMM_GEMV_FORWARD) && !defined(GEMM3M) && !defined(COMPLEX) && (!defined(BFLOAT16) || defined(GEMM_GEMV_FORWARD_BF16)) +#if defined(ARCH_ARM64) + // The gemv kernels in arm64/{gemv_n.S,gemv_n_sve.c,gemv_t.S,gemv_t_sve.c} + // perform poorly in certain circumstances. We use the following boolean + // variable along with the gemv argument values to avoid these inefficient + // gemv cases, see github issue#4951. + bool have_tuned_gemv = false; +#else + bool have_tuned_gemv = true; +#endif // Check if we can convert GEMM -> GEMV if (args.k != 0) { if (args.n == 1) { @@ -518,8 +528,11 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS if (transb & 1) { inc_x = args.ldb; } - GEMV(&NT, &m, &n, args.alpha, args.a, &lda, args.b, &inc_x, args.beta, args.c, &inc_y); - return; + bool is_efficient_gemv = have_tuned_gemv || ((NT == 'N') || (NT == 'T' && inc_x == 1)); + if (is_efficient_gemv) { + GEMV(&NT, &m, &n, args.alpha, args.a, &lda, args.b, &inc_x, args.beta, args.c, &inc_y); + return; + } } if (args.m == 1) { blasint inc_x = args.lda; @@ -538,8 +551,11 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS m = args.n; n = args.k; } - GEMV(&NT, &m, &n, args.alpha, args.b, &ldb, args.a, &inc_x, args.beta, args.c, &inc_y); - return; + bool is_efficient_gemv = have_tuned_gemv || ((NT == 'N' && inc_y == 1) || (NT == 'T' && inc_x == 1)); + if (is_efficient_gemv) { + GEMV(&NT, &m, &n, args.alpha, args.b, &ldb, args.a, &inc_x, args.beta, args.c, &inc_y); + return; + } } } #endif From 8f595382c4f31b0f49be8bf178aed80f501a0bc3 Mon Sep 17 00:00:00 2001 From: gxw Date: Fri, 25 Oct 2024 03:12:15 +0000 Subject: [PATCH 114/244] gh-actions: Test LoongArch64 with gcc14 from Ubuntu 24.04 --- .github/workflows/loongarch64.yml | 129 ++++++++++++------------------ 1 file changed, 53 insertions(+), 76 deletions(-) diff --git a/.github/workflows/loongarch64.yml b/.github/workflows/loongarch64.yml index 69379e0500..c4f5df13a4 100644 --- a/.github/workflows/loongarch64.yml +++ b/.github/workflows/loongarch64.yml @@ -9,31 +9,31 @@ concurrency: jobs: TEST: if: "github.repository == 'OpenMathLib/OpenBLAS'" - runs-on: ubuntu-latest + runs-on: ubuntu-24.04 strategy: fail-fast: false matrix: include: - target: LOONGSONGENERIC - triple: loongarch64-unknown-linux-gnu + triple: loongarch64-linux-gnu opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSONGENERIC - target: LOONGSON3R5 - triple: loongarch64-unknown-linux-gnu + triple: loongarch64-linux-gnu opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON3R5 - target: LOONGSON2K1000 - triple: loongarch64-unknown-linux-gnu + triple: loongarch64-linux-gnu opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON2K1000 - target: LA64_GENERIC - triple: loongarch64-unknown-linux-gnu + triple: loongarch64-linux-gnu opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LA64_GENERIC - target: LA464 - triple: loongarch64-unknown-linux-gnu + triple: loongarch64-linux-gnu opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LA464 - target: LA264 - triple: loongarch64-unknown-linux-gnu + triple: loongarch64-linux-gnu opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LA264 - target: DYNAMIC_ARCH - triple: loongarch64-unknown-linux-gnu + triple: loongarch64-linux-gnu opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=GENERIC steps: @@ -42,33 +42,9 @@ jobs: - name: Install APT deps run: | - sudo apt-get update - sudo apt-get install autoconf automake autotools-dev ninja-build make ccache - - - name: Download and install loongarch64-toolchain - run: | - wget https://github.com/sunhaiyong1978/CLFS-for-LoongArch/releases/download/8.1/CLFS-loongarch64-8.1-x86_64-cross-tools-gcc-glibc.tar.xz - #wget https://github.com/loongson/build-tools/releases/download/2023.08.08/CLFS-loongarch64-8.1-x86_64-cross-tools-gcc-glibc.tar.xz - tar -xf CLFS-loongarch64-8.1-x86_64-cross-tools-gcc-glibc.tar.xz -C /opt - - - name: Checkout qemu - uses: actions/checkout@v3 - with: - repository: qemu/qemu - path: qemu - ref: master - - - name: Install qemu - run: | - cd qemu - ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=loongarch64-linux-user --disable-system --static - make -j$(nproc) - make install - - - name: Set env - run: | - echo "LD_LIBRARY_PATH=/opt/cross-tools/target/usr/lib64:/opt/cross-tools/loongarch64-unknown-linux-gnu/lib64:$LD_LIBRARY_PATH" >> $GITHUB_ENV - echo "PATH=$GITHUB_WORKSPACE:/opt/cross-tools/bin:$PATH" >> $GITHUB_ENV + sudo apt-get update && \ + sudo apt-get install autoconf automake autotools-dev ninja-build make ccache qemu-user-static \ + gcc-14-loongarch64-linux-gnu g++-14-loongarch64-linux-gnu gfortran-14-loongarch64-linux-gnu - name: Compilation cache uses: actions/cache@v3 @@ -89,54 +65,55 @@ jobs: - name: Disable utest dsdot:dsdot_n_1 run: | echo -n > utest/test_dsdot.c - echo "Due to the qemu versions 7.2 causing utest cases to fail," + echo "Due to the current version of qemu causing utest cases to fail," echo "the utest dsdot:dsdot_n_1 have been temporarily disabled." - name: Build OpenBLAS - run: make CC='ccache ${{ matrix.triple }}-gcc -static' FC='ccache ${{ matrix.triple }}-gfortran -static' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc) + run: | + make CC='ccache ${{ matrix.triple }}-gcc-14 -static' FC='ccache ${{ matrix.triple }}-gfortran-14 -static' \ + RANLIB='ccache ${{ matrix.triple }}-gcc-ranlib-14' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc) - name: Test run: | - export PATH=$GITHUB_WORKSPACE/qemu-install/bin/:$PATH - qemu-loongarch64 ./utest/openblas_utest - qemu-loongarch64 ./utest/openblas_utest_ext - OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xscblat1 - OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xdcblat1 - OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xccblat1 - OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xzcblat1 - OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xscblat2 < ./ctest/sin2 - OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xdcblat2 < ./ctest/din2 - OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xccblat2 < ./ctest/cin2 - OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xzcblat2 < ./ctest/zin2 - OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xscblat3 < ./ctest/sin3 - OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xdcblat3 < ./ctest/din3 - OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xccblat3 < ./ctest/cin3 - OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./ctest/xzcblat3 < ./ctest/zin3 - OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/sblat1 - OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/dblat1 - OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/cblat1 - OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/zblat1 - OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/sblat1 - OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/dblat1 - OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/cblat1 - OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/zblat1 + qemu-loongarch64-static ./utest/openblas_utest + qemu-loongarch64-static ./utest/openblas_utest_ext + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat1 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat1 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat1 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat1 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat2 < ./ctest/sin2 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat2 < ./ctest/din2 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat2 < ./ctest/cin2 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat2 < ./ctest/zin2 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat3 < ./ctest/sin3 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat3 < ./ctest/din3 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat3 < ./ctest/cin3 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat3 < ./ctest/zin3 + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat1 + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat1 + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat1 + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat1 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat1 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat1 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat1 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat1 rm -f ./test/?BLAT2.SUMM - OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/sblat2 < ./test/sblat2.dat - OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/dblat2 < ./test/dblat2.dat - OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/cblat2 < ./test/cblat2.dat - OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/zblat2 < ./test/zblat2.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat2 < ./test/sblat2.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat2 < ./test/dblat2.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat2 < ./test/cblat2.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat2 < ./test/zblat2.dat rm -f ./test/?BLAT2.SUMM - OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/sblat2 < ./test/sblat2.dat - OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/dblat2 < ./test/dblat2.dat - OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/cblat2 < ./test/cblat2.dat - OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/zblat2 < ./test/zblat2.dat + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat2 < ./test/sblat2.dat + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat2 < ./test/dblat2.dat + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat2 < ./test/cblat2.dat + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat2 < ./test/zblat2.dat rm -f ./test/?BLAT3.SUMM - OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/sblat3 < ./test/sblat3.dat - OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/dblat3 < ./test/dblat3.dat - OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/cblat3 < ./test/cblat3.dat - OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64 ./test/zblat3 < ./test/zblat3.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat3 < ./test/sblat3.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat3 < ./test/dblat3.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat3 < ./test/cblat3.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat3 < ./test/zblat3.dat rm -f ./test/?BLAT3.SUMM - OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/sblat3 < ./test/sblat3.dat - OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/dblat3 < ./test/dblat3.dat - OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/cblat3 < ./test/cblat3.dat - OPENBLAS_NUM_THREADS=2 qemu-loongarch64 ./test/zblat3 < ./test/zblat3.dat + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat3 < ./test/sblat3.dat + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat3 < ./test/dblat3.dat + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat3 < ./test/cblat3.dat + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat3 < ./test/zblat3.dat From 815cb24944916ba35092041b79942a4cffce08ea Mon Sep 17 00:00:00 2001 From: TGY Date: Wed, 16 Aug 2023 06:06:00 +0200 Subject: [PATCH 115/244] remove unused INLINE macro definitions --- common_arm.h | 2 -- common_arm64.h | 2 -- common_e2k.h | 2 -- common_loongarch64.h | 2 -- common_mips.h | 2 -- common_mips64.h | 2 -- common_power.h | 4 +--- common_riscv64.h | 2 -- common_zarch.h | 3 --- 9 files changed, 1 insertion(+), 20 deletions(-) diff --git a/common_arm.h b/common_arm.h index a3db9953cb..80aabc7b02 100644 --- a/common_arm.h +++ b/common_arm.h @@ -47,8 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#define INLINE inline - #define RETURN_BY_COMPLEX #ifndef ASSEMBLER diff --git a/common_arm64.h b/common_arm64.h index 876a4aa6de..3e72e2a324 100644 --- a/common_arm64.h +++ b/common_arm64.h @@ -44,8 +44,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define RMB __asm__ __volatile__ ("dmb ishld" : : : "memory") #endif -#define INLINE inline - #if defined( F_INTERFACE_FLANG) || defined(F_INTERFACE_PGI) #define RETURN_BY_STACK #else diff --git a/common_e2k.h b/common_e2k.h index 0739c94732..fe8370c6c2 100644 --- a/common_e2k.h +++ b/common_e2k.h @@ -41,8 +41,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define WMB do { __asm__ __volatile__("": : :"memory"); } while (0) #define RMB -#define INLINE __attribute__((__always_inline__)) inline - static inline int blas_quickdivide(blasint x, blasint y) { return x / y; } diff --git a/common_loongarch64.h b/common_loongarch64.h index 2b48450a2d..6e96984f65 100644 --- a/common_loongarch64.h +++ b/common_loongarch64.h @@ -75,8 +75,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define WMB __sync_synchronize() #define RMB __sync_synchronize() -#define INLINE inline - #ifndef ASSEMBLER static inline int blas_quickdivide(blasint x, blasint y){ diff --git a/common_mips.h b/common_mips.h index 7dc3ba246f..ce328d7e2d 100644 --- a/common_mips.h +++ b/common_mips.h @@ -37,8 +37,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define WMB __sync_synchronize() #define RMB __sync_synchronize() -#define INLINE inline - #define RETURN_BY_COMPLEX #ifndef ASSEMBLER diff --git a/common_mips64.h b/common_mips64.h index 006cf33e41..c7eb212df6 100644 --- a/common_mips64.h +++ b/common_mips64.h @@ -75,8 +75,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define WMB __sync_synchronize() #define RMB __sync_synchronize() -#define INLINE inline - #ifndef ASSEMBLER static inline unsigned int rpcc(void){ diff --git a/common_power.h b/common_power.h index 6b13f06b10..ded76ad519 100644 --- a/common_power.h +++ b/common_power.h @@ -78,8 +78,6 @@ #define RMB __asm__ __volatile__ ("sync") #endif -#define INLINE inline - #ifdef PPC440 #define STDERR stdout #define QNONCACHE 0x1 @@ -91,7 +89,7 @@ void *qalloc(int flags, size_t bytes); -static INLINE void blas_lock(volatile unsigned long *address){ +static inline void blas_lock(volatile unsigned long *address){ long int ret, val = 1; diff --git a/common_riscv64.h b/common_riscv64.h index eccfc644fe..ba638e8be5 100644 --- a/common_riscv64.h +++ b/common_riscv64.h @@ -75,8 +75,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define WMB __sync_synchronize() #define RMB __sync_synchronize() -#define INLINE inline - #ifndef ASSEMBLER diff --git a/common_zarch.h b/common_zarch.h index 80609251b7..7911f11ae7 100644 --- a/common_zarch.h +++ b/common_zarch.h @@ -37,9 +37,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define WMB #define RMB - -#define INLINE inline - #define RETURN_BY_COMPLEX #ifndef ASSEMBLER From 0e6a2cc93cfb68b793e662b03384777ce445c963 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 25 Oct 2024 16:47:52 +0200 Subject: [PATCH 116/244] bump the minimum_required version instead --- CMakeLists.txt | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a4e025503a..4d193f4d81 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,9 +2,7 @@ ## Author: Hank Anderson ## -cmake_minimum_required(VERSION 2.8.5) - -cmake_policy(SET CMP0042 NEW) +cmake_minimum_required(VERSION 3.16.0) project(OpenBLAS C ASM) From 73c6a28073f9f80f22ea210654f8bcc0d47cbce8 Mon Sep 17 00:00:00 2001 From: gxw Date: Tue, 29 Oct 2024 06:31:58 +0000 Subject: [PATCH 117/244] x86_64: opt somatcopy_ct with AVX --- kernel/x86_64/KERNEL | 1 + kernel/x86_64/omatcopy_ct.c | 373 ++++++++++++++++++++++++++++++++++++ 2 files changed, 374 insertions(+) create mode 100644 kernel/x86_64/omatcopy_ct.c diff --git a/kernel/x86_64/KERNEL b/kernel/x86_64/KERNEL index ec4290e823..2deb5a864c 100644 --- a/kernel/x86_64/KERNEL +++ b/kernel/x86_64/KERNEL @@ -493,3 +493,4 @@ CSUMKERNEL = zsum_sse.S ZSUMKERNEL = zsum_sse2.S SOMATCOPY_RT = omatcopy_rt.c +SOMATCOPY_CT = omatcopy_ct.c diff --git a/kernel/x86_64/omatcopy_ct.c b/kernel/x86_64/omatcopy_ct.c new file mode 100644 index 0000000000..ffb41db8b4 --- /dev/null +++ b/kernel/x86_64/omatcopy_ct.c @@ -0,0 +1,373 @@ +/*************************************************************************** +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#ifdef HAVE_AVX + +#define COLS_OF_BLOCK 384 + +/* +r: %0 = src, %1 = dst, %2 = src_ld, %3 = dst_ld, %4 = dst_tmp */ +/* m: %5 = num_rows, %6 = alpha */ +/* xmm15 = alpha */ +#define TRANS_4x4(a1_no,a2_no,a3_no,a4_no,t1_no,t2_no,t3_no,t4_no)\ + "vunpcklps %%xmm"#a2_no",%%xmm"#a1_no",%%xmm"#t1_no"; vunpckhps %%xmm"#a2_no",%%xmm"#a1_no",%%xmm"#t2_no";"\ + "vunpcklps %%xmm"#a4_no",%%xmm"#a3_no",%%xmm"#t3_no"; vunpckhps %%xmm"#a4_no",%%xmm"#a3_no",%%xmm"#t4_no";"\ + "vunpcklpd %%xmm"#t3_no",%%xmm"#t1_no",%%xmm"#a1_no"; vunpckhpd %%xmm"#t3_no",%%xmm"#t1_no",%%xmm"#a2_no";"\ + "vunpcklpd %%xmm"#t4_no",%%xmm"#t2_no",%%xmm"#a3_no"; vunpckhpd %%xmm"#t4_no",%%xmm"#t2_no",%%xmm"#a4_no";" + +#define TRANS_4x8(a1_no,a2_no,a3_no,a4_no,t1_no,t2_no,t3_no,t4_no)\ + "vunpcklps %%ymm"#a2_no",%%ymm"#a1_no",%%ymm"#t1_no"; vunpckhps %%ymm"#a2_no",%%ymm"#a1_no",%%ymm"#t2_no";"\ + "vunpcklps %%ymm"#a4_no",%%ymm"#a3_no",%%ymm"#t3_no"; vunpckhps %%ymm"#a4_no",%%ymm"#a3_no",%%ymm"#t4_no";"\ + "vunpcklpd %%ymm"#t3_no",%%ymm"#t1_no",%%ymm"#a1_no"; vunpckhpd %%ymm"#t3_no",%%ymm"#t1_no",%%ymm"#a2_no";"\ + "vunpcklpd %%ymm"#t4_no",%%ymm"#t2_no",%%ymm"#a3_no"; vunpckhpd %%ymm"#t4_no",%%ymm"#t2_no",%%ymm"#a4_no";" + +#define SAVE_4x4(b1_no,b2_no,b3_no,b4_no)\ + "vmovups %%xmm"#b1_no",(%4); vmovups %%xmm"#b2_no",(%4,%3,1); leaq (%4,%3,2),%4;"\ + "vmovups %%xmm"#b3_no",(%4); vmovups %%xmm"#b4_no",(%4,%3,1); leaq (%4,%3,2),%4;" + +#define SAVE_4x8(b1_no,b2_no,b3_no,b4_no) SAVE_4x4(b1_no,b2_no,b3_no,b4_no)\ + "vextractf128 $1,%%ymm"#b1_no",(%4); vextractf128 $1,%%ymm"#b2_no",(%4,%3,1); leaq (%4,%3,2),%4;"\ + "vextractf128 $1,%%ymm"#b3_no",(%4); vextractf128 $1,%%ymm"#b4_no",(%4,%3,1); leaq (%4,%3,2),%4;" + +#define COPY_4x16 "movq %1,%4; addq $16,%1;"\ + "vmulps (%0),%%ymm15,%%ymm0; vmulps 32(%0),%%ymm15,%%ymm4; vmulps (%0,%2,1),%%ymm15,%%ymm1; vmulps 32(%0,%2,1),%%ymm15,%%ymm5; leaq (%0,%2,2),%0;"\ + "vmulps (%0),%%ymm15,%%ymm2; vmulps 32(%0),%%ymm15,%%ymm6; vmulps (%0,%2,1),%%ymm15,%%ymm3; vmulps 32(%0,%2,1),%%ymm15,%%ymm7; leaq (%0,%2,2),%0;"\ + TRANS_4x8(0,1,2,3,8,9,10,11) SAVE_4x8(0,1,2,3)\ + TRANS_4x8(4,5,6,7,8,9,10,11) SAVE_4x8(4,5,6,7) + +#define COPY_4x8 "movq %1,%4; addq $16,%1;"\ + "vmulps (%0),%%ymm15,%%ymm0; vmulps (%0,%2,1),%%ymm15,%%ymm1; leaq (%0,%2,2),%0;"\ + "vmulps (%0),%%ymm15,%%ymm2; vmulps (%0,%2,1),%%ymm15,%%ymm3; leaq (%0,%2,2),%0;"\ + TRANS_4x8(0,1,2,3,8,9,10,11) SAVE_4x8(0,1,2,3) + +#define COPY_4x4 "movq %1,%4; addq $16,%1;"\ + "vmulps (%0),%%xmm15,%%xmm0; vmulps (%0,%2,1),%%xmm15,%%xmm1; leaq (%0,%2,2),%0;"\ + "vmulps (%0),%%xmm15,%%xmm2; vmulps (%0,%2,1),%%xmm15,%%xmm3; leaq (%0,%2,2),%0;"\ + TRANS_4x4(0,1,2,3,8,9,10,11) SAVE_4x4(0,1,2,3) + +#define COPY_4x2 \ + "vmovsd (%0),%%xmm0; vmovhpd (%0,%2,1),%%xmm0,%%xmm0; vmulps %%xmm15,%%xmm0,%%xmm0; leaq (%0,%2,2),%0;"\ + "vmovsd (%0),%%xmm1; vmovhpd (%0,%2,1),%%xmm1,%%xmm1; vmulps %%xmm15,%%xmm1,%%xmm1; leaq (%0,%2,2),%0;"\ + "vpermilps $216,%%xmm0,%%xmm0; vpermilps $216,%%xmm1,%%xmm1; vunpcklpd %%xmm1,%%xmm0,%%xmm2; vunpckhpd %%xmm1,%%xmm0,%%xmm3;"\ + "vmovups %%xmm2,(%1); vmovups %%xmm3,(%1,%3,1); addq $16,%1;" + +#define COPY_4x1 \ + "vmovss (%0),%%xmm0; vinsertps $16,(%0,%2,1),%%xmm0,%%xmm0; leaq (%0,%2,2),%0;"\ + "vinsertps $32,(%0),%%xmm0,%%xmm0; vinsertps $48,(%0,%2,1),%%xmm0,%%xmm0; leaq (%0,%2,2),%0;"\ + "vmulps %%xmm15,%%xmm0,%%xmm0; vmovups %%xmm0,(%1); addq $16,%1;" + +#define SAVE_2x4(c1_no,c2_no,t1_no,t2_no) \ + "vunpcklps %%xmm"#c2_no",%%xmm"#c1_no",%%xmm"#t1_no"; vmulps %%xmm15,%%xmm"#t1_no",%%xmm"#t1_no";"\ + "vmovsd %%xmm"#t1_no",(%4); vmovhpd %%xmm"#t1_no",(%4,%3,1); leaq (%4,%3,2),%4;"\ + "vunpckhps %%xmm"#c2_no",%%xmm"#c1_no",%%xmm"#t2_no"; vmulps %%xmm15,%%xmm"#t2_no",%%xmm"#t2_no";"\ + "vmovsd %%xmm"#t2_no",(%4); vmovhpd %%xmm"#t2_no",(%4,%3,1); leaq (%4,%3,2),%4;" + +#define COPY_2x16 "movq %1,%4; addq $8,%1;"\ + "vmovups (%0),%%ymm0; vmovups 32(%0),%%ymm2; vmovups (%0,%2,1),%%ymm1; vmovups 32(%0,%2,1),%%ymm3; leaq (%0,%2,2),%0;"\ + "vextractf128 $1,%%ymm0,%%xmm4; vextractf128 $1,%%ymm2,%%xmm6; vextractf128 $1,%%ymm1,%%xmm5; vextractf128 $1,%%ymm3,%%xmm7;"\ + SAVE_2x4(0,1,8,9) SAVE_2x4(4,5,8,9) SAVE_2x4(2,3,8,9) SAVE_2x4(6,7,8,9) + +#define COPY_2x8 "movq %1,%4; addq $8,%1;"\ + "vmovups (%0),%%ymm0; vmovups (%0,%2,1),%%ymm1; leaq (%0,%2,2),%0;"\ + "vextractf128 $1,%%ymm0,%%xmm2; vextractf128 $1,%%ymm1,%%xmm3;"\ + SAVE_2x4(0,1,4,5) SAVE_2x4(2,3,4,5) + +#define COPY_2x4 "movq %1,%4; addq $8,%1;"\ + "vmovups (%0),%%xmm0; vmovups (%0,%2,1),%%xmm1; leaq (%0,%2,2),%0;"\ + SAVE_2x4(0,1,4,5) + +#define COPY_2x2 \ + "vmovsd (%0),%%xmm0; vmovhpd (%0,%2,1),%%xmm0,%%xmm0; vmulps %%xmm15,%%xmm0,%%xmm0; leaq (%0,%2,2),%0; vpermilps $216,%%xmm0,%%xmm0;"\ + "vmovsd %%xmm0,(%1); vmovhpd %%xmm0,(%1,%3,1); addq $8,%1;" + +#define COPY_2x1 \ + "vmovss (%0),%%xmm0; vinsertps $16,(%0,%2,1),%%xmm0,%%xmm0; vmulps %%xmm15,%%xmm0,%%xmm0; leaq (%0,%2,2),%0; vmovsd %%xmm0,(%1); addq $8,%1;" + +#define SAVE_1x4(c1_no)\ + "vmulps %%xmm15,%%xmm"#c1_no",%%xmm"#c1_no"; vmovss %%xmm"#c1_no",(%4); vextractps $1,%%xmm"#c1_no",(%4,%3,1); leaq (%4,%3,2),%4;"\ + "vextractps $2,%%xmm"#c1_no",(%4); vextractps $3,%%xmm"#c1_no",(%4,%3,1); leaq (%4,%3,2),%4;" + +#define COPY_1x16 "movq %1,%4; addq $4,%1;"\ + "vmovups (%0),%%xmm1;" SAVE_1x4(1) "vmovups 16(%0),%%xmm2;" SAVE_1x4(2)\ + "vmovups 32(%0),%%xmm1;" SAVE_1x4(1) "vmovups 48(%0),%%xmm2;" SAVE_1x4(2) "addq %2,%0;" + +#define COPY_1x8 "movq %1,%4; addq $4,%1;"\ + "vmovups (%0),%%xmm1;" SAVE_1x4(1) "vmovups 16(%0),%%xmm2;" SAVE_1x4(2) "addq %2,%0;" + +#define COPY_1x4 "movq %1,%4; addq $4,%1; vmovups (%0),%%xmm1;" SAVE_1x4(1) "addq %2,%0;" + +#define COPY_1x2 "vmovsd (%0),%%xmm1; addq %2,%0; vmulps %%xmm15,%%xmm1,%%xmm1; vmovss %%xmm1,(%1); vextractps $1,%%xmm1,(%1,%3,1); addq $4,%1;" + +#define COPY_1x1 "vmulss (%0),%%xmm15,%%xmm1; vmovss %%xmm1,(%1); addq %2,%0; addq $4,%1;" + +#define COMPUTE(ndim){\ + src = src_base; dst = dst_base;\ + __asm__ __volatile__(\ + "vbroadcastss %6,%%ymm15; movq %5,%%r11; cmpq $4,%%r11; jb "#ndim"32f;"\ + #ndim"31:\n\t"\ + COPY_4x##ndim "subq $4,%%r11; cmpq $4,%%r11; jnb "#ndim"31b;"\ + #ndim"32:\n\t"\ + "cmpq $2,%%r11; jb "#ndim"33f;"\ + COPY_2x##ndim "subq $2,%%r11;"\ + #ndim"33:\n\t"\ + "testq %%r11,%%r11; jz "#ndim"34f;"\ + COPY_1x##ndim "subq $1,%%r11;"\ + #ndim"34:\n\t"\ + :"+r"(src),"+r"(dst),"+r"(src_ld_bytes),"+r"(dst_ld_bytes),"+r"(dst_tmp):"m"(num_cols),"m"(ALPHA):"r11","cc","memory"\ + ,"xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\ +} +int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb){ + float *src, *dst, *dst_tmp=0, *src_base, *dst_base; + uint64_t src_ld_bytes = (uint64_t)lda * sizeof(float), dst_ld_bytes = (uint64_t)ldb * sizeof(float), num_cols = 0; + BLASLONG rows_left, cols_done; float ALPHA = alpha; + if(ALPHA==0.0){ + dst_base = b; + for(rows_left=rows;rows_left>0;rows_left--) {memset(dst_base,0,cols*sizeof(float)); dst_base += ldb;} + return 0; + } + for(cols_done=0;cols_done COLS_OF_BLOCK) num_cols = COLS_OF_BLOCK; + rows_left = rows; src_base = a + (int64_t)lda * (int64_t)cols_done; dst_base = b + cols_done; + if(ldb%1024>3 && ldb%1024<1021) for(;rows_left>15;rows_left-=16){COMPUTE(16) src_base += 16; dst_base += 16 * ldb;} + for(;rows_left>7;rows_left-=8){COMPUTE(8) src_base += 8; dst_base += 8 * ldb;} + for(;rows_left>3;rows_left-=4){COMPUTE(4) src_base += 4; dst_base += 4 * ldb;} + for(;rows_left>1;rows_left-=2){COMPUTE(2) src_base += 2; dst_base += 2 * ldb;} + if(rows_left>0){COMPUTE(1) src_base ++; dst_base += ldb;} + } + return 0; +} + +#else + +int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb) +{ + BLASLONG i, j; + FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; + FLOAT *b_offset, *b_offset1, *b_offset2, *b_offset3, *b_offset4; + + if (rows <= 0) return 0; + if (cols <= 0) return 0; + + a_offset = a; + b_offset = b; + + i = (cols >> 2); + if (i > 0) { + do { + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset3 = a_offset2 + lda; + a_offset4 = a_offset3 + lda; + a_offset += 4 * lda; + + b_offset1 = b_offset; + b_offset2 = b_offset1 + ldb; + b_offset3 = b_offset2 + ldb; + b_offset4 = b_offset3 + ldb; + b_offset += 4; + + j = (rows >> 2); + if (j > 0) { + do { + /* Column 1 of MAT_B */ + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; // Row 1 of MAT_A + *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; + *(b_offset3 + 0) = *(a_offset1 + 2)*alpha; + *(b_offset4 + 0) = *(a_offset1 + 3)*alpha; + + /* Column 2 of MAT_B */ + *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; // Row 2 of MAT_A + *(b_offset2 + 1) = *(a_offset2 + 1)*alpha; + *(b_offset3 + 1) = *(a_offset2 + 2)*alpha; + *(b_offset4 + 1) = *(a_offset2 + 3)*alpha; + + /* Column 3 of MAT_B */ + *(b_offset1 + 2) = *(a_offset3 + 0)*alpha; // Row 3 of MAT_A + *(b_offset2 + 2) = *(a_offset3 + 1)*alpha; + *(b_offset3 + 2) = *(a_offset3 + 2)*alpha; + *(b_offset4 + 2) = *(a_offset3 + 3)*alpha; + + /* Column 4 of MAT_B */ + *(b_offset1 + 3) = *(a_offset4 + 0)*alpha; // Row 4 of MAT_A + *(b_offset2 + 3) = *(a_offset4 + 1)*alpha; + *(b_offset3 + 3) = *(a_offset4 + 2)*alpha; + *(b_offset4 + 3) = *(a_offset4 + 3)*alpha; + + a_offset1 += 4; + a_offset2 += 4; + a_offset3 += 4; + a_offset4 += 4; + b_offset1 += ldb * 4; + b_offset2 += ldb * 4; + b_offset3 += ldb * 4; + b_offset4 += ldb * 4; + + j--; + } while (j > 0); + } // if(j > 0) + + + if (rows & 2) { + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; + + *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; + *(b_offset2 + 1) = *(a_offset2 + 1)*alpha; + + *(b_offset1 + 2) = *(a_offset3 + 0)*alpha; + *(b_offset2 + 2) = *(a_offset3 + 1)*alpha; + + *(b_offset1 + 3) = *(a_offset4 + 0)*alpha; + *(b_offset2 + 3) = *(a_offset4 + 1)*alpha; + + a_offset1 += 2; + a_offset2 += 2; + a_offset3 += 2; + a_offset4 += 2; + + b_offset1 += ldb*2; + + } + + if (rows & 1) { + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + + *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; + + *(b_offset1 + 2) = *(a_offset3 + 0)*alpha; + + *(b_offset1 + 3) = *(a_offset4 + 0)*alpha; + } + + i--; + } while (i > 0); + } + + + if (cols & 2) { + a_offset1 = a_offset; + a_offset2 = a_offset1 + lda; + a_offset += 2 * lda; + + b_offset1 = b_offset; + b_offset2 = b_offset1 + ldb; + b_offset3 = b_offset2 + ldb; + b_offset4 = b_offset3 + ldb; + b_offset += 2; + + j = (rows >> 2); + if (j > 0){ + do { + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; + *(b_offset3 + 0) = *(a_offset1 + 2)*alpha; + *(b_offset4 + 0) = *(a_offset1 + 3)*alpha; + + *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; + *(b_offset2 + 1) = *(a_offset2 + 1)*alpha; + *(b_offset3 + 1) = *(a_offset2 + 2)*alpha; + *(b_offset4 + 1) = *(a_offset2 + 3)*alpha; + + a_offset1 += 4; + a_offset2 += 4; + b_offset1 += ldb * 4; + b_offset2 += ldb * 4; + b_offset3 += ldb * 4; + b_offset4 += ldb * 4; + + j--; + } while (j > 0); + } + + + if (rows & 2){ + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; + + *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; + *(b_offset2 + 1) = *(a_offset2 + 1)*alpha; + + a_offset1 += 2; + a_offset2 += 2; + b_offset1 += ldb*2; + + } + + + if (rows & 1){ + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + *(b_offset1 + 1) = *(a_offset2 + 0)*alpha; + } + } // if (cols & 2) + + + if (cols & 1) { + a_offset1 = a_offset; + a_offset += lda; + + b_offset1 = b_offset; + b_offset2 = b_offset1 + ldb; + b_offset3 = b_offset2 + ldb; + b_offset4 = b_offset3 + ldb; + + j = (rows >> 2); + if (j > 0){ + do { + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; + *(b_offset3 + 0) = *(a_offset1 + 2)*alpha; + *(b_offset4 + 0) = *(a_offset1 + 3)*alpha; + + a_offset1 += 4; + b_offset1 += ldb * 4; + b_offset2 += ldb * 4; + b_offset3 += ldb * 4; + b_offset4 += ldb * 4; + + j--; + } while (j > 0); + } + + if (rows & 2){ + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + *(b_offset2 + 0) = *(a_offset1 + 1)*alpha; + + a_offset1 += 2; + b_offset1 += ldb * 2; + } + + if (rows & 1){ + *(b_offset1 + 0) = *(a_offset1 + 0)*alpha; + } + } + + return 0; +} + +#endif From 2718b37fedc81d13f3917b5644bfa32081434e9b Mon Sep 17 00:00:00 2001 From: CDAC-SSDG <141632518+CDAC-SSDG@users.noreply.github.com> Date: Wed, 30 Oct 2024 13:57:13 +0530 Subject: [PATCH 118/244] Update CONTRIBUTORS.md --- CONTRIBUTORS.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index a6d25b50bd..cf74524c8d 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -229,3 +229,13 @@ In chronological order: * Christopher Daley * [2024-01-24] Optimize GEMV forwarding on ARM64 systems + +* Aniket P. Garade + * [2024-10-30] Optimized scal Level-1 BLAS routines with ARM SVE + +* Sushil Pratap Singh + * [2024-10-30] Optimized swap Level-1 BLAS routines with ARM SVE + +* Juliya James + * [2024-10-30] Optimized rot Level-1 BLAS routines with ARM SVE + From 0667cf6c92396e3813c0dc58460312ff70df6c71 Mon Sep 17 00:00:00 2001 From: CDAC-SSDG <141632518+CDAC-SSDG@users.noreply.github.com> Date: Wed, 30 Oct 2024 14:01:09 +0530 Subject: [PATCH 119/244] Added optimized scal routine files --- kernel/arm64/scal.c | 40 +++++++++++++++++++++++++ kernel/arm64/scal_kernel_c.c | 43 +++++++++++++++++++++++++++ kernel/arm64/scal_kernel_sve.c | 54 ++++++++++++++++++++++++++++++++++ 3 files changed, 137 insertions(+) create mode 100644 kernel/arm64/scal.c create mode 100644 kernel/arm64/scal_kernel_c.c create mode 100644 kernel/arm64/scal_kernel_sve.c diff --git a/kernel/arm64/scal.c b/kernel/arm64/scal.c new file mode 100644 index 0000000000..e64b0075e8 --- /dev/null +++ b/kernel/arm64/scal.c @@ -0,0 +1,40 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#include "common.h" +#include "scal_kernel_sve.c" +#include "scal_kernel_c.c" + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + if ((n <= 0) || (inc_x <= 0)) + return (0); + if (inc_x == 1) + scal_kernel_sve(n, x, da); + else + scal_kernel_c(n, da, x, inc_x, y, inc_y); + return (0); +} diff --git a/kernel/arm64/scal_kernel_c.c b/kernel/arm64/scal_kernel_c.c new file mode 100644 index 0000000000..659168da54 --- /dev/null +++ b/kernel/arm64/scal_kernel_c.c @@ -0,0 +1,43 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#include "common.h" + +static int scal_kernel_c(BLASLONG n, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i = 0, j = 0; + + while (j < n) + { + if (da == 0.0) + x[i] = 0.0; + else + x[i] = da * x[i]; + i += inc_x; + j++; + } + return (0); +} diff --git a/kernel/arm64/scal_kernel_sve.c b/kernel/arm64/scal_kernel_sve.c new file mode 100644 index 0000000000..ccd5a4cd2b --- /dev/null +++ b/kernel/arm64/scal_kernel_sve.c @@ -0,0 +1,54 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#include "common.h" +#include + +#ifdef DOUBLE +#define SVE_TYPE svfloat64_t +#define SVE_ZERO svdup_f64(0.0) +#define SVE_WHILELT svwhilelt_b64 +#define SVE_ALL svptrue_b64() +#define SVE_WIDTH svcntd() +#else +#define SVE_TYPE svfloat32_t +#define SVE_ZERO svdup_f32(0.0) +#define SVE_WHILELT svwhilelt_b32 +#define SVE_ALL svptrue_b32() +#define SVE_WIDTH svcntw() +#endif + +static int scal_kernel_sve(int n, FLOAT *x, FLOAT da) +{ + for (int i = 0; i < n; i += SVE_WIDTH) + { + svbool_t pg = SVE_WHILELT(i, n); + SVE_TYPE x_vec = svld1(pg, &x[i]); + SVE_TYPE result = svmul_z(pg, x_vec, da); + svst1(pg, &x[i], result); + } + return (0); +} From b8bc2a752eb66ff696a5e6ebc951d615cf61b854 Mon Sep 17 00:00:00 2001 From: SushilPratap04 Date: Wed, 30 Oct 2024 14:02:57 +0530 Subject: [PATCH 120/244] Added sve optimized kernels for swap routine --- kernel/arm64/swap.c | 40 ++++++++++++++++++++++ kernel/arm64/swap_kernel_c.c | 46 +++++++++++++++++++++++++ kernel/arm64/swap_kernel_sve.c | 62 ++++++++++++++++++++++++++++++++++ 3 files changed, 148 insertions(+) create mode 100644 kernel/arm64/swap.c create mode 100644 kernel/arm64/swap_kernel_c.c create mode 100644 kernel/arm64/swap_kernel_sve.c diff --git a/kernel/arm64/swap.c b/kernel/arm64/swap.c new file mode 100644 index 0000000000..c5af18e6ba --- /dev/null +++ b/kernel/arm64/swap.c @@ -0,0 +1,40 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#include "common.h" +#include "swap_kernel_sve.c" +#include "swap_kernel_c.c" + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + if (n <= 0) + return 0; + if (inc_x == 1 && inc_y == 1) + swap_kernel_sve(n, x, y); + else + swap_kernel_c(n, x, inc_x, y, inc_y); + return (0); +} diff --git a/kernel/arm64/swap_kernel_c.c b/kernel/arm64/swap_kernel_c.c new file mode 100644 index 0000000000..c1d7cc619a --- /dev/null +++ b/kernel/arm64/swap_kernel_c.c @@ -0,0 +1,46 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#include "common.h" +#include + +static int swap_kernel_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + FLOAT temp; + + while (i < n) + { + temp = x[ix]; + x[ix] = y[iy]; + y[iy] = temp; + ix += inc_x; + iy += inc_y; + i++; + } + return (0); +} diff --git a/kernel/arm64/swap_kernel_sve.c b/kernel/arm64/swap_kernel_sve.c new file mode 100644 index 0000000000..fed7e6d0f5 --- /dev/null +++ b/kernel/arm64/swap_kernel_sve.c @@ -0,0 +1,62 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#include "common.h" +#include + +#ifdef DOUBLE +#define SVE_TYPE svfloat64_t +#define SVE_ZERO svdup_f64(0.0) +#define SVE_WHILELT svwhilelt_b64 +#define SVE_ALL svptrue_b64() +#define SVE_WIDTH svcntd() +#else +#define SVE_TYPE svfloat32_t +#define SVE_ZERO svdup_f32(0.0) +#define SVE_WHILELT svwhilelt_b32 +#define SVE_ALL svptrue_b32() +#define SVE_WIDTH svcntw() +#endif + +static int swap_kernel_sve(BLASLONG n, FLOAT *x, FLOAT *y) +{ + BLASLONG sve_width = SVE_WIDTH; + + for (BLASLONG i = 0; i < n; i += sve_width * 2) + { + svbool_t pg_a = SVE_WHILELT(i, n); + svbool_t pg_b = SVE_WHILELT((i + sve_width), n); + SVE_TYPE x_vec_a = svld1(pg_a, &x[i]); + SVE_TYPE y_vec_a = svld1(pg_a, &y[i]); + SVE_TYPE x_vec_b = svld1(pg_b, &x[i + sve_width]); + SVE_TYPE y_vec_b = svld1(pg_b, &y[i + sve_width]); + svst1(pg_a, &x[i], y_vec_a); + svst1(pg_a, &y[i], x_vec_a); + svst1(pg_b, &x[i + sve_width], y_vec_b); + svst1(pg_b, &y[i + sve_width], x_vec_b); + } + return (0); +} From 7822ae961784234f21d95b3de3aff53dfb0f799a Mon Sep 17 00:00:00 2001 From: SushilPratap04 Date: Wed, 30 Oct 2024 14:05:21 +0530 Subject: [PATCH 121/244] Added sve kernels for rot routine. --- kernel/arm64/rot.c | 40 ++++++++++++++++++++++++ kernel/arm64/rot_kernel_c.c | 44 ++++++++++++++++++++++++++ kernel/arm64/rot_kernel_sve.c | 59 +++++++++++++++++++++++++++++++++++ 3 files changed, 143 insertions(+) create mode 100644 kernel/arm64/rot.c create mode 100644 kernel/arm64/rot_kernel_c.c create mode 100644 kernel/arm64/rot_kernel_sve.c diff --git a/kernel/arm64/rot.c b/kernel/arm64/rot.c new file mode 100644 index 0000000000..abddc15381 --- /dev/null +++ b/kernel/arm64/rot.c @@ -0,0 +1,40 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#include "common.h" +#include "rot_kernel_sve.c" +#include "rot_kernel_c.c" + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + if (n <= 0) + return (0); + if (inc_x == 1 && inc_y == 1) + rot_kernel_sve(n, x, y, c, s); + else + rot_kernel_c(n, x, inc_x, y, inc_y, c, s); + return (0); +} diff --git a/kernel/arm64/rot_kernel_c.c b/kernel/arm64/rot_kernel_c.c new file mode 100644 index 0000000000..f37d2db169 --- /dev/null +++ b/kernel/arm64/rot_kernel_c.c @@ -0,0 +1,44 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#include "common.h" + +static int rot_kernel_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + FLOAT temp; + while (i < n) + { + temp = c * x[ix] + s * y[iy]; + y[iy] = c * y[iy] - s * x[ix]; + x[ix] = temp; + ix += inc_x; + iy += inc_y; + i++; + } + return (0); +} diff --git a/kernel/arm64/rot_kernel_sve.c b/kernel/arm64/rot_kernel_sve.c new file mode 100644 index 0000000000..0a790824f0 --- /dev/null +++ b/kernel/arm64/rot_kernel_sve.c @@ -0,0 +1,59 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#include "common.h" +#include + +#ifdef DOUBLE +#define SVE_TYPE svfloat64_t +#define SVE_ZERO svdup_f64(0.0) +#define SVE_WHILELT svwhilelt_b64 +#define SVE_ALL svptrue_b64() +#define SVE_WIDTH svcntd() +#else +#define SVE_TYPE svfloat32_t +#define SVE_ZERO svdup_f32(0.0) +#define SVE_WHILELT svwhilelt_b32 +#define SVE_ALL svptrue_b32() +#define SVE_WIDTH svcntw() +#endif + +static int rot_kernel_sve(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s) +{ + for (int i = 0; i < n; i += SVE_WIDTH) + { + svbool_t pg = SVE_WHILELT((uint32_t)i, (uint32_t)n); + SVE_TYPE x_vec = svld1(pg, &x[i]); + SVE_TYPE y_vec = svld1(pg, &y[i]); + SVE_TYPE cx_vec = svmul_z(pg, x_vec, c); + SVE_TYPE sy_vec = svmul_z(pg, y_vec, s); + SVE_TYPE sx_vec = svmul_z(pg, x_vec, s); + SVE_TYPE cy_vec = svmul_z(pg, y_vec, c); + svst1(pg, &x[i], svadd_z(pg, cx_vec, sy_vec)); + svst1(pg, &y[i], svsub_z(pg, cy_vec, sx_vec)); + } + return (0); +} From fa880ab1cfed1b449a4cbfbd9e55a0d6c78d2e9e Mon Sep 17 00:00:00 2001 From: SushilPratap04 Date: Wed, 30 Oct 2024 14:09:37 +0530 Subject: [PATCH 122/244] Update KERNEL.ARMV8SVE updated KERNEL.ARMV8SVE for level 1 sve (swap, rot and scal) kernels. --- kernel/arm64/KERNEL.ARMV8SVE | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/arm64/KERNEL.ARMV8SVE b/kernel/arm64/KERNEL.ARMV8SVE index bfadf5cba9..cecc72cf96 100644 --- a/kernel/arm64/KERNEL.ARMV8SVE +++ b/kernel/arm64/KERNEL.ARMV8SVE @@ -64,13 +64,13 @@ DAXPYKERNEL = daxpy_thunderx2t99.S CAXPYKERNEL = zaxpy.S ZAXPYKERNEL = zaxpy.S -SROTKERNEL = rot.S -DROTKERNEL = rot.S +SROTKERNEL = rot.c +DROTKERNEL = rot.c CROTKERNEL = zrot.S ZROTKERNEL = zrot.S -SSCALKERNEL = scal.S -DSCALKERNEL = scal.S +SSCALKERNEL = scal.c +DSCALKERNEL = scal.c CSCALKERNEL = zscal.S ZSCALKERNEL = zscal.S @@ -94,8 +94,8 @@ DCOPYKERNEL = copy_thunderx2t99.c CCOPYKERNEL = copy_thunderx2t99.c ZCOPYKERNEL = copy_thunderx2t99.c -SSWAPKERNEL = swap_thunderx2t99.S -DSWAPKERNEL = swap_thunderx2t99.S +SSWAPKERNEL = swap.c +DSWAPKERNEL = swap.c CSWAPKERNEL = swap_thunderx2t99.S ZSWAPKERNEL = swap_thunderx2t99.S From 668e28adc445edc5d905713daef246733bc62444 Mon Sep 17 00:00:00 2001 From: Juliya32 <116022942+Juliya32@users.noreply.github.com> Date: Wed, 30 Oct 2024 14:22:31 +0530 Subject: [PATCH 123/244] Delete kernel/arm64/rot.c --- kernel/arm64/rot.c | 40 ---------------------------------------- 1 file changed, 40 deletions(-) delete mode 100644 kernel/arm64/rot.c diff --git a/kernel/arm64/rot.c b/kernel/arm64/rot.c deleted file mode 100644 index abddc15381..0000000000 --- a/kernel/arm64/rot.c +++ /dev/null @@ -1,40 +0,0 @@ -/******************************************************************************* -Copyright (c) 2015, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*******************************************************************************/ -#include "common.h" -#include "rot_kernel_sve.c" -#include "rot_kernel_c.c" - -int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) -{ - if (n <= 0) - return (0); - if (inc_x == 1 && inc_y == 1) - rot_kernel_sve(n, x, y, c, s); - else - rot_kernel_c(n, x, inc_x, y, inc_y, c, s); - return (0); -} From d90ee00f8595ef46c31cb30fa045a75e8ba0056b Mon Sep 17 00:00:00 2001 From: Juliya32 <116022942+Juliya32@users.noreply.github.com> Date: Wed, 30 Oct 2024 14:22:51 +0530 Subject: [PATCH 124/244] Delete kernel/arm64/rot_kernel_c.c --- kernel/arm64/rot_kernel_c.c | 44 ------------------------------------- 1 file changed, 44 deletions(-) delete mode 100644 kernel/arm64/rot_kernel_c.c diff --git a/kernel/arm64/rot_kernel_c.c b/kernel/arm64/rot_kernel_c.c deleted file mode 100644 index f37d2db169..0000000000 --- a/kernel/arm64/rot_kernel_c.c +++ /dev/null @@ -1,44 +0,0 @@ -/******************************************************************************* -Copyright (c) 2015, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*******************************************************************************/ -#include "common.h" - -static int rot_kernel_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) -{ - BLASLONG i = 0; - BLASLONG ix = 0, iy = 0; - FLOAT temp; - while (i < n) - { - temp = c * x[ix] + s * y[iy]; - y[iy] = c * y[iy] - s * x[ix]; - x[ix] = temp; - ix += inc_x; - iy += inc_y; - i++; - } - return (0); -} From 012fe4da36a31586965b1e25d70c62f7ad8ac713 Mon Sep 17 00:00:00 2001 From: Juliya32 <116022942+Juliya32@users.noreply.github.com> Date: Wed, 30 Oct 2024 14:23:15 +0530 Subject: [PATCH 125/244] Delete kernel/arm64/rot_kernel_sve.c --- kernel/arm64/rot_kernel_sve.c | 59 ----------------------------------- 1 file changed, 59 deletions(-) delete mode 100644 kernel/arm64/rot_kernel_sve.c diff --git a/kernel/arm64/rot_kernel_sve.c b/kernel/arm64/rot_kernel_sve.c deleted file mode 100644 index 0a790824f0..0000000000 --- a/kernel/arm64/rot_kernel_sve.c +++ /dev/null @@ -1,59 +0,0 @@ -/******************************************************************************* -Copyright (c) 2015, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*******************************************************************************/ -#include "common.h" -#include - -#ifdef DOUBLE -#define SVE_TYPE svfloat64_t -#define SVE_ZERO svdup_f64(0.0) -#define SVE_WHILELT svwhilelt_b64 -#define SVE_ALL svptrue_b64() -#define SVE_WIDTH svcntd() -#else -#define SVE_TYPE svfloat32_t -#define SVE_ZERO svdup_f32(0.0) -#define SVE_WHILELT svwhilelt_b32 -#define SVE_ALL svptrue_b32() -#define SVE_WIDTH svcntw() -#endif - -static int rot_kernel_sve(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s) -{ - for (int i = 0; i < n; i += SVE_WIDTH) - { - svbool_t pg = SVE_WHILELT((uint32_t)i, (uint32_t)n); - SVE_TYPE x_vec = svld1(pg, &x[i]); - SVE_TYPE y_vec = svld1(pg, &y[i]); - SVE_TYPE cx_vec = svmul_z(pg, x_vec, c); - SVE_TYPE sy_vec = svmul_z(pg, y_vec, s); - SVE_TYPE sx_vec = svmul_z(pg, x_vec, s); - SVE_TYPE cy_vec = svmul_z(pg, y_vec, c); - svst1(pg, &x[i], svadd_z(pg, cx_vec, sy_vec)); - svst1(pg, &y[i], svsub_z(pg, cy_vec, sx_vec)); - } - return (0); -} From 3b2421cba0c73db40ba796e9d4f79161cba0b2d9 Mon Sep 17 00:00:00 2001 From: Juliya32 <116022942+Juliya32@users.noreply.github.com> Date: Wed, 30 Oct 2024 14:23:42 +0530 Subject: [PATCH 126/244] Add files via upload --- kernel/arm64/rot.c | 40 ++++++++++++++++++++++++ kernel/arm64/rot_kernel_c.c | 44 ++++++++++++++++++++++++++ kernel/arm64/rot_kernel_sve.c | 59 +++++++++++++++++++++++++++++++++++ 3 files changed, 143 insertions(+) create mode 100644 kernel/arm64/rot.c create mode 100644 kernel/arm64/rot_kernel_c.c create mode 100644 kernel/arm64/rot_kernel_sve.c diff --git a/kernel/arm64/rot.c b/kernel/arm64/rot.c new file mode 100644 index 0000000000..abddc15381 --- /dev/null +++ b/kernel/arm64/rot.c @@ -0,0 +1,40 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#include "common.h" +#include "rot_kernel_sve.c" +#include "rot_kernel_c.c" + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + if (n <= 0) + return (0); + if (inc_x == 1 && inc_y == 1) + rot_kernel_sve(n, x, y, c, s); + else + rot_kernel_c(n, x, inc_x, y, inc_y, c, s); + return (0); +} diff --git a/kernel/arm64/rot_kernel_c.c b/kernel/arm64/rot_kernel_c.c new file mode 100644 index 0000000000..f37d2db169 --- /dev/null +++ b/kernel/arm64/rot_kernel_c.c @@ -0,0 +1,44 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#include "common.h" + +static int rot_kernel_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + FLOAT temp; + while (i < n) + { + temp = c * x[ix] + s * y[iy]; + y[iy] = c * y[iy] - s * x[ix]; + x[ix] = temp; + ix += inc_x; + iy += inc_y; + i++; + } + return (0); +} diff --git a/kernel/arm64/rot_kernel_sve.c b/kernel/arm64/rot_kernel_sve.c new file mode 100644 index 0000000000..0a790824f0 --- /dev/null +++ b/kernel/arm64/rot_kernel_sve.c @@ -0,0 +1,59 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#include "common.h" +#include + +#ifdef DOUBLE +#define SVE_TYPE svfloat64_t +#define SVE_ZERO svdup_f64(0.0) +#define SVE_WHILELT svwhilelt_b64 +#define SVE_ALL svptrue_b64() +#define SVE_WIDTH svcntd() +#else +#define SVE_TYPE svfloat32_t +#define SVE_ZERO svdup_f32(0.0) +#define SVE_WHILELT svwhilelt_b32 +#define SVE_ALL svptrue_b32() +#define SVE_WIDTH svcntw() +#endif + +static int rot_kernel_sve(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s) +{ + for (int i = 0; i < n; i += SVE_WIDTH) + { + svbool_t pg = SVE_WHILELT((uint32_t)i, (uint32_t)n); + SVE_TYPE x_vec = svld1(pg, &x[i]); + SVE_TYPE y_vec = svld1(pg, &y[i]); + SVE_TYPE cx_vec = svmul_z(pg, x_vec, c); + SVE_TYPE sy_vec = svmul_z(pg, y_vec, s); + SVE_TYPE sx_vec = svmul_z(pg, x_vec, s); + SVE_TYPE cy_vec = svmul_z(pg, y_vec, c); + svst1(pg, &x[i], svadd_z(pg, cx_vec, sy_vec)); + svst1(pg, &y[i], svsub_z(pg, cy_vec, sx_vec)); + } + return (0); +} From 0cf656fd3eb431a83c8c71008ad10036a75a6895 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 30 Oct 2024 12:55:14 +0100 Subject: [PATCH 127/244] Add copies of GEMMT under its new name GEMMTR --- interface/CMakeLists.txt | 4 ++++ interface/Makefile | 50 +++++++++++++++++++++++++++++++--------- 2 files changed, 43 insertions(+), 11 deletions(-) diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index 449072bae1..8d815c4521 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -107,6 +107,9 @@ endif () # trmm is trsm with a compiler flag set GenerateNamedObjects("trsm.c" "TRMM" "trmm" ${CBLAS_FLAG}) + + # gemmtr is gemmt under the name adopted by the Reference BLAS + GenerateNamedObjects("gemm.c" "" "gemmtr" ${CBLAS_FLAG}) # max and imax are compiled 4 times GenerateNamedObjects("max.c" "" "" ${CBLAS_FLAG}) @@ -123,6 +126,7 @@ if (BUILD_BFLOAT16) GenerateNamedObjects("bf16dot.c" "" "sbdot" ${CBLAS_FLAG} "" "" true "BFLOAT16") GenerateNamedObjects("gemm.c" "" "sbgemm" ${CBLAS_FLAG} "" "" true "BFLOAT16") GenerateNamedObjects("gemmt.c" "" "sbgemmt" ${CBLAS_FLAG} "" "" true "BFLOAT16") + GenerateNamedObjects("gemmt.c" "" "sbgemmtr" ${CBLAS_FLAG} "" "" true "BFLOAT16") GenerateNamedObjects("sbgemv.c" "" "sbgemv" ${CBLAS_FLAG} "" "" true "BFLOAT16") GenerateNamedObjects("tobf16.c" "SINGLE_PREC" "sbstobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16") GenerateNamedObjects("tobf16.c" "DOUBLE_PREC" "sbdtobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16") diff --git a/interface/Makefile b/interface/Makefile index 97439d87f8..c22e087c02 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -44,12 +44,12 @@ SBLAS3OBJS = \ sgemm.$(SUFFIX) ssymm.$(SUFFIX) strmm.$(SUFFIX) \ strsm.$(SUFFIX) ssyrk.$(SUFFIX) ssyr2k.$(SUFFIX) \ somatcopy.$(SUFFIX) simatcopy.$(SUFFIX)\ - sgeadd.$(SUFFIX) sgemmt.$(SUFFIX) + sgeadd.$(SUFFIX) sgemmt.$(SUFFIX) sgemmtr.$(SUFFIX) ifeq ($(BUILD_BFLOAT16),1) SBBLAS1OBJS = sbdot.$(SUFFIX) SBBLAS2OBJS = sbgemv.$(SUFFIX) -SBBLAS3OBJS = sbgemm.$(SUFFIX) sbgemmt.$(SUFFIX) +SBBLAS3OBJS = sbgemm.$(SUFFIX) sbgemmt.$(SUFFIX) sbgemmtr.$(SUFFIX) SBEXTOBJS = sbstobf16.$(SUFFIX) sbdtobf16.$(SUFFIX) sbf16tos.$(SUFFIX) dbf16tod.$(SUFFIX) endif @@ -76,7 +76,7 @@ DBLAS3OBJS = \ dgemm.$(SUFFIX) dsymm.$(SUFFIX) dtrmm.$(SUFFIX) \ dtrsm.$(SUFFIX) dsyrk.$(SUFFIX) dsyr2k.$(SUFFIX) \ domatcopy.$(SUFFIX) dimatcopy.$(SUFFIX)\ - dgeadd.$(SUFFIX) dgemmt.$(SUFFIX) + dgeadd.$(SUFFIX) dgemmt.$(SUFFIX) dgemmtr.$(SUFFIX) CBLAS1OBJS = \ caxpy.$(SUFFIX) caxpyc.$(SUFFIX) cswap.$(SUFFIX) \ @@ -105,7 +105,7 @@ CBLAS3OBJS = \ ctrsm.$(SUFFIX) csyrk.$(SUFFIX) csyr2k.$(SUFFIX) \ chemm.$(SUFFIX) cherk.$(SUFFIX) cher2k.$(SUFFIX) \ comatcopy.$(SUFFIX) cimatcopy.$(SUFFIX)\ - cgeadd.$(SUFFIX) cgemmt.$(SUFFIX) + cgeadd.$(SUFFIX) cgemmt.$(SUFFIX) cgemmtr.$(SUFFIX) ZBLAS1OBJS = \ zaxpy.$(SUFFIX) zaxpyc.$(SUFFIX) zswap.$(SUFFIX) \ @@ -134,7 +134,7 @@ ZBLAS3OBJS = \ ztrsm.$(SUFFIX) zsyrk.$(SUFFIX) zsyr2k.$(SUFFIX) \ zhemm.$(SUFFIX) zherk.$(SUFFIX) zher2k.$(SUFFIX) \ zomatcopy.$(SUFFIX) zimatcopy.$(SUFFIX)\ - zgeadd.$(SUFFIX) zgemmt.$(SUFFIX) + zgeadd.$(SUFFIX) zgemmt.$(SUFFIX) zgemmtr.$(SUFFIX) ifeq ($(SUPPORT_GEMM3M), 1) @@ -282,12 +282,12 @@ CSBLAS2OBJS = \ CSBLAS3OBJS = \ cblas_sgemm.$(SUFFIX) cblas_ssymm.$(SUFFIX) cblas_strmm.$(SUFFIX) cblas_strsm.$(SUFFIX) \ cblas_ssyrk.$(SUFFIX) cblas_ssyr2k.$(SUFFIX) cblas_somatcopy.$(SUFFIX) cblas_simatcopy.$(SUFFIX)\ - cblas_sgeadd.$(SUFFIX) cblas_sgemmt.$(SUFFIX) cblas_sgemm_batch.$(SUFFIX) + cblas_sgeadd.$(SUFFIX) cblas_sgemmt.$(SUFFIX) cblas_sgemmtr.$(SUFFIX) cblas_sgemm_batch.$(SUFFIX) ifeq ($(BUILD_BFLOAT16),1) CSBBLAS1OBJS = cblas_sbdot.$(SUFFIX) CSBBLAS2OBJS = cblas_sbgemv.$(SUFFIX) -CSBBLAS3OBJS = cblas_sbgemm.$(SUFFIX) cblas_sbgemmt.$(SUFFIX) cblas_sbgemm_batch.$(SUFFIX) +CSBBLAS3OBJS = cblas_sbgemm.$(SUFFIX) cblas_sbgemmt.$(SUFFIX) cblas_sbgemmtr.$(SUFFIX) cblas_sbgemm_batch.$(SUFFIX) CSBEXTOBJS = cblas_sbstobf16.$(SUFFIX) cblas_sbdtobf16.$(SUFFIX) cblas_sbf16tos.$(SUFFIX) cblas_dbf16tod.$(SUFFIX) endif @@ -308,7 +308,7 @@ CDBLAS2OBJS = \ CDBLAS3OBJS += \ cblas_dgemm.$(SUFFIX) cblas_dsymm.$(SUFFIX) cblas_dtrmm.$(SUFFIX) cblas_dtrsm.$(SUFFIX) \ cblas_dsyrk.$(SUFFIX) cblas_dsyr2k.$(SUFFIX) cblas_domatcopy.$(SUFFIX) cblas_dimatcopy.$(SUFFIX) \ - cblas_dgeadd.$(SUFFIX) cblas_dgemmt.$(SUFFIX) cblas_dgemm_batch.$(SUFFIX) + cblas_dgeadd.$(SUFFIX) cblas_dgemmt.$(SUFFIX) cblas_dgemmtr.$(SUFFIX) cblas_dgemm_batch.$(SUFFIX) CCBLAS1OBJS = \ cblas_icamax.$(SUFFIX) cblas_icamin.$(SUFFIX) cblas_scasum.$(SUFFIX) cblas_caxpy.$(SUFFIX) \ @@ -333,7 +333,7 @@ CCBLAS3OBJS = \ cblas_csyrk.$(SUFFIX) cblas_csyr2k.$(SUFFIX) \ cblas_chemm.$(SUFFIX) cblas_cherk.$(SUFFIX) cblas_cher2k.$(SUFFIX) \ cblas_comatcopy.$(SUFFIX) cblas_cimatcopy.$(SUFFIX)\ - cblas_cgeadd.$(SUFFIX) cblas_cgemmt.$(SUFFIX) cblas_cgemm_batch.$(SUFFIX) + cblas_cgeadd.$(SUFFIX) cblas_cgemmt.$(SUFFIX) cblas_cgemmtr.$(SUFFIX) cblas_cgemm_batch.$(SUFFIX) CXERBLAOBJ = \ cblas_xerbla.$(SUFFIX) @@ -364,7 +364,7 @@ CZBLAS3OBJS = \ cblas_zsyrk.$(SUFFIX) cblas_zsyr2k.$(SUFFIX) \ cblas_zhemm.$(SUFFIX) cblas_zherk.$(SUFFIX) cblas_zher2k.$(SUFFIX)\ cblas_zomatcopy.$(SUFFIX) cblas_zimatcopy.$(SUFFIX) \ - cblas_zgeadd.$(SUFFIX) cblas_zgemmt.$(SUFFIX) cblas_zgemm_batch.$(SUFFIX) + cblas_zgeadd.$(SUFFIX) cblas_zgemmt.$(SUFFIX) cblas_zgemmtr.$(SUFFIX) cblas_zgemm_batch.$(SUFFIX) ifeq ($(SUPPORT_GEMM3M), 1) @@ -1305,6 +1305,8 @@ sbgemm.$(SUFFIX) sbgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -c $(CFLAGS) $< -o $(@F) sbgemmt.$(SUFFIX) sbgemmt.$(PSUFFIX) : sbgemmt.c ../param.h $(CC) -c $(CFLAGS) $< -o $(@F) +sbgemmtr.$(SUFFIX) sbgemmtr.$(PSUFFIX) : sbgemmt.c ../param.h + $(CC) -c $(CFLAGS) $< -o $(@F) endif sgemm.$(SUFFIX) sgemm.$(PSUFFIX) : gemm.c ../param.h @@ -1340,7 +1342,19 @@ cgemmt.$(SUFFIX) cgemmt.$(PSUFFIX) : gemmt.c ../param.h zgemmt.$(SUFFIX) zgemmt.$(PSUFFIX) : gemmt.c ../param.h $(CC) -c $(CFLAGS) $< -o $(@F) -xgemmt.$(SUFFIX) xgemmt.$(PSUFFIX) : gemmt.c ../param.h +sgemmtr.$(SUFFIX) sgemmtr.$(PSUFFIX) : gemmt.c ../param.h + $(CC) -c $(CFLAGS) $< -o $(@F) + +dgemmtr.$(SUFFIX) dgemmtr.$(PSUFFIX) : gemmt.c ../param.h + $(CC) -c $(CFLAGS) $< -o $(@F) + +qgemmtr.$(SUFFIX) qgemmtr.$(PSUFFIX) : gemmt.c ../param.h + $(CC) -c $(CFLAGS) $< -o $(@F) + +cgemmtr.$(SUFFIX) cgemmtr.$(PSUFFIX) : gemmt.c ../param.h + $(CC) -c $(CFLAGS) $< -o $(@F) + +zgemmtr.$(SUFFIX) zgemmtr.$(PSUFFIX) : gemmt.c ../param.h $(CC) -c $(CFLAGS) $< -o $(@F) ssymm.$(SUFFIX) ssymm.$(PSUFFIX) : symm.c @@ -1966,9 +1980,14 @@ cblas_zgemm.$(SUFFIX) cblas_zgemm.$(PSUFFIX) : gemm.c ../param.h cblas_sgemmt.$(SUFFIX) cblas_sgemmt.$(PSUFFIX) : gemmt.c ../param.h $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) +cblas_sgemmtr.$(SUFFIX) cblas_sgemmtr.$(PSUFFIX) : gemmt.c ../param.h + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + ifeq ($(BUILD_BFLOAT16),1) cblas_sbgemmt.$(SUFFIX) cblas_sbgemmt.$(PSUFFIX) : sbgemmt.c ../param.h $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) +cblas_sbgemmtr.$(SUFFIX) cblas_sbgemmtr.$(PSUFFIX) : sbgemmt.c ../param.h + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) endif cblas_dgemmt.$(SUFFIX) cblas_dgemmt.$(PSUFFIX) : gemmt.c ../param.h @@ -1980,6 +1999,15 @@ cblas_cgemmt.$(SUFFIX) cblas_cgemmt.$(PSUFFIX) : gemmt.c ../param.h cblas_zgemmt.$(SUFFIX) cblas_zgemmt.$(PSUFFIX) : gemmt.c ../param.h $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) +cblas_dgemmtr.$(SUFFIX) cblas_dgemmtr.$(PSUFFIX) : gemmt.c ../param.h + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_cgemmtr.$(SUFFIX) cblas_cgemmtr.$(PSUFFIX) : gemmt.c ../param.h + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + +cblas_zgemmtr.$(SUFFIX) cblas_zgemmtr.$(PSUFFIX) : gemmt.c ../param.h + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) + cblas_ssymm.$(SUFFIX) cblas_ssymm.$(PSUFFIX) : symm.c $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) From c3e7d08fb4c4471d6ee7b5c46f58ff1c4e36d2ab Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 30 Oct 2024 12:56:16 +0100 Subject: [PATCH 128/244] Copy GEMMT to its new name GEMMTR --- exports/gensymbol | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/exports/gensymbol b/exports/gensymbol index d53b980515..f3ca9a427e 100755 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -21,7 +21,7 @@ blasobjsc=" chbmv chemm chemv cher2 cher2k cher cherk scabs1 scamax chpmv chpr2 chpr crotg cscal csrot csscal cswap scamin scasum scnrm2 csymm csyr2k csyrk ctbmv ctbsv ctpmv ctpsv ctrmm ctrmv ctrsm - ctrsv icamax icamin cimatcopy comatcopy cgeadd scsum cgemmt" + ctrsv icamax icamin cimatcopy comatcopy cgeadd scsum cgemmt cgemmtr" blasobjsd=" damax damin dasum daxpy daxpby dcabs1 dcopy ddot dgbmv dgemm @@ -29,7 +29,7 @@ blasobjsd=" dscal dsdot dspmv dspr2 dimatcopy domatcopy dspr dswap dsymm dsymv dsyr2 dsyr2k dsyr dsyrk dtbmv dtbsv dtpmv dtpsv dtrmm dtrmv dtrsm dtrsv - idamax idamin idmax idmin dgeadd dsum dgemmt" + idamax idamin idmax idmin dgeadd dsum dgemmt dgemmtr" blasobjss=" isamax isamin ismax ismin @@ -38,7 +38,7 @@ blasobjss=" smax smin snrm2 simatcopy somatcopy srot srotg srotm srotmg ssbmv sscal sspmv sspr2 sspr sswap ssymm ssymv ssyr2 ssyr2k ssyr ssyrk stbmv stbsv stpmv stpsv - strmm strmv strsm strsv sgeadd ssum sgemmt" + strmm strmv strsm strsv sgeadd ssum sgemmt sgemmtr" blasobjsz=" izamax izamin @@ -48,17 +48,17 @@ blasobjsz=" zhpr zrotg zscal zswap zsymm zsyr2k zsyrk ztbmv ztbsv ztpmv ztpsv ztrmm ztrmv ztrsm ztrsv zomatcopy zimatcopy dzamax dzamin dzasum dznrm2 - zgeadd dzsum zgemmt" + zgeadd dzsum zgemmt zgemmtr" blasobjs="lsame xerbla" -bfblasobjs="sbgemm sbgemv sbdot sbstobf16 sbdtobf16 sbf16tos dbf16tod" +bfblasobjs="sbgemm sbgemmt sbgemmtr sbgemv sbdot sbstobf16 sbdtobf16 sbf16tos dbf16tod" cblasobjsc=" cblas_caxpy cblas_ccopy cblas_cdotc cblas_cdotu cblas_cgbmv cblas_cgemm cblas_cgemv cblas_cgerc cblas_cgeru cblas_chbmv cblas_chemm cblas_chemv cblas_cher2 cblas_cher2k cblas_cher cblas_cherk cblas_chpmv cblas_chpr2 cblas_chpr cblas_cscal cblas_caxpby cblas_csscal cblas_cswap cblas_csymm cblas_csyr2k cblas_csyrk cblas_ctbmv cblas_cgeadd cblas_ctbsv cblas_ctpmv cblas_ctpsv cblas_ctrmm cblas_ctrmv cblas_ctrsm cblas_ctrsv - cblas_scnrm2 cblas_scasum cblas_cgemmt + cblas_scnrm2 cblas_scasum cblas_cgemmt cblas_cgemmtr cblas_icamax cblas_icamin cblas_icmin cblas_icmax cblas_scsum cblas_cimatcopy cblas_comatcopy cblas_caxpyc cblas_crotg cblas_csrot cblas_scamax cblas_scamin cblas_cgemm_batch " @@ -68,7 +68,7 @@ cblasobjsd=" cblas_drot cblas_drotg cblas_drotm cblas_drotmg cblas_dsbmv cblas_dscal cblas_dsdot cblas_dspmv cblas_dspr2 cblas_dspr cblas_dswap cblas_dsymm cblas_dsymv cblas_dsyr2 cblas_dsyr2k cblas_dsyr cblas_dsyrk cblas_dtbmv cblas_dtbsv cblas_dtpmv cblas_dtpsv - cblas_dtrmm cblas_dtrmv cblas_dtrsm cblas_dtrsv cblas_daxpby cblas_dgeadd cblas_dgemmt + cblas_dtrmm cblas_dtrmv cblas_dtrsm cblas_dtrsv cblas_daxpby cblas_dgeadd cblas_dgemmt cblas_dgemmtr cblas_idamax cblas_idamin cblas_idmin cblas_idmax cblas_dsum cblas_dimatcopy cblas_domatcopy cblas_damax cblas_damin cblas_dgemm_batch " @@ -80,7 +80,7 @@ cblasobjss=" cblas_srotm cblas_srotmg cblas_ssbmv cblas_sscal cblas_sspmv cblas_sspr2 cblas_sspr cblas_sswap cblas_ssymm cblas_ssymv cblas_ssyr2 cblas_ssyr2k cblas_ssyr cblas_ssyrk cblas_stbmv cblas_stbsv cblas_stpmv cblas_stpsv cblas_strmm cblas_strmv cblas_strsm - cblas_strsv cblas_sgeadd cblas_sgemmt + cblas_strsv cblas_sgeadd cblas_sgemmt cblas_sgemmtr cblas_isamax cblas_isamin cblas_ismin cblas_ismax cblas_ssum cblas_simatcopy cblas_somatcopy cblas_samax cblas_samin cblas_sgemm_batch " @@ -92,7 +92,7 @@ cblasobjsz=" cblas_zhpr cblas_zscal cblas_zswap cblas_zsymm cblas_zsyr2k cblas_zsyrk cblas_ztbmv cblas_ztbsv cblas_ztpmv cblas_ztpsv cblas_ztrmm cblas_ztrmv cblas_ztrsm cblas_ztrsv cblas_cdotc_sub cblas_cdotu_sub cblas_zdotc_sub cblas_zdotu_sub - cblas_zaxpby cblas_zgeadd cblas_zgemmt + cblas_zaxpby cblas_zgeadd cblas_zgemmt cblas_zgemmtr cblas_izamax cblas_izamin cblas_izmin cblas_izmax cblas_dzsum cblas_zimatcopy cblas_zomatcopy cblas_zaxpyc cblas_zdrot cblas_zrotg cblas_dzamax cblas_dzamin cblas_zgemm_batch " From d3272e51ebffc81ca37496a923c9de2918df95ca Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Mon, 1 Jul 2024 09:05:00 +1100 Subject: [PATCH 129/244] explicitly link to OpenMP --- CMakeLists.txt | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index c6421cecc3..ddff73c2cd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -102,6 +102,10 @@ endif() message(WARNING "CMake support is experimental. It does not yet support all build options and may not produce the same Makefiles that OpenBLAS ships with.") +if (USE_OPENMP) + find_package(OpenMP REQUIRED) +endif () + include("${PROJECT_SOURCE_DIR}/cmake/utils.cmake") include("${PROJECT_SOURCE_DIR}/cmake/system.cmake") @@ -258,6 +262,15 @@ if (${CMAKE_SYSTEM_NAME} MATCHES "AIX|Android|Linux|FreeBSD|OpenBSD|NetBSD|Drago endif() endif() +if (USE_OPENMP) + if(BUILD_STATIC_LIBS) + target_link_libraries(${OpenBLAS_LIBNAME}_static OpenMP::OpenMP_C) + endif() + if(BUILD_SHARED_LIBS) + target_link_libraries(${OpenBLAS_LIBNAME}_shared OpenMP::OpenMP_C) + endif() +endif() + # Seems that this hack doesn't required since macOS 11 Big Sur if (APPLE AND BUILD_SHARED_LIBS AND CMAKE_HOST_SYSTEM_VERSION VERSION_LESS 20) set (CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1) From 87a18154ce311e567c30ac39fa6d42e489efd157 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 6 Nov 2024 14:44:11 +0100 Subject: [PATCH 130/244] Update version of upload-artifacts again --- .github/workflows/nightly-Homebrew-build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nightly-Homebrew-build.yml b/.github/workflows/nightly-Homebrew-build.yml index 71da7cd875..4e2b55807e 100644 --- a/.github/workflows/nightly-Homebrew-build.yml +++ b/.github/workflows/nightly-Homebrew-build.yml @@ -69,7 +69,7 @@ jobs: mv *.bottle.tar.gz bottles - name: Upload bottle - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: openblas--HEAD.catalina.bottle.tar.gz path: bottles From 2332ea7e7a835c673608ec188c89708a76b6f819 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 6 Nov 2024 18:35:31 +0100 Subject: [PATCH 131/244] fix misleading indentation --- lapack/getrf/getrf_parallel.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/lapack/getrf/getrf_parallel.c b/lapack/getrf/getrf_parallel.c index fed5c1de54..b6f52ee025 100644 --- a/lapack/getrf/getrf_parallel.c +++ b/lapack/getrf/getrf_parallel.c @@ -326,16 +326,15 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * min_i = m - is; if (min_i >= GEMM_P * 2) { min_i = GEMM_P; - } else - if (min_i > GEMM_P) { + } else if (min_i > GEMM_P) { min_i = (((min_i + 1) / 2 + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M; - } + } - ICOPY_OPERATION(k, min_i, a, lda, 0, is, sa); + ICOPY_OPERATION(k, min_i, a, lda, 0, is, sa); - current = mypos; + current = mypos; - do { + do { div_n = (range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE; @@ -365,7 +364,7 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * current ++; if (current >= args -> nthreads) current = 0; - } while (current != mypos); + } while (current != mypos); } for (i = 0; i < args -> nthreads; i++) { From 50da5c2b347d5ea3c2a5884a36a5c6036af78c16 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 12 Nov 2024 15:20:21 -0800 Subject: [PATCH 132/244] Add Apple M4 as VORTEX with HAVE_SME --- cpuid_arm64.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cpuid_arm64.c b/cpuid_arm64.c index 5d25d2ff69..aaf5084395 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -281,6 +281,7 @@ int detect(void) if (value64 ==131287967|| value64 == 458787763 ) return CPU_VORTEX; //A12/M1 if (value64 == 3660830781) return CPU_VORTEX; //A15/M2 if (value64 == 2271604202) return CPU_VORTEX; //A16/M3 + if (value64 == 1867590060) return CPU_VORTEX; //M4 #endif return CPU_ARMV8; #endif @@ -558,6 +559,8 @@ void get_cpuconfig(void) case CPU_VORTEX: printf("#define VORTEX \n"); #ifdef __APPLE__ + sysctlbyname("hw.cpufamily",&value64,&length64,NULL,0); + if (value64 == 1867590060) printf("#define HAVE_SME 1\n");; //M4 sysctlbyname("hw.l1icachesize",&value64,&length64,NULL,0); printf("#define L1_CODE_SIZE %lld \n",value64); sysctlbyname("hw.cachelinesize",&value64,&length64,NULL,0); From 760bf7aa373ddcfefe8557e85b95c57a22ed067b Mon Sep 17 00:00:00 2001 From: Caroline Newcombe Date: Wed, 13 Nov 2024 14:05:20 -0600 Subject: [PATCH 133/244] Update Fortran return for complex data types (Cray and Nvidia compilers) --- Makefile.system | 8 +++++--- common_arm64.h | 2 +- common_x86_64.h | 4 ++++ 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/Makefile.system b/Makefile.system index 8351b8efb2..21a0fc3caa 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1392,15 +1392,17 @@ endif endif ifeq ($(F_COMPILER), CRAY) -CCOMMON_OPT += -DF_INTERFACE_INTEL +CCOMMON_OPT += -DF_INTERFACE_CRAYFC FCOMMON_OPT += -hnopattern ifdef INTERFACE64 ifneq ($(INTERFACE64), 0) FCOMMON_OPT += -s integer64 endif endif -ifneq ($(USE_OPENMP), 1) -FCOMMON_OPT += -O noomp +ifeq ($(USE_OPENMP), 1) +FCOMMON_OPT += -fopenmp +else +FCOMMON_OPT += -fno-openmp endif endif diff --git a/common_arm64.h b/common_arm64.h index 3e72e2a324..595a01995a 100644 --- a/common_arm64.h +++ b/common_arm64.h @@ -44,7 +44,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define RMB __asm__ __volatile__ ("dmb ishld" : : : "memory") #endif -#if defined( F_INTERFACE_FLANG) || defined(F_INTERFACE_PGI) +#if defined( F_INTERFACE_FLANG) || (defined(F_INTERFACE_PGI) && (defined(__NVCOMPILER) && (__NVCOMPILER_MAJOR__ < 23 || (__NVCOMPILER_MAJOR__ == 23 && __NVCOMPILER_MINOR__ < 9)))) #define RETURN_BY_STACK #else #define RETURN_BY_COMPLEX diff --git a/common_x86_64.h b/common_x86_64.h index 21cd198f34..143e188a79 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -283,6 +283,10 @@ static __inline unsigned int blas_quickdivide(unsigned int x, unsigned int y){ #define RETURN_BY_STACK #endif +#ifdef F_INTERFACE_CRAYFC +#define RETURN_BY_PACKED +#endif + #ifdef F_INTERFACE_FUJITSU #define RETURN_BY_STACK #endif From cea9df36438466adf528f7a397c72c88efebd3ba Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 13 Nov 2024 14:56:30 -0800 Subject: [PATCH 134/244] Update Cray compiler options and calling convention --- cmake/fc.cmake | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cmake/fc.cmake b/cmake/fc.cmake index db818e4a03..4ce1c99d4b 100644 --- a/cmake/fc.cmake +++ b/cmake/fc.cmake @@ -257,13 +257,15 @@ if (${F_COMPILER} STREQUAL "COMPAQ") endif () if (${F_COMPILER} STREQUAL "CRAY") - set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_INTEL") + set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_CRAYFC") set(FCOMMON_OPT "${FCOMMON_OPT} -hnopattern") if (INTERFACE64) set (FCOMMON_OPT "${FCOMMON_OPT} -s integer64") endif () if (NOT USE_OPENMP) - set(FCOMMON_OPT "${FCOMMON_OPT} -O noomp") + set(FCOMMON_OPT "${FCOMMON_OPT} -fno-openmp") + else () + set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp") endif () endif () From 926e56e3892f58c82588a7eb6aa72ba302bc0b22 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 14 Nov 2024 14:04:25 -0800 Subject: [PATCH 135/244] Align GEMM3M parameters for GENERIC with ZGEMM and add P/Q/R --- param.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/param.h b/param.h index fee9195d02..27743c6ef3 100644 --- a/param.h +++ b/param.h @@ -4033,6 +4033,8 @@ Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy rout #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_N 2 #define XGEMM_DEFAULT_UNROLL_N 1 +#define CGEMM3M_DEFAULT_UNROLL_N 2 +#define ZGEMM3M_DEFAULT_UNROLL_N 2 #ifdef ARCH_X86 #define SGEMM_DEFAULT_UNROLL_M 2 @@ -4048,6 +4050,18 @@ Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy rout #define CGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_M 2 #define XGEMM_DEFAULT_UNROLL_M 1 +#define CGEMM3M_DEFAULT_UNROLL_M 2 +#define ZGEMM3M_DEFAULT_UNROLL_M 2 +#define CGEMM3M_DEFAULT_P 448 +#define ZGEMM3M_DEFAULT_P 224 +#define XGEMM3M_DEFAULT_P 112 +#define CGEMM3M_DEFAULT_Q 224 +#define ZGEMM3M_DEFAULT_Q 224 +#define XGEMM3M_DEFAULT_Q 224 +#define CGEMM3M_DEFAULT_R 12288 +#define ZGEMM3M_DEFAULT_R 12288 +#define XGEMM3M_DEFAULT_R 12288 + #endif #ifdef ARCH_MIPS From 2a290dfc2c659f42715df84a94973b4be57c89e5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 14 Nov 2024 14:07:08 -0800 Subject: [PATCH 136/244] forward GEMM3M calls for GENERIC targets to the regular C/ZGEMM for now --- interface/gemm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/interface/gemm.c b/interface/gemm.c index 576e94593c..c9f810faa2 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -86,7 +86,7 @@ #endif static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, IFLOAT *, IFLOAT *, BLASLONG) = { -#ifndef GEMM3M +#if !defined(GEMM3M) || defined(GENERIC) GEMM_NN, GEMM_TN, GEMM_RN, GEMM_CN, GEMM_NT, GEMM_TT, GEMM_RT, GEMM_CT, GEMM_NR, GEMM_TR, GEMM_RR, GEMM_CR, From d04686acd8844443c8787148c7e22db35babbf05 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 14 Nov 2024 14:09:01 -0800 Subject: [PATCH 137/244] Re-enable the EXPRECISION option for non-Windows x86/x86_64 --- Makefile.system | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/Makefile.system b/Makefile.system index 21a0fc3caa..48099d8bbb 100644 --- a/Makefile.system +++ b/Makefile.system @@ -446,7 +446,7 @@ endif ifeq ($(OSNAME), Linux) EXTRALIB += -lm -NO_EXPRECISION = 1 +#NO_EXPRECISION = 1 endif ifeq ($(OSNAME), Android) @@ -572,7 +572,7 @@ NO_BINARY_MODE = 1 endif ifeq ($(CORE), generic) -NO_EXPRECISION = 1 +#NO_EXPRECISION = 1 endif ifndef NO_EXPRECISION @@ -595,7 +595,7 @@ endif ifeq ($(ARCH), x86_64) ifeq ($(CORE), generic) -NO_EXPRECISION = 1 +#NO_EXPRECISION = 1 endif ifndef NO_EXPRECISION @@ -828,8 +828,8 @@ BINARY_DEFINED = 1 ifeq ($(F_COMPILER), GFORTRAN) ifeq ($(C_COMPILER), GCC) -# EXPRECISION = 1 -# CCOMMON_OPT += -DEXPRECISION +EXPRECISION = 1 +CCOMMON_OPT += -DEXPRECISION endif endif endif @@ -1392,17 +1392,15 @@ endif endif ifeq ($(F_COMPILER), CRAY) -CCOMMON_OPT += -DF_INTERFACE_CRAYFC +CCOMMON_OPT += -DF_INTERFACE_INTEL FCOMMON_OPT += -hnopattern ifdef INTERFACE64 ifneq ($(INTERFACE64), 0) FCOMMON_OPT += -s integer64 endif endif -ifeq ($(USE_OPENMP), 1) -FCOMMON_OPT += -fopenmp -else -FCOMMON_OPT += -fno-openmp +ifneq ($(USE_OPENMP), 1) +FCOMMON_OPT += -O noomp endif endif From 4060dd43e308e4dfa32a7024b98ef62314e9ed18 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 15 Nov 2024 15:16:17 -0800 Subject: [PATCH 138/244] Add dummy implementations of openblas_get/set_affinity --- driver/others/blas_server_omp.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index 4341389d81..38b48fc842 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -126,6 +126,18 @@ void openblas_set_num_threads(int num_threads) { goto_set_num_threads(num_threads); } +#ifdef OS_LINUX + +int openblas_setaffinity(int thread_idx, size_t cpusetsize, cpu_set_t* cpu_set) { + fprintf(stderr,"OpenBLAS: use OpenMP environment variables for setting cpu affinity\n"); + return -1; +} +int openblas_getaffinity(int thread_idx, size_t cpusetsize, cpu_set_t* cpu_set) { + fprintf(stderr,"OpenBLAS: use OpenMP environment variables for querying cpu affinity\n"); + return -1; +} +#endif + int blas_thread_init(void){ #if defined(__FreeBSD__) && defined(__clang__) From 9db51f790a53a0f9af295ca284bc76ce648537ec Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 17 Nov 2024 23:19:58 +0100 Subject: [PATCH 139/244] Remove any optimization flags from DEBUG builds on POWER architecture --- Makefile.system | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Makefile.system b/Makefile.system index 21a0fc3caa..29ea819f13 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1615,6 +1615,13 @@ NO_AFFINITY = 1 endif endif +ifeq ($(ARCH), POWER) +ifeq ($(DEBUG), 1) +CCOMMON_OPT := $(filter-out -O%, $(CCOMMON_OPT)) -O0 +FCOMMON_OPT := $(filter-out -O%, $(FCOMMON_OPT)) -O0 +endif +endif + ifdef NO_AFFINITY ifeq ($(NO_AFFINITY), 0) override undefine NO_AFFINITY From bfaf5b9ea442633ca5e3c6968c375b933b1794ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Villemot?= Date: Wed, 20 Nov 2024 11:41:52 +0100 Subject: [PATCH 140/244] Restore libsuffix support in pkg-config file It had been mistakenly removed in 9ef10ffa496b919c25aedbb4aa2fdb930901475a. --- openblas.pc.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openblas.pc.in b/openblas.pc.in index d9bb845499..7632645ac1 100644 --- a/openblas.pc.in +++ b/openblas.pc.in @@ -2,6 +2,6 @@ Name: openblas Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version Version: ${version} URL: https://github.com/xianyi/OpenBLAS -Libs: -L${libdir} -l${libprefix}openblas${libnamesuffix} +Libs: -L${libdir} -l${libprefix}openblas${libnamesuffix}${libsuffix} Libs.private: ${extralib} Cflags: -I${includedir} ${omp_opt} From a0131e56e09c75372740579981becf75fab11edd Mon Sep 17 00:00:00 2001 From: Ralf Gommers Date: Thu, 21 Nov 2024 13:56:54 +0100 Subject: [PATCH 141/244] doc: update README to link to the html docs and fix links Also some minor formatting improvements and linking the home page. --- README.md | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index a31588be02..f527fd429c 100644 --- a/README.md +++ b/README.md @@ -15,11 +15,14 @@ OSUOSL IBMZ-CI [![Build Status](http://ibmz-ci.osuosl.org/buildStatus/icon?job=O OpenBLAS is an optimized BLAS (Basic Linear Algebra Subprograms) library based on GotoBLAS2 1.13 BSD version. -Please read the documentation in the OpenBLAS folder: . +For more information about OpenBLAS, please see: + +- The documentation at [openmathlib.org/OpenBLAS/docs/](http://www.openmathlib.org/OpenBLAS/docs), +- The home page at [openmathlib.org/OpenBLAS/](http://www.openmathlib.org/OpenBLAS). For a general introduction to the BLAS routines, please refer to the extensive documentation of their reference implementation hosted at netlib: . On that site you will likewise find documentation for the reference implementation of the higher-level library LAPACK - the **L**inear **A**lgebra **Pack**age that comes included with OpenBLAS. If you are looking for a general primer or refresher on Linear Algebra, the set of six -20-minute lecture videos by Prof. Gilbert Strang on either MIT OpenCourseWare or Youtube may be helpful. +20-minute lecture videos by Prof. Gilbert Strang on either MIT OpenCourseWare [here](https://ocw.mit.edu/resources/res-18-010-a-2020-vision-of-linear-algebra-spring-2020/) or YouTube [here](https://www.youtube.com/playlist?list=PLUl4u3cNGP61iQEFiWLE21EJCxwmWvvek) may be helpful. ## Binary Packages @@ -27,15 +30,17 @@ We provide official binary packages for the following platform: * Windows x86/x86_64 -You can download them from [file hosting on sourceforge.net](https://sourceforge.net/projects/openblas/files/) or from the Releases section of the github project page, [https://github.com/OpenMathLib/OpenBLAS/releases](https://github.com/OpenMathLib/OpenBLAS/releases). +You can download them from [file hosting on sourceforge.net](https://sourceforge.net/projects/openblas/files/) or from the [Releases section of the GitHub project page](https://github.com/OpenMathLib/OpenBLAS/releases). + +OpenBLAS is also packaged for many package managers - see [the installation section of the docs](http://www.openmathlib.org/OpenBLAS/docs/install/) for details. ## Installation from Source -Download from project homepage, https://github.com/OpenMathLib/OpenBLAS/, or check out the code -using Git from https://github.com/OpenMathLib/OpenBLAS.git. (If you want the most up to date version, be -sure to use the develop branch - master is several years out of date due to a change of maintainership.) -Buildtime parameters can be chosen in Makefile.rule, see there for a short description of each option. -Most can also be given directly on the make or cmake command line. +Obtain the source code from https://github.com/OpenMathLib/OpenBLAS/. Note that the default branch +is `develop` (a `master` branch is still present, but far out of date). + +Build-time parameters can be chosen in `Makefile.rule`, see there for a short description of each option. +Most options can also be given directly on the command line as parameters to your `make` or `cmake` invocation. ### Dependencies @@ -60,6 +65,9 @@ For building with `cmake`, the usual conventions apply, i.e. create a build dire OpenBLAS source directory or separate from it, and invoke `cmake` there with the path to the source tree and any build options you plan to set. +For more details, see the [Building from source](http://www.openmathlib.org/OpenBLAS/docs/install/#building-from-source) +section in the docs. + ### Cross compile Set `CC` and `FC` to point to the cross toolchains, and if you use `make`, also set `HOSTCC` to your host C compiler. @@ -76,10 +84,12 @@ Examples: make CC="i686-w64-mingw32-gcc -Bstatic" FC="i686-w64-mingw32-gfortran -static-libgfortran" TARGET=HASWELL BINARY=32 CROSS=1 NUM_THREADS=20 CONSISTENT_FPCSR=1 HOSTCC=gcc ``` -You can find instructions for other cases both in the "Supported Systems" section below and in the docs folder. The .yml scripts included with the sources (which contain the +You can find instructions for other cases both in the "Supported Systems" section below and in +the [Building from source docs](http://www.openmathlib.org/OpenBLAS/docs/install). +The `.yml` scripts included with the sources (which contain the build scripts for the "continuous integration" (CI) build tests automatically run on every proposed change to the sources) may also provide additional hints. -When compiling for a more modern CPU TARGET of the same architecture, e.g. TARGET=SKYLAKEX on a HASWELL host, option "CROSS=1" can be used to suppress the automatic invocation of the tests at the end of the build. +When compiling for a more modern CPU target of the same architecture, e.g. `TARGET=SKYLAKEX` on a `HASWELL` host, option `CROSS=1` can be used to suppress the automatic invocation of the tests at the end of the build. ### Debug version @@ -325,7 +335,7 @@ Please see Changelog.txt. ## Troubleshooting -* Please read the [FAQ](https://github.com/OpenMathLib/OpenBLAS/docs/faq,md) in the docs folder first. +* Please read the [FAQ](www.openmathlib.org/OpenBLAS/docs/faq) section of the docs first. * Please use GCC version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MinGW/BSD. * Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. Clang 3.0 will generate the wrong AVX binary code. @@ -350,4 +360,4 @@ Please see Changelog.txt. ## Donation -Please read [this wiki page](https://github.com/xianyi/OpenBLAS/wiki/Donation). +Please see [the donations section](http://www.openmathlib.org/OpenBLAS/docs/about/#donations) in the docs. From 0b3db03d4b41b86bb26170e4f4e36785ced9d947 Mon Sep 17 00:00:00 2001 From: daichengrong Date: Fri, 22 Nov 2024 11:13:24 +0800 Subject: [PATCH 142/244] added optimizations for RISC-V YIELDING --- common.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/common.h b/common.h index b8bac1ad27..b3fc6d7ded 100644 --- a/common.h +++ b/common.h @@ -372,6 +372,12 @@ typedef int blasint; #endif #endif +#if defined(ARCH_RISCV64) +#ifndef YIELDING +#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); +#endif +#endif + #ifdef __EMSCRIPTEN__ #define YIELDING From 3a63bbabd1e032b4e0e5ef4199f7c19ff1a5594e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 22 Nov 2024 12:10:56 +0100 Subject: [PATCH 143/244] Add compiler version notes and mention the f2c fallback LAPACK --- README.md | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index f527fd429c..d8e73b2022 100644 --- a/README.md +++ b/README.md @@ -47,9 +47,12 @@ Most options can also be given directly on the command line as parameters to you Building OpenBLAS requires the following to be installed: * GNU Make or CMake -* A C compiler, e.g. GCC or Clang +* A C compiler, e.g. GCC or Clang * A Fortran compiler (optional, for LAPACK) +In general, using a recent version of the compiler is strongly recommended. +If a Fortran compiler is not available, it is possible to compile an older version of the included LAPACK +that has been machine-translated to C. ### Normal compile @@ -339,7 +342,10 @@ Please see Changelog.txt. * Please use GCC version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MinGW/BSD. * Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. Clang 3.0 will generate the wrong AVX binary code. -* Please use GCC version 6 or LLVM version 6 and above to compile Skylake AVX512 kernels. +* Please use GCC version 6 or LLVM version 6 and above to compile Skylake/CooperLake AVX512 kernels +* Please use LLVM version 18 and above (version 19 and above on Windows) if you plan to use + its new flang compiler for Fortran +* Please use GCC version 11 and above to compile OpenBLAS on the POWER architecture * The number of CPUs/cores should be less than or equal to 256. On Linux `x86_64` (`amd64`), there is experimental support for up to 1024 CPUs/cores and 128 numa nodes if you build the library with `BIGNUMA=1`. From 760a5371f317fda909c2d6850e3d3a71b2d7d280 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 22 Nov 2024 15:59:45 +0100 Subject: [PATCH 144/244] Update build instructions for WoA (use LLVM19 and its flang-new) --- docs/install.md | 45 +++++++++++++++------------------------------ 1 file changed, 15 insertions(+), 30 deletions(-) diff --git a/docs/install.md b/docs/install.md index ffb4659d82..33e9323cd9 100644 --- a/docs/install.md +++ b/docs/install.md @@ -439,49 +439,34 @@ To then use the built OpenBLAS shared library in Visual Studio: #### Windows on Arm +While OpenBLAS can be built with Microsoft VisualStudio (Community Edition or commercial), you would only be able to build for the GENERIC target +that does not use optimized assembly kernels, also the stock VisualStudio lacks the Fortran compiler necessary for building the LAPACK component. +It is therefore highly recommended to download the free LLVM compiler suite and use it to compile OpenBLAS outside of VisualStudio. + The following tools needs to be installed to build for Windows on Arm (WoA): -- Clang for Windows on Arm. - Find the latest LLVM build for WoA from [LLVM release page](https://releases.llvm.org/). - E.g: LLVM 12 build for WoA64 can be found [here](https://github.com/llvm/llvm-project/releases/download/llvmorg-12.0.0/LLVM-12.0.0-woa64.exe) - Run the LLVM installer and ensure that LLVM is added to environment PATH. -- Download and install classic Flang for Windows on Arm. - Classic Flang is the only available Fortran compiler for Windows on Arm for now. - A pre-release build can be found [here](https://github.com/kaadam/flang/releases/tag/v0.1) - There is no installer for classic flang and the zip package can be - extracted and the path needs to be added to environment `PATH`. - E.g., in PowerShell: - ``` - $env:Path += ";C:\flang_woa\bin" - ``` +- LLVM for Windows on Arm. + Find the latest LLVM build for WoA from [LLVM release page](https://releases.llvm.org/) - you want the package whose name ends in "woa64.exe". + E.g: LLVM 19 build for WoA64 can be found [here](https://github.com/llvm/llvm-project/releases/download/llvmorg-19.1.4/LLVM-19.1.4-woa64.exe) + Run the LLVM installer and ensure that LLVM is added to environment PATH. (If you do not want to add it to the PATH, you will need to specify + both C and Fortran compiler to Make or CMake with their full path later on) -The following steps describe how to build the static library for OpenBLAS with and without LAPACK: +The following steps describe how to build the static library for OpenBLAS with either Make or CMake: -1. Build OpenBLAS static library with BLAS and LAPACK routines with Make: +1. Build OpenBLAS with Make: ```bash - $ make CC="clang-cl" HOSTCC="clang-cl" AR="llvm-ar" BUILD_WITHOUT_LAPACK=0 NOFORTRAN=0 DYNAMIC_ARCH=0 TARGET=ARMV8 ARCH=arm64 BINARY=64 USE_OPENMP=0 PARALLEL=1 RANLIB="llvm-ranlib" MAKE=make F_COMPILER=FLANG FC=FLANG FFLAGS_NOOPT="-march=armv8-a -cpp" FFLAGS="-march=armv8-a -cpp" NEED_PIC=0 HOSTARCH=arm64 libs netlib + $ make CC=clang-cl FC=flang-new AR="llvm-ar" TARGET=ARMV8 ARCH=arm64 RANLIB="llvm-ranlib" MAKE=make ``` -2. Build static library with BLAS routines using CMake: - - Classic Flang has compatibility issues with CMake, hence only BLAS routines can be compiled with CMake: - +2. Build OpenBLAS with CMake ```bash $ mkdir build $ cd build - $ cmake .. -G Ninja -DCMAKE_C_COMPILER=clang -DBUILD_WITHOUT_LAPACK=1 -DNOFORTRAN=1 -DDYNAMIC_ARCH=0 -DTARGET=ARMV8 -DARCH=arm64 -DBINARY=64 -DUSE_OPENMP=0 -DCMAKE_SYSTEM_PROCESSOR=ARM64 -DCMAKE_CROSSCOMPILING=1 -DCMAKE_SYSTEM_NAME=Windows - $ cmake --build . --config Release + $ cmake .. -G Ninja -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang-new -DTARGET=ARMV8 -DCMAKE_BUILD_TYPE=Release + $ cmake --build . ``` -!!! tip "`getarch.exe` execution error" - - If you notice that platform-specific headers by `getarch.exe` are not - generated correctly, this could be due to a known debug runtime DLL issue for - arm64 platforms. Please check out [this page](https://linaro.atlassian.net/wiki/spaces/WOAR/pages/28677636097/Debug+run-time+DLL+issue#Workaround) - for a workaround. - - #### Generating an import library Microsoft Windows has this thing called "import libraries". You need it for From 009c1e0387357eff7cc9f6f7713ce92e2f17ef5b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 23 Nov 2024 14:15:04 +0100 Subject: [PATCH 145/244] fix download link for the current WoA binary of LLVM --- docs/install.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/install.md b/docs/install.md index 33e9323cd9..b842d3355b 100644 --- a/docs/install.md +++ b/docs/install.md @@ -447,8 +447,9 @@ The following tools needs to be installed to build for Windows on Arm (WoA): - LLVM for Windows on Arm. Find the latest LLVM build for WoA from [LLVM release page](https://releases.llvm.org/) - you want the package whose name ends in "woa64.exe". - E.g: LLVM 19 build for WoA64 can be found [here](https://github.com/llvm/llvm-project/releases/download/llvmorg-19.1.4/LLVM-19.1.4-woa64.exe) - Run the LLVM installer and ensure that LLVM is added to environment PATH. (If you do not want to add it to the PATH, you will need to specify + (This may not always be present in the very latest point release, as building and uploading the binaries takes time.) + E.g: a LLVM 19 build for WoA64 can be found [here](https://github.com/llvm/llvm-project/releases/download/llvmorg-19.1.2/LLVM-19.1.2-woa64.exe). + Run the LLVM installer and ensure that LLVM is added to the environment variable PATH. (If you do not want to add it to the PATH, you will need to specify both C and Fortran compiler to Make or CMake with their full path later on) The following steps describe how to build the static library for OpenBLAS with either Make or CMake: From 7452af4471d6e71c40a5a9bec444eaaed6db5a8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Markus=20M=C3=BCtzel?= Date: Sat, 5 Aug 2023 16:48:04 +0200 Subject: [PATCH 146/244] CI (MinGW): Remove work-around with NO_AVX512 that was needed for older versions of LLVM Flang. --- .github/workflows/dynamic_arch.yml | 7 ------- 1 file changed, 7 deletions(-) diff --git a/.github/workflows/dynamic_arch.yml b/.github/workflows/dynamic_arch.yml index 669aa81168..df61eccff6 100644 --- a/.github/workflows/dynamic_arch.yml +++ b/.github/workflows/dynamic_arch.yml @@ -174,9 +174,6 @@ jobs: idx: int32 target-prefix: mingw-w64-clang-x86_64 fc-pkg: fc - # Compiling with Flang 16 seems to cause test errors on machines - # with AVX512 instructions. Revisit after MSYS2 distributes Flang 17. - no-avx512-flags: -DNO_AVX512=1 - msystem: CLANG32 idx: int32 target-prefix: mingw-w64-clang-i686 @@ -192,9 +189,6 @@ jobs: idx64-flags: -DBINARY=64 -DINTERFACE64=1 target-prefix: mingw-w64-clang-x86_64 fc-pkg: fc - # Compiling with Flang 16 seems to cause test errors on machines - # with AVX512 instructions. Revisit after MSYS2 distributes Flang 17. - no-avx512-flags: -DNO_AVX512=1 - msystem: UCRT64 idx: int32 target-prefix: mingw-w64-ucrt-x86_64 @@ -281,7 +275,6 @@ jobs: -DTARGET=CORE2 \ ${{ matrix.idx64-flags }} \ ${{ matrix.c-lapack-flags }} \ - ${{ matrix.no-avx512-flags }} \ -DCMAKE_C_COMPILER_LAUNCHER=ccache \ -DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \ .. From f5e6b5b5c91f3bc6b6f5f8f47a320864ef940538 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Markus=20M=C3=BCtzel?= Date: Tue, 26 Nov 2024 13:14:23 +0100 Subject: [PATCH 147/244] CI (MinGW): Remove CLANG32 environment from build matrix. The CLANG32 environment is in the process of being removed from MSYS2 currently: https://www.msys2.org/news/#2024-09-23-starting-to-drop-the-clang32-environment Remove it from the build matrix ahead of its complete removal from MSYS2. --- .github/workflows/dynamic_arch.yml | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/.github/workflows/dynamic_arch.yml b/.github/workflows/dynamic_arch.yml index df61eccff6..9e55e73467 100644 --- a/.github/workflows/dynamic_arch.yml +++ b/.github/workflows/dynamic_arch.yml @@ -158,7 +158,7 @@ jobs: strategy: fail-fast: false matrix: - msystem: [UCRT64, MINGW32, CLANG64, CLANG32] + msystem: [UCRT64, MINGW32, CLANG64] idx: [int32, int64] build-type: [Release] include: @@ -174,11 +174,6 @@ jobs: idx: int32 target-prefix: mingw-w64-clang-x86_64 fc-pkg: fc - - msystem: CLANG32 - idx: int32 - target-prefix: mingw-w64-clang-i686 - fc-pkg: cc - c-lapack-flags: -DC_LAPACK=ON - msystem: UCRT64 idx: int64 idx64-flags: -DBINARY=64 -DINTERFACE64=1 @@ -197,8 +192,6 @@ jobs: exclude: - msystem: MINGW32 idx: int64 - - msystem: CLANG32 - idx: int64 defaults: run: @@ -274,7 +267,6 @@ jobs: -DNUM_THREADS=64 \ -DTARGET=CORE2 \ ${{ matrix.idx64-flags }} \ - ${{ matrix.c-lapack-flags }} \ -DCMAKE_C_COMPILER_LAUNCHER=ccache \ -DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \ .. From 57a51d74c915e7c957a0209eef1d18a1e5eb9b32 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 27 Nov 2024 09:52:56 +0100 Subject: [PATCH 148/244] translate CMAKE_SYSTEM_NAME in compilations on or for IOS --- cmake/system_check.cmake | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index 59a1358789..fc81e9797d 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -10,6 +10,10 @@ if (${HOST_OS} STREQUAL "WINDOWS") set(HOST_OS WINNT) endif () +if (${HOST_OS} STREQUAL "IOS") + set(HOST_OS DARWIN) +endif () + if (${HOST_OS} STREQUAL "LINUX") # check if we're building natively on Android (TERMUX) EXECUTE_PROCESS( COMMAND uname -o COMMAND tr -d '\n' OUTPUT_VARIABLE OPERATING_SYSTEM) From 0c440f8a27b09c45cdbd380824109d9d16bde5bc Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 27 Nov 2024 23:15:41 +0100 Subject: [PATCH 149/244] disable multithreading for small workloads --- interface/lapack/trtri.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/interface/lapack/trtri.c b/interface/lapack/trtri.c index 0285293892..df79f26656 100644 --- a/interface/lapack/trtri.c +++ b/interface/lapack/trtri.c @@ -127,6 +127,9 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In #endif #ifdef SMP +if (args.n <= 150) + args.nthreads = 1; +else args.nthreads = num_cpu_avail(4); if (args.nthreads == 1) { From 4918beecbef845cce1e5b496a6330dfc035d7964 Mon Sep 17 00:00:00 2001 From: "Iha, Taisei" Date: Mon, 2 Dec 2024 18:46:00 +0900 Subject: [PATCH 150/244] Loop-unrolled transposed [SD]GEMV kernels for A64FX and Neoverse V1 --- kernel/arm64/KERNEL.A64FX | 4 +- kernel/arm64/KERNEL.NEOVERSEV1 | 4 +- kernel/arm64/gemv_t_sve_v1x3.c | 152 +++++++++++++++++++++ kernel/arm64/gemv_t_sve_v4x3.c | 234 +++++++++++++++++++++++++++++++++ 4 files changed, 390 insertions(+), 4 deletions(-) create mode 100644 kernel/arm64/gemv_t_sve_v1x3.c create mode 100644 kernel/arm64/gemv_t_sve_v4x3.c diff --git a/kernel/arm64/KERNEL.A64FX b/kernel/arm64/KERNEL.A64FX index 4abc840405..75f0f39a7e 100644 --- a/kernel/arm64/KERNEL.A64FX +++ b/kernel/arm64/KERNEL.A64FX @@ -2,5 +2,5 @@ include $(KERNELDIR)/KERNEL.ARMV8SVE SGEMVNKERNEL = gemv_n_sve.c DGEMVNKERNEL = gemv_n_sve.c -SGEMVTKERNEL = gemv_t_sve.c -DGEMVTKERNEL = gemv_t_sve.c +SGEMVTKERNEL = gemv_t_sve_v4x3.c +DGEMVTKERNEL = gemv_t_sve_v4x3.c diff --git a/kernel/arm64/KERNEL.NEOVERSEV1 b/kernel/arm64/KERNEL.NEOVERSEV1 index 53d157a0aa..859466409e 100644 --- a/kernel/arm64/KERNEL.NEOVERSEV1 +++ b/kernel/arm64/KERNEL.NEOVERSEV1 @@ -1,4 +1,4 @@ include $(KERNELDIR)/KERNEL.ARMV8SVE -SGEMVTKERNEL = gemv_t_sve.c -DGEMVTKERNEL = gemv_t_sve.c +SGEMVTKERNEL = gemv_t_sve_v1x3.c +DGEMVTKERNEL = gemv_t_sve_v1x3.c diff --git a/kernel/arm64/gemv_t_sve_v1x3.c b/kernel/arm64/gemv_t_sve_v1x3.c new file mode 100644 index 0000000000..e481abec7c --- /dev/null +++ b/kernel/arm64/gemv_t_sve_v1x3.c @@ -0,0 +1,152 @@ +/*************************************************************************** +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include + +#include "common.h" + +#ifdef DOUBLE +#define SV_COUNT svcntd +#define SV_TYPE svfloat64_t +#define SV_TRUE svptrue_b64 +#define SV_WHILE svwhilelt_b64_s64 +#define SV_DUP svdup_f64 +#else +#define SV_COUNT svcntw +#define SV_TYPE svfloat32_t +#define SV_TRUE svptrue_b32 +#define SV_WHILE svwhilelt_b32_s64 +#define SV_DUP svdup_f32 +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, + BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, + FLOAT *buffer) +{ + BLASLONG i; + BLASLONG ix,iy; + BLASLONG j; + FLOAT *a_ptr; + FLOAT temp; + + iy = 0; + + if (inc_x == 1) { + BLASLONG width = (n + 3 - 1) / 3; + + FLOAT *a0_ptr = a + lda * width * 0; + FLOAT *a1_ptr = a + lda * width * 1; + FLOAT *a2_ptr = a + lda * width * 2; + + FLOAT *y0_ptr = y + inc_y * width * 0; + FLOAT *y1_ptr = y + inc_y * width * 1; + FLOAT *y2_ptr = y + inc_y * width * 2; + + for (j = 0; j < width; j++) { + svbool_t pg00 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse(); + svbool_t pg01 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse(); + svbool_t pg02 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); + + SV_TYPE temp00_vec = SV_DUP(0.0); + SV_TYPE temp01_vec = SV_DUP(0.0); + SV_TYPE temp02_vec = SV_DUP(0.0); + + i = 0; + BLASLONG sve_size = SV_COUNT(); + while ((i + sve_size * 1 - 1) < m) { + SV_TYPE x0_vec = svld1_vnum(SV_TRUE(), x + i, 0); + + SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0); + SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0); + SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0); + + temp00_vec = svmla_m(pg00, temp00_vec, a00_vec, x0_vec); + temp01_vec = svmla_m(pg01, temp01_vec, a01_vec, x0_vec); + temp02_vec = svmla_m(pg02, temp02_vec, a02_vec, x0_vec); + + i += sve_size * 1; + } + + if (i < m) { + svbool_t pg0 = SV_WHILE(i + sve_size * 0, m); + + pg00 = svand_z(SV_TRUE(), pg0, pg00); + pg01 = svand_z(SV_TRUE(), pg0, pg01); + pg02 = svand_z(SV_TRUE(), pg0, pg02); + + SV_TYPE x0_vec = svld1_vnum(pg0, x + i, 0); + + SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0); + SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0); + SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0); + + temp00_vec = svmla_m(pg00, temp00_vec, a00_vec, x0_vec); + temp01_vec = svmla_m(pg01, temp01_vec, a01_vec, x0_vec); + temp02_vec = svmla_m(pg02, temp02_vec, a02_vec, x0_vec); + } + + if ((j + width * 0) < n) { + temp = svaddv(SV_TRUE(), temp00_vec); + y0_ptr[iy] += alpha * temp; + } + if ((j + width * 1) < n) { + temp = svaddv(SV_TRUE(), temp01_vec); + y1_ptr[iy] += alpha * temp; + } + if ((j + width * 2) < n) { + temp = svaddv(SV_TRUE(), temp02_vec); + y2_ptr[iy] += alpha * temp; + } + iy += inc_y; + + a0_ptr += lda; + a1_ptr += lda; + a2_ptr += lda; + } + + return(0); + } + + a_ptr = a; + for (j = 0; j < n; j++) { + temp = 0.0; + ix = 0; + for (i = 0; i < m; i++) { + temp += a_ptr[i] * x[ix]; + ix += inc_x; + } + y[iy] += alpha * temp; + iy += inc_y; + a_ptr += lda; + } + return(0); +} diff --git a/kernel/arm64/gemv_t_sve_v4x3.c b/kernel/arm64/gemv_t_sve_v4x3.c new file mode 100644 index 0000000000..77c46feb34 --- /dev/null +++ b/kernel/arm64/gemv_t_sve_v4x3.c @@ -0,0 +1,234 @@ +/*************************************************************************** +Copyright (c) 2024, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include + +#include "common.h" + +#ifdef DOUBLE +#define SV_COUNT svcntd +#define SV_TYPE svfloat64_t +#define SV_TRUE svptrue_b64 +#define SV_WHILE svwhilelt_b64_s64 +#define SV_DUP svdup_f64 +#else +#define SV_COUNT svcntw +#define SV_TYPE svfloat32_t +#define SV_TRUE svptrue_b32 +#define SV_WHILE svwhilelt_b32_s64 +#define SV_DUP svdup_f32 +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, + BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, + FLOAT *buffer) +{ + BLASLONG i; + BLASLONG ix,iy; + BLASLONG j; + FLOAT *a_ptr; + FLOAT temp; + + iy = 0; + + if (inc_x == 1) { + BLASLONG width = (n + 3 - 1) / 3; + + FLOAT *a0_ptr = a + lda * width * 0; + FLOAT *a1_ptr = a + lda * width * 1; + FLOAT *a2_ptr = a + lda * width * 2; + + FLOAT *y0_ptr = y + inc_y * width * 0; + FLOAT *y1_ptr = y + inc_y * width * 1; + FLOAT *y2_ptr = y + inc_y * width * 2; + + for (j = 0; j < width; j++) { + svbool_t pg00 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse(); + svbool_t pg10 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse(); + svbool_t pg20 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse(); + svbool_t pg30 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse(); + svbool_t pg01 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse(); + svbool_t pg11 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse(); + svbool_t pg21 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse(); + svbool_t pg31 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse(); + svbool_t pg02 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); + svbool_t pg12 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); + svbool_t pg22 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); + svbool_t pg32 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse(); + + SV_TYPE temp00_vec = SV_DUP(0.0); + SV_TYPE temp10_vec = SV_DUP(0.0); + SV_TYPE temp20_vec = SV_DUP(0.0); + SV_TYPE temp30_vec = SV_DUP(0.0); + SV_TYPE temp01_vec = SV_DUP(0.0); + SV_TYPE temp11_vec = SV_DUP(0.0); + SV_TYPE temp21_vec = SV_DUP(0.0); + SV_TYPE temp31_vec = SV_DUP(0.0); + SV_TYPE temp02_vec = SV_DUP(0.0); + SV_TYPE temp12_vec = SV_DUP(0.0); + SV_TYPE temp22_vec = SV_DUP(0.0); + SV_TYPE temp32_vec = SV_DUP(0.0); + + i = 0; + BLASLONG sve_size = SV_COUNT(); + while ((i + sve_size * 4 - 1) < m) { + SV_TYPE x0_vec = svld1_vnum(SV_TRUE(), x + i, 0); + SV_TYPE x1_vec = svld1_vnum(SV_TRUE(), x + i, 1); + SV_TYPE x2_vec = svld1_vnum(SV_TRUE(), x + i, 2); + SV_TYPE x3_vec = svld1_vnum(SV_TRUE(), x + i, 3); + + SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0); + SV_TYPE a10_vec = svld1_vnum(pg10, a0_ptr + i, 1); + SV_TYPE a20_vec = svld1_vnum(pg20, a0_ptr + i, 2); + SV_TYPE a30_vec = svld1_vnum(pg30, a0_ptr + i, 3); + SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0); + SV_TYPE a11_vec = svld1_vnum(pg11, a1_ptr + i, 1); + SV_TYPE a21_vec = svld1_vnum(pg21, a1_ptr + i, 2); + SV_TYPE a31_vec = svld1_vnum(pg31, a1_ptr + i, 3); + SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0); + SV_TYPE a12_vec = svld1_vnum(pg12, a2_ptr + i, 1); + SV_TYPE a22_vec = svld1_vnum(pg22, a2_ptr + i, 2); + SV_TYPE a32_vec = svld1_vnum(pg32, a2_ptr + i, 3); + + temp00_vec = svmla_m(pg00, temp00_vec, a00_vec, x0_vec); + temp10_vec = svmla_m(pg10, temp10_vec, a10_vec, x1_vec); + temp20_vec = svmla_m(pg20, temp20_vec, a20_vec, x2_vec); + temp30_vec = svmla_m(pg30, temp30_vec, a30_vec, x3_vec); + temp01_vec = svmla_m(pg01, temp01_vec, a01_vec, x0_vec); + temp11_vec = svmla_m(pg11, temp11_vec, a11_vec, x1_vec); + temp21_vec = svmla_m(pg21, temp21_vec, a21_vec, x2_vec); + temp31_vec = svmla_m(pg31, temp31_vec, a31_vec, x3_vec); + temp02_vec = svmla_m(pg02, temp02_vec, a02_vec, x0_vec); + temp12_vec = svmla_m(pg12, temp12_vec, a12_vec, x1_vec); + temp22_vec = svmla_m(pg22, temp22_vec, a22_vec, x2_vec); + temp32_vec = svmla_m(pg32, temp32_vec, a32_vec, x3_vec); + + i += sve_size * 4; + } + + if (i < m) { + svbool_t pg0 = SV_WHILE(i + sve_size * 0, m); + svbool_t pg1 = SV_WHILE(i + sve_size * 1, m); + svbool_t pg2 = SV_WHILE(i + sve_size * 2, m); + svbool_t pg3 = SV_WHILE(i + sve_size * 3, m); + + pg00 = svand_z(SV_TRUE(), pg0, pg00); + pg10 = svand_z(SV_TRUE(), pg1, pg10); + pg20 = svand_z(SV_TRUE(), pg2, pg20); + pg30 = svand_z(SV_TRUE(), pg3, pg30); + pg01 = svand_z(SV_TRUE(), pg0, pg01); + pg11 = svand_z(SV_TRUE(), pg1, pg11); + pg21 = svand_z(SV_TRUE(), pg2, pg21); + pg31 = svand_z(SV_TRUE(), pg3, pg31); + pg02 = svand_z(SV_TRUE(), pg0, pg02); + pg12 = svand_z(SV_TRUE(), pg1, pg12); + pg22 = svand_z(SV_TRUE(), pg2, pg22); + pg32 = svand_z(SV_TRUE(), pg3, pg32); + + SV_TYPE x0_vec = svld1_vnum(pg0, x + i, 0); + SV_TYPE x1_vec = svld1_vnum(pg1, x + i, 1); + SV_TYPE x2_vec = svld1_vnum(pg2, x + i, 2); + SV_TYPE x3_vec = svld1_vnum(pg3, x + i, 3); + + SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0); + SV_TYPE a10_vec = svld1_vnum(pg10, a0_ptr + i, 1); + SV_TYPE a20_vec = svld1_vnum(pg20, a0_ptr + i, 2); + SV_TYPE a30_vec = svld1_vnum(pg30, a0_ptr + i, 3); + SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0); + SV_TYPE a11_vec = svld1_vnum(pg11, a1_ptr + i, 1); + SV_TYPE a21_vec = svld1_vnum(pg21, a1_ptr + i, 2); + SV_TYPE a31_vec = svld1_vnum(pg31, a1_ptr + i, 3); + SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0); + SV_TYPE a12_vec = svld1_vnum(pg12, a2_ptr + i, 1); + SV_TYPE a22_vec = svld1_vnum(pg22, a2_ptr + i, 2); + SV_TYPE a32_vec = svld1_vnum(pg32, a2_ptr + i, 3); + + temp00_vec = svmla_m(pg00, temp00_vec, a00_vec, x0_vec); + temp10_vec = svmla_m(pg10, temp10_vec, a10_vec, x1_vec); + temp20_vec = svmla_m(pg20, temp20_vec, a20_vec, x2_vec); + temp30_vec = svmla_m(pg30, temp30_vec, a30_vec, x3_vec); + temp01_vec = svmla_m(pg01, temp01_vec, a01_vec, x0_vec); + temp11_vec = svmla_m(pg11, temp11_vec, a11_vec, x1_vec); + temp21_vec = svmla_m(pg21, temp21_vec, a21_vec, x2_vec); + temp31_vec = svmla_m(pg31, temp31_vec, a31_vec, x3_vec); + temp02_vec = svmla_m(pg02, temp02_vec, a02_vec, x0_vec); + temp12_vec = svmla_m(pg12, temp12_vec, a12_vec, x1_vec); + temp22_vec = svmla_m(pg22, temp22_vec, a22_vec, x2_vec); + temp32_vec = svmla_m(pg32, temp32_vec, a32_vec, x3_vec); + } + + temp00_vec = svadd_x(SV_TRUE(), temp00_vec, temp10_vec); + temp01_vec = svadd_x(SV_TRUE(), temp01_vec, temp11_vec); + temp02_vec = svadd_x(SV_TRUE(), temp02_vec, temp12_vec); + temp20_vec = svadd_x(SV_TRUE(), temp20_vec, temp30_vec); + temp21_vec = svadd_x(SV_TRUE(), temp21_vec, temp31_vec); + temp22_vec = svadd_x(SV_TRUE(), temp22_vec, temp32_vec); + temp00_vec = svadd_x(SV_TRUE(), temp00_vec, temp20_vec); + temp01_vec = svadd_x(SV_TRUE(), temp01_vec, temp21_vec); + temp02_vec = svadd_x(SV_TRUE(), temp02_vec, temp22_vec); + + if ((j + width * 0) < n) { + temp = svaddv(SV_TRUE(), temp00_vec); + y0_ptr[iy] += alpha * temp; + } + if ((j + width * 1) < n) { + temp = svaddv(SV_TRUE(), temp01_vec); + y1_ptr[iy] += alpha * temp; + } + if ((j + width * 2) < n) { + temp = svaddv(SV_TRUE(), temp02_vec); + y2_ptr[iy] += alpha * temp; + } + iy += inc_y; + + a0_ptr += lda; + a1_ptr += lda; + a2_ptr += lda; + } + + return(0); + } + + a_ptr = a; + for (j = 0; j < n; j++) { + temp = 0.0; + ix = 0; + for (i = 0; i < m; i++) { + temp += a_ptr[i] * x[ix]; + ix += inc_x; + } + y[iy] += alpha * temp; + iy += inc_y; + a_ptr += lda; + } + return(0); +} From dc905636d12efa91b2e690ab2b1f07de45f0a6d2 Mon Sep 17 00:00:00 2001 From: Kai Pastor Date: Tue, 3 Dec 2024 07:42:44 +0100 Subject: [PATCH 151/244] arm: Declare symbols as .type function --- common_arm.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/common_arm.h b/common_arm.h index 80aabc7b02..d6291018b1 100644 --- a/common_arm.h +++ b/common_arm.h @@ -102,9 +102,16 @@ static inline int blas_quickdivide(blasint x, blasint y){ #if defined(ASSEMBLER) && !defined(NEEDPARAM) +#if !defined(__APPLE__) && !defined(_WIN32) +#define OPENBLAS_ARM_TYPE_FUNCTION .type REALNAME, %function ; +#else +#define OPENBLAS_ARM_TYPE_FUNCTION +#endif + #define PROLOGUE \ .arm ;\ .global REALNAME ;\ + OPENBLAS_ARM_TYPE_FUNCTION \ REALNAME: #define EPILOGUE From 93eb42fdc836871943bc2582599db854716e7659 Mon Sep 17 00:00:00 2001 From: Kai Pastor Date: Tue, 3 Dec 2024 09:45:04 +0100 Subject: [PATCH 152/244] Fix redefinition of FAILED --- ctest/cblas_test.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ctest/cblas_test.h b/ctest/cblas_test.h index 3eeb46ac2c..24ea677637 100644 --- a/ctest/cblas_test.h +++ b/ctest/cblas_test.h @@ -10,6 +10,11 @@ #define int long #endif +/* e.g. mingw64/x86_64-w64-mingw32/include/winerror.h */ +#ifdef FAILED +#undef FAILED +#endif + #define TRUE 1 #define PASSED 1 #define TEST_ROW_MJR 1 From a8b1705dbd39f079ecf120622fa889ecdd92ac04 Mon Sep 17 00:00:00 2001 From: Matthew Thompson Date: Tue, 26 Nov 2024 15:21:28 -0500 Subject: [PATCH 153/244] CMake build has wrong PIC flag for NAG --- cmake/system.cmake | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cmake/system.cmake b/cmake/system.cmake index 6b891ca0ef..df1095c045 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -382,6 +382,8 @@ if (NEED_PIC) if (NOT NOFORTRAN) if (${F_COMPILER} STREQUAL "SUN") set(FCOMMON_OPT "${FCOMMON_OPT} -pic") + elseif (${F_COMPILER} STREQUAL "NAG") + set(FCOMMON_OPT "${FCOMMON_OPT} -PIC") else () set(FCOMMON_OPT "${FCOMMON_OPT} -fPIC") endif () From 2eaf285de53d5f064e15e8e7ee9d3dd4cef61455 Mon Sep 17 00:00:00 2001 From: Matthew Thompson Date: Tue, 26 Nov 2024 15:26:55 -0500 Subject: [PATCH 154/244] Use F_COMPILER name --- cmake/system.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/system.cmake b/cmake/system.cmake index df1095c045..82d16c92fa 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -382,7 +382,7 @@ if (NEED_PIC) if (NOT NOFORTRAN) if (${F_COMPILER} STREQUAL "SUN") set(FCOMMON_OPT "${FCOMMON_OPT} -pic") - elseif (${F_COMPILER} STREQUAL "NAG") + elseif (${F_COMPILER} STREQUAL "NAGFOR") set(FCOMMON_OPT "${FCOMMON_OPT} -PIC") else () set(FCOMMON_OPT "${FCOMMON_OPT} -fPIC") From be19966d3b7618625febeca35fe6be57899f4aea Mon Sep 17 00:00:00 2001 From: Matthew Thompson Date: Wed, 4 Dec 2024 10:52:43 -0500 Subject: [PATCH 155/244] Fixes for NAG CMake --- cmake/fc.cmake | 25 +++++++++++++++++++++++++ cmake/lapack.cmake | 7 ++++++- cmake/system.cmake | 2 +- 3 files changed, 32 insertions(+), 2 deletions(-) diff --git a/cmake/fc.cmake b/cmake/fc.cmake index 4ce1c99d4b..38bd406a3a 100644 --- a/cmake/fc.cmake +++ b/cmake/fc.cmake @@ -269,6 +269,31 @@ if (${F_COMPILER} STREQUAL "CRAY") endif () endif () +if (${F_COMPILER} STREQUAL "NAGFOR") + set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_NAG") + if (INTERFACE64) + set(FCOMMON_OPT "${FCOMMON_OPT} -i8") + endif () + # Options from Makefile.system + # -dcfuns: Enable non-standard double precision complex intrinsic functions + # -ieee=full: enables all IEEE arithmetic facilities including non-stop arithmetic. + # -w=obs: Suppress warning messages about obsolescent features + # -thread_safe: Compile code for safe execution in a multi-threaded environment. + # -recursive: Specifies that procedures are RECURSIVE by default. + set(FCOMMON_OPT "${FCOMMON_OPT} -dcfuns -recursive -ieee=full -w=obs -thread_safe") + # Options from Reference-LAPACK + # Suppress compiler banner and summary + set(FCOMMON_OPT "${FCOMMON_OPT} -quiet") + # Disable other common warnings + # -w=x77: Suppress warning messages about Fortran 77 features + # -w=ques: Suppress warning messages about questionable usage + # -w=unused: Suppress warning messages about unused variables + set(FCOMMON_OPT "${FCOMMON_OPT} -w=x77 -w=ques -w=unused") + if (USE_OPENMP) + set(FCOMMON_OPT "${FCOMMON_OPT} -openmp") + endif () +endif () + # from the root Makefile - this is for lapack-netlib to compile the correct secnd file. if (${F_COMPILER} STREQUAL "GFORTRAN") set(TIMER "INT_ETIME") diff --git a/cmake/lapack.cmake b/cmake/lapack.cmake index 003a8b3c17..6a74fb7640 100644 --- a/cmake/lapack.cmake +++ b/cmake/lapack.cmake @@ -1018,7 +1018,12 @@ foreach (LA_FILE ${LA_GEN_SRC}) endforeach () if (NOT C_LAPACK) - set_source_files_properties(${LA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_FFLAGS}") + # The below line is duplicating Fortran flags but NAG has a few flags + # that cannot be specified twice. It's possible this is not needed for + # any compiler, but for safety, we only turn off for NAG + if (NOT ${F_COMPILER} STREQUAL "NAGFOR") + set_source_files_properties(${LA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_FFLAGS}") + endif () if (${F_COMPILER} STREQUAL "GFORTRAN") set_source_files_properties(${LA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_FFLAGS} -fno-tree-vectorize") endif() diff --git a/cmake/system.cmake b/cmake/system.cmake index 82d16c92fa..b58a0f4b55 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -642,7 +642,7 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "Windows") endif () if (CMAKE_Fortran_COMPILER) -if ("${F_COMPILER}" STREQUAL "NAG" OR "${F_COMPILER}" STREQUAL "CRAY" OR CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*") + if ("${F_COMPILER}" STREQUAL "NAGFOR" OR "${F_COMPILER}" STREQUAL "CRAY" OR CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*") set(FILTER_FLAGS "-msse3;-mssse3;-msse4.1;-mavx;-mavx2,-mskylake-avx512") if (CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*") message(STATUS "removing fortran flags") From 35334ed2ea7cd5859b1ac7b767df2854d69d8f55 Mon Sep 17 00:00:00 2001 From: Matthew Thompson Date: Wed, 4 Dec 2024 10:53:05 -0500 Subject: [PATCH 156/244] Fixes for Fortran Standards violations for lapack-netlib --- lapack-netlib/SRC/claqp2rk.f | 6 +++--- lapack-netlib/SRC/claqp3rk.f | 6 +++--- lapack-netlib/TESTING/EIG/cchkhb2stg.f | 5 +++-- lapack-netlib/TESTING/EIG/dchksb2stg.f | 5 +++-- lapack-netlib/TESTING/EIG/schksb2stg.f | 5 +++-- lapack-netlib/TESTING/EIG/zchkhb2stg.f | 5 +++-- lapack-netlib/TESTING/LIN/alahd.f | 2 +- 7 files changed, 19 insertions(+), 15 deletions(-) diff --git a/lapack-netlib/SRC/claqp2rk.f b/lapack-netlib/SRC/claqp2rk.f index 6b1db085aa..0501c50bb4 100644 --- a/lapack-netlib/SRC/claqp2rk.f +++ b/lapack-netlib/SRC/claqp2rk.f @@ -378,7 +378,7 @@ SUBROUTINE CLAQP2RK( M, N, NRHS, IOFFSET, KMAX, ABSTOL, RELTOL, EXTERNAL CLARF, CLARFG, CSWAP * .. * .. Intrinsic Functions .. - INTRINSIC ABS, REAL, CONJG, IMAG, MAX, MIN, SQRT + INTRINSIC ABS, REAL, CONJG, AIMAG, MAX, MIN, SQRT * .. * .. External Functions .. LOGICAL SISNAN @@ -599,8 +599,8 @@ SUBROUTINE CLAQP2RK( M, N, NRHS, IOFFSET, KMAX, ABSTOL, RELTOL, * IF( SISNAN( REAL( TAU(KK) ) ) ) THEN TAUNAN = REAL( TAU(KK) ) - ELSE IF( SISNAN( IMAG( TAU(KK) ) ) ) THEN - TAUNAN = IMAG( TAU(KK) ) + ELSE IF( SISNAN( AIMAG( TAU(KK) ) ) ) THEN + TAUNAN = AIMAG( TAU(KK) ) ELSE TAUNAN = ZERO END IF diff --git a/lapack-netlib/SRC/claqp3rk.f b/lapack-netlib/SRC/claqp3rk.f index 3703bcbd65..8fe5a220ff 100644 --- a/lapack-netlib/SRC/claqp3rk.f +++ b/lapack-netlib/SRC/claqp3rk.f @@ -431,7 +431,7 @@ SUBROUTINE CLAQP3RK( M, N, NRHS, IOFFSET, NB, ABSTOL, EXTERNAL CGEMM, CGEMV, CLARFG, CSWAP * .. * .. Intrinsic Functions .. - INTRINSIC ABS, REAL, CONJG, IMAG, MAX, MIN, SQRT + INTRINSIC ABS, REAL, CONJG, AIMAG, MAX, MIN, SQRT * .. * .. External Functions .. LOGICAL SISNAN @@ -739,8 +739,8 @@ SUBROUTINE CLAQP3RK( M, N, NRHS, IOFFSET, NB, ABSTOL, * IF( SISNAN( REAL( TAU(K) ) ) ) THEN TAUNAN = REAL( TAU(K) ) - ELSE IF( SISNAN( IMAG( TAU(K) ) ) ) THEN - TAUNAN = IMAG( TAU(K) ) + ELSE IF( SISNAN( AIMAG( TAU(K) ) ) ) THEN + TAUNAN = AIMAG( TAU(K) ) ELSE TAUNAN = ZERO END IF diff --git a/lapack-netlib/TESTING/EIG/cchkhb2stg.f b/lapack-netlib/TESTING/EIG/cchkhb2stg.f index 1a11ac5eaf..7500c22791 100644 --- a/lapack-netlib/TESTING/EIG/cchkhb2stg.f +++ b/lapack-netlib/TESTING/EIG/cchkhb2stg.f @@ -852,8 +852,9 @@ SUBROUTINE CCHKHB2STG( NSIZES, NN, NWDTHS, KK, NTYPES, DOTYPE, CALL SLASUM( 'CHB', NOUNIT, NERRS, NTESTT ) RETURN * - 9999 FORMAT( ' CCHKHB2STG: ', A, ' returned INFO=', I6, '.', / 9X, 'N=', - $ I6, ', JTYPE=', I6, ', ISEED=(', 3( I5, ',' ), I5, ')' ) + 9999 FORMAT( ' CCHKHB2STG: ', A, ' returned INFO=', I6, '.', / 9X, + $ 'N=', I6, ', JTYPE=', I6, ', ISEED=(', 3( I5, ',' ), I5, + $ ')' ) 9998 FORMAT( / 1X, A3, $ ' -- Complex Hermitian Banded Tridiagonal Reduction Routines' $ ) diff --git a/lapack-netlib/TESTING/EIG/dchksb2stg.f b/lapack-netlib/TESTING/EIG/dchksb2stg.f index 878da8b6f0..4e807f1c88 100644 --- a/lapack-netlib/TESTING/EIG/dchksb2stg.f +++ b/lapack-netlib/TESTING/EIG/dchksb2stg.f @@ -840,8 +840,9 @@ SUBROUTINE DCHKSB2STG( NSIZES, NN, NWDTHS, KK, NTYPES, DOTYPE, CALL DLASUM( 'DSB', NOUNIT, NERRS, NTESTT ) RETURN * - 9999 FORMAT( ' DCHKSB2STG: ', A, ' returned INFO=', I6, '.', / 9X, 'N=', - $ I6, ', JTYPE=', I6, ', ISEED=(', 3( I5, ',' ), I5, ')' ) + 9999 FORMAT( ' DCHKSB2STG: ', A, ' returned INFO=', I6, '.', / 9X, + $ 'N=', I6, ', JTYPE=', I6, ', ISEED=(', 3( I5, ',' ), I5, + $ ')' ) * 9998 FORMAT( / 1X, A3, $ ' -- Real Symmetric Banded Tridiagonal Reduction Routines' ) diff --git a/lapack-netlib/TESTING/EIG/schksb2stg.f b/lapack-netlib/TESTING/EIG/schksb2stg.f index 5de9204979..eee486ade7 100644 --- a/lapack-netlib/TESTING/EIG/schksb2stg.f +++ b/lapack-netlib/TESTING/EIG/schksb2stg.f @@ -840,8 +840,9 @@ SUBROUTINE SCHKSB2STG( NSIZES, NN, NWDTHS, KK, NTYPES, DOTYPE, CALL SLASUM( 'SSB', NOUNIT, NERRS, NTESTT ) RETURN * - 9999 FORMAT( ' SCHKSB2STG: ', A, ' returned INFO=', I6, '.', / 9X, 'N=', - $ I6, ', JTYPE=', I6, ', ISEED=(', 3( I5, ',' ), I5, ')' ) + 9999 FORMAT( ' SCHKSB2STG: ', A, ' returned INFO=', I6, '.', / 9X, + $ 'N=', I6, ', JTYPE=', I6, ', ISEED=(', 3( I5, ',' ), I5, + $ ')' ) * 9998 FORMAT( / 1X, A3, $ ' -- Real Symmetric Banded Tridiagonal Reduction Routines' ) diff --git a/lapack-netlib/TESTING/EIG/zchkhb2stg.f b/lapack-netlib/TESTING/EIG/zchkhb2stg.f index 786df7882c..bfe6ceadca 100644 --- a/lapack-netlib/TESTING/EIG/zchkhb2stg.f +++ b/lapack-netlib/TESTING/EIG/zchkhb2stg.f @@ -849,8 +849,9 @@ SUBROUTINE ZCHKHB2STG( NSIZES, NN, NWDTHS, KK, NTYPES, DOTYPE, CALL DLASUM( 'ZHB', NOUNIT, NERRS, NTESTT ) RETURN * - 9999 FORMAT( ' ZCHKHB2STG: ', A, ' returned INFO=', I6, '.', / 9X, 'N=', - $ I6, ', JTYPE=', I6, ', ISEED=(', 3( I5, ',' ), I5, ')' ) + 9999 FORMAT( ' ZCHKHB2STG: ', A, ' returned INFO=', I6, '.', / 9X, + $ 'N=', I6, ', JTYPE=', I6, ', ISEED=(', 3( I5, ',' ), I5, + $ ')' ) 9998 FORMAT( / 1X, A3, $ ' -- Complex Hermitian Banded Tridiagonal Reduction Routines' $ ) diff --git a/lapack-netlib/TESTING/LIN/alahd.f b/lapack-netlib/TESTING/LIN/alahd.f index 8f966c5841..c0334b5de9 100644 --- a/lapack-netlib/TESTING/LIN/alahd.f +++ b/lapack-netlib/TESTING/LIN/alahd.f @@ -954,7 +954,7 @@ SUBROUTINE ALAHD( IOUNIT, PATH ) $ 4X, '10. Random, Last columns are zero starting from', $ ' MINMN/2+1, CNDNUM = 2', / $ 4X, '11. Random, Half MINMN columns in the middle are', - $ ' zero starting from MINMN/2-(MINMN/2)/2+1,' + $ ' zero starting from MINMN/2-(MINMN/2)/2+1,', $ ' CNDNUM = 2', / $ 4X, '12. Random, Odd columns are ZERO, CNDNUM = 2', / $ 4X, '13. Random, Even columns are ZERO, CNDNUM = 2', / From d3b2036d49c16cb9f7520d3ee5d3d39d349c18bf Mon Sep 17 00:00:00 2001 From: Matthew Thompson Date: Wed, 4 Dec 2024 12:09:24 -0500 Subject: [PATCH 157/244] Move to use ERROR STOP instead of ABORT --- ctest/c_cblat1.f | 6 +++--- ctest/c_cblat2.f | 10 +++++----- ctest/c_cblat3.f | 14 +++++++------- ctest/c_cblat3_3m.f | 14 +++++++------- ctest/c_dblat1.f | 10 +++++----- ctest/c_dblat2.f | 10 +++++----- ctest/c_dblat3.f | 14 +++++++------- ctest/c_sblat1.f | 10 +++++----- ctest/c_sblat2.f | 10 +++++----- ctest/c_sblat3.f | 14 +++++++------- ctest/c_zblat1.f | 6 +++--- ctest/c_zblat2.f | 10 +++++----- ctest/c_zblat3.f | 14 +++++++------- ctest/c_zblat3_3m.f | 14 +++++++------- 14 files changed, 78 insertions(+), 78 deletions(-) diff --git a/ctest/c_cblat1.f b/ctest/c_cblat1.f index 73ab485bbd..2af54e7a65 100644 --- a/ctest/c_cblat1.f +++ b/ctest/c_cblat1.f @@ -41,7 +41,7 @@ PROGRAM CCBLAT1 IF (PASS) THEN WRITE (NOUT,99998) ELSE - CALL ABORT + ERROR STOP END IF 20 CONTINUE * @@ -231,7 +231,7 @@ SUBROUTINE CHECK1(SFAC) CALL ITEST1(ICAMAXTEST(N,CX,INCX),ITRUE3(NP1)) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK1' - CALL ABORT + ERROR STOP END IF * 40 CONTINUE @@ -515,7 +515,7 @@ SUBROUTINE CHECK2(SFAC) CALL CTEST(LENY,CY,CT10Y(1,KN,KI),CSIZE3,1.0E0) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK2' - CALL ABORT + ERROR STOP END IF * 40 CONTINUE diff --git a/ctest/c_cblat2.f b/ctest/c_cblat2.f index d48c10b7c8..d31884cddc 100644 --- a/ctest/c_cblat2.f +++ b/ctest/c_cblat2.f @@ -10,7 +10,7 @@ PROGRAM CBLAT2 * 'CBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES. +* F LOGICAL FLAG, T TO ERROR STOP ON FAILURES. * T LOGICAL FLAG, T TO TEST ERROR EXITS. * 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH * 16.0 THRESHOLD VALUE OF TEST RATIO @@ -243,7 +243,7 @@ PROGRAM CBLAT2 $ GO TO 70 60 CONTINUE WRITE( NOUT, FMT = 9986 )SNAMET - CALL ABORT + ERROR STOP 70 LTEST( I ) = LTESTT GO TO 50 * @@ -283,7 +283,7 @@ PROGRAM CBLAT2 SAME = LCE( YY, YT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR - CALL ABORT + ERROR STOP END IF TRANS = 'T' CALL CMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G, @@ -291,7 +291,7 @@ PROGRAM CBLAT2 SAME = LCE( YY, YT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR - CALL ABORT + ERROR STOP END IF * * Test each subroutine in turn. @@ -419,7 +419,7 @@ PROGRAM CBLAT2 $ CLOSE ( NTRA ) CLOSE ( NOUT ) IF( FATAL ) THEN - CALL ABORT + ERROR STOP END IF * 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) diff --git a/ctest/c_cblat3.f b/ctest/c_cblat3.f index 5d289aafe0..f713b2dd0a 100644 --- a/ctest/c_cblat3.f +++ b/ctest/c_cblat3.f @@ -10,7 +10,7 @@ PROGRAM CBLAT3 * 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES. +* F LOGICAL FLAG, T TO ERROR STOP ON FAILURES. * T LOGICAL FLAG, T TO TEST ERROR EXITS. * 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH * 16.0 THRESHOLD VALUE OF TEST RATIO @@ -194,7 +194,7 @@ PROGRAM CBLAT3 $ GO TO 50 40 CONTINUE WRITE( NOUT, FMT = 9990 )SNAMET - CALL ABORT + ERROR STOP 50 LTEST( I ) = LTESTT GO TO 30 * @@ -237,7 +237,7 @@ PROGRAM CBLAT3 SAME = LCE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - CALL ABORT + ERROR STOP END IF TRANSB = 'C' CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, @@ -246,7 +246,7 @@ PROGRAM CBLAT3 SAME = LCE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - CALL ABORT + ERROR STOP END IF DO 120 J = 1, N AB( J, NMAX + 1 ) = N - J + 1 @@ -264,7 +264,7 @@ PROGRAM CBLAT3 SAME = LCE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - CALL ABORT + ERROR STOP END IF TRANSB = 'C' CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, @@ -273,7 +273,7 @@ PROGRAM CBLAT3 SAME = LCE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - CALL ABORT + ERROR STOP END IF * * Test each subroutine in turn. @@ -386,7 +386,7 @@ PROGRAM CBLAT3 $ CLOSE ( NTRA ) CLOSE ( NOUT ) IF( FATAL ) THEN - CALL ABORT + ERROR STOP END IF * 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) diff --git a/ctest/c_cblat3_3m.f b/ctest/c_cblat3_3m.f index 73fca5664f..3f8157b0ed 100644 --- a/ctest/c_cblat3_3m.f +++ b/ctest/c_cblat3_3m.f @@ -10,7 +10,7 @@ PROGRAM CBLAT3 * 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES. +* F LOGICAL FLAG, T TO ERROR STOP ON FAILURES. * T LOGICAL FLAG, T TO TEST ERROR EXITS. * 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH * 16.0 THRESHOLD VALUE OF TEST RATIO @@ -194,7 +194,7 @@ PROGRAM CBLAT3 $ GO TO 50 40 CONTINUE WRITE( NOUT, FMT = 9990 )SNAMET - CALL ABORT + ERROR STOP 50 LTEST( I ) = LTESTT GO TO 30 * @@ -237,7 +237,7 @@ PROGRAM CBLAT3 SAME = LCE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - CALL ABORT + ERROR STOP END IF TRANSB = 'C' CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, @@ -246,7 +246,7 @@ PROGRAM CBLAT3 SAME = LCE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - CALL ABORT + ERROR STOP END IF DO 120 J = 1, N AB( J, NMAX + 1 ) = N - J + 1 @@ -264,7 +264,7 @@ PROGRAM CBLAT3 SAME = LCE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - CALL ABORT + ERROR STOP END IF TRANSB = 'C' CALL CMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, @@ -273,7 +273,7 @@ PROGRAM CBLAT3 SAME = LCE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - CALL ABORT + ERROR STOP END IF * * Test each subroutine in turn. @@ -386,7 +386,7 @@ PROGRAM CBLAT3 $ CLOSE ( NTRA ) CLOSE ( NOUT ) IF( FATAL ) THEN - CALL ABORT + ERROR STOP END IF * 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) diff --git a/ctest/c_dblat1.f b/ctest/c_dblat1.f index 99c8b5da49..4877ea62b8 100644 --- a/ctest/c_dblat1.f +++ b/ctest/c_dblat1.f @@ -47,7 +47,7 @@ PROGRAM DCBLAT1 IF (PASS) THEN WRITE (NOUT,99998) ELSE - CALL ABORT + ERROR STOP END IF 20 CONTINUE * @@ -139,7 +139,7 @@ SUBROUTINE CHECK0(SFAC) CALL STEST1(SS,DS1(K),DS1(K),SFAC) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK0' - CALL ABORT + ERROR STOP END IF 20 CONTINUE 40 RETURN @@ -232,7 +232,7 @@ SUBROUTINE CHECK1(SFAC) CALL ITEST1(IDAMAXTEST(N,SX,INCX),ITRUE2(NP1)) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK1' - CALL ABORT + ERROR STOP END IF 60 CONTINUE 80 CONTINUE @@ -387,7 +387,7 @@ SUBROUTINE CHECK2(SFAC) CALL STEST(LENY,SY,STY,SSIZE2(1,1),1.0D0) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK2' - CALL ABORT + ERROR STOP END IF 100 CONTINUE 120 CONTINUE @@ -475,7 +475,7 @@ SUBROUTINE CHECK3(SFAC) 70 CONTINUE ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK3' - CALL ABORT + ERROR STOP END IF 40 CONTINUE 60 CONTINUE diff --git a/ctest/c_dblat2.f b/ctest/c_dblat2.f index 01a21a7163..342382c9ed 100644 --- a/ctest/c_dblat2.f +++ b/ctest/c_dblat2.f @@ -10,7 +10,7 @@ PROGRAM DBLAT2 * 'DBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES. +* F LOGICAL FLAG, T TO ERROR STOP ON FAILURES. * T LOGICAL FLAG, T TO TEST ERROR EXITS. * 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH * 16.0 THRESHOLD VALUE OF TEST RATIO @@ -239,7 +239,7 @@ PROGRAM DBLAT2 $ GO TO 70 60 CONTINUE WRITE( NOUT, FMT = 9986 )SNAMET - CALL ABORT + ERROR STOP 70 LTEST( I ) = LTESTT GO TO 50 * @@ -279,7 +279,7 @@ PROGRAM DBLAT2 SAME = LDE( YY, YT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR - CALL ABORT + ERROR STOP END IF TRANS = 'T' CALL DMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G, @@ -287,7 +287,7 @@ PROGRAM DBLAT2 SAME = LDE( YY, YT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR - CALL ABORT + ERROR STOP END IF * * Test each subroutine in turn. @@ -415,7 +415,7 @@ PROGRAM DBLAT2 $ CLOSE ( NTRA ) CLOSE ( NOUT ) IF( FATAL ) THEN - CALL ABORT + ERROR STOP END IF * 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) diff --git a/ctest/c_dblat3.f b/ctest/c_dblat3.f index 00d16c2961..cbd95b8544 100644 --- a/ctest/c_dblat3.f +++ b/ctest/c_dblat3.f @@ -10,7 +10,7 @@ PROGRAM DBLAT3 * 'DBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES. +* F LOGICAL FLAG, T TO ERROR STOP ON FAILURES. * T LOGICAL FLAG, T TO TEST ERROR EXITS. * 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH * 16.0 THRESHOLD VALUE OF TEST RATIO @@ -189,7 +189,7 @@ PROGRAM DBLAT3 $ GO TO 50 40 CONTINUE WRITE( NOUT, FMT = 9990 )SNAMET - CALL ABORT + ERROR STOP 50 LTEST( I ) = LTESTT GO TO 30 * @@ -232,7 +232,7 @@ PROGRAM DBLAT3 SAME = LDE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - CALL ABORT + ERROR STOP END IF TRANSB = 'T' CALL DMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, @@ -241,7 +241,7 @@ PROGRAM DBLAT3 SAME = LDE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - CALL ABORT + ERROR STOP END IF DO 120 J = 1, N AB( J, NMAX + 1 ) = N - J + 1 @@ -259,7 +259,7 @@ PROGRAM DBLAT3 SAME = LDE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - CALL ABORT + ERROR STOP END IF TRANSB = 'T' CALL DMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, @@ -268,7 +268,7 @@ PROGRAM DBLAT3 SAME = LDE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - CALL ABORT + ERROR STOP END IF * * Test each subroutine in turn. @@ -380,7 +380,7 @@ PROGRAM DBLAT3 $ CLOSE ( NTRA ) CLOSE ( NOUT ) IF( FATAL ) THEN - CALL ABORT + ERROR STOP END IF * 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) diff --git a/ctest/c_sblat1.f b/ctest/c_sblat1.f index b88c2b7835..2e7c1d9b3f 100644 --- a/ctest/c_sblat1.f +++ b/ctest/c_sblat1.f @@ -47,7 +47,7 @@ PROGRAM SCBLAT1 IF (PASS) THEN WRITE (NOUT,99998) ELSE - CALL ABORT + ERROR STOP END IF 20 CONTINUE * @@ -139,7 +139,7 @@ SUBROUTINE CHECK0(SFAC) CALL STEST1(SS,DS1(K),DS1(K),SFAC) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK0' - CALL ABORT + ERROR STOP END IF 20 CONTINUE 40 RETURN @@ -232,7 +232,7 @@ SUBROUTINE CHECK1(SFAC) CALL ITEST1(ISAMAXTEST(N,SX,INCX),ITRUE2(NP1)) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK1' - CALL ABORT + ERROR STOP END IF 60 CONTINUE 80 CONTINUE @@ -387,7 +387,7 @@ SUBROUTINE CHECK2(SFAC) CALL STEST(LENY,SY,STY,SSIZE2(1,1),1.0E0) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK2' - CALL ABORT + ERROR STOP END IF 100 CONTINUE 120 CONTINUE @@ -482,7 +482,7 @@ SUBROUTINE CHECK3(SFAC) 70 CONTINUE ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK3' - CALL ABORT + ERROR STOP END IF 40 CONTINUE 60 CONTINUE diff --git a/ctest/c_sblat2.f b/ctest/c_sblat2.f index 18d568d5d3..00cbc8f011 100644 --- a/ctest/c_sblat2.f +++ b/ctest/c_sblat2.f @@ -10,7 +10,7 @@ PROGRAM SBLAT2 * 'SBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES. +* F LOGICAL FLAG, T TO ERROR STOP ON FAILURES. * T LOGICAL FLAG, T TO TEST ERROR EXITS. * 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH * 16.0 THRESHOLD VALUE OF TEST RATIO @@ -239,7 +239,7 @@ PROGRAM SBLAT2 $ GO TO 70 60 CONTINUE WRITE( NOUT, FMT = 9986 )SNAMET - CALL ABORT + ERROR STOP 70 LTEST( I ) = LTESTT GO TO 50 * @@ -279,7 +279,7 @@ PROGRAM SBLAT2 SAME = LSE( YY, YT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR - CALL ABORT + ERROR STOP END IF TRANS = 'T' CALL SMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G, @@ -287,7 +287,7 @@ PROGRAM SBLAT2 SAME = LSE( YY, YT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR - CALL ABORT + ERROR STOP END IF * * Test each subroutine in turn. @@ -415,7 +415,7 @@ PROGRAM SBLAT2 $ CLOSE ( NTRA ) CLOSE ( NOUT ) IF( FATAL ) THEN - CALL ABORT + ERROR STOP END IF * 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) diff --git a/ctest/c_sblat3.f b/ctest/c_sblat3.f index bbb58d04f6..61bf46997f 100644 --- a/ctest/c_sblat3.f +++ b/ctest/c_sblat3.f @@ -10,7 +10,7 @@ PROGRAM SBLAT3 * 'SBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES. +* F LOGICAL FLAG, T TO ERROR STOP ON FAILURES. * T LOGICAL FLAG, T TO TEST ERROR EXITS. * 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH * 16.0 THRESHOLD VALUE OF TEST RATIO @@ -188,7 +188,7 @@ PROGRAM SBLAT3 $ GO TO 50 40 CONTINUE WRITE( NOUT, FMT = 9990 )SNAMET - CALL ABORT + ERROR STOP 50 LTEST( I ) = LTESTT GO TO 30 * @@ -231,7 +231,7 @@ PROGRAM SBLAT3 SAME = LSE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - CALL ABORT + ERROR STOP END IF TRANSB = 'T' CALL SMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, @@ -240,7 +240,7 @@ PROGRAM SBLAT3 SAME = LSE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - CALL ABORT + ERROR STOP END IF DO 120 J = 1, N AB( J, NMAX + 1 ) = N - J + 1 @@ -258,7 +258,7 @@ PROGRAM SBLAT3 SAME = LSE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - CALL ABORT + ERROR STOP END IF TRANSB = 'T' CALL SMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, @@ -267,7 +267,7 @@ PROGRAM SBLAT3 SAME = LSE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.ZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - CALL ABORT + ERROR STOP END IF * * Test each subroutine in turn. @@ -379,7 +379,7 @@ PROGRAM SBLAT3 $ CLOSE ( NTRA ) CLOSE ( NOUT ) IF( FATAL ) THEN - CALL ABORT + ERROR STOP END IF * 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) diff --git a/ctest/c_zblat1.f b/ctest/c_zblat1.f index 43486433e3..1d48159c91 100644 --- a/ctest/c_zblat1.f +++ b/ctest/c_zblat1.f @@ -41,7 +41,7 @@ PROGRAM ZCBLAT1 IF (PASS) THEN WRITE (NOUT,99998) ELSE - CALL ABORT + ERROR STOP END IF 20 CONTINUE * @@ -231,7 +231,7 @@ SUBROUTINE CHECK1(SFAC) CALL ITEST1(IZAMAXTEST(N,CX,INCX),ITRUE3(NP1)) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK1' - CALL ABORT + ERROR STOP END IF * 40 CONTINUE @@ -515,7 +515,7 @@ SUBROUTINE CHECK2(SFAC) CALL CTEST(LENY,CY,CT10Y(1,KN,KI),CSIZE3,1.0D0) ELSE WRITE (NOUT,*) ' Shouldn''t be here in CHECK2' - CALL ABORT + ERROR STOP END IF * 40 CONTINUE diff --git a/ctest/c_zblat2.f b/ctest/c_zblat2.f index daa1a603b2..220e2fd259 100644 --- a/ctest/c_zblat2.f +++ b/ctest/c_zblat2.f @@ -10,7 +10,7 @@ PROGRAM ZBLAT2 * 'CBLAT2.SNAP' NAME OF SNAPSHOT OUTPUT FILE * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES. +* F LOGICAL FLAG, T TO ERROR STOP ON FAILURES. * T LOGICAL FLAG, T TO TEST ERROR EXITS. * 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH * 16.0 THRESHOLD VALUE OF TEST RATIO @@ -243,7 +243,7 @@ PROGRAM ZBLAT2 $ GO TO 70 60 CONTINUE WRITE( NOUT, FMT = 9986 )SNAMET - CALL ABORT + ERROR STOP 70 LTEST( I ) = LTESTT GO TO 50 * @@ -283,7 +283,7 @@ PROGRAM ZBLAT2 SAME = LZE( YY, YT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR - CALL ABORT + ERROR STOP END IF TRANS = 'T' CALL ZMVCH( TRANS, N, N, ONE, A, NMAX, X, -1, ZERO, Y, -1, YT, G, @@ -291,7 +291,7 @@ PROGRAM ZBLAT2 SAME = LZE( YY, YT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9985 )TRANS, SAME, ERR - CALL ABORT + ERROR STOP END IF * * Test each subroutine in turn. @@ -419,7 +419,7 @@ PROGRAM ZBLAT2 $ CLOSE ( NTRA ) CLOSE ( NOUT ) IF( FATAL ) THEN - CALL ABORT + ERROR STOP END IF * 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) diff --git a/ctest/c_zblat3.f b/ctest/c_zblat3.f index 83eb9e9184..e14f5af65a 100644 --- a/ctest/c_zblat3.f +++ b/ctest/c_zblat3.f @@ -10,7 +10,7 @@ PROGRAM ZBLAT3 * 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES. +* F LOGICAL FLAG, T TO ERROR STOP ON FAILURES. * T LOGICAL FLAG, T TO TEST ERROR EXITS. * 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH * 16.0 THRESHOLD VALUE OF TEST RATIO @@ -195,7 +195,7 @@ PROGRAM ZBLAT3 $ GO TO 50 40 CONTINUE WRITE( NOUT, FMT = 9990 )SNAMET - CALL ABORT + ERROR STOP 50 LTEST( I ) = LTESTT GO TO 30 * @@ -238,7 +238,7 @@ PROGRAM ZBLAT3 SAME = LZE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - CALL ABORT + ERROR STOP END IF TRANSB = 'C' CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, @@ -247,7 +247,7 @@ PROGRAM ZBLAT3 SAME = LZE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - CALL ABORT + ERROR STOP END IF DO 120 J = 1, N AB( J, NMAX + 1 ) = N - J + 1 @@ -265,7 +265,7 @@ PROGRAM ZBLAT3 SAME = LZE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - CALL ABORT + ERROR STOP END IF TRANSB = 'C' CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, @@ -274,7 +274,7 @@ PROGRAM ZBLAT3 SAME = LZE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - CALL ABORT + ERROR STOP END IF * * Test each subroutine in turn. @@ -387,7 +387,7 @@ PROGRAM ZBLAT3 $ CLOSE ( NTRA ) CLOSE ( NOUT ) IF( FATAL ) THEN - CALL ABORT + ERROR STOP END IF * 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) diff --git a/ctest/c_zblat3_3m.f b/ctest/c_zblat3_3m.f index d0923439e8..6f52b64036 100644 --- a/ctest/c_zblat3_3m.f +++ b/ctest/c_zblat3_3m.f @@ -10,7 +10,7 @@ PROGRAM ZBLAT3 * 'CBLAT3.SNAP' NAME OF SNAPSHOT OUTPUT FILE * -1 UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) * F LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. -* F LOGICAL FLAG, T TO CALL ABORT ON FAILURES. +* F LOGICAL FLAG, T TO ERROR STOP ON FAILURES. * T LOGICAL FLAG, T TO TEST ERROR EXITS. * 2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH * 16.0 THRESHOLD VALUE OF TEST RATIO @@ -195,7 +195,7 @@ PROGRAM ZBLAT3 $ GO TO 50 40 CONTINUE WRITE( NOUT, FMT = 9990 )SNAMET - CALL ABORT + ERROR STOP 50 LTEST( I ) = LTESTT GO TO 30 * @@ -238,7 +238,7 @@ PROGRAM ZBLAT3 SAME = LZE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - CALL ABORT + ERROR STOP END IF TRANSB = 'C' CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, @@ -247,7 +247,7 @@ PROGRAM ZBLAT3 SAME = LZE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - CALL ABORT + ERROR STOP END IF DO 120 J = 1, N AB( J, NMAX + 1 ) = N - J + 1 @@ -265,7 +265,7 @@ PROGRAM ZBLAT3 SAME = LZE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - CALL ABORT + ERROR STOP END IF TRANSB = 'C' CALL ZMMCH( TRANSA, TRANSB, N, 1, N, ONE, AB, NMAX, @@ -274,7 +274,7 @@ PROGRAM ZBLAT3 SAME = LZE( CC, CT, N ) IF( .NOT.SAME.OR.ERR.NE.RZERO )THEN WRITE( NOUT, FMT = 9989 )TRANSA, TRANSB, SAME, ERR - CALL ABORT + ERROR STOP END IF * * Test each subroutine in turn. @@ -387,7 +387,7 @@ PROGRAM ZBLAT3 $ CLOSE ( NTRA ) CLOSE ( NOUT ) IF( FATAL ) THEN - CALL ABORT + ERROR STOP END IF * 10002 FORMAT( ' COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED' ) From c4e8bac5a5e306731550f6ee39db99c184c31ed0 Mon Sep 17 00:00:00 2001 From: Matthew Thompson Date: Wed, 4 Dec 2024 12:11:35 -0500 Subject: [PATCH 158/244] Fix indent --- cmake/system.cmake | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/cmake/system.cmake b/cmake/system.cmake index b58a0f4b55..4ac244e3ea 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -643,16 +643,16 @@ endif () if (CMAKE_Fortran_COMPILER) if ("${F_COMPILER}" STREQUAL "NAGFOR" OR "${F_COMPILER}" STREQUAL "CRAY" OR CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*") - set(FILTER_FLAGS "-msse3;-mssse3;-msse4.1;-mavx;-mavx2,-mskylake-avx512") - if (CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*") -message(STATUS "removing fortran flags") - set(FILTER_FLAGS "${FILTER_FLAGS};-m32;-m64") + set(FILTER_FLAGS "-msse3;-mssse3;-msse4.1;-mavx;-mavx2,-mskylake-avx512") + if (CMAKE_Fortran_COMPILER_ID MATCHES "LLVMFlang.*") + message(STATUS "removing fortran flags") + set(FILTER_FLAGS "${FILTER_FLAGS};-m32;-m64") + endif () + foreach (FILTER_FLAG ${FILTER_FLAGS}) + string(REPLACE ${FILTER_FLAG} "" LAPACK_FFLAGS ${LAPACK_FFLAGS}) + string(REPLACE ${FILTER_FLAG} "" LAPACK_FPFLAGS ${LAPACK_FPFLAGS}) + endforeach () endif () - foreach (FILTER_FLAG ${FILTER_FLAGS}) - string(REPLACE ${FILTER_FLAG} "" LAPACK_FFLAGS ${LAPACK_FFLAGS}) - string(REPLACE ${FILTER_FLAG} "" LAPACK_FPFLAGS ${LAPACK_FPFLAGS}) - endforeach () -endif () endif () if ("${F_COMPILER}" STREQUAL "GFORTRAN") From 1a6ecda3983c9daab2f94dfe5bc1fdcd759a94ea Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 4 Dec 2024 15:32:26 -0800 Subject: [PATCH 159/244] utilize /proc/cpuinfo on NetBSD too --- cpuid_arm64.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpuid_arm64.c b/cpuid_arm64.c index aaf5084395..fbb78e7943 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -127,7 +127,7 @@ static char *cpuname_lower[] = { int get_feature(char *search) { -#ifdef __linux +#if defined( __linux ) || defined( __NetBSD__ ) FILE *infile; char buffer[2048], *p,*t; p = (char *) NULL ; @@ -163,7 +163,7 @@ int get_feature(char *search) int detect(void) { -#ifdef __linux +#if defined( __linux ) || defined( __NetBSD__ ) FILE *infile; char buffer[512], *p, *cpu_part = NULL, *cpu_implementer = NULL; @@ -314,7 +314,7 @@ void get_cpucount(void) { int n=0; -#ifdef __linux +#if defined( __linux ) || defined( __NetBSD__ ) FILE *infile; char buffer[2048], *p,*t; p = (char *) NULL ; @@ -608,7 +608,7 @@ void get_libname(void) void get_features(void) { -#ifdef __linux +#if defined( __linux ) || defined( __NetBSD__ ) FILE *infile; char buffer[2048], *p,*t; p = (char *) NULL ; From a791912cbb06260e1d0271b31959f16a41ddef4c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 4 Dec 2024 15:34:57 -0800 Subject: [PATCH 160/244] handle uname returning evbarm on NetBSD --- c_check | 3 +++ 1 file changed, 3 insertions(+) diff --git a/c_check b/c_check index c2b52c81b0..c3c2901712 100755 --- a/c_check +++ b/c_check @@ -6,6 +6,9 @@ hostarch=`uname -m | sed -e 's/i.86/x86/'` if [ "$hostos" = "AIX" ] || [ "$hostos" = "SunOS" ]; then hostarch=`uname -p` fi +if [ "$hostarch" = "evbarm" ]; then + hostarch=`uname -p` +fi case "$hostarch" in amd64) hostarch=x86_64 ;; arm*) [ "$hostarch" = "arm64" ] || hostarch='arm' ;; From 5fe983db29381a86a0dcef2a1750b4da86cca69c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 5 Dec 2024 21:09:53 +0100 Subject: [PATCH 161/244] retire the thunderx2 nrm2 kernels for now due to NAN and inaccuracies --- kernel/arm64/KERNEL.ARMV8SVE | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/arm64/KERNEL.ARMV8SVE b/kernel/arm64/KERNEL.ARMV8SVE index bfadf5cba9..7904011a82 100644 --- a/kernel/arm64/KERNEL.ARMV8SVE +++ b/kernel/arm64/KERNEL.ARMV8SVE @@ -104,10 +104,10 @@ IDAMAXKERNEL = iamax_thunderx2t99.c ICAMAXKERNEL = izamax_thunderx2t99.c IZAMAXKERNEL = izamax_thunderx2t99.c -SNRM2KERNEL = scnrm2_thunderx2t99.c -DNRM2KERNEL = dznrm2_thunderx2t99.c -CNRM2KERNEL = scnrm2_thunderx2t99.c -ZNRM2KERNEL = dznrm2_thunderx2t99.c +SNRM2KERNEL = nrm2.S +DNRM2KERNEL = nrm2.S +CNRM2KERNEL = znrm2.S +ZNRM2KERNEL = znrm2.S DDOTKERNEL = dot.c SDOTKERNEL = dot.c From 3345007d8f4559fcd65ea7166695fda3a161e7ae Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 5 Dec 2024 21:12:06 +0100 Subject: [PATCH 162/244] retire the thunderx2 NRM2 kernels due to reported inaccuracies and NAN --- kernel/arm64/KERNEL.NEOVERSEN2 | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/arm64/KERNEL.NEOVERSEN2 b/kernel/arm64/KERNEL.NEOVERSEN2 index cabacad46e..2f7400113b 100644 --- a/kernel/arm64/KERNEL.NEOVERSEN2 +++ b/kernel/arm64/KERNEL.NEOVERSEN2 @@ -91,10 +91,10 @@ IDAMAXKERNEL = iamax_thunderx2t99.c ICAMAXKERNEL = izamax_thunderx2t99.c IZAMAXKERNEL = izamax_thunderx2t99.c -SNRM2KERNEL = scnrm2_thunderx2t99.c -DNRM2KERNEL = dznrm2_thunderx2t99.c -CNRM2KERNEL = scnrm2_thunderx2t99.c -ZNRM2KERNEL = dznrm2_thunderx2t99.c +SNRM2KERNEL = nrm2.S +DNRM2KERNEL = nrm2.S +CNRM2KERNEL = znrm2.S +ZNRM2KERNEL = znrm2.S DDOTKERNEL = dot.c SDOTKERNEL = dot.c From 0f8ff82592b7eaf6bb61613673e64aa8ecb5ebfa Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 6 Dec 2024 01:35:42 -0800 Subject: [PATCH 163/244] Add build notes for Windows and flang from gh Discussion 5008 --- benchmark/pybench/README.md | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/benchmark/pybench/README.md b/benchmark/pybench/README.md index 7523ca75ab..43c1b3665d 100644 --- a/benchmark/pybench/README.md +++ b/benchmark/pybench/README.md @@ -43,7 +43,17 @@ have all what it takes to build OpenBLAS from source, plus `python` and $ python -mpip install numpy meson ninja pytest pytest-benchmark ``` -The benchmark syntax is consistent with that of `pytest-benchmark` framework. The incantation to run the suite locally is `$ pytest benchmark/pybench/benchmarks/test_blas.py`. +The Meson build system looks for the installed OpenBLAS using pkgconfig, so the openblas.pc created during the OpenBLAS build needs +to be somewhere on the search path of pkgconfig or in a folder pointed to by the environment variable PKG_CONFIG_PATH. + +If you want to build the benchmark suite using flang (or flang-new) instead of gfortran for the Fortran parts, you currently need +to edit the meson.build file and change the line `'fortran_std=legacy'` to `'fortran_std=none'` to work around an incompatibility +between Meson and flang. + +If you are building and running the benchmark under MS Windows, it may be necessary to copy the generated openblas_wrap module from +your build folder to the `benchmarks` folder. + +The benchmark syntax is consistent with that of `pytest-benchmark` framework. The incantation to run the suite locally is `$ pytest benchmark/pybench/benchmarks/bench_blas.py`. An ASV compatible benchmark suite is planned but currently not implemented. From 5aea097df068e87033d46efb081640e8bf41caa0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 10 Dec 2024 23:52:05 +0100 Subject: [PATCH 164/244] add missing lapack 3.11+ symbols --- exports/gensymbol | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/exports/gensymbol b/exports/gensymbol index f3ca9a427e..d886e6d143 100755 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -869,8 +869,12 @@ lapackobjs2z="$lapackobjs2z #functions added post 3.11 lapackobjs2c="$lapackobjs2c + cgelst + cgeqp3rk claqp2rk claqp3rk + clatrs3 + crscl ctrsyl3 " # claqz0 @@ -894,6 +898,17 @@ lapackobjs2d="$lapackobjs2d # dlaqz3 # dlaqz4 +lapackobjs2s="$lapackobjs2s + sgelst + sgeqp2rk + sgeqp3rk + slaqp2rk + slaqp3rk + slarmm + slatrs3 + strsyl3 + " + lapackobjs2z="$lapackobjs2z zgelst zgeqp3rk From 61d5aec7c1298969cb007686695dcadefa0e9f7f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 11 Dec 2024 00:41:56 +0100 Subject: [PATCH 165/244] remove typo --- exports/gensymbol | 1 - 1 file changed, 1 deletion(-) diff --git a/exports/gensymbol b/exports/gensymbol index d886e6d143..f747dd091f 100755 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -900,7 +900,6 @@ lapackobjs2d="$lapackobjs2d lapackobjs2s="$lapackobjs2s sgelst - sgeqp2rk sgeqp3rk slaqp2rk slaqp3rk From b9f51a5cf7723c7fb383812c75f786d3e5c4a1ba Mon Sep 17 00:00:00 2001 From: CDAC-SSDG <141632518+CDAC-SSDG@users.noreply.github.com> Date: Fri, 13 Dec 2024 10:58:06 +0530 Subject: [PATCH 166/244] Delete kernel/arm64/rot.c --- kernel/arm64/rot.c | 40 ---------------------------------------- 1 file changed, 40 deletions(-) delete mode 100644 kernel/arm64/rot.c diff --git a/kernel/arm64/rot.c b/kernel/arm64/rot.c deleted file mode 100644 index abddc15381..0000000000 --- a/kernel/arm64/rot.c +++ /dev/null @@ -1,40 +0,0 @@ -/******************************************************************************* -Copyright (c) 2015, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*******************************************************************************/ -#include "common.h" -#include "rot_kernel_sve.c" -#include "rot_kernel_c.c" - -int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) -{ - if (n <= 0) - return (0); - if (inc_x == 1 && inc_y == 1) - rot_kernel_sve(n, x, y, c, s); - else - rot_kernel_c(n, x, inc_x, y, inc_y, c, s); - return (0); -} From 10857c9df4f915871e989496c8b2bb78f81af8e2 Mon Sep 17 00:00:00 2001 From: CDAC-SSDG <141632518+CDAC-SSDG@users.noreply.github.com> Date: Fri, 13 Dec 2024 10:58:51 +0530 Subject: [PATCH 167/244] Delete kernel/arm64/rot_kernel_c.c --- kernel/arm64/rot_kernel_c.c | 44 ------------------------------------- 1 file changed, 44 deletions(-) delete mode 100644 kernel/arm64/rot_kernel_c.c diff --git a/kernel/arm64/rot_kernel_c.c b/kernel/arm64/rot_kernel_c.c deleted file mode 100644 index f37d2db169..0000000000 --- a/kernel/arm64/rot_kernel_c.c +++ /dev/null @@ -1,44 +0,0 @@ -/******************************************************************************* -Copyright (c) 2015, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*******************************************************************************/ -#include "common.h" - -static int rot_kernel_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) -{ - BLASLONG i = 0; - BLASLONG ix = 0, iy = 0; - FLOAT temp; - while (i < n) - { - temp = c * x[ix] + s * y[iy]; - y[iy] = c * y[iy] - s * x[ix]; - x[ix] = temp; - ix += inc_x; - iy += inc_y; - i++; - } - return (0); -} From f62519cc87521e1c6e09972cd03f3695b01b086f Mon Sep 17 00:00:00 2001 From: CDAC-SSDG <141632518+CDAC-SSDG@users.noreply.github.com> Date: Fri, 13 Dec 2024 10:59:35 +0530 Subject: [PATCH 168/244] Delete kernel/arm64/rot_kernel_sve.c --- kernel/arm64/rot_kernel_sve.c | 59 ----------------------------------- 1 file changed, 59 deletions(-) delete mode 100644 kernel/arm64/rot_kernel_sve.c diff --git a/kernel/arm64/rot_kernel_sve.c b/kernel/arm64/rot_kernel_sve.c deleted file mode 100644 index 0a790824f0..0000000000 --- a/kernel/arm64/rot_kernel_sve.c +++ /dev/null @@ -1,59 +0,0 @@ -/******************************************************************************* -Copyright (c) 2015, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*******************************************************************************/ -#include "common.h" -#include - -#ifdef DOUBLE -#define SVE_TYPE svfloat64_t -#define SVE_ZERO svdup_f64(0.0) -#define SVE_WHILELT svwhilelt_b64 -#define SVE_ALL svptrue_b64() -#define SVE_WIDTH svcntd() -#else -#define SVE_TYPE svfloat32_t -#define SVE_ZERO svdup_f32(0.0) -#define SVE_WHILELT svwhilelt_b32 -#define SVE_ALL svptrue_b32() -#define SVE_WIDTH svcntw() -#endif - -static int rot_kernel_sve(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s) -{ - for (int i = 0; i < n; i += SVE_WIDTH) - { - svbool_t pg = SVE_WHILELT((uint32_t)i, (uint32_t)n); - SVE_TYPE x_vec = svld1(pg, &x[i]); - SVE_TYPE y_vec = svld1(pg, &y[i]); - SVE_TYPE cx_vec = svmul_z(pg, x_vec, c); - SVE_TYPE sy_vec = svmul_z(pg, y_vec, s); - SVE_TYPE sx_vec = svmul_z(pg, x_vec, s); - SVE_TYPE cy_vec = svmul_z(pg, y_vec, c); - svst1(pg, &x[i], svadd_z(pg, cx_vec, sy_vec)); - svst1(pg, &y[i], svsub_z(pg, cy_vec, sx_vec)); - } - return (0); -} From 5540f2121e2304e8e4682708b00af18fade7465b Mon Sep 17 00:00:00 2001 From: CDAC-SSDG <141632518+CDAC-SSDG@users.noreply.github.com> Date: Fri, 13 Dec 2024 11:00:12 +0530 Subject: [PATCH 169/244] Delete kernel/arm64/scal.c --- kernel/arm64/scal.c | 40 ---------------------------------------- 1 file changed, 40 deletions(-) delete mode 100644 kernel/arm64/scal.c diff --git a/kernel/arm64/scal.c b/kernel/arm64/scal.c deleted file mode 100644 index e64b0075e8..0000000000 --- a/kernel/arm64/scal.c +++ /dev/null @@ -1,40 +0,0 @@ -/******************************************************************************* -Copyright (c) 2015, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*******************************************************************************/ -#include "common.h" -#include "scal_kernel_sve.c" -#include "scal_kernel_c.c" - -int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) -{ - if ((n <= 0) || (inc_x <= 0)) - return (0); - if (inc_x == 1) - scal_kernel_sve(n, x, da); - else - scal_kernel_c(n, da, x, inc_x, y, inc_y); - return (0); -} From 95a97012e8e7350df05c3e3ee749dbe34feff05a Mon Sep 17 00:00:00 2001 From: CDAC-SSDG <141632518+CDAC-SSDG@users.noreply.github.com> Date: Fri, 13 Dec 2024 11:00:45 +0530 Subject: [PATCH 170/244] Delete kernel/arm64/scal_kernel_c.c --- kernel/arm64/scal_kernel_c.c | 43 ------------------------------------ 1 file changed, 43 deletions(-) delete mode 100644 kernel/arm64/scal_kernel_c.c diff --git a/kernel/arm64/scal_kernel_c.c b/kernel/arm64/scal_kernel_c.c deleted file mode 100644 index 659168da54..0000000000 --- a/kernel/arm64/scal_kernel_c.c +++ /dev/null @@ -1,43 +0,0 @@ -/******************************************************************************* -Copyright (c) 2015, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*******************************************************************************/ -#include "common.h" - -static int scal_kernel_c(BLASLONG n, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) -{ - BLASLONG i = 0, j = 0; - - while (j < n) - { - if (da == 0.0) - x[i] = 0.0; - else - x[i] = da * x[i]; - i += inc_x; - j++; - } - return (0); -} From 3b7b74664c125e8589b8f5c4255bdb972f666dff Mon Sep 17 00:00:00 2001 From: CDAC-SSDG <141632518+CDAC-SSDG@users.noreply.github.com> Date: Fri, 13 Dec 2024 11:01:03 +0530 Subject: [PATCH 171/244] Delete kernel/arm64/scal_kernel_sve.c --- kernel/arm64/scal_kernel_sve.c | 54 ---------------------------------- 1 file changed, 54 deletions(-) delete mode 100644 kernel/arm64/scal_kernel_sve.c diff --git a/kernel/arm64/scal_kernel_sve.c b/kernel/arm64/scal_kernel_sve.c deleted file mode 100644 index ccd5a4cd2b..0000000000 --- a/kernel/arm64/scal_kernel_sve.c +++ /dev/null @@ -1,54 +0,0 @@ -/******************************************************************************* -Copyright (c) 2015, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*******************************************************************************/ -#include "common.h" -#include - -#ifdef DOUBLE -#define SVE_TYPE svfloat64_t -#define SVE_ZERO svdup_f64(0.0) -#define SVE_WHILELT svwhilelt_b64 -#define SVE_ALL svptrue_b64() -#define SVE_WIDTH svcntd() -#else -#define SVE_TYPE svfloat32_t -#define SVE_ZERO svdup_f32(0.0) -#define SVE_WHILELT svwhilelt_b32 -#define SVE_ALL svptrue_b32() -#define SVE_WIDTH svcntw() -#endif - -static int scal_kernel_sve(int n, FLOAT *x, FLOAT da) -{ - for (int i = 0; i < n; i += SVE_WIDTH) - { - svbool_t pg = SVE_WHILELT(i, n); - SVE_TYPE x_vec = svld1(pg, &x[i]); - SVE_TYPE result = svmul_z(pg, x_vec, da); - svst1(pg, &x[i], result); - } - return (0); -} From f6416c0e3702a1d1d825b1500993f66a60677281 Mon Sep 17 00:00:00 2001 From: CDAC-SSDG <141632518+CDAC-SSDG@users.noreply.github.com> Date: Fri, 13 Dec 2024 11:01:32 +0530 Subject: [PATCH 172/244] Delete kernel/arm64/swap.c --- kernel/arm64/swap.c | 40 ---------------------------------------- 1 file changed, 40 deletions(-) delete mode 100644 kernel/arm64/swap.c diff --git a/kernel/arm64/swap.c b/kernel/arm64/swap.c deleted file mode 100644 index c5af18e6ba..0000000000 --- a/kernel/arm64/swap.c +++ /dev/null @@ -1,40 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ -#include "common.h" -#include "swap_kernel_sve.c" -#include "swap_kernel_c.c" - -int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) -{ - if (n <= 0) - return 0; - if (inc_x == 1 && inc_y == 1) - swap_kernel_sve(n, x, y); - else - swap_kernel_c(n, x, inc_x, y, inc_y); - return (0); -} From c17c19fbcf4d6fd90326c564b6bddd06c93cfe23 Mon Sep 17 00:00:00 2001 From: CDAC-SSDG <141632518+CDAC-SSDG@users.noreply.github.com> Date: Fri, 13 Dec 2024 11:01:46 +0530 Subject: [PATCH 173/244] Delete kernel/arm64/swap_kernel_c.c --- kernel/arm64/swap_kernel_c.c | 46 ------------------------------------ 1 file changed, 46 deletions(-) delete mode 100644 kernel/arm64/swap_kernel_c.c diff --git a/kernel/arm64/swap_kernel_c.c b/kernel/arm64/swap_kernel_c.c deleted file mode 100644 index c1d7cc619a..0000000000 --- a/kernel/arm64/swap_kernel_c.c +++ /dev/null @@ -1,46 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ -#include "common.h" -#include - -static int swap_kernel_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) -{ - BLASLONG i = 0; - BLASLONG ix = 0, iy = 0; - FLOAT temp; - - while (i < n) - { - temp = x[ix]; - x[ix] = y[iy]; - y[iy] = temp; - ix += inc_x; - iy += inc_y; - i++; - } - return (0); -} From 765850194e2529433433be20ae9d74ff54f6c673 Mon Sep 17 00:00:00 2001 From: CDAC-SSDG <141632518+CDAC-SSDG@users.noreply.github.com> Date: Fri, 13 Dec 2024 11:02:01 +0530 Subject: [PATCH 174/244] Delete kernel/arm64/swap_kernel_sve.c --- kernel/arm64/swap_kernel_sve.c | 62 ---------------------------------- 1 file changed, 62 deletions(-) delete mode 100644 kernel/arm64/swap_kernel_sve.c diff --git a/kernel/arm64/swap_kernel_sve.c b/kernel/arm64/swap_kernel_sve.c deleted file mode 100644 index fed7e6d0f5..0000000000 --- a/kernel/arm64/swap_kernel_sve.c +++ /dev/null @@ -1,62 +0,0 @@ -/******************************************************************************* -Copyright (c) 2015, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*******************************************************************************/ -#include "common.h" -#include - -#ifdef DOUBLE -#define SVE_TYPE svfloat64_t -#define SVE_ZERO svdup_f64(0.0) -#define SVE_WHILELT svwhilelt_b64 -#define SVE_ALL svptrue_b64() -#define SVE_WIDTH svcntd() -#else -#define SVE_TYPE svfloat32_t -#define SVE_ZERO svdup_f32(0.0) -#define SVE_WHILELT svwhilelt_b32 -#define SVE_ALL svptrue_b32() -#define SVE_WIDTH svcntw() -#endif - -static int swap_kernel_sve(BLASLONG n, FLOAT *x, FLOAT *y) -{ - BLASLONG sve_width = SVE_WIDTH; - - for (BLASLONG i = 0; i < n; i += sve_width * 2) - { - svbool_t pg_a = SVE_WHILELT(i, n); - svbool_t pg_b = SVE_WHILELT((i + sve_width), n); - SVE_TYPE x_vec_a = svld1(pg_a, &x[i]); - SVE_TYPE y_vec_a = svld1(pg_a, &y[i]); - SVE_TYPE x_vec_b = svld1(pg_b, &x[i + sve_width]); - SVE_TYPE y_vec_b = svld1(pg_b, &y[i + sve_width]); - svst1(pg_a, &x[i], y_vec_a); - svst1(pg_a, &y[i], x_vec_a); - svst1(pg_b, &x[i + sve_width], y_vec_b); - svst1(pg_b, &y[i + sve_width], x_vec_b); - } - return (0); -} From 41912f9c22615bf2d94cecd7ea7d239a5f94e666 Mon Sep 17 00:00:00 2001 From: CDAC-SSDG <141632518+CDAC-SSDG@users.noreply.github.com> Date: Fri, 13 Dec 2024 11:05:10 +0530 Subject: [PATCH 175/244] Update CONTRIBUTORS.md --- CONTRIBUTORS.md | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index cf74524c8d..508dbcd0e6 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -230,12 +230,5 @@ In chronological order: * Christopher Daley * [2024-01-24] Optimize GEMV forwarding on ARM64 systems -* Aniket P. Garade - * [2024-10-30] Optimized scal Level-1 BLAS routines with ARM SVE - -* Sushil Pratap Singh - * [2024-10-30] Optimized swap Level-1 BLAS routines with ARM SVE - -* Juliya James - * [2024-10-30] Optimized rot Level-1 BLAS routines with ARM SVE - +* Aniket P. Garade Sushil Pratap Singh Juliya James + * [2024-12-13] Optimized swap and rot Level-1 BLAS routines with ARM SVE From 06ffd411a588734793cb2057c254090aac07f7a7 Mon Sep 17 00:00:00 2001 From: CDAC-SSDG <141632518+CDAC-SSDG@users.noreply.github.com> Date: Fri, 13 Dec 2024 11:05:47 +0530 Subject: [PATCH 176/244] Update KERNEL.ARMV8SVE --- kernel/arm64/KERNEL.ARMV8SVE | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/arm64/KERNEL.ARMV8SVE b/kernel/arm64/KERNEL.ARMV8SVE index cecc72cf96..133fab9d62 100644 --- a/kernel/arm64/KERNEL.ARMV8SVE +++ b/kernel/arm64/KERNEL.ARMV8SVE @@ -69,8 +69,8 @@ DROTKERNEL = rot.c CROTKERNEL = zrot.S ZROTKERNEL = zrot.S -SSCALKERNEL = scal.c -DSCALKERNEL = scal.c +SSCALKERNEL = scal.S +DSCALKERNEL = scal.S CSCALKERNEL = zscal.S ZSCALKERNEL = zscal.S From dd71e4234a0cb3469168f7e6a8f55a17cd02db58 Mon Sep 17 00:00:00 2001 From: CDAC-SSDG <141632518+CDAC-SSDG@users.noreply.github.com> Date: Fri, 13 Dec 2024 11:15:29 +0530 Subject: [PATCH 177/244] Added Updated swap and rot sve kernels. --- kernel/arm64/rot.c | 40 ++++++++++++++++++++++ kernel/arm64/rot_kernel_c.c | 44 ++++++++++++++++++++++++ kernel/arm64/rot_kernel_sve.c | 59 ++++++++++++++++++++++++++++++++ kernel/arm64/swap.c | 40 ++++++++++++++++++++++ kernel/arm64/swap_kernel_c.c | 46 +++++++++++++++++++++++++ kernel/arm64/swap_kernel_sve.c | 62 ++++++++++++++++++++++++++++++++++ 6 files changed, 291 insertions(+) create mode 100644 kernel/arm64/rot.c create mode 100644 kernel/arm64/rot_kernel_c.c create mode 100644 kernel/arm64/rot_kernel_sve.c create mode 100644 kernel/arm64/swap.c create mode 100644 kernel/arm64/swap_kernel_c.c create mode 100644 kernel/arm64/swap_kernel_sve.c diff --git a/kernel/arm64/rot.c b/kernel/arm64/rot.c new file mode 100644 index 0000000000..09b708494c --- /dev/null +++ b/kernel/arm64/rot.c @@ -0,0 +1,40 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#include "common.h" +#include "rot_kernel_sve.c" +#include "rot_kernel_c.c" + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + if (n <= 0) + return (0); + if (inc_x == 1 && inc_y == 1) + rot_kernel_sve(n, x, y, c, s); + else + rot_kernel_c(n, x, inc_x, y, inc_y, c, s); + return (0); +} \ No newline at end of file diff --git a/kernel/arm64/rot_kernel_c.c b/kernel/arm64/rot_kernel_c.c new file mode 100644 index 0000000000..788beed7a5 --- /dev/null +++ b/kernel/arm64/rot_kernel_c.c @@ -0,0 +1,44 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#include "common.h" + +static int rot_kernel_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + FLOAT temp; + while (i < n) + { + temp = c * x[ix] + s * y[iy]; + y[iy] = c * y[iy] - s * x[ix]; + x[ix] = temp; + ix += inc_x; + iy += inc_y; + i++; + } + return (0); +} \ No newline at end of file diff --git a/kernel/arm64/rot_kernel_sve.c b/kernel/arm64/rot_kernel_sve.c new file mode 100644 index 0000000000..1d54a2907e --- /dev/null +++ b/kernel/arm64/rot_kernel_sve.c @@ -0,0 +1,59 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#include "common.h" +#include + +#ifdef DOUBLE +#define SVE_TYPE svfloat64_t +#define SVE_ZERO svdup_f64(0.0) +#define SVE_WHILELT svwhilelt_b64 +#define SVE_ALL svptrue_b64() +#define SVE_WIDTH svcntd() +#else +#define SVE_TYPE svfloat32_t +#define SVE_ZERO svdup_f32(0.0) +#define SVE_WHILELT svwhilelt_b32 +#define SVE_ALL svptrue_b32() +#define SVE_WIDTH svcntw() +#endif + +static int rot_kernel_sve(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s) +{ + for (BLASLONG i = 0; i < n; i += SVE_WIDTH) + { + svbool_t pg = SVE_WHILELT((uint64_t)i, (uint64_t)n); + SVE_TYPE x_vec = svld1(pg, &x[i]); + SVE_TYPE y_vec = svld1(pg, &y[i]); + SVE_TYPE cx_vec = svmul_z(pg, x_vec, c); + SVE_TYPE sy_vec = svmul_z(pg, y_vec, s); + SVE_TYPE sx_vec = svmul_z(pg, x_vec, s); + SVE_TYPE cy_vec = svmul_z(pg, y_vec, c); + svst1(pg, &x[i], svadd_z(pg, cx_vec, sy_vec)); + svst1(pg, &y[i], svsub_z(pg, cy_vec, sx_vec)); + } + return (0); +} \ No newline at end of file diff --git a/kernel/arm64/swap.c b/kernel/arm64/swap.c new file mode 100644 index 0000000000..6a9117cf0e --- /dev/null +++ b/kernel/arm64/swap.c @@ -0,0 +1,40 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#include "common.h" +#include "swap_kernel_sve.c" +#include "swap_kernel_c.c" + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + if (n <= 0) + return 0; + if (inc_x == 1 && inc_y == 1) + swap_kernel_sve(n, x, y); + else + swap_kernel_c(n, x, inc_x, y, inc_y); + return (0); +} \ No newline at end of file diff --git a/kernel/arm64/swap_kernel_c.c b/kernel/arm64/swap_kernel_c.c new file mode 100644 index 0000000000..4029350962 --- /dev/null +++ b/kernel/arm64/swap_kernel_c.c @@ -0,0 +1,46 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#include "common.h" +#include + +static int swap_kernel_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + FLOAT temp; + + while (i < n) + { + temp = x[ix]; + x[ix] = y[iy]; + y[iy] = temp; + ix += inc_x; + iy += inc_y; + i++; + } + return (0); +} \ No newline at end of file diff --git a/kernel/arm64/swap_kernel_sve.c b/kernel/arm64/swap_kernel_sve.c new file mode 100644 index 0000000000..db3c0fae57 --- /dev/null +++ b/kernel/arm64/swap_kernel_sve.c @@ -0,0 +1,62 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#include "common.h" +#include + +#ifdef DOUBLE +#define SVE_TYPE svfloat64_t +#define SVE_ZERO svdup_f64(0.0) +#define SVE_WHILELT svwhilelt_b64 +#define SVE_ALL svptrue_b64() +#define SVE_WIDTH svcntd() +#else +#define SVE_TYPE svfloat32_t +#define SVE_ZERO svdup_f32(0.0) +#define SVE_WHILELT svwhilelt_b32 +#define SVE_ALL svptrue_b32() +#define SVE_WIDTH svcntw() +#endif + +static int swap_kernel_sve(BLASLONG n, FLOAT *x, FLOAT *y) +{ + BLASLONG sve_width = SVE_WIDTH; + + for (BLASLONG i = 0; i < n; i += sve_width * 2) + { + svbool_t pg_a = SVE_WHILELT((uint64_t)i, (uint64_t)n); + svbool_t pg_b = SVE_WHILELT((i + sve_width), n); + SVE_TYPE x_vec_a = svld1(pg_a, &x[i]); + SVE_TYPE y_vec_a = svld1(pg_a, &y[i]); + SVE_TYPE x_vec_b = svld1(pg_b, &x[i + sve_width]); + SVE_TYPE y_vec_b = svld1(pg_b, &y[i + sve_width]); + svst1(pg_a, &x[i], y_vec_a); + svst1(pg_a, &y[i], x_vec_a); + svst1(pg_b, &x[i + sve_width], y_vec_b); + svst1(pg_b, &y[i + sve_width], x_vec_b); + } + return (0); +} \ No newline at end of file From 3368a4e697c45a5de4370b1e6861c9ab7178b297 Mon Sep 17 00:00:00 2001 From: SushilPratap04 Date: Fri, 13 Dec 2024 16:47:58 +0530 Subject: [PATCH 178/244] Update swap_kernel_sve.c --- kernel/arm64/swap_kernel_sve.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/arm64/swap_kernel_sve.c b/kernel/arm64/swap_kernel_sve.c index db3c0fae57..1efdce48bd 100644 --- a/kernel/arm64/swap_kernel_sve.c +++ b/kernel/arm64/swap_kernel_sve.c @@ -48,7 +48,7 @@ static int swap_kernel_sve(BLASLONG n, FLOAT *x, FLOAT *y) for (BLASLONG i = 0; i < n; i += sve_width * 2) { svbool_t pg_a = SVE_WHILELT((uint64_t)i, (uint64_t)n); - svbool_t pg_b = SVE_WHILELT((i + sve_width), n); + svbool_t pg_b = SVE_WHILELT((uint64_t)(i + sve_width), (uint64_t)n); SVE_TYPE x_vec_a = svld1(pg_a, &x[i]); SVE_TYPE y_vec_a = svld1(pg_a, &y[i]); SVE_TYPE x_vec_b = svld1(pg_b, &x[i + sve_width]); @@ -59,4 +59,4 @@ static int swap_kernel_sve(BLASLONG n, FLOAT *x, FLOAT *y) svst1(pg_b, &y[i + sve_width], x_vec_b); } return (0); -} \ No newline at end of file +} From d00cc400b17155d6f5b624e272a3ec458f93a1fe Mon Sep 17 00:00:00 2001 From: "tingbo.liao" Date: Wed, 18 Dec 2024 08:35:26 +0800 Subject: [PATCH 179/244] Replaced the __riscv_vid_v_i32m2 and __riscv_vid_v_i64m2 with __riscv_vid_v_u32m2 and __riscv_vid_v_u64m2 for riscv64-unknown-linux-gnu-gcc compiling. Signed-off-by: tingbo.liao --- kernel/riscv64/symm_lcopy_rvv_v1.c | 9 +++++---- kernel/riscv64/symm_ucopy_rvv_v1.c | 10 ++++++---- kernel/riscv64/zhemm_ltcopy_rvv_v1.c | 9 +++++---- kernel/riscv64/zhemm_utcopy_rvv_v1.c | 10 ++++++---- kernel/riscv64/zsymm_lcopy_rvv_v1.c | 9 +++++---- kernel/riscv64/zsymm_ucopy_rvv_v1.c | 10 ++++++---- kernel/riscv64/ztrmm_lncopy_rvv_v1.c | 12 +++++++----- 7 files changed, 40 insertions(+), 29 deletions(-) diff --git a/kernel/riscv64/symm_lcopy_rvv_v1.c b/kernel/riscv64/symm_lcopy_rvv_v1.c index a615db44d9..2e5bfc6caf 100644 --- a/kernel/riscv64/symm_lcopy_rvv_v1.c +++ b/kernel/riscv64/symm_lcopy_rvv_v1.c @@ -35,11 +35,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSEV_FLOAT __riscv_vse32_v_f32m2 #define VLSEV_FLOAT __riscv_vlse32_v_f32m2 #define INT_V_T vint32m2_t -#define VID_V_INT __riscv_vid_v_i32m2 +#define VID_V_INT __riscv_vid_v_u32m2 #define VADD_VX_INT __riscv_vadd_vx_i32m2 #define VMSGT_VX_INT __riscv_vmsgt_vx_i32m2_b16 #define VBOOL_T vbool16_t #define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f32m2 +#define V_UM2_TO_IM2 __riscv_vreinterpret_v_u32m2_i32m2 #else #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e64m2() @@ -48,11 +49,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSEV_FLOAT __riscv_vse64_v_f64m2 #define VLSEV_FLOAT __riscv_vlse64_v_f64m2 #define INT_V_T vint64m2_t -#define VID_V_INT __riscv_vid_v_i64m2 +#define VID_V_INT __riscv_vid_v_u64m2 #define VADD_VX_INT __riscv_vadd_vx_i64m2 #define VMSGT_VX_INT __riscv_vmsgt_vx_i64m2_b32 #define VBOOL_T vbool32_t #define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f64m2 +#define V_UM2_TO_IM2 __riscv_vreinterpret_v_u64m2_i64m2 #endif // Optimizes the implementation in ../generic/symm_lcopy_4.c @@ -70,7 +72,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON INT_V_T vindex_max, vindex; size_t vl = VSETVL_MAX; - vindex_max = VID_V_INT(vl); + vindex_max = V_UM2_TO_IM2(VID_V_INT(vl)); for (js = n; js > 0; js -= vl, posX += vl) { vl = VSETVL(js); @@ -98,4 +100,3 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON return 0; } - diff --git a/kernel/riscv64/symm_ucopy_rvv_v1.c b/kernel/riscv64/symm_ucopy_rvv_v1.c index 464f97b3a6..faab88a678 100644 --- a/kernel/riscv64/symm_ucopy_rvv_v1.c +++ b/kernel/riscv64/symm_ucopy_rvv_v1.c @@ -35,11 +35,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSEV_FLOAT __riscv_vse32_v_f32m2 #define VLSEV_FLOAT __riscv_vlse32_v_f32m2 #define INT_V_T vint32m2_t -#define VID_V_INT __riscv_vid_v_i32m2 +#define VID_V_INT __riscv_vid_v_u32m2 #define VADD_VX_INT __riscv_vadd_vx_i32m2 #define VMSGT_VX_INT __riscv_vmsgt_vx_i32m2_b16 #define VBOOL_T vbool16_t #define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f32m2 +#define V_UM2_TO_IM2 __riscv_vreinterpret_v_u32m2_i32m2 #else #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e64m2() @@ -48,11 +49,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSEV_FLOAT __riscv_vse64_v_f64m2 #define VLSEV_FLOAT __riscv_vlse64_v_f64m2 #define INT_V_T vint64m2_t -#define VID_V_INT __riscv_vid_v_i64m2 +#define VID_V_INT __riscv_vid_v_u64m2 #define VADD_VX_INT __riscv_vadd_vx_i64m2 #define VMSGT_VX_INT __riscv_vmsgt_vx_i64m2_b32 #define VBOOL_T vbool32_t #define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f64m2 +#define V_UM2_TO_IM2 __riscv_vreinterpret_v_u64m2_i64m2 #endif // Optimizes the implementation in ../generic/symm_ucopy_4.c @@ -70,7 +72,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON INT_V_T vindex_max, vindex; size_t vl = VSETVL_MAX; - vindex_max = VID_V_INT(vl); + vindex_max = V_UM2_TO_IM2(VID_V_INT(vl)); for (js = n; js > 0; js -= vl, posX += vl) { vl = VSETVL(js); @@ -97,4 +99,4 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } return 0; -} +} \ No newline at end of file diff --git a/kernel/riscv64/zhemm_ltcopy_rvv_v1.c b/kernel/riscv64/zhemm_ltcopy_rvv_v1.c index 97013895ae..15dfc229d8 100644 --- a/kernel/riscv64/zhemm_ltcopy_rvv_v1.c +++ b/kernel/riscv64/zhemm_ltcopy_rvv_v1.c @@ -41,7 +41,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 #define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #define INT_V_T vint32m2_t -#define VID_V_INT __riscv_vid_v_i32m2 +#define VID_V_INT __riscv_vid_v_u32m2 #define VADD_VX_INT __riscv_vadd_vx_i32m2 #define VFRSUB_VF_FLOAT __riscv_vfrsub_vf_f32m2 #define VMSGT_VX_INT __riscv_vmsgt_vx_i32m2_b16 @@ -50,6 +50,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VBOOL_T vbool16_t #define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f32m2 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2 +#define V_UM2_TO_IM2 __riscv_vreinterpret_v_u32m2_i32m2 #else #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e64m2() @@ -64,7 +65,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 #define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #define INT_V_T vint64m2_t -#define VID_V_INT __riscv_vid_v_i64m2 +#define VID_V_INT __riscv_vid_v_u64m2 #define VADD_VX_INT __riscv_vadd_vx_i64m2 #define VFRSUB_VF_FLOAT __riscv_vfrsub_vf_f64m2 #define VMSGT_VX_INT __riscv_vmsgt_vx_i64m2_b32 @@ -73,6 +74,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VBOOL_T vbool32_t #define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f64m2 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 +#define V_UM2_TO_IM2 __riscv_vreinterpret_v_u64m2_i64m2 #endif @@ -92,7 +94,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON INT_V_T vindex_max, vindex; size_t vl = VSETVL_MAX; - vindex_max = VID_V_INT(vl); + vindex_max = V_UM2_TO_IM2(VID_V_INT(vl)); vzero = VFMVVF_FLOAT(ZERO, vl); for (js = n; js > 0; js -= vl, posX += vl) { @@ -136,4 +138,3 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON return 0; } - diff --git a/kernel/riscv64/zhemm_utcopy_rvv_v1.c b/kernel/riscv64/zhemm_utcopy_rvv_v1.c index 59029e9e59..cc7c44e12c 100644 --- a/kernel/riscv64/zhemm_utcopy_rvv_v1.c +++ b/kernel/riscv64/zhemm_utcopy_rvv_v1.c @@ -41,7 +41,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 #define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #define INT_V_T vint32m2_t -#define VID_V_INT __riscv_vid_v_i32m2 +#define VID_V_INT __riscv_vid_v_u32m2 #define VADD_VX_INT __riscv_vadd_vx_i32m2 #define VFRSUB_VF_FLOAT __riscv_vfrsub_vf_f32m2 #define VMSGT_VX_INT __riscv_vmsgt_vx_i32m2_b16 @@ -50,6 +50,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VBOOL_T vbool16_t #define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f32m2 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2 +#define V_UM2_TO_IM2 __riscv_vreinterpret_v_u32m2_i32m2 #else #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e64m2() @@ -64,7 +65,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 #define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #define INT_V_T vint64m2_t -#define VID_V_INT __riscv_vid_v_i64m2 +#define VID_V_INT __riscv_vid_v_u64m2 #define VADD_VX_INT __riscv_vadd_vx_i64m2 #define VFRSUB_VF_FLOAT __riscv_vfrsub_vf_f64m2 #define VMSGT_VX_INT __riscv_vmsgt_vx_i64m2_b32 @@ -73,6 +74,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VBOOL_T vbool32_t #define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f64m2 #define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2 +#define V_UM2_TO_IM2 __riscv_vreinterpret_v_u64m2_i64m2 #endif @@ -90,7 +92,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON INT_V_T vindex_max, vindex; size_t vl = VSETVL_MAX; - vindex_max = VID_V_INT(vl); + vindex_max = V_UM2_TO_IM2(VID_V_INT(vl)); vzero = VFMVVF_FLOAT(ZERO, vl); for (js = n; js > 0; js -= vl, posX += vl) { @@ -132,4 +134,4 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } return 0; -} +} \ No newline at end of file diff --git a/kernel/riscv64/zsymm_lcopy_rvv_v1.c b/kernel/riscv64/zsymm_lcopy_rvv_v1.c index f4d8061909..ed0e00b547 100644 --- a/kernel/riscv64/zsymm_lcopy_rvv_v1.c +++ b/kernel/riscv64/zsymm_lcopy_rvv_v1.c @@ -41,11 +41,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 #define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #define INT_V_T vint32m2_t -#define VID_V_INT __riscv_vid_v_i32m2 +#define VID_V_INT __riscv_vid_v_u32m2 #define VADD_VX_INT __riscv_vadd_vx_i32m2 #define VMSGT_VX_INT __riscv_vmsgt_vx_i32m2_b16 #define VBOOL_T vbool16_t #define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f32m2 +#define V_UM2_TO_IM2 __riscv_vreinterpret_v_u32m2_i32m2 #else #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e64m2() @@ -60,11 +61,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 #define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #define INT_V_T vint64m2_t -#define VID_V_INT __riscv_vid_v_i64m2 +#define VID_V_INT __riscv_vid_v_u64m2 #define VADD_VX_INT __riscv_vadd_vx_i64m2 #define VMSGT_VX_INT __riscv_vmsgt_vx_i64m2_b32 #define VBOOL_T vbool32_t #define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f64m2 +#define V_UM2_TO_IM2 __riscv_vreinterpret_v_u64m2_i64m2 #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b) @@ -81,7 +83,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON INT_V_T vindex_max, vindex; size_t vl = VSETVL_MAX; - vindex_max = VID_V_INT(vl); + vindex_max = V_UM2_TO_IM2(VID_V_INT(vl)); for (js = n; js > 0; js -= vl, posX += vl) { vl = VSETVL(js); @@ -118,4 +120,3 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON return 0; } - diff --git a/kernel/riscv64/zsymm_ucopy_rvv_v1.c b/kernel/riscv64/zsymm_ucopy_rvv_v1.c index 069551bb0e..5f3ac3d07d 100644 --- a/kernel/riscv64/zsymm_ucopy_rvv_v1.c +++ b/kernel/riscv64/zsymm_ucopy_rvv_v1.c @@ -41,11 +41,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m2x2 #define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #define INT_V_T vint32m2_t -#define VID_V_INT __riscv_vid_v_i32m2 +#define VID_V_INT __riscv_vid_v_u32m2 #define VADD_VX_INT __riscv_vadd_vx_i32m2 #define VMSGT_VX_INT __riscv_vmsgt_vx_i32m2_b16 #define VBOOL_T vbool16_t #define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f32m2 +#define V_UM2_TO_IM2 __riscv_vreinterpret_v_u32m2_i32m2 #else #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define VSETVL_MAX __riscv_vsetvlmax_e64m2() @@ -60,11 +61,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m2x2 #define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m2x2 #define INT_V_T vint64m2_t -#define VID_V_INT __riscv_vid_v_i64m2 +#define VID_V_INT __riscv_vid_v_u64m2 #define VADD_VX_INT __riscv_vadd_vx_i64m2 #define VMSGT_VX_INT __riscv_vmsgt_vx_i64m2_b32 #define VBOOL_T vbool32_t #define VMERGE_VVM_FLOAT __riscv_vmerge_vvm_f64m2 +#define V_UM2_TO_IM2 __riscv_vreinterpret_v_u64m2_i64m2 #endif @@ -83,7 +85,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON size_t vl = VSETVL_MAX; - vindex_max = VID_V_INT(vl); + vindex_max = V_UM2_TO_IM2(VID_V_INT(vl)); for (js = n; js > 0; js -= vl, posX += vl) { vl = VSETVL(js); @@ -118,4 +120,4 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } return 0; -} +} \ No newline at end of file diff --git a/kernel/riscv64/ztrmm_lncopy_rvv_v1.c b/kernel/riscv64/ztrmm_lncopy_rvv_v1.c index ae664561b4..9264f13781 100644 --- a/kernel/riscv64/ztrmm_lncopy_rvv_v1.c +++ b/kernel/riscv64/ztrmm_lncopy_rvv_v1.c @@ -42,10 +42,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m2x2 #define VBOOL_T vbool16_t #define UINT_V_T vint32m2_t -#define VID_V_UINT __riscv_vid_v_i32m2 +#define VID_V_UINT __riscv_vid_v_u32m2 #define VMSGTU_VX_UINT __riscv_vmsgt_vx_i32m2_b16 #define VMSEQ_VX_UINT __riscv_vmseq_vx_i32m2_b16 #define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f32m2 +#define V_UM2_TO_IM2 __riscv_vreinterpret_v_u32m2_i32m2 #else #define VSETVL(n) __riscv_vsetvl_e64m2(n) #define FLOAT_V_T vfloat64m2_t @@ -63,6 +64,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define VMSGTU_VX_UINT __riscv_vmsgtu_vx_u64m2_b32 #define VMSEQ_VX_UINT __riscv_vmseq_vx_u64m2_b32 #define VFMERGE_VFM_FLOAT __riscv_vfmerge_vfm_f64m2 +#define V_UM2_TO_IM2(values) values #endif int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ @@ -99,7 +101,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } i = 0; - do + do { if (X > posY) { @@ -119,9 +121,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON X ++; i ++; } - else + else { - vindex = VID_V_UINT(vl); + vindex = V_UM2_TO_IM2(VID_V_UINT(vl)); for (unsigned int j = 0; j < vl; j++) { vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl); @@ -152,4 +154,4 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON } return 0; -} +} \ No newline at end of file From 48caf2303d4b953d74b3caba0f8fc4ad94c9cdd8 Mon Sep 17 00:00:00 2001 From: Ralf Gommers Date: Wed, 18 Dec 2024 08:53:29 +0100 Subject: [PATCH 180/244] Fix build warning about discarding volatile qualifier in memory.c The warning was: ``` [4339/5327] Building C object driver/others/CMakeFiles/driver_others.dir/memory.c.o /home/rgommers/code/pixi-dev-scipystack/openblas/OpenBLAS/driver/others/memory.c: In function 'blas_shutdown': /home/rgommers/code/pixi-dev-scipystack/openblas/OpenBLAS/driver/others/memory.c:3257:10: warning: passing argument 1 of 'free' discards 'volatile' qualifier from pointer target type [-Wdiscarded-qualifiers] 3257 | free(newmemory); | ^~~~~~~~~ In file included from /home/rgommers/code/pixi-dev-scipystack/openblas/OpenBLAS/common.h:83, from /home/rgommers/code/pixi-dev-scipystack/openblas/OpenBLAS/driver/others/memory.c:74: /home/rgommers/code/pixi-dev-scipystack/openblas/.pixi/envs/default/x86_64-conda-linux-gnu/sysroot/usr/include/stdlib.h:482:25: note: expected 'void *' but argument is of type 'volatile struct newmemstruct *' 482 | extern void free (void *__ptr) __THROW; | ~~~~~~^~~~~ ``` The use of `volatile` for `newmemstruct` seems on purpose, and there are more such constructs in this file. The warning appeared after gh-4451 and is correct. The `free` prototype doesn't expect a volatile pointer, hence this change adds a cast to silence the warning. --- driver/others/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 6343a3785e..276e39ece0 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -3254,7 +3254,7 @@ void blas_shutdown(void){ #endif newmemory[pos].lock = 0; } - free(newmemory); + free((void*)newmemory); newmemory = NULL; memory_overflowed = 0; } From 765ad8bcd2bee89d8393a2200a6777989a8d4db0 Mon Sep 17 00:00:00 2001 From: Ralf Gommers Date: Wed, 18 Dec 2024 09:39:07 +0100 Subject: [PATCH 181/244] Fix guard around `alloc_hugetlb`, fixes compile warning The warning was: ``` /home/rgommers/code/pixi-dev-scipystack/openblas/OpenBLAS/driver/others/memory.c: At top level: /home/rgommers/code/pixi-dev-scipystack/openblas/OpenBLAS/driver/others/memory.c:2565:14: warning: 'alloc_hugetlb' defined but not used [-Wunused-function] 2565 | static void *alloc_hugetlb(void *address){ | ^~~~~~~~~~~~~ ``` The added define is the same as is already present in the TLS part of `memory.c`. This follows up on gh-4681. --- driver/others/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 276e39ece0..c53e798bc1 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -2538,7 +2538,7 @@ static void *alloc_shm(void *address){ } #endif -#if defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS +#if ((defined ALLOC_HUGETLB) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS)) static void alloc_hugetlb_free(struct release_t *release){ From e460512685b3004c3796b4620c1454150cf61ef0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 19 Dec 2024 00:50:37 +0100 Subject: [PATCH 182/244] Update WoA build instructions from rewording in issue #5001 --- docs/install.md | 66 +++++++++++++++++++++++++++++++------------------ 1 file changed, 42 insertions(+), 24 deletions(-) diff --git a/docs/install.md b/docs/install.md index b842d3355b..7155263056 100644 --- a/docs/install.md +++ b/docs/install.md @@ -437,36 +437,54 @@ To then use the built OpenBLAS shared library in Visual Studio: [Qt Creator](http://qt.nokia.com/products/developer-tools/). -#### Windows on Arm - -While OpenBLAS can be built with Microsoft VisualStudio (Community Edition or commercial), you would only be able to build for the GENERIC target -that does not use optimized assembly kernels, also the stock VisualStudio lacks the Fortran compiler necessary for building the LAPACK component. -It is therefore highly recommended to download the free LLVM compiler suite and use it to compile OpenBLAS outside of VisualStudio. - -The following tools needs to be installed to build for Windows on Arm (WoA): - -- LLVM for Windows on Arm. - Find the latest LLVM build for WoA from [LLVM release page](https://releases.llvm.org/) - you want the package whose name ends in "woa64.exe". - (This may not always be present in the very latest point release, as building and uploading the binaries takes time.) - E.g: a LLVM 19 build for WoA64 can be found [here](https://github.com/llvm/llvm-project/releases/download/llvmorg-19.1.2/LLVM-19.1.2-woa64.exe). - Run the LLVM installer and ensure that LLVM is added to the environment variable PATH. (If you do not want to add it to the PATH, you will need to specify - both C and Fortran compiler to Make or CMake with their full path later on) +## Windows on Arm + +A fully functional native OpenBLAS for WoA that can be built as both a static and dynamic library using LLVM toolchain and Visual Studio 2022. Before starting to build, make sure that you have installed Visual Studio 2022 on your ARM device, including the "Desktop Development with C++" component (that contains the cmake tool). +(Note that you can use the free "Visual Studio 2022 Community Edition" for this task. In principle it would be possible to build with VisualStudio alone, but using +the LLVM toolchain enables native compilation of the Fortran sources of LAPACK and of all the optimized assembly files, which VisualStudio cannot handle on its own) + + 1. Clone OpenBLAS to your local machine and checkout to latest release of OpenBLAS (unless you want to build the latest development snapshot - here we are using the 0.3.28 release as the example, of course this exact version may be outdated by the time you read this) + + ```cmd + git clone https://github.com/OpenMathLib/OpenBLAS.git + cd OpenBLAS + git checkout v0.3.28 + ``` + + 2. Install Latest LLVM toolchain for WoA: + + Download the Latest LLVM toolchain for WoA from [the Release page](https://github.com/llvm/llvm-project/releases/tag/llvmorg-19.1.5). At the time of writing, this is version 19.1.5 - be sure to select the latest release for which you can find a precompiled package whose name ends in "-woa64.exe" (precompiled packages + usually lag a week or two behind their corresponding source release). + Make sure to enable the option “Add LLVM to the system PATH for all the users” + Note: Make sure that the path of LLVM toolchain is at the top of Environment Variables section to avoid conflicts between the set of compilers available in the system path + + 3. Launch the Native Command Prompt for Windows ARM64: + + From the start menu search for “ARM64 Native Tools Command Prompt for Visual Studio 2022” + Alternatively open command prompt, run the following command to activate the environment: + "C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Auxiliary\Build\vcvarsarm64.bat" + + Navigate to the OpenBLAS source code directory and start building OpenBLAS by invoking Ninja: + + ```cmd + cd OpenBLAS + mkdir build + cd build + + cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release -DTARGET=ARMV8 -DBINARY=64 -DCMAKE_C_COMPILER=clang-cl -DCMAKE_C_COMPILER=arm64-pc-windows-msvc -DCMAKE_ASM_COMPILER=arm64-pc-windows-msvc -DCMAKE_Fortran_COMPILER=flang-new -The following steps describe how to build the static library for OpenBLAS with either Make or CMake: + ninja -j16 + ``` + +Note: You might want to include additional options in the cmake command here. For example, the default configuration only generates a static.lib version of the library. If you prefer a DLL, you can add -DBUILD_SHARED_LIBS=ON. -1. Build OpenBLAS with Make: +Note that it is also possible to use the same setup to build OpenBLAS with Make, if you prepare Makefiles over the CMake build for some reason: - ```bash + ```cmd $ make CC=clang-cl FC=flang-new AR="llvm-ar" TARGET=ARMV8 ARCH=arm64 RANLIB="llvm-ranlib" MAKE=make ``` -2. Build OpenBLAS with CMake - ```bash - $ mkdir build - $ cd build - $ cmake .. -G Ninja -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang-new -DTARGET=ARMV8 -DCMAKE_BUILD_TYPE=Release - $ cmake --build . - ``` + #### Generating an import library From a93d3db34a7e2fe70bbeb3a43c20323d85802a74 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 19 Dec 2024 00:53:10 +0100 Subject: [PATCH 183/244] fix formatting of WoA section --- docs/install.md | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/docs/install.md b/docs/install.md index 7155263056..5bb88cccd8 100644 --- a/docs/install.md +++ b/docs/install.md @@ -437,13 +437,13 @@ To then use the built OpenBLAS shared library in Visual Studio: [Qt Creator](http://qt.nokia.com/products/developer-tools/). -## Windows on Arm +### Windows on Arm A fully functional native OpenBLAS for WoA that can be built as both a static and dynamic library using LLVM toolchain and Visual Studio 2022. Before starting to build, make sure that you have installed Visual Studio 2022 on your ARM device, including the "Desktop Development with C++" component (that contains the cmake tool). (Note that you can use the free "Visual Studio 2022 Community Edition" for this task. In principle it would be possible to build with VisualStudio alone, but using the LLVM toolchain enables native compilation of the Fortran sources of LAPACK and of all the optimized assembly files, which VisualStudio cannot handle on its own) - 1. Clone OpenBLAS to your local machine and checkout to latest release of OpenBLAS (unless you want to build the latest development snapshot - here we are using the 0.3.28 release as the example, of course this exact version may be outdated by the time you read this) +1. Clone OpenBLAS to your local machine and checkout to latest release of OpenBLAS (unless you want to build the latest development snapshot - here we are using the 0.3.28 release as the example, of course this exact version may be outdated by the time you read this) ```cmd git clone https://github.com/OpenMathLib/OpenBLAS.git @@ -451,20 +451,20 @@ the LLVM toolchain enables native compilation of the Fortran sources of LAPACK a git checkout v0.3.28 ``` - 2. Install Latest LLVM toolchain for WoA: +2. Install Latest LLVM toolchain for WoA: - Download the Latest LLVM toolchain for WoA from [the Release page](https://github.com/llvm/llvm-project/releases/tag/llvmorg-19.1.5). At the time of writing, this is version 19.1.5 - be sure to select the latest release for which you can find a precompiled package whose name ends in "-woa64.exe" (precompiled packages - usually lag a week or two behind their corresponding source release). - Make sure to enable the option “Add LLVM to the system PATH for all the users” - Note: Make sure that the path of LLVM toolchain is at the top of Environment Variables section to avoid conflicts between the set of compilers available in the system path +Download the Latest LLVM toolchain for WoA from [the Release page](https://github.com/llvm/llvm-project/releases/tag/llvmorg-19.1.5). At the time of writing, this is version 19.1.5 - be sure to select the latest release for which you can find a precompiled package whose name ends in "-woa64.exe" (precompiled packages +usually lag a week or two behind their corresponding source release). +Make sure to enable the option “Add LLVM to the system PATH for all the users” +Note: Make sure that the path of LLVM toolchain is at the top of Environment Variables section to avoid conflicts between the set of compilers available in the system path - 3. Launch the Native Command Prompt for Windows ARM64: +3. Launch the Native Command Prompt for Windows ARM64: - From the start menu search for “ARM64 Native Tools Command Prompt for Visual Studio 2022” - Alternatively open command prompt, run the following command to activate the environment: - "C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Auxiliary\Build\vcvarsarm64.bat" +From the start menu search for “ARM64 Native Tools Command Prompt for Visual Studio 2022” +Alternatively open command prompt, run the following command to activate the environment: +"C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Auxiliary\Build\vcvarsarm64.bat" - Navigate to the OpenBLAS source code directory and start building OpenBLAS by invoking Ninja: +Navigate to the OpenBLAS source code directory and start building OpenBLAS by invoking Ninja: ```cmd cd OpenBLAS From 1c4401ebf16dd4ff3c0de8a7517bea9724a63a45 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 19 Dec 2024 14:32:24 -0800 Subject: [PATCH 184/244] Add target-specific options to enable SVE with the NVIDIA compiler --- Makefile.arm64 | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/Makefile.arm64 b/Makefile.arm64 index fccc0d0d0f..2909a83e0e 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -351,4 +351,31 @@ endif endif +else +# NVIDIA HPC options necessary to enable SVE in the compiler +ifeq ($(CORE), THUNDERX2T99) +CCOMMON_OPT += -tp=thunderx2t99 +FCOMMON_OPT += -tp=thunderx2t99 +endif +ifeq ($(CORE), NEOVERSEN1) +CCOMMON_OPT += -tp=neoverse-n1 +FCOMMON_OPT += -tp=neoverse-n1 +endif +ifeq ($(CORE), NEOVERSEV1) +CCOMMON_OPT += -tp=neoverse-v1 +FCOMMON_OPT += -tp=neoverse-v1 +endif +ifeq ($(CORE), NEOVERSEV2) +CCOMMON_OPT += -tp=neoverse-v2 +FCOMMON_OPT += -tp=neoverse-v2 +endif +ifeq ($(CORE), ARMV8SVE) +CCOMMON_OPT += -tp=neoverse-v2 +FCOMMON_OPT += -tp=neoverse-v2 +endif +ifeq ($(CORE), ARMV9SVE) +CCOMMON_OPT += -tp=neoverse-v2 +FCOMMON_OPT += -tp=neoverse-v2 +endif + endif From 32319a33ac5e7c1562ce9763cae0a5118a8ec2bd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 23 Dec 2024 19:00:48 +0100 Subject: [PATCH 185/244] Add options for Intel oneAPI 2025.0 ifx on Windows --- cmake/f_check.cmake | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cmake/f_check.cmake b/cmake/f_check.cmake index 4c4f5ac044..dc0f5e0ac5 100644 --- a/cmake/f_check.cmake +++ b/cmake/f_check.cmake @@ -45,13 +45,15 @@ if (NOT ONLY_CBLAS) # TODO: detect whether underscore needed, set #defines and BU appropriately - use try_compile # TODO: set FEXTRALIB flags a la f_check? - + if (NOT (${CMAKE_SYSTEM_NAME} MATCHES "Windows" AND ${CMAKE_Fortran_COMPILER_ID} MATCHES "IntelLLVM")) set(BU "_") file(APPEND ${TARGET_CONF_TEMP} "#define BUNDERSCORE _\n" "#define NEEDBUNDERSCORE 1\n" "#define NEED2UNDERSCORES 0\n") - + else () + set (FCOMMON_OPT "${FCOMMON_OPT} /fp:precise /recursive /names:lowercase /assume:nounderscore") + endif() else () #When we only build CBLAS, we set NOFORTRAN=2 From 30188a55d180a493922dc9ffc4ff0c17696cdf41 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 23 Dec 2024 19:02:34 +0100 Subject: [PATCH 186/244] Don't assume underlined symbols for ifx; make cpuid.S inclusion conditional --- cmake/prebuild.cmake | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index 53a78d782f..bdc0f7f927 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -58,7 +58,7 @@ set(TARGET_CONF_TEMP "${PROJECT_BINARY_DIR}/${TARGET_CONF}.tmp") # c_check set(FU "") -if (APPLE OR (MSVC AND NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang")) +if (APPLE OR (MSVC AND NOT (${CMAKE_C_COMPILER_ID} MATCHES "Clang" OR ${CMAKE_C_COMPILER_ID} MATCHES "IntelLLVM"))) set(FU "_") endif() if(MINGW AND NOT MINGW64) @@ -1433,7 +1433,9 @@ else(NOT CMAKE_CROSSCOMPILING) message(STATUS "MSVC") set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_GENERIC) else() - list(APPEND GETARCH_SRC ${PROJECT_SOURCE_DIR}/cpuid.S) + if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Darwin") + list(APPEND GETARCH_SRC ${PROJECT_SOURCE_DIR}/cpuid.S) + endif() if (DEFINED TARGET_CORE) set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_${TARGET_CORE}) endif () From d78fbe425c4ea0a79f005d8c6b1014b4b16743b2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 23 Dec 2024 19:04:50 +0100 Subject: [PATCH 187/244] Assume no underline suffixes on symbols when compiling with ifx on Windows --- cmake/system.cmake | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cmake/system.cmake b/cmake/system.cmake index 4ac244e3ea..7413c88c80 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -672,6 +672,9 @@ endif () if (${CMAKE_C_COMPILER} STREQUAL "LSB" OR ${CMAKE_SYSTEM_NAME} STREQUAL "Windows") set(LAPACK_CFLAGS "${LAPACK_CFLAGS} -DLAPACK_COMPLEX_STRUCTURE") endif () +if (${CMAKE_C_COMPILER_ID} MATCHES "IntelLLVM" AND ${CMAKE_SYSTEM_NAME} STREQUAL "Windows") + set(LAPACK_CFLAGS "${LAPACK_CFLAGS} -DNOCHANGE") +endif () if ("${CMAKE_BUILD_TYPE}" STREQUAL "Release") if ("${F_COMPILER}" STREQUAL "FLANG") From 5d81e514e4d289879921ff3be9b432afdc5fc53f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 23 Dec 2024 19:06:03 +0100 Subject: [PATCH 188/244] Assume no underline suffixes on symbols when compiling with ifx on Windows --- ctest/cblas_test.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ctest/cblas_test.h b/ctest/cblas_test.h index 24ea677637..502a2fee20 100644 --- a/ctest/cblas_test.h +++ b/ctest/cblas_test.h @@ -10,6 +10,10 @@ #define int long #endif +#if defined(_MSC_VER) && defined(__INTEL_CLANG_COMPILER) +//#define LAPACK_COMPLEX_STRUCTURE +#define NOCHANGE +#endif /* e.g. mingw64/x86_64-w64-mingw32/include/winerror.h */ #ifdef FAILED #undef FAILED From 5c9417d3061650a26062f3759da4f8586fa790f0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 23 Dec 2024 19:07:39 +0100 Subject: [PATCH 189/244] Assume no underline suffixes on symbols when compiling with ifx on Windows --- lapack-netlib/LAPACKE/include/lapacke_config.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/lapack-netlib/LAPACKE/include/lapacke_config.h b/lapack-netlib/LAPACKE/include/lapacke_config.h index 798a5eb2ef..4ef542fb10 100644 --- a/lapack-netlib/LAPACKE/include/lapacke_config.h +++ b/lapack-netlib/LAPACKE/include/lapacke_config.h @@ -67,8 +67,14 @@ extern "C" { #define lapack_logical lapack_int #endif +#if defined(_MSC_VER) && defined(__INTEL_CLANG_COMPILER) +#define LAPACK_COMPLEX_STRUCTURE +#define LAPACK_GLOBAL(lcname,UCNAME) lcname +#define NOCHANGE +#endif + #ifndef LAPACK_COMPLEX_CUSTOM -#if defined(_MSC_VER) +#if defined(_MSC_VER) && !defined(__INTEL_CLANG_COMPILER) #define _CRT_USE_C_COMPLEX_H #include #define LAPACK_COMPLEX_CUSTOM From 64c6c7920175b6b1603a3a876b86c83e4a4a3cdf Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 23 Dec 2024 19:09:34 +0100 Subject: [PATCH 190/244] Assume no underline suffixes on symbols when compiling with Intel ifx on Windows --- utest/openblas_utest.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/utest/openblas_utest.h b/utest/openblas_utest.h index abe381a924..1851c60c56 100644 --- a/utest/openblas_utest.h +++ b/utest/openblas_utest.h @@ -36,7 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include "ctest.h" - +#if defined(_MSC_VER) && defined(__INTEL_CLANG_COMPILER) +//#define LAPACK_COMPLEX_STRUCTURE +#define NOCHANGE +#endif #include #include From 05fe49ddafc438b564879e5fc19b6ab8083a2e3e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 23 Dec 2024 19:12:17 +0100 Subject: [PATCH 191/244] Rename local copy functions to avoid name clash with the standard BLAS ones --- utest/test_extensions/common.c | 8 ++++---- utest/test_extensions/common.h | 10 +++++----- utest/test_extensions/test_cimatcopy.c | 2 +- utest/test_extensions/test_comatcopy.c | 2 +- utest/test_extensions/test_dimatcopy.c | 2 +- utest/test_extensions/test_domatcopy.c | 2 +- utest/test_extensions/test_simatcopy.c | 2 +- utest/test_extensions/test_somatcopy.c | 2 +- utest/test_extensions/test_zimatcopy.c | 2 +- utest/test_extensions/test_zomatcopy.c | 2 +- 10 files changed, 17 insertions(+), 17 deletions(-) diff --git a/utest/test_extensions/common.c b/utest/test_extensions/common.c index 808aa54557..a5d3196aaf 100644 --- a/utest/test_extensions/common.c +++ b/utest/test_extensions/common.c @@ -206,7 +206,7 @@ void ztranspose(blasint rows, blasint cols, double *alpha, double *a_src, int ld * param lda_dst - leading dimension of output matrix A * param conj specifies conjugation */ -void scopy(blasint rows, blasint cols, float alpha, float *a_src, int lda_src, +void my_scopy(blasint rows, blasint cols, float alpha, float *a_src, int lda_src, float *a_dst, blasint lda_dst) { blasint i, j; @@ -217,7 +217,7 @@ void scopy(blasint rows, blasint cols, float alpha, float *a_src, int lda_src, } } -void dcopy(blasint rows, blasint cols, double alpha, double *a_src, int lda_src, +void my_dcopy(blasint rows, blasint cols, double alpha, double *a_src, int lda_src, double *a_dst, blasint lda_dst) { blasint i, j; @@ -228,7 +228,7 @@ void dcopy(blasint rows, blasint cols, double alpha, double *a_src, int lda_src, } } -void ccopy(blasint rows, blasint cols, float *alpha, float *a_src, int lda_src, +void my_ccopy(blasint rows, blasint cols, float *alpha, float *a_src, int lda_src, float *a_dst, blasint lda_dst, int conj) { blasint i, j; @@ -243,7 +243,7 @@ void ccopy(blasint rows, blasint cols, float *alpha, float *a_src, int lda_src, } } -void zcopy(blasint rows, blasint cols, double *alpha, double *a_src, int lda_src, +void my_zcopy(blasint rows, blasint cols, double *alpha, double *a_src, int lda_src, double *a_dst, blasint lda_dst, int conj) { blasint i, j; diff --git a/utest/test_extensions/common.h b/utest/test_extensions/common.h index 62b84325c2..f8c60d2686 100644 --- a/utest/test_extensions/common.h +++ b/utest/test_extensions/common.h @@ -65,12 +65,12 @@ extern void ctranspose(blasint rows, blasint cols, float *alpha, float *a_src, i extern void ztranspose(blasint rows, blasint cols, double *alpha, double *a_src, int lda_src, double *a_dst, blasint lda_dst, int conj); -extern void scopy(blasint rows, blasint cols, float alpha, float *a_src, int lda_src, +extern void my_scopy(blasint rows, blasint cols, float alpha, float *a_src, int lda_src, float *a_dst, blasint lda_dst); -extern void dcopy(blasint rows, blasint cols, double alpha, double *a_src, int lda_src, +extern void my_dcopy(blasint rows, blasint cols, double alpha, double *a_src, int lda_src, double *a_dst, blasint lda_dst); -extern void ccopy(blasint rows, blasint cols, float *alpha, float *a_src, int lda_src, +extern void my_ccopy(blasint rows, blasint cols, float *alpha, float *a_src, int lda_src, float *a_dst, blasint lda_dst, int conj); -extern void zcopy(blasint rows, blasint cols, double *alpha, double *a_src, int lda_src, +extern void my_zcopy(blasint rows, blasint cols, double *alpha, double *a_src, int lda_src, double *a_dst, blasint lda_dst, int conj); -#endif \ No newline at end of file +#endif diff --git a/utest/test_extensions/test_cimatcopy.c b/utest/test_extensions/test_cimatcopy.c index 0c96a3b17c..41c0a0f6b0 100644 --- a/utest/test_extensions/test_cimatcopy.c +++ b/utest/test_extensions/test_cimatcopy.c @@ -91,7 +91,7 @@ static float check_cimatcopy(char api, char order, char trans, blasint rows, bla ctranspose(m, n, alpha, data_cimatcopy.a_test, lda_src, data_cimatcopy.a_verify, lda_dst, conj); } else { - ccopy(m, n, alpha, data_cimatcopy.a_test, lda_src, data_cimatcopy.a_verify, lda_dst, conj); + my_ccopy(m, n, alpha, data_cimatcopy.a_test, lda_src, data_cimatcopy.a_verify, lda_dst, conj); } if (api == 'F') { diff --git a/utest/test_extensions/test_comatcopy.c b/utest/test_extensions/test_comatcopy.c index b493c93a6f..dc6beeeaee 100644 --- a/utest/test_extensions/test_comatcopy.c +++ b/utest/test_extensions/test_comatcopy.c @@ -92,7 +92,7 @@ static float check_comatcopy(char api, char order, char trans, blasint rows, bla ctranspose(m, n, alpha, data_comatcopy.a_test, lda, data_comatcopy.b_verify, ldb, conj); } else { - ccopy(m, n, alpha, data_comatcopy.a_test, lda, data_comatcopy.b_verify, ldb, conj); + my_ccopy(m, n, alpha, data_comatcopy.a_test, lda, data_comatcopy.b_verify, ldb, conj); } if (api == 'F') { diff --git a/utest/test_extensions/test_dimatcopy.c b/utest/test_extensions/test_dimatcopy.c index eebb7669eb..f57707eeea 100644 --- a/utest/test_extensions/test_dimatcopy.c +++ b/utest/test_extensions/test_dimatcopy.c @@ -86,7 +86,7 @@ static double check_dimatcopy(char api, char order, char trans, blasint rows, bl dtranspose(m, n, alpha, data_dimatcopy.a_test, lda_src, data_dimatcopy.a_verify, lda_dst); } else { - dcopy(m, n, alpha, data_dimatcopy.a_test, lda_src, data_dimatcopy.a_verify, lda_dst); + my_dcopy(m, n, alpha, data_dimatcopy.a_test, lda_src, data_dimatcopy.a_verify, lda_dst); } if (api == 'F') { diff --git a/utest/test_extensions/test_domatcopy.c b/utest/test_extensions/test_domatcopy.c index e892271d2d..8869f7b453 100644 --- a/utest/test_extensions/test_domatcopy.c +++ b/utest/test_extensions/test_domatcopy.c @@ -87,7 +87,7 @@ static double check_domatcopy(char api, char order, char trans, blasint rows, bl dtranspose(m, n, alpha, data_domatcopy.a_test, lda, data_domatcopy.b_verify, ldb); } else { - dcopy(m, n, alpha, data_domatcopy.a_test, lda, data_domatcopy.b_verify, ldb); + my_dcopy(m, n, alpha, data_domatcopy.a_test, lda, data_domatcopy.b_verify, ldb); } if (api == 'F') { diff --git a/utest/test_extensions/test_simatcopy.c b/utest/test_extensions/test_simatcopy.c index c00ea0c8f0..6b70881bf9 100644 --- a/utest/test_extensions/test_simatcopy.c +++ b/utest/test_extensions/test_simatcopy.c @@ -86,7 +86,7 @@ static float check_simatcopy(char api, char order, char trans, blasint rows, bla stranspose(m, n, alpha, data_simatcopy.a_test, lda_src, data_simatcopy.a_verify, lda_dst); } else { - scopy(m, n, alpha, data_simatcopy.a_test, lda_src, data_simatcopy.a_verify, lda_dst); + my_scopy(m, n, alpha, data_simatcopy.a_test, lda_src, data_simatcopy.a_verify, lda_dst); } if (api == 'F') { diff --git a/utest/test_extensions/test_somatcopy.c b/utest/test_extensions/test_somatcopy.c index 62a6056d92..bcc2eabf51 100644 --- a/utest/test_extensions/test_somatcopy.c +++ b/utest/test_extensions/test_somatcopy.c @@ -87,7 +87,7 @@ static float check_somatcopy(char api, char order, char trans, blasint rows, bla stranspose(m, n, alpha, data_somatcopy.a_test, lda, data_somatcopy.b_verify, ldb); } else { - scopy(m, n, alpha, data_somatcopy.a_test, lda, data_somatcopy.b_verify, ldb); + my_scopy(m, n, alpha, data_somatcopy.a_test, lda, data_somatcopy.b_verify, ldb); } if (api == 'F') { diff --git a/utest/test_extensions/test_zimatcopy.c b/utest/test_extensions/test_zimatcopy.c index 86bc4670f2..349050b9c1 100644 --- a/utest/test_extensions/test_zimatcopy.c +++ b/utest/test_extensions/test_zimatcopy.c @@ -91,7 +91,7 @@ static double check_zimatcopy(char api, char order, char trans, blasint rows, bl ztranspose(m, n, alpha, data_zimatcopy.a_test, lda_src, data_zimatcopy.a_verify, lda_dst, conj); } else { - zcopy(m, n, alpha, data_zimatcopy.a_test, lda_src, data_zimatcopy.a_verify, lda_dst, conj); + my_zcopy(m, n, alpha, data_zimatcopy.a_test, lda_src, data_zimatcopy.a_verify, lda_dst, conj); } if (api == 'F') { diff --git a/utest/test_extensions/test_zomatcopy.c b/utest/test_extensions/test_zomatcopy.c index 208cfd981c..eb13d10830 100644 --- a/utest/test_extensions/test_zomatcopy.c +++ b/utest/test_extensions/test_zomatcopy.c @@ -92,7 +92,7 @@ static double check_zomatcopy(char api, char order, char trans, blasint rows, bl ztranspose(m, n, alpha, data_zomatcopy.a_test, lda, data_zomatcopy.b_verify, ldb, conj); } else { - zcopy(m, n, alpha, data_zomatcopy.a_test, lda, data_zomatcopy.b_verify, ldb, conj); + my_zcopy(m, n, alpha, data_zomatcopy.a_test, lda, data_zomatcopy.b_verify, ldb, conj); } if (api == 'F') { From e6fd62977056b95aad33a10c433ce686e4f852e4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 23 Dec 2024 23:18:52 +0100 Subject: [PATCH 192/244] Expressly declare the .S extension for assembly (documented as standard, but current cmake does not set it for icx) --- CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index ddff73c2cd..3c6508edff 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,6 +4,7 @@ cmake_minimum_required(VERSION 3.16.0) +set (CMAKE_ASM_SOURCE_FILE_EXTENSIONS "S") project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) From 0bea1cfd9df3eeeefa3c19d88b6e8d08b15d9603 Mon Sep 17 00:00:00 2001 From: "tingbo.liao" Date: Tue, 24 Dec 2024 10:33:27 +0800 Subject: [PATCH 193/244] Optimize the zgemm_tcopy_4_rvv function to be compatible with the situations where the vector lengths(vlens) are 128 and 256. Signed-off-by: tingbo.liao --- kernel/riscv64/zgemm_tcopy_4_rvv.c | 136 ++++++----------------------- 1 file changed, 25 insertions(+), 111 deletions(-) diff --git a/kernel/riscv64/zgemm_tcopy_4_rvv.c b/kernel/riscv64/zgemm_tcopy_4_rvv.c index cfafbf0dc7..9c194877a2 100644 --- a/kernel/riscv64/zgemm_tcopy_4_rvv.c +++ b/kernel/riscv64/zgemm_tcopy_4_rvv.c @@ -28,35 +28,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if !defined(DOUBLE) -#define VSETVL(n) __riscv_vsetvl_e32m1(n) -#define FLOAT_V_T vfloat32m1_t -#define FLOAT_VX2_T vfloat32m1x2_t -#define FLOAT_VX4_T vfloat32m1x4_t -#define FLOAT_VX8_T vfloat32m1x8_t -#define VLEV_FLOAT __riscv_vle32_v_f32m1 -#define VSEV_FLOAT __riscv_vse32_v_f32m1 -#define VLSSEG2_FLOAT __riscv_vlsseg2e32_v_f32m1x2 -#define VLSSEG4_FLOAT __riscv_vlsseg4e32_v_f32m1x4 -#define VLSSEG8_FLOAT __riscv_vlsseg8e32_v_f32m1x8 -#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m1x2 -#define VSSEG4_FLOAT __riscv_vsseg4e32_v_f32m1x4 -#define VSSEG8_FLOAT __riscv_vsseg8e32_v_f32m1x8 +#define FLOAT_V_T vfloat32m2_t +#define FLOAT_V_T_HALF vfloat32m1_t +#define VLEV_FLOAT __riscv_vle32_v_f32m2 +#define VLEV_FLOAT_HALF __riscv_vle32_v_f32m1 +#define VSEV_FLOAT __riscv_vse32_v_f32m2 +#define VSEV_FLOAT_HALF __riscv_vse32_v_f32m1 #else -#define VSETVL(n) __riscv_vsetvl_e64m1(n) -#define FLOAT_V_T vfloat64m1_t -#define FLOAT_VX2_T vfloat64m1x2_t -#define FLOAT_VX4_T vfloat64m1x4_t -#define FLOAT_VX8_T vfloat64m1x8_t -#define VLEV_FLOAT __riscv_vle64_v_f64m1 -#define VSEV_FLOAT __riscv_vse64_v_f64m1 -#define VLSSEG2_FLOAT __riscv_vlsseg2e64_v_f64m1x2 -#define VLSSEG4_FLOAT __riscv_vlsseg4e64_v_f64m1x4 -#define VLSSEG8_FLOAT __riscv_vlsseg8e64_v_f64m1x8 -#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m1x2 -#define VSSEG4_FLOAT __riscv_vsseg4e64_v_f64m1x4 -#define VSSEG8_FLOAT __riscv_vsseg8e64_v_f64m1x8 +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_HALF vfloat64m2_t +#define VLEV_FLOAT __riscv_vle64_v_f64m4 +#define VLEV_FLOAT_HALF __riscv_vle64_v_f64m2 +#define VSEV_FLOAT __riscv_vse64_v_f64m4 +#define VSEV_FLOAT_HALF __riscv_vse64_v_f64m2 #endif + int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ BLASLONG i, j; @@ -67,9 +54,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ IFLOAT *boffset, *boffset1, *boffset2, *boffset3; FLOAT_V_T v0; - FLOAT_VX2_T vx2; - FLOAT_VX4_T vx4; - FLOAT_VX8_T vx8; + FLOAT_V_T_HALF v1; size_t vl; @@ -80,86 +65,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ boffset2 = b + 2 * m * (n & ~3); boffset3 = b + 2 * m * (n & ~1); - for(j = (m >> 2); j > 0; j--) { - - aoffset1 = aoffset; - aoffset += 8 * lda; - - boffset1 = boffset; - boffset += 32; - - for(i = (n >> 2); i > 0; i--) { - vl = 4; - - vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl); - VSSEG8_FLOAT(boffset1, vx8, vl); - - aoffset1 += 8; - boffset1 += m * 8; - } - - if (n & 2) { - vl = 4; - - vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl); - VSSEG4_FLOAT(boffset2, vx4, vl); - - aoffset1 += 4; - boffset2 += 16; - } - - if (n & 1) { - vl = 4; - - vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl); - VSSEG2_FLOAT(boffset3, vx2, vl); - - aoffset1 += 2; - boffset3 += 8; - } - } - - if (m & 2) { + for(j = m; j > 0; j--) { aoffset1 = aoffset; - aoffset += 4 * lda; - boffset1 = boffset; - boffset += 16; - - for(i = (n >> 2); i > 0; i--) { - vl = 2; - - vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl); - VSSEG8_FLOAT(boffset1, vx8, vl); - - aoffset1 += 8; - boffset1 += m * 8; - } - - if (n & 2) { - vl = 2; - - vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl); - VSSEG4_FLOAT(boffset2, vx4, vl); - - aoffset1 += 4; - boffset2 += 8; - } - - if (n & 1) { - vl = 2; - vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl); - VSSEG2_FLOAT(boffset3, vx2, vl); - - //aoffset1 += 2; - boffset3 += 4; - } - } - - if (m & 1) { - aoffset1 = aoffset; - boffset1 = boffset; + aoffset += 2 * lda; + boffset += 8; for(i = (n >> 2); i > 0; i--) { vl = 8; @@ -174,16 +85,19 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ if (n & 2) { vl = 4; - v0 = VLEV_FLOAT(aoffset1, vl); - VSEV_FLOAT(boffset2, v0, vl); + v1 = VLEV_FLOAT_HALF(aoffset1, vl); + VSEV_FLOAT_HALF(boffset2, v1, vl); aoffset1 += 4; - //boffset2 += 4; + boffset2 += 4; } if (n & 1) { - *(boffset3) = *(aoffset1); - *(boffset3 + 1) = *(aoffset1 + 1); + *(boffset3) = *(aoffset1); + *(boffset3 + 1) = *(aoffset1 + 1); + + aoffset1 += 2; + boffset3 += 2; } } From fbf594b62f4d1ee015a03a5df6e58fe796e63c98 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 24 Dec 2024 13:34:33 +0100 Subject: [PATCH 194/244] Guard against empty CMAKE_Fortran_COMPILER_ID --- cmake/f_check.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/f_check.cmake b/cmake/f_check.cmake index dc0f5e0ac5..3f713807ea 100644 --- a/cmake/f_check.cmake +++ b/cmake/f_check.cmake @@ -45,7 +45,7 @@ if (NOT ONLY_CBLAS) # TODO: detect whether underscore needed, set #defines and BU appropriately - use try_compile # TODO: set FEXTRALIB flags a la f_check? - if (NOT (${CMAKE_SYSTEM_NAME} MATCHES "Windows" AND ${CMAKE_Fortran_COMPILER_ID} MATCHES "IntelLLVM")) + if (NOT (${CMAKE_SYSTEM_NAME} MATCHES "Windows" AND x${CMAKE_Fortran_COMPILER_ID} MATCHES "IntelLLVM")) set(BU "_") file(APPEND ${TARGET_CONF_TEMP} "#define BUNDERSCORE _\n" From 762fa1afa9aedebb32e4516b9b5b35a70869dd0e Mon Sep 17 00:00:00 2001 From: david-cortes Date: Tue, 24 Dec 2024 19:48:04 +0100 Subject: [PATCH 195/244] fix link to faq --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d8e73b2022..cc9325d39d 100644 --- a/README.md +++ b/README.md @@ -338,7 +338,7 @@ Please see Changelog.txt. ## Troubleshooting -* Please read the [FAQ](www.openmathlib.org/OpenBLAS/docs/faq) section of the docs first. +* Please read the [FAQ](http://www.openmathlib.org/OpenBLAS/docs/faq) section of the docs first. * Please use GCC version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MinGW/BSD. * Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. Clang 3.0 will generate the wrong AVX binary code. From df42f79c4c7bc94e5b861af129638229ec2c0ce9 Mon Sep 17 00:00:00 2001 From: Ralf Gommers Date: Thu, 26 Dec 2024 21:09:26 +0100 Subject: [PATCH 196/244] docs: update extensions and install pages with last wiki edits I went through the wiki pages and found two pages with edits that weren't reflected in the html docs yet, so syncing that content here. --- docs/extensions.md | 16 ++++++++-------- docs/install.md | 14 ++++++++++---- 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/docs/extensions.md b/docs/extensions.md index 483b009289..bc015910d3 100644 --- a/docs/extensions.md +++ b/docs/extensions.md @@ -5,14 +5,14 @@ This page documents those non-standard APIs. ## BLAS-like extensions -| Routine | Data Types | Description | -| ------------- |:------------- | :---------------| -| ?axpby | s,d,c,z | like axpy with a multiplier for y | -| ?gemm3m | c,z | gemm3m | -| ?imatcopy | s,d,c,z | in-place transpositon/copying | -| ?omatcopy | s,d,c,z | out-of-place transpositon/copying | -| ?geadd | s,d,c,z | matrix add | -| ?gemmt | s,d,c,z | gemm but only a triangular part updated| +| Routine | Data Types | Description | +| ------------- |:------------- | :-----------------------------------------------| +| ?axpby | s,d,c,z | like `axpy` with a multiplier for `y` | +| ?gemm3m | c,z | `gemm3m` | +| ?imatcopy | s,d,c,z | in-place transposition/copying | +| ?omatcopy | s,d,c,z | out-of-place transposition/copying | +| ?geadd | s,d,c,z | ATLAS-like matrix add `B = α*A+β*B` | +| ?gemmt | s,d,c,z | `gemm` but only a triangular part updated | ## bfloat16 functionality diff --git a/docs/install.md b/docs/install.md index 5bb88cccd8..3bc7ffc8f9 100644 --- a/docs/install.md +++ b/docs/install.md @@ -536,7 +536,6 @@ In your shell, move to this directory: `cd exports`. To build OpenBLAS for Android, you will need the following tools installed on your machine: - [The Android NDK](https://developer.android.com/ndk/) -- Perl - Clang compiler on the build machine The next two sections below describe how to build with Clang for ARMV7 and @@ -578,7 +577,9 @@ utility in the make command above, like so: AR=${NDK_BUNDLE_DIR}/toolchains/arm-linux-androideabi-4.9/prebuilt/darwin-x86_64/bin/arm-linux-androideabi-gcc-ar ``` otherwise you may get a linker error complaining like `malformed archive header -name at 8` when the native macOS `ar` command was invoked instead. +name at 8` when the native macOS `ar` command was invoked instead. Note that +with recent NDK versions, the AR tool may be named `llvm-ar` rather than what +is assumed above. #### Building for ARMV8 @@ -608,12 +609,17 @@ Note: for NDK 23b, something as simple as: export PATH=/opt/android-ndk-r23b/toolchains/llvm/prebuilt/linux-x86_64/bin/:$PATH make HOSTCC=gcc CC=/opt/android-ndk-r23b/toolchains/llvm/prebuilt/linux-x86_64/bin/aarch64-linux-android31-clang ONLY_CBLAS=1 TARGET=ARMV8 ``` -appears to be sufficient on Linux. +appears to be sufficient on Linux. On OSX, setting AR to the ar provided in the +"bin" path of the NDK (probably `llvm-ar`) is also necessary. ??? note "Alternative build script for 3 architectures" - This script will build OpenBLAS for 3 architecture (`ARMV7`, `ARMV8`, `X86`) and install them to `/opt/OpenBLAS/lib`. + This script will build OpenBLAS for 3 architecture (`ARMV7`, `ARMV8`, + `X86`) and install them to `/opt/OpenBLAS/lib`. Of course you can also copy + only the section that is of interest to you - also notice that the `AR=` + line may need adapting to the name of the ar tool provided in your + `$TOOLCHAIN/bin` - for example `llvm-ar` in some recent NDK versions. It was tested on macOS with NDK version 21.3.6528147. ```bash From d5e255519e5cbdd496a816dce939ae54f59896f2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 29 Dec 2024 22:38:23 +0100 Subject: [PATCH 197/244] Improve OpenBLASConfig.cmake contents --- Makefile.install | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/Makefile.install b/Makefile.install index 129ed9a137..bfed157a49 100644 --- a/Makefile.install +++ b/Makefile.install @@ -191,22 +191,29 @@ endif #Generating OpenBLASConfig.cmake @echo Generating $(OPENBLAS_CMAKE_CONFIG) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR) @echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" - @echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" + @echo "file(REAL_PATH \"../../..\" _OpenBLAS_ROOT_DIR BASE_DIRECTORY \$${CMAKE_CURRENT_LIST_DIR} )" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" + @echo "SET(OpenBLAS_INCLUDE_DIRS \$${_OpenBLAS_ROOT_DIR}/include)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" ifneq ($(NO_SHARED),1) #ifeq logical or ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD DragonFly)) - @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX)$(SYMBOLSUFFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" + @echo "SET(OpenBLAS_LIBRARIES \$${_OpenBLAS_ROOT_DIR}/lib/$(LIBPREFIX)$(SYMBOLSUFFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" endif ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT)) - @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_BINARY_DIR}/$(LIBDLLNAME))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" + @echo "SET(OpenBLAS_LIBRARIES \$${_OpenBLAS_ROOT_DIR}/bin/$(LIBDLLNAME))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" endif ifeq ($(OSNAME), Darwin) - @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).dylib)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" + @echo "SET(OpenBLAS_LIBRARIES \$${_OpenBLAS_ROOT_DIR}/lib/$(LIBPREFIX).dylib)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" +endif + @echo "add_library(OpenBLAS::OpenBLAS SHARED IMPORTED)" + @echo "target_include_directories(OpenBLAS::OpenBLAS INTERFACE \$${OpenBLAS_INCLUDE_DIRS})" +ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT)) + @echo "set_property(TARGET OpenBLAS::OpenBLAS PROPERTY IMPORTED_LOCATION \$${OpenBLAS_LIBRARIES})" + @echo "set_property(TARGET OpenBLAS::OpenBLAS PROPERTY IMPORTED_IMPLIB \$${_OpenBLAS_ROOT_DIR}/lib/libopenblas.lib)" endif else #only static - @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).$(LIBSUFFIX))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" + @echo "SET(OpenBLAS_LIBRARIES \$${_OpenBLAS_ROOT_DIR}/lib/$(LIBPREFIX).$(LIBSUFFIX))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" endif #Generating OpenBLASConfigVersion.cmake @echo Generating $(OPENBLAS_CMAKE_CONFIG_VERSION) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR) From fff2e214caee6e516ba1e49de81e9044d46b5a2e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 30 Dec 2024 23:05:17 +0100 Subject: [PATCH 198/244] Add LAPACK-TEST errors topic --- docs/faq.md | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/docs/faq.md b/docs/faq.md index 699042d512..1a3505ca90 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -51,9 +51,9 @@ In practice, the values are derived by experimentation to yield the block sizes ### How can I report a bug? -Please file an issue at this [issue page](https://github.com/xianyi/OpenBLAS/issues) or send mail to the [OpenBLAS mailing list](https://groups.google.com/forum/#!forum/openblas-users). +Please file an issue at this [issue page](https://github.com/OpenMathLib/OpenBLAS/issues) or send mail to the [OpenBLAS mailing list](https://groups.google.com/forum/#!forum/openblas-users). -Please provide the following information: CPU, OS, compiler, and OpenBLAS compiling flags (Makefile.rule). In addition, please describe how to reproduce this bug. +Please provide the following information: CPU, OS, compiler, OpenBLAS version and any compiling flags you used (Makefile.rule). In addition, please describe how to reproduce this bug. ### How to reference OpenBLAS. @@ -105,7 +105,7 @@ Please read [this page](install.md#visual-studio). Zaheer has fixed this bug. You can now use the structure instead of C99 complex numbers. Please read [this issue page](http://github.com/xianyi/OpenBLAS/issues/95) for details. -[This issue](https://github.com/xianyi/OpenBLAS/issues/305) is for using LAPACKE in Visual Studio. +[This issue](https://github.com/OpenMathLib/OpenBLAS/issues/305) is for using LAPACKE in Visual Studio. ### I get a SEGFAULT with multi-threading on Linux. What's wrong? @@ -134,6 +134,13 @@ Background: OpenBLAS implements optimized versions of some LAPACK functions, so Some of the LAPACK tests, notably in xeigtstz, try to allocate around 10MB on the stack. You may need to use `ulimit -s` to change the default limits on your system to allow this. +### My build worked fine and passed the BLAS tests, but running `make lapack-test` ends with a number of errors in the summary report + +The LAPACK tests were primarily created to test the validity of the Reference-LAPACK implementation, which is implemented in unoptimized, single-threaded Fortran code. This makes it very sensitive to small numerical deviations that can result from the use of specialized cpu instructions that combine multiplications and additions without intermediate rounding and storing to memory (FMA), or from changing the order of mathematical operations by splitting an original problem workload into smaller tasks that are solved in parallel. As a result, you may encounter a small number of errors in the "numerical" column of +the summary table at the end of the `make lapack-test` run - this is usually nothing to worry about, and the exact number and distribution of errors among the +four data types will often vary with the optimization flags you supplied to the compiler, or the cpu model for which you built OpenBLAS. Sporadic errors in the column labeled `other` are normally the sign of failed convergence of iterative diagonalizations for the same reasons just mentioned. A more detailed error report is stored in the file testing_results.txt - this should be consulted in case of doubt. Care should be taken if you encounter numerical errors in the hundreds, or `other` errors accompanied by the LAPACK error message "on entry to function_name parameter X had an illegal value" that signals a problem with argument passing between individual functions. +(See also [this issue](https://github.com/OpenMathLib/OpenBLAS/issues/4032) in the issue tracker on github for additional discussion, examples and links) + ### How could I disable OpenBLAS threading affinity on runtime? You can define the OPENBLAS_MAIN_FREE or GOTOBLAS_MAIN_FREE environment variable to disable threading affinity on runtime. For example, before the running, From c37509c213a34a8cae449ededd7bc7064675ecc4 Mon Sep 17 00:00:00 2001 From: "tingbo.liao" Date: Tue, 31 Dec 2024 08:46:55 +0800 Subject: [PATCH 199/244] Optimize the nrm2_rvv function to further improve performance. Signed-off-by: tingbo.liao --- kernel/riscv64/nrm2_rvv.c | 370 +++++++++++++++++++++----------------- 1 file changed, 204 insertions(+), 166 deletions(-) diff --git a/kernel/riscv64/nrm2_rvv.c b/kernel/riscv64/nrm2_rvv.c index 14ed68b0a0..472b1148eb 100644 --- a/kernel/riscv64/nrm2_rvv.c +++ b/kernel/riscv64/nrm2_rvv.c @@ -27,185 +27,223 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(DOUBLE) -#define VSETVL __riscv_vsetvl_e64m4 -#define FLOAT_V_T vfloat64m4_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT __riscv_vle64_v_f64m4 -#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 -#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 -#define VFMVSF_FLOAT __riscv_vfmv_s_f_f64m4 -#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 -#define MASK_T vbool16_t -#define VFABS __riscv_vfabs_v_f64m4 -#define VMFNE __riscv_vmfne_vf_f64m4_b16 -#define VMFGT __riscv_vmfgt_vv_f64m4_b16 -#define VMFEQ __riscv_vmfeq_vf_f64m4_b16 -#define VCPOP __riscv_vcpop_m_b16 -#define VFREDMAX __riscv_vfredmax_vs_f64m4_f64m1 -#define VFREDMIN __riscv_vfredmin_vs_f64m4_f64m1 -#define VFIRST __riscv_vfirst_m_b16 -#define VRGATHER __riscv_vrgather_vx_f64m4 -#define VFDIV __riscv_vfdiv_vv_f64m4 -#define VFDIV_M __riscv_vfdiv_vv_f64m4_mu -#define VFMUL __riscv_vfmul_vv_f64m4 -#define VFMUL_M __riscv_vfmul_vv_f64m4_mu -#define VFMACC __riscv_vfmacc_vv_f64m4 -#define VFMACC_M __riscv_vfmacc_vv_f64m4_mu -#define VMSBF __riscv_vmsbf_m_b16 -#define VMSOF __riscv_vmsof_m_b16 -#define VMAND __riscv_vmand_mm_b16 -#define VMANDN __riscv_vmand_mm_b16 -#define VFREDSUM __riscv_vfredusum_vs_f64m4_f64m1 -#define VMERGE __riscv_vmerge_vvm_f64m4 -#define VSEV_FLOAT __riscv_vse64_v_f64m4 -#define EXTRACT_FLOAT0_V(v) __riscv_vfmv_f_s_f64m4_f64(v) -#define ABS fabs -#else -#define VSETVL __riscv_vsetvl_e32m4 +#if !defined(DOUBLE) +#define VSETVL(n) __riscv_vsetvl_e32m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e32m4() #define FLOAT_V_T vfloat32m4_t #define FLOAT_V_T_M1 vfloat32m1_t +#define MASK_T vbool8_t #define VLEV_FLOAT __riscv_vle32_v_f32m4 #define VLSEV_FLOAT __riscv_vlse32_v_f32m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1_tu +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu #define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4 -#define VFMVSF_FLOAT __riscv_vfmv_s_f_f32m4 #define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1 -#define MASK_T vbool8_t -#define VFABS __riscv_vfabs_v_f32m4 -#define VMFNE __riscv_vmfne_vf_f32m4_b8 -#define VMFGT __riscv_vmfgt_vv_f32m4_b8 -#define VMFEQ __riscv_vmfeq_vf_f32m4_b8 -#define VCPOP __riscv_vcpop_m_b8 -#define VFREDMAX __riscv_vfredmax_vs_f32m4_f32m1 -#define VFREDMIN __riscv_vfredmin_vs_f32m4_f32m1 -#define VFIRST __riscv_vfirst_m_b8 -#define VRGATHER __riscv_vrgather_vx_f32m4 -#define VFDIV __riscv_vfdiv_vv_f32m4 -#define VFDIV_M __riscv_vfdiv_vv_f32m4_mu -#define VFMUL __riscv_vfmul_vv_f32m4 -#define VFMUL_M __riscv_vfmul_vv_f32m4_mu -#define VFMACC __riscv_vfmacc_vv_f32m4 -#define VFMACC_M __riscv_vfmacc_vv_f32m4_mu -#define VMSBF __riscv_vmsbf_m_b8 -#define VMSOF __riscv_vmsof_m_b8 -#define VMAND __riscv_vmand_mm_b8 -#define VMANDN __riscv_vmand_mm_b8 -#define VFREDSUM __riscv_vfredusum_vs_f32m4_f32m1 -#define VMERGE __riscv_vmerge_vvm_f32m4 -#define VSEV_FLOAT __riscv_vse32_v_f32m4 -#define EXTRACT_FLOAT0_V(v) __riscv_vfmv_f_s_f32m4_f32(v) +#define VMFIRSTM __riscv_vfirst_m_b8 +#define VFREDMAXVS_FLOAT_TU __riscv_vfredmax_vs_f32m4_f32m1_tu +#define VFMVFS_FLOAT __riscv_vfmv_f_s_f32m1_f32 +#define VMFGTVF_FLOAT __riscv_vmfgt_vf_f32m4_b8 +#define VFDIVVF_FLOAT __riscv_vfdiv_vf_f32m4 +#define VFABSV_FLOAT __riscv_vfabs_v_f32m4 #define ABS fabsf +#else +#define VSETVL(n) __riscv_vsetvl_e64m4(n) +#define VSETVL_MAX __riscv_vsetvlmax_e64m4() +#define FLOAT_V_T vfloat64m4_t +#define FLOAT_V_T_M1 vfloat64m1_t +#define MASK_T vbool16_t +#define VLEV_FLOAT __riscv_vle64_v_f64m4 +#define VLSEV_FLOAT __riscv_vlse64_v_f64m4 +#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1_tu +#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4 +#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1 +#define VMFIRSTM __riscv_vfirst_m_b16 +#define VFREDMAXVS_FLOAT_TU __riscv_vfredmax_vs_f64m4_f64m1_tu +#define VFMVFS_FLOAT __riscv_vfmv_f_s_f64m1_f64 +#define VMFGTVF_FLOAT __riscv_vmfgt_vf_f64m4_b16 +#define VFDIVVF_FLOAT __riscv_vfdiv_vf_f64m4 +#define VFABSV_FLOAT __riscv_vfabs_v_f64m4 +#define ABS fabs #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i=0; - - if (n <= 0 || inc_x == 0) return(0.0); - if(n == 1) return (ABS(x[0])); - - unsigned int gvl = 0; - - MASK_T nonzero_mask; - MASK_T scale_mask; - - gvl = VSETVL(n); - FLOAT_V_T v0; - FLOAT_V_T v_ssq = VFMVVF_FLOAT(0, gvl); - FLOAT_V_T v_scale = VFMVVF_FLOAT(0, gvl); - - FLOAT scale = 0; - FLOAT ssq = 0; - unsigned int stride_x = inc_x * sizeof(FLOAT); - int idx = 0; - - if( n >= gvl && inc_x > 0 ) // don't pay overheads if we're not doing useful work - { - for(i=0; i 0 ){ + FLOAT_V_T vr, v0, v_zero; + unsigned int gvl = 0; + FLOAT_V_T_M1 v_res, v_z0; + gvl = VSETVL_MAX; + v_res = VFMVVF_FLOAT_M1(0, gvl); + v_z0 = VFMVVF_FLOAT_M1(0, gvl); + MASK_T mask; + BLASLONG index = 0; + + if (inc_x == 1) { + gvl = VSETVL(n); + vr = VFMVVF_FLOAT(0, gvl); + v_zero = VFMVVF_FLOAT(0, gvl); + for (i = 0, j = 0; i < n / gvl; i++) { + v0 = VLEV_FLOAT(&x[j], gvl); + // fabs(vector) + v0 = VFABSV_FLOAT(v0, gvl); + // if scale change + mask = VMFGTVF_FLOAT(v0, scale, gvl); + index = VMFIRSTM(mask, gvl); + if (index == -1) { // no elements greater than scale + if (scale != 0.0) { + v0 = VFDIVVF_FLOAT(v0, scale, gvl); + vr = VFMACCVV_FLOAT_TU(vr, v0, v0, gvl); + } + } + else { // found greater element + // ssq in vector vr: vr[0] + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + // total ssq before current vector + ssq += VFMVFS_FLOAT(v_res); + // find max + v_res = VFREDMAXVS_FLOAT_TU(v_res, v0, v_z0, gvl); + // update ssq before max_index + ssq = ssq * (scale / VFMVFS_FLOAT(v_res)) * (scale / VFMVFS_FLOAT(v_res)); + // update scale + scale = VFMVFS_FLOAT(v_res); + // ssq in vector vr + v0 = VFDIVVF_FLOAT(v0, scale, gvl); + vr = VFMACCVV_FLOAT_TU(v_zero, v0, v0, gvl); + } + j += gvl; + } + // ssq in vector vr: vr[0] + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + // total ssq now + ssq += VFMVFS_FLOAT(v_res); + + // tail processing + if(j < n){ + gvl = VSETVL(n-j); + v0 = VLEV_FLOAT(&x[j], gvl); + // fabs(vector) + v0 = VFABSV_FLOAT(v0, gvl); + // if scale change + mask = VMFGTVF_FLOAT(v0, scale, gvl); + index = VMFIRSTM(mask, gvl); + if (index == -1) { // no elements greater than scale + if(scale != 0.0) + v0 = VFDIVVF_FLOAT(v0, scale, gvl); + } else { // found greater element + // find max + v_res = VFREDMAXVS_FLOAT_TU(v_res, v0, v_z0, gvl); + // update ssq before max_index + ssq = ssq * (scale / VFMVFS_FLOAT(v_res))*(scale / VFMVFS_FLOAT(v_res)); + // update scale + scale = VFMVFS_FLOAT(v_res); + v0 = VFDIVVF_FLOAT(v0, scale, gvl); + } + vr = VFMACCVV_FLOAT_TU(v_zero, v0, v0, gvl); + // ssq in vector vr: vr[0] + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + // total ssq now + ssq += VFMVFS_FLOAT(v_res); + } + } + else { + gvl = VSETVL(n); + vr = VFMVVF_FLOAT(0, gvl); + v_zero = VFMVVF_FLOAT(0, gvl); + unsigned int stride_x = inc_x * sizeof(FLOAT); + int idx = 0, inc_v = inc_x * gvl; + for (i = 0, j = 0; i < n / gvl; i++) { + v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl); + // fabs(vector) + v0 = VFABSV_FLOAT(v0, gvl); + // if scale change + mask = VMFGTVF_FLOAT(v0, scale, gvl); + index = VMFIRSTM(mask, gvl); + if (index == -1) {// no elements greater than scale + if(scale != 0.0){ + v0 = VFDIVVF_FLOAT(v0, scale, gvl); + vr = VFMACCVV_FLOAT_TU(vr, v0, v0, gvl); + } + } + else { // found greater element + // ssq in vector vr: vr[0] + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + // total ssq before current vector + ssq += VFMVFS_FLOAT(v_res); + // find max + v_res = VFREDMAXVS_FLOAT_TU(v_res, v0, v_z0, gvl); + // update ssq before max_index + ssq = ssq * (scale / VFMVFS_FLOAT(v_res))*(scale / VFMVFS_FLOAT(v_res)); + // update scale + scale = VFMVFS_FLOAT(v_res); + // ssq in vector vr + v0 = VFDIVVF_FLOAT(v0, scale, gvl); + vr = VFMACCVV_FLOAT_TU(v_zero, v0, v0, gvl); + } + j += gvl; + idx += inc_v; + } + // ssq in vector vr: vr[0] + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + // total ssq now + ssq += VFMVFS_FLOAT(v_res); + + // tail processing + if (j < n) { + gvl = VSETVL(n-j); + v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl); + // fabs(vector) + v0 = VFABSV_FLOAT(v0, gvl); + // if scale change + mask = VMFGTVF_FLOAT(v0, scale, gvl); + index = VMFIRSTM(mask, gvl); + if(index == -1) { // no elements greater than scale + if(scale != 0.0) { + v0 = VFDIVVF_FLOAT(v0, scale, gvl); + vr = VFMACCVV_FLOAT_TU(v_zero, v0, v0, gvl); + } + } + else { // found greater element + // find max + v_res = VFREDMAXVS_FLOAT_TU(v_res, v0, v_z0, gvl); + // update ssq before max_index + ssq = ssq * (scale / VFMVFS_FLOAT(v_res))*(scale / VFMVFS_FLOAT(v_res)); + // update scale + scale = VFMVFS_FLOAT(v_res); + v0 = VFDIVVF_FLOAT(v0, scale, gvl); + vr = VFMACCVV_FLOAT_TU(v_zero, v0, v0, gvl); + } + // ssq in vector vr: vr[0] + v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl); + // total ssq now + ssq += VFMVFS_FLOAT(v_res); + } + } + } + else{ + // using scalar ops when inc_x < 0 + n *= inc_x; while(abs(i) < abs(n)){ - if ( x[i] != 0.0 ){ - FLOAT absxi = ABS( x[i] ); - if ( scale < absxi ){ - ssq = 1 + ssq * ( scale / absxi ) * ( scale / absxi ); - scale = absxi ; - } - else{ - ssq += ( absxi/scale ) * ( absxi/scale ); - } - - } - - i += inc_x; + if ( x[i] != 0.0 ){ + FLOAT absxi = ABS( x[i] ); + if ( scale < absxi ){ + ssq = 1 + ssq * ( scale / absxi ) * ( scale / absxi ); + scale = absxi ; + } + else{ + ssq += ( absxi/scale ) * ( absxi/scale ); + } + + } + i += inc_x; } - + } return(scale * sqrt(ssq)); } From 6ad793d65ec1e5e733e3c2e2327793cc1d3b8360 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 31 Dec 2024 14:34:55 +0100 Subject: [PATCH 200/244] Fix naming of suffixed libraries in the cmake and pkgconfig files --- Makefile.install | 4 ++-- openblas.pc.in | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Makefile.install b/Makefile.install index bfed157a49..486e9233eb 100644 --- a/Makefile.install +++ b/Makefile.install @@ -191,13 +191,13 @@ endif #Generating OpenBLASConfig.cmake @echo Generating $(OPENBLAS_CMAKE_CONFIG) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR) @echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" - @echo "file(REAL_PATH \"../../..\" _OpenBLAS_ROOT_DIR BASE_DIRECTORY \$${CMAKE_CURRENT_LIST_DIR} )" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" + @echo "file(REAL_PATH \"../../..\" _OpenBLAS_ROOT_DIR BASE_DIRECTORY \$${CMAKE_CURRENT_LIST_DIR} )" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" @echo "SET(OpenBLAS_INCLUDE_DIRS \$${_OpenBLAS_ROOT_DIR}/include)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" ifneq ($(NO_SHARED),1) #ifeq logical or ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD DragonFly)) - @echo "SET(OpenBLAS_LIBRARIES \$${_OpenBLAS_ROOT_DIR}/lib/$(LIBPREFIX)$(SYMBOLSUFFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" + @echo "SET(OpenBLAS_LIBRARIES \$${_OpenBLAS_ROOT_DIR}/lib/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" endif ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT)) @echo "SET(OpenBLAS_LIBRARIES \$${_OpenBLAS_ROOT_DIR}/bin/$(LIBDLLNAME))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" diff --git a/openblas.pc.in b/openblas.pc.in index 7632645ac1..fe2f087208 100644 --- a/openblas.pc.in +++ b/openblas.pc.in @@ -2,6 +2,6 @@ Name: openblas Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version Version: ${version} URL: https://github.com/xianyi/OpenBLAS -Libs: -L${libdir} -l${libprefix}openblas${libnamesuffix}${libsuffix} +Libs: -L${libdir} -l${libprefix}openblas${libsuffix}${libnamesuffix} Libs.private: ${extralib} Cflags: -I${includedir} ${omp_opt} From e9ff70b3941d99ad101286629e0044f6de83daa5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 31 Dec 2024 15:55:13 +0100 Subject: [PATCH 201/244] Add an install_tests target to facilitate testing on cross-compiled targets --- Makefile | 3 ++ Makefile.install | 93 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 96 insertions(+) diff --git a/Makefile b/Makefile index 78f82dea59..4c72177343 100644 --- a/Makefile +++ b/Makefile @@ -426,6 +426,9 @@ dummy : install : $(MAKE) -f Makefile.install install +install_tests : + $(MAKE) -f Makefile.install install_tests + clean :: @for d in $(SUBDIRS_ALL) ; \ do if test -d $$d; then \ diff --git a/Makefile.install b/Makefile.install index 486e9233eb..cd1dcdabcb 100644 --- a/Makefile.install +++ b/Makefile.install @@ -227,3 +227,96 @@ endif @echo " endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" @echo "endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" @echo Install OK! + +install_tests : lib.grd +ifneq ($(ONLY_CBLAS), 1) + @install -m 666 utest/openblas_utest $(DESTDIR)$(OPENBLAS_BINARY_DIR) + @install -m 666 utest/openblas_utest_ext $(DESTDIR)$(OPENBLAS_BINARY_DIR) +ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) +ifndef NO_FBLAS +ifeq ($(BUILD_BFLOAT16),1) + @install -m 666 test/test_sbgemm $(DESTDIR)$(OPENBLAS_BINARY_DIR) +endif +ifeq ($(BUILD_SINGLE),1) + @install -m 666 test/sblat1 $(DESTDIR)$(OPENBLAS_BINARY_DIR) + @install -m 666 test/sblat2 $(DESTDIR)$(OPENBLAS_BINARY_DIR) + @install -m 666 test/sblat3 $(DESTDIR)$(OPENBLAS_BINARY_DIR) + @install -m 666 test/sblat2.dat $(DESTDIR)$(OPENBLAS_BINARY_DIR) + @install -m 666 test/sblat3.dat $(DESTDIR)$(OPENBLAS_BINARY_DIR) +endif +ifeq ($(BUILD_DOUBLE),1) + @install -m 666 test/dblat1 $(DESTDIR)$(OPENBLAS_BINARY_DIR) + @install -m 666 test/dblat2 $(DESTDIR)$(OPENBLAS_BINARY_DIR) + @install -m 666 test/dblat3 $(DESTDIR)$(OPENBLAS_BINARY_DIR) + @install -m 666 test/dblat2.dat $(DESTDIR)$(OPENBLAS_BINARY_DIR) + @install -m 666 test/dblat3.dat $(DESTDIR)$(OPENBLAS_BINARY_DIR) +endif +ifeq ($(BUILD_COMPLEX),1) + @install -m 666 test/cblat1 $(DESTDIR)$(OPENBLAS_BINARY_DIR) + @install -m 666 test/cblat2 $(DESTDIR)$(OPENBLAS_BINARY_DIR) + @install -m 666 test/cblat3 $(DESTDIR)$(OPENBLAS_BINARY_DIR) + @install -m 666 test/cblat2.dat $(DESTDIR)$(OPENBLAS_BINARY_DIR) + @install -m 666 test/cblat3.dat $(DESTDIR)$(OPENBLAS_BINARY_DIR) +ifeq ($(ARCH), filter($(ARCH), x86 x86_64 ia64 MIPS)) + @install -m 666 test/cblat3_3m $(DESTDIR)$(OPENBLAS_BINARY_DIR) + @install -m 666 test/cblat3_3m.dat $(DESTDIR)$(OPENBLAS_BINARY_DIR) +endif +endif +ifeq ($(BUILD_COMPLEX16),1) + @install -m 666 test/zblat1 $(DESTDIR)$(OPENBLAS_BINARY_DIR) + @install -m 666 test/zblat2 $(DESTDIR)$(OPENBLAS_BINARY_DIR) + @install -m 666 test/zblat3 $(DESTDIR)$(OPENBLAS_BINARY_DIR) + @install -m 666 test/zblat2.dat $(DESTDIR)$(OPENBLAS_BINARY_DIR) + @install -m 666 test/zblat3.dat $(DESTDIR)$(OPENBLAS_BINARY_DIR) +ifeq ($(ARCH), filter($(ARCH), x86 x86_64 ia64 MIPS)) + @install -m 666 test/zblat3_3m $(DESTDIR)$(OPENBLAS_BINARY_DIR) + @install -m 666 test/zblat3_3m.dat $(DESTDIR)$(OPENBLAS_BINARY_DIR) +endif +endif +endif +endif +ifneq ($(ONLY_CBLAS), 1) +ifeq ($(BUILD_SINGLE),1) + @install -m 666 ctest/xscblat1 $(DESTDIR)$(OPENBLAS_BINARY_DIR) + @install -m 666 ctest/xscblat2 $(DESTDIR)$(OPENBLAS_BINARY_DIR) + @install -m 666 ctest/xscblat3 $(DESTDIR)$(OPENBLAS_BINARY_DIR) + @install -m 666 ctest/sin2 $(DESTDIR)$(OPENBLAS_BINARY_DIR) + @install -m 666 ctest/sin3 $(DESTDIR)$(OPENBLAS_BINARY_DIR) +endif +ifeq ($(BUILD_DOUBLE),1) + @install -m 666 ctest/xdcblat1 $(DESTDIR)$(OPENBLAS_BINARY_DIR) + @install -m 666 ctest/xdcblat2 $(DESTDIR)$(OPENBLAS_BINARY_DIR) + @install -m 666 ctest/xdcblat3 $(DESTDIR)$(OPENBLAS_BINARY_DIR) + @install -m 666 ctest/din2 $(DESTDIR)$(OPENBLAS_BINARY_DIR) + @install -m 666 ctest/din3 $(DESTDIR)$(OPENBLAS_BINARY_DIR) +endif +ifeq ($(BUILD_COMPLEX),1) + @install -m 666 ctest/xccblat1 $(DESTDIR)$(OPENBLAS_BINARY_DIR) + @install -m 666 ctest/xccblat2 $(DESTDIR)$(OPENBLAS_BINARY_DIR) + @install -m 666 ctest/xccblat3 $(DESTDIR)$(OPENBLAS_BINARY_DIR) + @install -m 666 ctest/cin2 $(DESTDIR)$(OPENBLAS_BINARY_DIR) + @install -m 666 ctest/cin3 $(DESTDIR)$(OPENBLAS_BINARY_DIR) +ifeq ($(ARCH), filter($(ARCH), x86 x86_64 ia64 MIPS)) + @install -m 666 ctest/xccblat3_3m $(DESTDIR)$(OPENBLAS_BINARY_DIR) + @install -m 666 ctest/cin3_3m $(DESTDIR)$(OPENBLAS_BINARY_DIR) +endif +endif +ifeq ($(BUILD_COMPLEX16),1) + @install -m 666 ctest/xzcblat1 $(DESTDIR)$(OPENBLAS_BINARY_DIR) + @install -m 666 ctest/xzcblat2 $(DESTDIR)$(OPENBLAS_BINARY_DIR) + @install -m 666 ctest/xzcblat3 $(DESTDIR)$(OPENBLAS_BINARY_DIR) + @install -m 666 ctest/zin2 $(DESTDIR)$(OPENBLAS_BINARY_DIR) + @install -m 666 ctest/zin3 $(DESTDIR)$(OPENBLAS_BINARY_DIR) +ifeq ($(ARCH), filter($(ARCH), x86 x86_64 ia64 MIPS)) + @install -m 666 ctest/xzcblat3_3m $(DESTDIR)$(OPENBLAS_BINARY_DIR) + @install -m 666 ctest/zin3_3m $(DESTDIR)$(OPENBLAS_BINARY_DIR) +endif +endif + +endif +ifeq ($(CPP_THREAD_SAFETY_TEST), 1) +@install -m 666 cpp_thread_test/dgemm_tester $(DESTDIR)$(OPENBLAS_BINARY_DIR) +@install -m 666 cpp_thread_test/dgemv_tester $(DESTDIR)$(OPENBLAS_BINARY_DIR) +endif +endif + From be807c98a6463f18bed2c5ea111a02b670b20f57 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 1 Jan 2025 21:42:10 +0100 Subject: [PATCH 202/244] Identify all cores, group by performance and report the fastest TARGET --- cpuid_arm64.c | 184 +++++++++++++++++++++++++++++++++----------------- 1 file changed, 123 insertions(+), 61 deletions(-) diff --git a/cpuid_arm64.c b/cpuid_arm64.c index fbb78e7943..3e0022b845 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -25,6 +25,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ +#include #include #ifdef __APPLE__ #include @@ -33,6 +34,20 @@ size_t length=sizeof(value); int64_t value64; size_t length64=sizeof(value64); #endif +#if (defined OS_LINUX || defined OS_ANDROID) +#include +#include +#ifndef HWCAP_CPUID +#define HWCAP_CPUID (1 << 11) +#endif +#ifndef HWCAP_SVE +#define HWCAP_SVE (1 << 22) +#endif + +#define get_cpu_ftr(id, var) ({ \ + __asm__ __volatile__ ("mrs %0, "#id : "=r" (var)); \ + }) +#endif #define CPU_UNKNOWN 0 #define CPU_ARMV8 1 @@ -42,11 +57,9 @@ size_t length64=sizeof(value64); #define CPU_CORTEXA57 3 #define CPU_CORTEXA72 4 #define CPU_CORTEXA73 5 -#define CPU_CORTEXA76 23 #define CPU_NEOVERSEN1 11 #define CPU_NEOVERSEV1 16 #define CPU_NEOVERSEN2 17 -#define CPU_NEOVERSEV2 24 #define CPU_CORTEXX1 18 #define CPU_CORTEXX2 19 #define CPU_CORTEXA510 20 @@ -91,9 +104,7 @@ static char *cpuname[] = { "CORTEXX2", "CORTEXA510", "CORTEXA710", - "FT2000", - "CORTEXA76", - "NEOVERSEV2" + "FT2000" }; static char *cpuname_lower[] = { @@ -119,15 +130,17 @@ static char *cpuname_lower[] = { "cortexx2", "cortexa510", "cortexa710", - "ft2000", - "cortexa76", - "neoversev2" + "ft2000" }; +static int cpulowperf=0; +static int cpumidperf=0; +static int cpuhiperf=0; + int get_feature(char *search) { -#if defined( __linux ) || defined( __NetBSD__ ) +#ifdef __linux FILE *infile; char buffer[2048], *p,*t; p = (char *) NULL ; @@ -158,33 +171,108 @@ int get_feature(char *search) #endif return(0); } - +static int cpusort(const void *model1, const void *model2) +{ + return (*(int*)model2-*(int*)model1); +} int detect(void) { -#if defined( __linux ) || defined( __NetBSD__ ) - +#ifdef __linux + int n,i,ii; + int midr_el1; + int implementer; + int cpucap[1024]; + int cpucores[1024]; FILE *infile; - char buffer[512], *p, *cpu_part = NULL, *cpu_implementer = NULL; + char cpupart[6],cpuimpl[6]; + char *cpu_impl=NULL,*cpu_pt=NULL; + char buffer[2048], *p, *cpu_part = NULL, *cpu_implementer = NULL; p = (char *) NULL ; - - infile = fopen("/proc/cpuinfo", "r"); - while (fgets(buffer, sizeof(buffer), infile)) { - if ((cpu_part != NULL) && (cpu_implementer != NULL)) { - break; + cpulowperf=cpumidperf=cpuhiperf=0; + for (i=0;i<1024;i++)cpucores[i]=0; + n=0; + infile = fopen("/sys/devices/system/cpu/possible", "r"); + if (!infile) { + infile = fopen("/proc/cpuinfo", "r"); + while (fgets(buffer, sizeof(buffer), infile)) { + if (!strncmp("processor", buffer, 9)) + n++; } - - if ((cpu_part == NULL) && !strncmp("CPU part", buffer, 8)) { - cpu_part = strchr(buffer, ':') + 2; - cpu_part = strdup(cpu_part); - } else if ((cpu_implementer == NULL) && !strncmp("CPU implementer", buffer, 15)) { - cpu_implementer = strchr(buffer, ':') + 2; - cpu_implementer = strdup(cpu_implementer); + } else { + fgets(buffer, sizeof(buffer), infile); + sscanf(buffer,"0-%d",&n); + n++; + } + fclose(infile); + + cpu_implementer=NULL; + for (i=0;i= 0xd4b) cpuhiperf++; + else + if (cpucores[ii] >= 0xd07) cpumidperf++; + else cpulowperf++; + } + else cpulowperf++; + } + fclose(infile); + break; + } else { + (void)fgets(buffer, sizeof(buffer), infile); + midr_el1=strtoul(buffer,NULL,16); + fclose(infile); + implementer = (midr_el1 >> 24) & 0xFF; + cpucores[i] = (midr_el1 >> 4) & 0xFFF; + sprintf(buffer,"/sys/devices/system/cpu/cpu%d/cpu_capacity",i); + infile= fopen(buffer,"r"); + if (!infile) { + if (implementer== 65) { + if (cpucores[i] >= 0xd4b) cpuhiperf++; + else + if (cpucores[i] >= 0xd07) cpumidperf++; + else cpulowperf++; + } + else cpulowperf++; + } else { + (void)fgets(buffer, sizeof(buffer), infile); + sscanf(buffer,"%d",&cpucap[i]); + if (cpucap[i] >= 1000) cpuhiperf++; + else + if (cpucap[i] >= 500) cpumidperf++; + else cpulowperf++; + fclose(infile); + } } + sprintf(cpuimpl,"0x%2x",implementer); + cpu_implementer=strdup(cpuimpl); } - - fclose(infile); + qsort(cpucores,1024,sizeof(int),cpusort); + sprintf(cpupart,"0x%3x",cpucores[0]); + cpu_part=strdup(cpupart); if(cpu_part != NULL && cpu_implementer != NULL) { // Arm if (strstr(cpu_implementer, "0x41")) { @@ -216,10 +304,6 @@ int detect(void) return CPU_CORTEXX2; else if (strstr(cpu_part, "0xd4e")) //X3 return CPU_CORTEXX2; - else if (strstr(cpu_part, "0xd4f")) //NVIDIA Grace et al. - return CPU_NEOVERSEV2; - else if (strstr(cpu_part, "0xd0b")) - return CPU_CORTEXA76; } // Qualcomm else if (strstr(cpu_implementer, "0x51") && strstr(cpu_part, "0xc00")) @@ -280,8 +364,6 @@ int detect(void) sysctlbyname("hw.cpufamily",&value64,&length64,NULL,0); if (value64 ==131287967|| value64 == 458787763 ) return CPU_VORTEX; //A12/M1 if (value64 == 3660830781) return CPU_VORTEX; //A15/M2 - if (value64 == 2271604202) return CPU_VORTEX; //A16/M3 - if (value64 == 1867590060) return CPU_VORTEX; //M4 #endif return CPU_ARMV8; #endif @@ -314,7 +396,7 @@ void get_cpucount(void) { int n=0; -#if defined( __linux ) || defined( __NetBSD__ ) +#ifdef __linux FILE *infile; char buffer[2048], *p,*t; p = (char *) NULL ; @@ -331,6 +413,12 @@ int n=0; fclose(infile); printf("#define NUM_CORES %d\n",n); + if (cpulowperf >0) + printf("#define NUM_CORES_LP %d\n",cpulowperf); + if (cpumidperf >0) + printf("#define NUM_CORES_MP %d\n",cpumidperf); + if (cpuhiperf >0) + printf("#define NUM_CORES_HP %d\n",cpuhiperf); #endif #ifdef __APPLE__ sysctlbyname("hw.physicalcpu_max",&value,&length,NULL,0); @@ -347,7 +435,6 @@ void get_cpuconfig(void) printf("#define ARMV8\n"); printf("#define HAVE_NEON\n"); // This shouldn't be necessary printf("#define HAVE_VFPV4\n"); // This shouldn't be necessary - int d = detect(); switch (d) { @@ -402,8 +489,6 @@ void get_cpuconfig(void) break; case CPU_NEOVERSEV1: - printf("#define HAVE_SVE 1\n"); - case CPU_CORTEXA76: printf("#define %s\n", cpuname[d]); printf("#define L1_CODE_SIZE 65536\n"); printf("#define L1_CODE_LINESIZE 64\n"); @@ -431,32 +516,12 @@ void get_cpuconfig(void) printf("#define L2_ASSOCIATIVE 8\n"); printf("#define DTB_DEFAULT_ENTRIES 48\n"); printf("#define DTB_SIZE 4096\n"); - printf("#define HAVE_SVE 1\n"); - break; - case CPU_NEOVERSEV2: - printf("#define ARMV9\n"); - printf("#define HAVE_SVE 1\n"); - printf("#define %s\n", cpuname[d]); - printf("#define L1_CODE_SIZE 65536\n"); - printf("#define L1_CODE_LINESIZE 64\n"); - printf("#define L1_CODE_ASSOCIATIVE 4\n"); - printf("#define L1_DATA_SIZE 65536\n"); - printf("#define L1_DATA_LINESIZE 64\n"); - printf("#define L1_DATA_ASSOCIATIVE 4\n"); - printf("#define L2_SIZE 1048576\n"); - printf("#define L2_LINESIZE 64\n"); - printf("#define L2_ASSOCIATIVE 8\n"); - // L1 Data TLB = 48 entries - // L2 Data TLB = 2048 entries - printf("#define DTB_DEFAULT_ENTRIES 48\n"); - printf("#define DTB_SIZE 4096\n"); // Set to 4096 for symmetry with other configs. break; case CPU_CORTEXA510: case CPU_CORTEXA710: case CPU_CORTEXX1: case CPU_CORTEXX2: printf("#define ARMV9\n"); - printf("#define HAVE_SVE 1\n"); printf("#define %s\n", cpuname[d]); printf("#define L1_CODE_SIZE 65536\n"); printf("#define L1_CODE_LINESIZE 64\n"); @@ -559,8 +624,6 @@ void get_cpuconfig(void) case CPU_VORTEX: printf("#define VORTEX \n"); #ifdef __APPLE__ - sysctlbyname("hw.cpufamily",&value64,&length64,NULL,0); - if (value64 == 1867590060) printf("#define HAVE_SME 1\n");; //M4 sysctlbyname("hw.l1icachesize",&value64,&length64,NULL,0); printf("#define L1_CODE_SIZE %lld \n",value64); sysctlbyname("hw.cachelinesize",&value64,&length64,NULL,0); @@ -575,7 +638,6 @@ void get_cpuconfig(void) break; case CPU_A64FX: printf("#define A64FX\n"); - printf("#define HAVE_SVE 1\n"); printf("#define L1_CODE_SIZE 65535\n"); printf("#define L1_DATA_SIZE 65535\n"); printf("#define L1_DATA_LINESIZE 256\n"); @@ -608,7 +670,7 @@ void get_libname(void) void get_features(void) { -#if defined( __linux ) || defined( __NetBSD__ ) +#ifdef __linux FILE *infile; char buffer[2048], *p,*t; p = (char *) NULL ; From 3c3d1c48495091de690bc9a480702528465e1bfa Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 1 Jan 2025 22:21:29 +0100 Subject: [PATCH 203/244] Identify all cores and select the most performant one as TARGET --- driver/others/dynamic_arm64.c | 49 ++++++++++++++++++++++++++++++----- 1 file changed, 43 insertions(+), 6 deletions(-) diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index dc88d816fb..53ec99e476 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -271,15 +271,52 @@ static gotoblas_t *get_coretype(void) { if (!(getauxval(AT_HWCAP) & HWCAP_CPUID)) { #ifdef __linux + int i; + int ncores=0; + int p,cpucap,cpulowperf=0,cpumidperf=0,cpuhiperf=0; FILE *infile; char buffer[512], *p, *cpu_part = NULL, *cpu_implementer = NULL; p = (char *) NULL ; - infile = fopen("/sys/devices/system/cpu/cpu0/regs/identification/midr_el1","r"); - if (!infile) return NULL; - (void)fgets(buffer, sizeof(buffer), infile); - midr_el1=strtoul(buffer,NULL,16); - fclose(infile); -#else + infile = fopen("/sys/devices/system/cpu/possible","r"); + if (infile) { + (void)fgets(buffer, sizeof(buffer), infile); + sscanf(buffer,"0-%d",&ncores); + fclose (infile); + ncores++; + } else { + infile = fopen("/proc/cpuinfo","r"); + while (fgets(buffer, sizeof(buffer), infile)) { + if (!strncmp("processor", buffer, 9)) + ncores++; + } + } + for (i=0;i> 24) & 0xFF; + p = (midr_el1 >> 4) & 0xFFF; + fclose(infile); + sprintf(buffer,"/sys/devices/system/cpu/cpu%d/cpu_capability",i); + infile = fopen(buffer,"r"); + if (infile) { + (void)fgets(buffer, sizeof(buffer), infile); + cpucap=strtoul(buffer,NULL,16); + fclose(infile); + if (cpucap >= 1000) cpuhiperf++; + else if (cpucap >=500) cpumidperf++; + else cpulowperf++; + if (cpucap >=1000) part = p; + } else if (implementer == 0x41 ){ + if (p >= 0xd4b) cpuhiperf++: + else if (p>= 0xd07) cpumidperf++; + else cpulowperf++; + } else cpulowperf++; + } + if (!part) part = p; +#else snprintf(coremsg, 128, "Kernel lacks cpuid feature support. Auto detection of core type failed !!!\n"); openblas_warning(1, coremsg); return NULL; From ed957916182627d7cd20efc5a8dfb400b1c26457 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 1 Jan 2025 23:27:38 +0100 Subject: [PATCH 204/244] fix conflicting variables --- driver/others/dynamic_arm64.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index 53ec99e476..b1aad68d9b 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -273,10 +273,10 @@ static gotoblas_t *get_coretype(void) { #ifdef __linux int i; int ncores=0; - int p,cpucap,cpulowperf=0,cpumidperf=0,cpuhiperf=0; + int prt,cpucap,cpulowperf=0,cpumidperf=0,cpuhiperf=0; FILE *infile; - char buffer[512], *p, *cpu_part = NULL, *cpu_implementer = NULL; - p = (char *) NULL ; + char buffer[512], *cpu_part = NULL, *cpu_implementer = NULL; + infile = fopen("/sys/devices/system/cpu/possible","r"); if (infile) { (void)fgets(buffer, sizeof(buffer), infile); @@ -297,7 +297,7 @@ static gotoblas_t *get_coretype(void) { (void)fgets(buffer, sizeof(buffer), infile); midr_el1=strtoul(buffer,NULL,16); implementer = (midr_el1 >> 24) & 0xFF; - p = (midr_el1 >> 4) & 0xFFF; + prt = (midr_el1 >> 4) & 0xFFF; fclose(infile); sprintf(buffer,"/sys/devices/system/cpu/cpu%d/cpu_capability",i); infile = fopen(buffer,"r"); @@ -308,14 +308,14 @@ static gotoblas_t *get_coretype(void) { if (cpucap >= 1000) cpuhiperf++; else if (cpucap >=500) cpumidperf++; else cpulowperf++; - if (cpucap >=1000) part = p; + if (cpucap >=1000) part = prt; } else if (implementer == 0x41 ){ - if (p >= 0xd4b) cpuhiperf++: - else if (p>= 0xd07) cpumidperf++; + if (prt >= 0xd4b) cpuhiperf++: + else if (prt>= 0xd07) cpumidperf++; else cpulowperf++; } else cpulowperf++; } - if (!part) part = p; + if (!part) part = prt; #else snprintf(coremsg, 128, "Kernel lacks cpuid feature support. Auto detection of core type failed !!!\n"); openblas_warning(1, coremsg); @@ -323,7 +323,7 @@ static gotoblas_t *get_coretype(void) { #endif } else { get_cpu_ftr(MIDR_EL1, midr_el1); - } + /* * MIDR_EL1 * @@ -334,7 +334,7 @@ static gotoblas_t *get_coretype(void) { */ implementer = (midr_el1 >> 24) & 0xFF; part = (midr_el1 >> 4) & 0xFFF; - + } switch(implementer) { case 0x41: // ARM From a182251284835e5fb56c2074b8bb08c04ebbc9b0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 2 Jan 2025 00:04:33 +0100 Subject: [PATCH 205/244] fix typo --- driver/others/dynamic_arm64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index b1aad68d9b..37991184ac 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -310,7 +310,7 @@ static gotoblas_t *get_coretype(void) { else cpulowperf++; if (cpucap >=1000) part = prt; } else if (implementer == 0x41 ){ - if (prt >= 0xd4b) cpuhiperf++: + if (prt >= 0xd4b) cpuhiperf++; else if (prt>= 0xd07) cpumidperf++; else cpulowperf++; } else cpulowperf++; From 14c72d616a4a372827992a1b1f9ccda56b43b9d4 Mon Sep 17 00:00:00 2001 From: "tingbo.liao" Date: Thu, 2 Jan 2025 10:05:57 +0800 Subject: [PATCH 206/244] Add the test cases of rot to improve unit testing. Signed-off-by: tingbo.liao --- utest/test_rot.c | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/utest/test_rot.c b/utest/test_rot.c index 0e74ecbb36..03776586b0 100644 --- a/utest/test_rot.c +++ b/utest/test_rot.c @@ -53,6 +53,23 @@ CTEST(rot,drot_inc_0) ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); } } +CTEST(rot,drot_inc_1) +{ + blasint i=0; + blasint N=4,incX=1,incY=1; + double c=1.0,s=1.0; + double x1[]={1.0,3.0,5.0,7.0}; + double y1[]={2.0,4.0,6.0,8.0}; + double x2[]={3.0,7.0,11.0,15.0}; + double y2[]={1.0,1.0,1.0,1.0}; + + BLASFUNC(drot)(&N,x1,&incX,y1,&incY,&c,&s); + + for(i=0; i Date: Thu, 2 Jan 2025 06:13:07 -0800 Subject: [PATCH 207/244] Fix accidentally dropped cpu ids and add MacOS performance groups --- cpuid_arm64.c | 66 ++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 58 insertions(+), 8 deletions(-) diff --git a/cpuid_arm64.c b/cpuid_arm64.c index 3e0022b845..47e8ffcd61 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -57,9 +57,11 @@ size_t length64=sizeof(value64); #define CPU_CORTEXA57 3 #define CPU_CORTEXA72 4 #define CPU_CORTEXA73 5 +#define CPU_CORTEXA76 23 #define CPU_NEOVERSEN1 11 #define CPU_NEOVERSEV1 16 #define CPU_NEOVERSEN2 17 +#define CPU_NEOVERSEV2 24 #define CPU_CORTEXX1 18 #define CPU_CORTEXX2 19 #define CPU_CORTEXA510 20 @@ -104,7 +106,9 @@ static char *cpuname[] = { "CORTEXX2", "CORTEXA510", "CORTEXA710", - "FT2000" + "FT2000", + "CORTEXA76", + "NEOVERSEV2" }; static char *cpuname_lower[] = { @@ -130,7 +134,9 @@ static char *cpuname_lower[] = { "cortexx2", "cortexa510", "cortexa710", - "ft2000" + "ft2000", + "cortexa76", + "neoversev2" }; static int cpulowperf=0; @@ -140,7 +146,7 @@ static int cpuhiperf=0; int get_feature(char *search) { -#ifdef __linux +#if defined( __linux ) || defined( __NetBSD__ ) FILE *infile; char buffer[2048], *p,*t; p = (char *) NULL ; @@ -179,7 +185,7 @@ static int cpusort(const void *model1, const void *model2) int detect(void) { -#ifdef __linux +#if defined( __linux ) || defined( __NetBSD__ ) int n,i,ii; int midr_el1; int implementer; @@ -243,8 +249,8 @@ int detect(void) break; } else { (void)fgets(buffer, sizeof(buffer), infile); - midr_el1=strtoul(buffer,NULL,16); - fclose(infile); + midr_el1=strtoul(buffer,NULL,16); + fclose(infile); implementer = (midr_el1 >> 24) & 0xFF; cpucores[i] = (midr_el1 >> 4) & 0xFFF; sprintf(buffer,"/sys/devices/system/cpu/cpu%d/cpu_capacity",i); @@ -304,6 +310,10 @@ int detect(void) return CPU_CORTEXX2; else if (strstr(cpu_part, "0xd4e")) //X3 return CPU_CORTEXX2; + else if (strstr(cpu_part, "0xd4f")) //NVIDIA Grace et al. + return CPU_NEOVERSEV2; + else if (strstr(cpu_part, "0xd0b")) + return CPU_CORTEXA76; } // Qualcomm else if (strstr(cpu_implementer, "0x51") && strstr(cpu_part, "0xc00")) @@ -361,9 +371,20 @@ int detect(void) } #else #ifdef __APPLE__ + sysctlbyname("hw.ncpu",&value64,&length64,NULL,0); + cpulowperf=value64; + sysctlbyname("hw.nperflevels",&value64,&length64,NULL,0); + if (value64 > 1) { + sysctlbyname("hw.perflevel0.cpusperl",&value64,&length64,NULL,0); + cpuhiperf=value64; + sysctlbyname("hw.perflevel1.cpusperl",&value64,&length64,NULL,0); + cpulowperf=value64; + } sysctlbyname("hw.cpufamily",&value64,&length64,NULL,0); if (value64 ==131287967|| value64 == 458787763 ) return CPU_VORTEX; //A12/M1 if (value64 == 3660830781) return CPU_VORTEX; //A15/M2 + if (value64 == 2271604202) return CPU_VORTEX; //A16/M3 + if (value64 == 1867590060) return CPU_VORTEX; //M4 #endif return CPU_ARMV8; #endif @@ -396,7 +417,7 @@ void get_cpucount(void) { int n=0; -#ifdef __linux +#if defined( __linux ) || defined( __NetBSD__ ) FILE *infile; char buffer[2048], *p,*t; p = (char *) NULL ; @@ -423,6 +444,12 @@ int n=0; #ifdef __APPLE__ sysctlbyname("hw.physicalcpu_max",&value,&length,NULL,0); printf("#define NUM_CORES %d\n",value); + if (cpulowperf >0) + printf("#define NUM_CORES_LP %d\n",cpulowperf); + if (cpumidperf >0) + printf("#define NUM_CORES_MP %d\n",cpumidperf); + if (cpuhiperf >0) + printf("#define NUM_CORES_HP %d\n",cpuhiperf); #endif } @@ -489,6 +516,8 @@ void get_cpuconfig(void) break; case CPU_NEOVERSEV1: + printf("#define HAVE_SVE 1\n"); + case CPU_CORTEXA76: printf("#define %s\n", cpuname[d]); printf("#define L1_CODE_SIZE 65536\n"); printf("#define L1_CODE_LINESIZE 64\n"); @@ -516,12 +545,32 @@ void get_cpuconfig(void) printf("#define L2_ASSOCIATIVE 8\n"); printf("#define DTB_DEFAULT_ENTRIES 48\n"); printf("#define DTB_SIZE 4096\n"); + printf("#define HAVE_SVE 1\n"); break; + case CPU_NEOVERSEV2: + printf("#define ARMV9\n"); + printf("#define HAVE_SVE 1\n"); + printf("#define %s\n", cpuname[d]); + printf("#define L1_CODE_SIZE 65536\n"); + printf("#define L1_CODE_LINESIZE 64\n"); + printf("#define L1_CODE_ASSOCIATIVE 4\n"); + printf("#define L1_DATA_SIZE 65536\n"); + printf("#define L1_DATA_LINESIZE 64\n"); + printf("#define L1_DATA_ASSOCIATIVE 4\n"); + printf("#define L2_SIZE 1048576\n"); + printf("#define L2_LINESIZE 64\n"); + printf("#define L2_ASSOCIATIVE 8\n"); + // L1 Data TLB = 48 entries + // L2 Data TLB = 2048 entries + printf("#define DTB_DEFAULT_ENTRIES 48\n"); + printf("#define DTB_SIZE 4096\n"); // Set to 4096 for symmetry with other configs. + break; case CPU_CORTEXA510: case CPU_CORTEXA710: case CPU_CORTEXX1: case CPU_CORTEXX2: printf("#define ARMV9\n"); + printf("#define HAVE_SVE 1\n"); printf("#define %s\n", cpuname[d]); printf("#define L1_CODE_SIZE 65536\n"); printf("#define L1_CODE_LINESIZE 64\n"); @@ -638,6 +687,7 @@ void get_cpuconfig(void) break; case CPU_A64FX: printf("#define A64FX\n"); + printf("#define HAVE_SVE 1\n"); printf("#define L1_CODE_SIZE 65535\n"); printf("#define L1_DATA_SIZE 65535\n"); printf("#define L1_DATA_LINESIZE 256\n"); @@ -670,7 +720,7 @@ void get_libname(void) void get_features(void) { -#ifdef __linux +#if defined( __linux ) || defined( __NetBSD__ ) FILE *infile; char buffer[2048], *p,*t; p = (char *) NULL ; From 108bf599ae485292622613fa4261eb8f18f0f746 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 3 Jan 2025 17:19:41 +0100 Subject: [PATCH 208/244] Create harmonyos.yml --- .github/workflows/harmonyos.yml | 37 +++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 .github/workflows/harmonyos.yml diff --git a/.github/workflows/harmonyos.yml b/.github/workflows/harmonyos.yml new file mode 100644 index 0000000000..75efaa9258 --- /dev/null +++ b/.github/workflows/harmonyos.yml @@ -0,0 +1,37 @@ +name: harmonyos + +on: [push, pull_request] + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +permissions: + contents: read # to fetch code (actions/checkout) + +jobs: + build: + if: "github.repository == 'OpenMathLib/OpenBLAS'" + runs-on: ubuntu-latest + env: + OHOS_NDK_CMAKE: $GITHUB_WORKSPACE/ohos-sdk/linux/native/build-tools/cmake/bin/cmake + COMMON_CMAKE_OPTIONS: | + -DCMAKE_TOOLCHAIN_FILE=$GITHUB_WORKSPACE/ohos-sdk/linux/native/build/cmake/ohos.toolchain.cmake \ + -DCMAKE_INSTALL_PREFIX=install \ + -DCMAKE_BUILD_TYPE=Release \ + steps: + - uses: actions/checkout@v4 + - name: ndk-install + run: | + wget https://repo.huaweicloud.com/harmonyos/os/4.1.1-Release/ohos-sdk-windows_linux-public.tar.gz + tar -xf ohos-sdk-windows_linux-public.tar.gz + cd ohos-sdk/linux + unzip -q native-linux-x64-4.1.7.8-Release.zip + -name: build-armv8 + run: | + cd + mkdir build && cd build + ${{ env.OHOS_NDK_CMAKE }} ${{ env.COMMON_CMAKE_OPTIONS }} -DOHOS_ARCH="arm64-v8a" \ + -DTARGET=ARMV8 .. + ${{ env.OHOS_NDK_CMAKE }} --build . -j $(nproc) + ctest From 67bbde71e534aee4cb25fbac4cf1698797c15575 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 3 Jan 2025 17:47:17 +0100 Subject: [PATCH 209/244] Update harmonyos.yml --- .github/workflows/harmonyos.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/harmonyos.yml b/.github/workflows/harmonyos.yml index 75efaa9258..19ec17b892 100644 --- a/.github/workflows/harmonyos.yml +++ b/.github/workflows/harmonyos.yml @@ -27,11 +27,11 @@ jobs: tar -xf ohos-sdk-windows_linux-public.tar.gz cd ohos-sdk/linux unzip -q native-linux-x64-4.1.7.8-Release.zip - -name: build-armv8 - run: | + - name: build-armv8 + run: | cd mkdir build && cd build ${{ env.OHOS_NDK_CMAKE }} ${{ env.COMMON_CMAKE_OPTIONS }} -DOHOS_ARCH="arm64-v8a" \ -DTARGET=ARMV8 .. ${{ env.OHOS_NDK_CMAKE }} --build . -j $(nproc) - ctest + From 593427c8a1046989051f94ff53185ab33d20c4eb Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 3 Jan 2025 18:10:05 +0100 Subject: [PATCH 210/244] Update harmonyos.yml --- .github/workflows/harmonyos.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/harmonyos.yml b/.github/workflows/harmonyos.yml index 19ec17b892..deab48e1a3 100644 --- a/.github/workflows/harmonyos.yml +++ b/.github/workflows/harmonyos.yml @@ -27,9 +27,9 @@ jobs: tar -xf ohos-sdk-windows_linux-public.tar.gz cd ohos-sdk/linux unzip -q native-linux-x64-4.1.7.8-Release.zip + cd - - name: build-armv8 run: | - cd mkdir build && cd build ${{ env.OHOS_NDK_CMAKE }} ${{ env.COMMON_CMAKE_OPTIONS }} -DOHOS_ARCH="arm64-v8a" \ -DTARGET=ARMV8 .. From 14e68355d2047c7fc24013c1078f8eff19853721 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 3 Jan 2025 18:29:39 +0100 Subject: [PATCH 211/244] Update harmonyos.yml --- .github/workflows/harmonyos.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/harmonyos.yml b/.github/workflows/harmonyos.yml index deab48e1a3..118fe6300e 100644 --- a/.github/workflows/harmonyos.yml +++ b/.github/workflows/harmonyos.yml @@ -32,6 +32,6 @@ jobs: run: | mkdir build && cd build ${{ env.OHOS_NDK_CMAKE }} ${{ env.COMMON_CMAKE_OPTIONS }} -DOHOS_ARCH="arm64-v8a" \ - -DTARGET=ARMV8 .. + -DTARGET=ARMV8 -DNOFORTRAN=1 .. ${{ env.OHOS_NDK_CMAKE }} --build . -j $(nproc) From 9b9c0aa5c9ab08df01fb5acfe83a50b4c7ce6bdd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 3 Jan 2025 21:36:46 +0100 Subject: [PATCH 212/244] temporarily disable the default S/DSCAL kernel --- kernel/power/KERNEL.PPC970 | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/power/KERNEL.PPC970 b/kernel/power/KERNEL.PPC970 index fee5fa5290..d8e8c66466 100644 --- a/kernel/power/KERNEL.PPC970 +++ b/kernel/power/KERNEL.PPC970 @@ -89,3 +89,6 @@ DROTKERNEL = ../arm/rot.c CROTKERNEL = ../arm/zrot.c ZROTKERNEL = ../arm/zrot.c endif + +SSCALKERNEL = ../arm/scal.c +DSCALKERNEL = ../arm/scal.c From afd572511f410e1d705e793b6995ae8b11cf41dc Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 3 Jan 2025 22:47:38 +0100 Subject: [PATCH 213/244] Add build instructions for Huawei's HarmonyOS from #5043 --- docs/install.md | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/docs/install.md b/docs/install.md index 3bc7ffc8f9..b7d8a36167 100644 --- a/docs/install.md +++ b/docs/install.md @@ -690,6 +690,29 @@ make TARGET=ARMV8 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 Adjust `MIN_IOS_VERSION` as necessary for your installation. E.g., change the version number to the minimum iOS version you want to target and execute this file to build the library. +### HarmonyOS + +For this target you will need the cross-compiler toolchain package by Huawei, which contains solutions for both Windows and Linux. Only the Linux-based +toolchain has been tested so far, but the following instructions may apply similarly to Windows: + +Download https://repo.huaweicloud.com/harmonyos/os/4.1.1-Release/ohos-sdk-windows_linux-public.tar.gz (or whatever newer version may be available in the future). Use tar xvf ohos-sdk-windows_linux_public.tar.gz to unpack it somewhere on your system. This will create a folder named "ohos-sdk" with subfolders "linux" and "windows". In the linux one you will find a ZIP archive named "native-linux-x64-4.1.7.8-Release.zip" - you need to unzip this where you want to +install the cross-compiler, for example in /opt/ohos-sdk. + +In the directory where you unpacked OpenBLAS, create a build directory for cmake, and change into it : +``` +mkdir build +cd build +``` +Use the version of `cmake` that came with the SDK, and specify the location of its toolchain file as a cmake option. Also set the build target for OpenBLAS to ARMV8 and specify NOFORTRAN=1 (at least as of version 4.1.1, the SDK contains no Fortran compiler): +``` +/opt/ohos-sdk/linux/native/build-tools/cmake/bin/cmake -DCMAKE_TOOLCHAIN_FILE=/opt/ohos-sdk/linux/native/build/cmake/ohos.toolchain.cmake \ + -DOHOS_ARCH="arm64-v8a" -DTARGET=ARMV8 -DNOFORTRAN=1 .. +``` +Additional other OpenBLAS build options like USE_OPENMP=1 or DYNAMIC_ARCH=1 will probably work too. +Finally do the build: +``` +/opt/ohos-sdk/linux/native/build-tools/cmake/bin/cmake --build . +``` ### MIPS From 33b9e5b14313cbcd15856253a9165508a98e2859 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 3 Jan 2025 23:32:34 +0100 Subject: [PATCH 214/244] Add a documentation page for the runtime variables --- docs/runtime_variables.md | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 docs/runtime_variables.md diff --git a/docs/runtime_variables.md b/docs/runtime_variables.md new file mode 100644 index 0000000000..3e83110bb6 --- /dev/null +++ b/docs/runtime_variables.md @@ -0,0 +1,23 @@ +## Runtime variables + +OpenBLAS checks the following environment variables on startup: + +* **OPENBLAS_NUM_THREADS=** the number of threads to use (for non OpenMP-builds of OpenBLAS) +* **OMP_NUM_THREADS=** the number of threads to use (for OpenMP builds - note that setting this may also affect any other OpenMP code) +* **OPENBLAS_DEFAULT_NUM_THREADS=** the number of threads to use, irrespective if OpenBLAS was built for OpenMP or pthreads + +* **OPENBLAS_MAIN_FREE=1**" this can be used to disable automatic assignment of cpu affinity in OpenBLAS builds that have it enabled by default +* **OPENBLAS_THREAD_TIMEOUT=** this can be used to define the length of time that idle threads should wait before exiting +* **OMP_ADAPTIVE=1** this can be used in OpenMP builds to actually remove any surplus threads when the number of threads is decreased +* +DYNAMIC_ARCH builds also accept the following: +* **OPENBLAS_VERBOSE=** set this to "1" to enable a warning when there is no exact match for the detected cpu in the library +* set this to "2" to make OpenBLAS print the name of the cpu target it autodetected +* **OPENBLAS_CORETYPE=** set this to one of the supported target names to override autodetection, e.g. OPENBLAS_CORETYPE=HASWELL + + + +Deprecated variables still recognized for compatibilty: +* **GOTO_NUM_THREADS=** equivalent to **OPENBLAS_NUM_THREADS** +* **GOTOBLAS_MAIN_FREE** equivalent to **OPENBLAS_MAIN_FREE** +* **OPENBLAS_BLOCK_FACTOR** this applies a scale factor to the GEMM "P" parameter of the block matrix code, see file driver/others/parameter.cen From b6c906f80545aa98d1324a9b0d0e77e46f9fc19b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 4 Jan 2025 13:02:35 +0100 Subject: [PATCH 215/244] Add OPENBLAS_L2_SIZE and improve formatting --- docs/runtime_variables.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/docs/runtime_variables.md b/docs/runtime_variables.md index 3e83110bb6..a43b98cac5 100644 --- a/docs/runtime_variables.md +++ b/docs/runtime_variables.md @@ -2,18 +2,20 @@ OpenBLAS checks the following environment variables on startup: -* **OPENBLAS_NUM_THREADS=** the number of threads to use (for non OpenMP-builds of OpenBLAS) +* **OPENBLAS_NUM_THREADS=** the number of threads to use (for non-OpenMP-builds of OpenBLAS) * **OMP_NUM_THREADS=** the number of threads to use (for OpenMP builds - note that setting this may also affect any other OpenMP code) * **OPENBLAS_DEFAULT_NUM_THREADS=** the number of threads to use, irrespective if OpenBLAS was built for OpenMP or pthreads * **OPENBLAS_MAIN_FREE=1**" this can be used to disable automatic assignment of cpu affinity in OpenBLAS builds that have it enabled by default * **OPENBLAS_THREAD_TIMEOUT=** this can be used to define the length of time that idle threads should wait before exiting * **OMP_ADAPTIVE=1** this can be used in OpenMP builds to actually remove any surplus threads when the number of threads is decreased -* + + DYNAMIC_ARCH builds also accept the following: * **OPENBLAS_VERBOSE=** set this to "1" to enable a warning when there is no exact match for the detected cpu in the library -* set this to "2" to make OpenBLAS print the name of the cpu target it autodetected + set this to "2" to make OpenBLAS print the name of the cpu target it autodetected * **OPENBLAS_CORETYPE=** set this to one of the supported target names to override autodetection, e.g. OPENBLAS_CORETYPE=HASWELL +* **OPENBLAS_L2_SIZE=** set this to override the autodetected size of the L2 cache where it is not reported correctly (in virtual environments) From 81e1be8d90ea6537d790549d11504d1992961e28 Mon Sep 17 00:00:00 2001 From: Sergey Fedorov Date: Sat, 4 Jan 2025 22:54:54 +0800 Subject: [PATCH 216/244] Revert "temporarily disable the default S/DSCAL kernel" This reverts commit 9b9c0aa5c9ab08df01fb5acfe83a50b4c7ce6bdd. --- kernel/power/KERNEL.PPC970 | 3 --- 1 file changed, 3 deletions(-) diff --git a/kernel/power/KERNEL.PPC970 b/kernel/power/KERNEL.PPC970 index d8e8c66466..fee5fa5290 100644 --- a/kernel/power/KERNEL.PPC970 +++ b/kernel/power/KERNEL.PPC970 @@ -89,6 +89,3 @@ DROTKERNEL = ../arm/rot.c CROTKERNEL = ../arm/zrot.c ZROTKERNEL = ../arm/zrot.c endif - -SSCALKERNEL = ../arm/scal.c -DSCALKERNEL = ../arm/scal.c From 229efa42ffc2d0623062f1646f72ad9beb71f2cc Mon Sep 17 00:00:00 2001 From: Sergey Fedorov Date: Sun, 5 Jan 2025 00:31:27 +0800 Subject: [PATCH 217/244] scal.S: use r11 on 32-bit Darwin on powerpc --- kernel/power/scal.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/power/scal.S b/kernel/power/scal.S index 5e92a88aa1..eceb9fe8ed 100644 --- a/kernel/power/scal.S +++ b/kernel/power/scal.S @@ -59,7 +59,7 @@ #if !defined(__64BIT__) && defined(DOUBLE) #define X r8 #define INCX r9 -#define FLAG r13 +#define FLAG r11 #else #define X r7 #define INCX r8 @@ -91,7 +91,7 @@ fcmpu cr0, FZERO, ALPHA bne- cr0, LL(A1I1) - ld FLAG, 48+64+8(SP) + LDLONG FLAG, 48+64+8(SP) cmpwi cr0, FLAG, 1 beq- cr0, LL(A1I1) From 970e48e9e5530ba7d6289708c21e880a485d5e54 Mon Sep 17 00:00:00 2001 From: Ralf Gommers Date: Sat, 4 Jan 2025 15:35:21 +0100 Subject: [PATCH 218/244] docs: improve readability of the Build system page This only fixes Markdown syntax, and adds a few headers to bring some structure into the long list of variables that influence the build. It does not add or remove variables. --- docs/build_system.md | 113 ++++++++++++++++++++++++++----------------- 1 file changed, 69 insertions(+), 44 deletions(-) diff --git a/docs/build_system.md b/docs/build_system.md index 3de2205807..872553749d 100644 --- a/docs/build_system.md +++ b/docs/build_system.md @@ -1,7 +1,10 @@ -This page describes the Make-based build, which is the default/authoritative -build method. Note that the OpenBLAS repository also supports building with -CMake (not described here) - that generally works and is tested, however there -may be small differences between the Make and CMake builds. +!!! info "Supported build systems" + + This page describes the Make-based build, which is the + default/authoritative build method. Note that the OpenBLAS repository also + supports building with CMake (not described here) - that generally works + and is tested, however there may be small differences between the Make and + CMake builds. !!! warning This page is made by someone who is not the developer and should not be considered as an official documentation of the build system. For getting the full picture, it is best to read the Makefiles and understand them yourself. @@ -49,56 +52,78 @@ Makefile ## Important Variables -Most of the tunable variables are found in [Makefile.rule](https://github.com/xianyi/OpenBLAS/blob/develop/Makefile.rule), along with their detailed descriptions.
-Most of the variables are detected automatically in [Makefile.prebuild](https://github.com/xianyi/OpenBLAS/blob/develop/Makefile.prebuild), if they are not set in the environment. +Most of the tunable variables are found in +[Makefile.rule](https://github.com/xianyi/OpenBLAS/blob/develop/Makefile.rule), +along with their detailed descriptions. -### CPU related -``` -ARCH - Target architecture (eg. x86_64) -TARGET - Target CPU architecture, in case of DYNAMIC_ARCH=1 means library will not be usable on less capable CPUs -TARGET_CORE - TARGET_CORE will override TARGET internally during each cpu-specific cycle of the build for DYNAMIC_ARCH -DYNAMIC_ARCH - For building library for multiple TARGETs (does not lose any optimizations, but increases library size) -DYNAMIC_LIST - optional user-provided subset of the DYNAMIC_CORE list in Makefile.system -``` +Most of the variables are detected automatically in +[Makefile.prebuild](https://github.com/xianyi/OpenBLAS/blob/develop/Makefile.prebuild), +if they are not set in the environment. -### Toolchain related -``` -CC - TARGET C compiler used for compilation (can be cross-toolchains) -FC - TARGET Fortran compiler used for compilation (can be cross-toolchains, set NOFORTRAN=1 if used cross-toolchain has no fortran compiler) -AR, AS, LD, RANLIB - TARGET toolchain helpers used for compilation (can be cross-toolchains) -HOSTCC - compiler of build machine, needed to create proper config files for target architecture -HOST_CFLAGS - flags for build machine compiler -``` +### CPU related -### Library related -``` -BINARY - 32/64 bit library +- `ARCH`: target architecture (e.g., `x86-64`). +- `DYNAMIC_ARCH`: For building library for multiple `TARGET`s (does not lose any + optimizations, but increases library size). +- `DYNAMIC_LIST`: optional user-provided subset of the `DYNAMIC_CORE` list in + [Makefile.system](https://github.com/xianyi/OpenBLAS/blob/develop/Makefile.system). +- `TARGET`: target CPU architecture. In case of `DYNAMIC_ARCH=1`, it means that + the library will not be usable on less capable CPUs. +- `TARGET_CORE`: override `TARGET` internally during each CPU-specific cycle of + the build for `DYNAMIC_ARCH`. -BUILD_SHARED - Create shared library -BUILD_STATIC - Create static library -QUAD_PRECISION - enable support for IEEE quad precision [ largely unimplemented leftover from GotoBLAS, do not use ] -EXPRECISION - Obsolete option to use float80 of SSE on BSD-like systems -INTERFACE64 - Build with 64bit integer representations to support large array index values [ incompatible with standard API ] +### Toolchain related -BUILD_SINGLE - build the single-precision real functions of BLAS [and optionally LAPACK] -BUILD_DOUBLE - build the double-precision real functions -BUILD_COMPLEX - build the single-precision complex functions -BUILD_COMPLEX16 - build the double-precision complex functions -(all four types are included in the build by default when none was specifically selected) +- `CC`: `TARGET` C compiler used for compilation (can be cross-toolchains). +- `FC`: `TARGET` Fortran compiler used for compilation (can be cross-toolchains, + set `NOFORTRAN=1` if the used cross-toolchain has no Fortran compiler). +- `AR`, `AS`, `LD`, `RANLIB`: `TARGET` toolchain helpers used for compilation + (can be cross-toolchains). +- `HOSTCC`: compiler of build machine, needed to create proper config files for + the target architecture. +- `HOST_CFLAGS`: flags for the build machine compiler. -BUILD_BFLOAT16 - build the "half precision brainfloat" real functions - -USE_THREAD - Use a multithreading backend (default to pthread) -USE_LOCKING - implement locking for thread safety even when USE_THREAD is not set (so that the singlethreaded library can - safely be called from multithreaded programs) -USE_OPENMP - Use OpenMP as multithreading backend -NUM_THREADS - define this to the maximum number of parallel threads you expect to need (defaults to the number of cores in the build cpu) -NUM_PARALLEL - define this to the number of OpenMP instances that your code may use for parallel calls into OpenBLAS (default 1,see below) -``` +### Library related +#### Library kind and bitness options + +- `BINARY`: whether to build a 32-bit or 64-bit library (default is `64`, set + to `32` on a 32-bit platform). +- `BUILD_SHARED`: create a shared library +- `BUILD_STATIC`: create a static library +- `INTERFACE64`: build with 64-bit (ILP64) integer representations to support + large array index values (incompatible with the standard 32-bit integer (LP64) API). + +#### Data type options + +- `BUILD_SINGLE`: build the single-precision real functions of BLAS and (if + it's built) LAPACK +- `BUILD_DOUBLE`: build the double-precision real functions +- `BUILD_COMPLEX`: build the single-precision complex functions +- `BUILD_COMPLEX16`: build the double-precision complex functions +- `BUILD_BFLOAT16`: build the "half precision brainfloat" real functions +- `EXPRECISION`: obsolete option to use float80 of SSE on BSD-like systems +- `QUAD_PRECISION`: enable support for IEEE quad precision (largely + unimplemented leftover from GotoBLAS, do not use) + +By default, the single- and double-precision real and complex floating-point +functions are included in the build, while the half- and extended-precision +functions are not. + +#### Threading options + +- `USE_THREAD`: Use a multithreading backend (defaults to `pthreads`). +- `USE_LOCKING`: implement locking for thread safety even when `USE_THREAD` is + not set (so that the single-threaded library can safely be called from + multithreaded programs). +- `USE_OPENMP`: Use OpenMP as multithreading backend +- `NUM_THREADS`: define this to the maximum number of parallel threads you + expect to need (defaults to the number of cores in the build CPU). +- `NUM_PARALLEL`: define this to the number of OpenMP instances that your code + may use for parallel calls into OpenBLAS (the default is `1`, see below). OpenBLAS uses a fixed set of memory buffers internally, used for communicating and compiling partial results from individual threads. For efficiency, the From d4addc0688b0d12f91b15d6420b5ea966802e8b4 Mon Sep 17 00:00:00 2001 From: Ralf Gommers Date: Sat, 4 Jan 2025 16:02:34 +0100 Subject: [PATCH 219/244] docs: improve description of library, data type and toolchain build variables --- docs/build_system.md | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/docs/build_system.md b/docs/build_system.md index 872553749d..9ceed13656 100644 --- a/docs/build_system.md +++ b/docs/build_system.md @@ -79,6 +79,13 @@ if they are not set in the environment. - `CC`: `TARGET` C compiler used for compilation (can be cross-toolchains). - `FC`: `TARGET` Fortran compiler used for compilation (can be cross-toolchains, set `NOFORTRAN=1` if the used cross-toolchain has no Fortran compiler). +- `COMMON_OPT`: flags to add to all invocations of the target C and Fortran compilers + (overrides `CFLAGS`/`FFLAGS` - prefer using `COMMON_OPT`) +- `CCOMMON_OPT`: flags to add to all invocations of the target C compiler + (overrides `CFLAGS`) +- `FCOMMON_OPT`: flags to add to all invocations of the target Fortran compiler + (overrides `FFLAGS`) +- `LDFLAGS`: flags to add to all target linker invocations - `AR`, `AS`, `LD`, `RANLIB`: `TARGET` toolchain helpers used for compilation (can be cross-toolchains). - `HOSTCC`: compiler of build machine, needed to create proper config files for @@ -92,11 +99,13 @@ if they are not set in the environment. - `BINARY`: whether to build a 32-bit or 64-bit library (default is `64`, set to `32` on a 32-bit platform). -- `BUILD_SHARED`: create a shared library -- `BUILD_STATIC`: create a static library - `INTERFACE64`: build with 64-bit (ILP64) integer representations to support large array index values (incompatible with the standard 32-bit integer (LP64) API). +Note that both shared and static libraries will be built with the Make-based +build. The CMake build provides `BUILD_SHARED_LIBS`/`BUILD_STATIC_LIBS` +variables to allow building only one of the two. + #### Data type options - `BUILD_SINGLE`: build the single-precision real functions of BLAS and (if @@ -105,9 +114,8 @@ if they are not set in the environment. - `BUILD_COMPLEX`: build the single-precision complex functions - `BUILD_COMPLEX16`: build the double-precision complex functions - `BUILD_BFLOAT16`: build the "half precision brainfloat" real functions -- `EXPRECISION`: obsolete option to use float80 of SSE on BSD-like systems -- `QUAD_PRECISION`: enable support for IEEE quad precision (largely - unimplemented leftover from GotoBLAS, do not use) +- `EXPRECISION`: (do not use, this is a work in progress) option to use `long + double` functions By default, the single- and double-precision real and complex floating-point functions are included in the build, while the half- and extended-precision From c526b10b6897bfa7099e9e00060fb35a1bbbc3b5 Mon Sep 17 00:00:00 2001 From: Ralf Gommers Date: Sat, 4 Jan 2025 16:18:26 +0100 Subject: [PATCH 220/244] docs: add library and symbol name build variables --- docs/build_system.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/docs/build_system.md b/docs/build_system.md index 9ceed13656..aa5d1fe121 100644 --- a/docs/build_system.md +++ b/docs/build_system.md @@ -151,3 +151,17 @@ same time, then only one of them will be able to make progress while all the rest of them spin-wait for the one available buffer. Setting `NUM_PARALLEL` to the upper bound on the number of OpenMP runtimes that you can have in a process ensures that there are a sufficient number of buffer sets available. + +#### Library and symbol name options + +- `FIXED_LIBNAME`: if set to `1`, uses a non-versioned name for the library and + no symbolic linking to variant names (default is `0`) +- `LIBNAMEPREFIX`: prefix that, if given, will be inserted in the library name + before `openblas` (e.g., `xxx` will result in `libxxxopenblas.so`) +- `LIBNAMESUFFIX`: suffix that, if given, will be inserted in the library name + after `openblas`, separated by an underscore (e.g., `yyy` will result in + `libopenblas_yyy.so`) +- `SYMBOLPREFIX`: prefix that, if given, will be added to all symbol names + *and* to the library name +- `SYMBOLSUFFIX`: suffix that, if given, will be added to all symbol names + *and* to the library name From ed114150d13a2e3203fb0bffc8587330d33896a7 Mon Sep 17 00:00:00 2001 From: Ralf Gommers Date: Sat, 4 Jan 2025 16:28:31 +0100 Subject: [PATCH 221/244] docs: add the build variables for BLAS/LAPACK functionality --- docs/build_system.md | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/docs/build_system.md b/docs/build_system.md index aa5d1fe121..c8b8f36ea3 100644 --- a/docs/build_system.md +++ b/docs/build_system.md @@ -60,6 +60,8 @@ Most of the variables are detected automatically in [Makefile.prebuild](https://github.com/xianyi/OpenBLAS/blob/develop/Makefile.prebuild), if they are not set in the environment. +The most commonly used variables are documented below. There are more options +though - please read the linked Makefiles if you want to see all variables. ### CPU related @@ -101,10 +103,8 @@ if they are not set in the environment. to `32` on a 32-bit platform). - `INTERFACE64`: build with 64-bit (ILP64) integer representations to support large array index values (incompatible with the standard 32-bit integer (LP64) API). - -Note that both shared and static libraries will be built with the Make-based -build. The CMake build provides `BUILD_SHARED_LIBS`/`BUILD_STATIC_LIBS` -variables to allow building only one of the two. +- `NO_STATIC`: if set to `1`, don't build a static library (default is `0`) +- `NO_SHARED`: if set to `1`, don't build a shared library (default is `0`) #### Data type options @@ -165,3 +165,18 @@ ensures that there are a sufficient number of buffer sets available. *and* to the library name - `SYMBOLSUFFIX`: suffix that, if given, will be added to all symbol names *and* to the library name + +#### BLAS and LAPACK options + +By default, the Fortran and C interfaces to BLAS and LAPACK are built, +including deprecated functions, while +[ReLAPACK](https://github.com/HPAC/ReLAPACK) is not. + +- `NO_CBLAS`: if set to `1`, don't build the CBLAS interface (default is `0`) +- `ONLY_CBLAS`: if set to `1`, only build the CBLAS interface (default is `0`) +- `NO_LAPACK`: if set to `1`, don't build LAPACK (default is `0`) +- `NO_LAPACKE`: if set to `1`, don't build the LAPACKE interface (default is `0`) +- `BUILD_LAPACK_DEPRECATED`: if set to `0`, don't build deprecated LAPACK + functions (default is `1`) +- `BUILD_RELAPACK`: if set to `1`, build Recursive LAPACK on top of LAPACK + (default is `0`) From 5aa1845a43e2bbe7a4d269de54dac05916eb5613 Mon Sep 17 00:00:00 2001 From: Ralf Gommers Date: Sat, 4 Jan 2025 16:55:43 +0100 Subject: [PATCH 222/244] docs: fix two broken links related to MSVC The doc build is now clean of warnings again. --- docs/faq.md | 2 +- docs/install.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/faq.md b/docs/faq.md index 1a3505ca90..93d76c67fb 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -99,7 +99,7 @@ Here is the result of the DGEMM subroutine's performance on Intel Core i5-2500K ### How can I call an OpenBLAS function in Microsoft Visual Studio? -Please read [this page](install.md#visual-studio). +Please read [this page](install.md#visual-studio-native-windows-abi). ### How can I use CBLAS and LAPACKE without C99 complex number support (e.g. in Visual Studio)? diff --git a/docs/install.md b/docs/install.md index b7d8a36167..55ebc35c1b 100644 --- a/docs/install.md +++ b/docs/install.md @@ -505,7 +505,7 @@ In your shell, move to this directory: `cd exports`. incompatibility in the C ABI would be a bug). The import libraries of MSVC have the suffix `.lib`. They are generated - from a `.def` file using MSVC's `lib.exe`. See [the MSVC instructions](use_visual_studio.md#generate-import-library-before-0210-version). + from a `.def` file using MSVC's `lib.exe`. === "MinGW" From f764d76a4a0306517727abac4c5ec4f924629666 Mon Sep 17 00:00:00 2001 From: Ralf Gommers Date: Sat, 4 Jan 2025 18:10:41 +0100 Subject: [PATCH 223/244] docs: improve the Makefile dependency graph Uses Mermaid to render it as a diagram in the html docs. --- .github/workflows/docs.yml | 2 +- docs/build_system.md | 65 +++++++++++++++----------------------- mkdocs.yml | 7 +++- 3 files changed, 33 insertions(+), 41 deletions(-) diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index da40b853f0..391183d1cd 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -23,7 +23,7 @@ jobs: python-version: "3.10" - name: Install MkDocs and doc theme packages - run: pip install mkdocs mkdocs-material mkdocs-git-revision-date-localized-plugin + run: pip install mkdocs mkdocs-material mkdocs-git-revision-date-localized-plugin mkdocs-mermaid2-plugin - name: Build docs site run: mkdocs build diff --git a/docs/build_system.md b/docs/build_system.md index c8b8f36ea3..f26bfb917a 100644 --- a/docs/build_system.md +++ b/docs/build_system.md @@ -9,47 +9,34 @@ !!! warning This page is made by someone who is not the developer and should not be considered as an official documentation of the build system. For getting the full picture, it is best to read the Makefiles and understand them yourself. -## Makefile dep graph - -``` -Makefile -| -|----- Makefile.system # !!! this is included by many of the Makefiles in the subdirectories !!! -| | -| |===== Makefile.prebuild # This is triggered (not included) once by Makefile.system -| | | # and runs before any of the actual library code is built. -| | | # (builds and runs the "getarch" tool for cpu identification, -| | | # runs the compiler detection scripts c_check and f_check) -| | | -| | ----- (Makefile.conf) [ either this or Makefile_kernel.conf is generated ] -| | | { Makefile.system#L243 } -| | ----- (Makefile_kernel.conf) [ temporary Makefile.conf during DYNAMIC_ARCH builds ] -| | -| |----- Makefile.rule # defaults for build options that can be given on the make command line -| | -| |----- Makefile.$(ARCH) # architecture-specific compiler options and OpenBLAS buffer size values -| -|~~~~~ exports/ -| -|~~~~~ test/ -| -|~~~~~ utest/ -| -|~~~~~ ctest/ -| -|~~~~~ cpp_thread_test/ -| -|~~~~~ kernel/ -| -|~~~~~ ${SUBDIRS} -| -|~~~~~ ${BLASDIRS} -| -|~~~~~ ${NETLIB_LAPACK_DIR}{,/timing,/testing/{EIG,LIN}} -| -|~~~~~ relapack/ +## Makefile dependency graph + + + +```mermaid +flowchart LR + A[Makefile] -->|included by many of the Makefiles in the subdirectories!| B(Makefile.system) + B -->|triggered, not included, once by Makefile.system, and runs before any of the actual library code is built. builds and runs the 'getarch' tool for cpu identification, runs the compiler detection scripts c_check/f_check| C{Makefile.prebuild} + C -->|either this or Makefile_kernel.conf is generated| D[Makefile.conf] + C -->|temporary Makefile.conf during DYNAMIC_ARCH builds| E[Makefile_kernel.conf] + B -->|defaults for build options that can be given on the make command line| F[Makefile.rule] + B -->|architecture-specific compiler options and OpenBLAS buffer size values| G[Makefile.$ARCH] + A --> exports + A -->|directories: test, ctest, utest, cpp_thread_test| H(test directories) + A --> I($BLASDIRS) + I --> interface + I --> driver/level2 + I --> driver/level3 + I --> driver/others + A -->|for each target in DYNAMIC_CORE if DYNAMIC_ARCH=1| kernel + A -->|subdirs: timing, testing, testing/EIG, testing/LIN| J($NETLIB_LAPACK_DIR) + A --> relapack ``` + ## Important Variables Most of the tunable variables are found in diff --git a/mkdocs.yml b/mkdocs.yml index 374b03e398..6e2b33be22 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -26,13 +26,18 @@ theme: plugins: - search + - mermaid2 - git-revision-date-localized: enable_creation_date: true markdown_extensions: - admonition - pymdownx.details - - pymdownx.superfences + - pymdownx.superfences: + custom_fences: + - name: mermaid + class: mermaid + format: !!python/name:mermaid2.fence_mermaid_custom - footnotes - pymdownx.tabbed: alternate_style: true From c0bf48fbf32da2197fa5093f0cc4a30f0b05238f Mon Sep 17 00:00:00 2001 From: Ralf Gommers Date: Sat, 4 Jan 2025 18:13:40 +0100 Subject: [PATCH 224/244] docs: remove warning on the Build system page Content is reviewed fairly carefully, and should be up to the same standard as the rest of the docs now. --- docs/build_system.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/docs/build_system.md b/docs/build_system.md index f26bfb917a..d5d76cc463 100644 --- a/docs/build_system.md +++ b/docs/build_system.md @@ -6,8 +6,6 @@ and is tested, however there may be small differences between the Make and CMake builds. -!!! warning - This page is made by someone who is not the developer and should not be considered as an official documentation of the build system. For getting the full picture, it is best to read the Makefiles and understand them yourself. ## Makefile dependency graph From 1833e68bee0bc2fee5dcc7f8b45580bd29269606 Mon Sep 17 00:00:00 2001 From: Ralf Gommers Date: Sat, 4 Jan 2025 20:55:39 +0100 Subject: [PATCH 225/244] docs: improve rendering of "Runtime variables" page --- docs/runtime_variables.md | 45 +++++++++++++++++++++++++-------------- mkdocs.yml | 1 + 2 files changed, 30 insertions(+), 16 deletions(-) diff --git a/docs/runtime_variables.md b/docs/runtime_variables.md index a43b98cac5..f1ffb791fd 100644 --- a/docs/runtime_variables.md +++ b/docs/runtime_variables.md @@ -1,25 +1,38 @@ -## Runtime variables - OpenBLAS checks the following environment variables on startup: -* **OPENBLAS_NUM_THREADS=** the number of threads to use (for non-OpenMP-builds of OpenBLAS) -* **OMP_NUM_THREADS=** the number of threads to use (for OpenMP builds - note that setting this may also affect any other OpenMP code) -* **OPENBLAS_DEFAULT_NUM_THREADS=** the number of threads to use, irrespective if OpenBLAS was built for OpenMP or pthreads +* `OPENBLAS_NUM_THREADS`: the number of threads to use (for non-OpenMP builds + of OpenBLAS) +* `OMP_NUM_THREADS`: the number of threads to use (for OpenMP builds - note + that setting this may also affect any other OpenMP code) +* `OPENBLAS_DEFAULT_NUM_THREADS`: the number of threads to use, irrespective if + OpenBLAS was built for OpenMP or pthreads + +* `OPENBLAS_MAIN_FREE=1`: this can be used to disable automatic assignment of + cpu affinity in OpenBLAS builds that have it enabled by default +* `OPENBLAS_THREAD_TIMEOUT`: this can be used to define the length of time + that idle threads should wait before exiting +* `OMP_ADAPTIVE=1`: this can be used in OpenMP builds to actually remove any + surplus threads when the number of threads is decreased -* **OPENBLAS_MAIN_FREE=1**" this can be used to disable automatic assignment of cpu affinity in OpenBLAS builds that have it enabled by default -* **OPENBLAS_THREAD_TIMEOUT=** this can be used to define the length of time that idle threads should wait before exiting -* **OMP_ADAPTIVE=1** this can be used in OpenMP builds to actually remove any surplus threads when the number of threads is decreased +`DYNAMIC_ARCH` builds also accept the following: -DYNAMIC_ARCH builds also accept the following: -* **OPENBLAS_VERBOSE=** set this to "1" to enable a warning when there is no exact match for the detected cpu in the library - set this to "2" to make OpenBLAS print the name of the cpu target it autodetected -* **OPENBLAS_CORETYPE=** set this to one of the supported target names to override autodetection, e.g. OPENBLAS_CORETYPE=HASWELL -* **OPENBLAS_L2_SIZE=** set this to override the autodetected size of the L2 cache where it is not reported correctly (in virtual environments) +* `OPENBLAS_VERBOSE`: + - set this to `1` to enable a warning when there is no exact match for the + detected cpu in the library + - set this to `2` to make OpenBLAS print the name of the cpu target it + autodetected + +* `OPENBLAS_CORETYPE`: set this to one of the supported target names to + override autodetection, e.g., `OPENBLAS_CORETYPE=HASWELL` +* `OPENBLAS_L2_SIZE`: set this to override the autodetected size of the L2 + cache where it is not reported correctly (in virtual environments) Deprecated variables still recognized for compatibilty: -* **GOTO_NUM_THREADS=** equivalent to **OPENBLAS_NUM_THREADS** -* **GOTOBLAS_MAIN_FREE** equivalent to **OPENBLAS_MAIN_FREE** -* **OPENBLAS_BLOCK_FACTOR** this applies a scale factor to the GEMM "P" parameter of the block matrix code, see file driver/others/parameter.cen + +* `GOTO_NUM_THREADS`: equivalent to `OPENBLAS_NUM_THREADS` +* `GOTOBLAS_MAIN_FREE`: equivalent to `OPENBLAS_MAIN_FREE` +* `OPENBLAS_BLOCK_FACTOR`: this applies a scale factor to the GEMM "P" + parameter of the block matrix code, see file `driver/others/parameter.c` diff --git a/mkdocs.yml b/mkdocs.yml index 6e2b33be22..333344fe30 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -51,6 +51,7 @@ nav: - extensions.md - developers.md - build_system.md + - runtime_variables.md - distributing.md - ci.md - about.md From eda80f436a35491078e226ae6a471c419e8fda7a Mon Sep 17 00:00:00 2001 From: Ralf Gommers Date: Sat, 4 Jan 2025 21:10:43 +0100 Subject: [PATCH 226/244] docs: improve rendering of Windows on Arm instructions --- docs/install.md | 59 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 39 insertions(+), 20 deletions(-) diff --git a/docs/install.md b/docs/install.md index 55ebc35c1b..7ac85e82d2 100644 --- a/docs/install.md +++ b/docs/install.md @@ -443,28 +443,43 @@ A fully functional native OpenBLAS for WoA that can be built as both a static an (Note that you can use the free "Visual Studio 2022 Community Edition" for this task. In principle it would be possible to build with VisualStudio alone, but using the LLVM toolchain enables native compilation of the Fortran sources of LAPACK and of all the optimized assembly files, which VisualStudio cannot handle on its own) -1. Clone OpenBLAS to your local machine and checkout to latest release of OpenBLAS (unless you want to build the latest development snapshot - here we are using the 0.3.28 release as the example, of course this exact version may be outdated by the time you read this) +1. Clone OpenBLAS to your local machine and checkout to latest release of + OpenBLAS (unless you want to build the latest development snapshot - here we + are using the 0.3.28 release as the example, of course this exact version + may be outdated by the time you read this) - ```cmd - git clone https://github.com/OpenMathLib/OpenBLAS.git - cd OpenBLAS - git checkout v0.3.28 - ``` + ```cmd + git clone https://github.com/OpenMathLib/OpenBLAS.git + cd OpenBLAS + git checkout v0.3.28 + ``` 2. Install Latest LLVM toolchain for WoA: -Download the Latest LLVM toolchain for WoA from [the Release page](https://github.com/llvm/llvm-project/releases/tag/llvmorg-19.1.5). At the time of writing, this is version 19.1.5 - be sure to select the latest release for which you can find a precompiled package whose name ends in "-woa64.exe" (precompiled packages -usually lag a week or two behind their corresponding source release). -Make sure to enable the option “Add LLVM to the system PATH for all the users” -Note: Make sure that the path of LLVM toolchain is at the top of Environment Variables section to avoid conflicts between the set of compilers available in the system path + Download the Latest LLVM toolchain for WoA from [the Release + page](https://github.com/llvm/llvm-project/releases/tag/llvmorg-19.1.5). At + the time of writing, this is version 19.1.5 - be sure to select the + latest release for which you can find a precompiled package whose name ends + in "-woa64.exe" (precompiled packages usually lag a week or two behind their + corresponding source release). Make sure to enable the option + *“Add LLVM to the system PATH for all the users”*. + + Note: Make sure that the path of LLVM toolchain is at the top of Environment + Variables section to avoid conflicts between the set of compilers available + in the system path 3. Launch the Native Command Prompt for Windows ARM64: -From the start menu search for “ARM64 Native Tools Command Prompt for Visual Studio 2022” -Alternatively open command prompt, run the following command to activate the environment: -"C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Auxiliary\Build\vcvarsarm64.bat" + From the start menu search for *"ARM64 Native Tools Command Prompt for Visual + Studio 2022"*. Alternatively open command prompt, run the following command to + activate the environment: + + ```cmd + C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Auxiliary\Build\vcvarsarm64.bat + ``` -Navigate to the OpenBLAS source code directory and start building OpenBLAS by invoking Ninja: +4. Navigate to the OpenBLAS source code directory and start building OpenBLAS + by invoking Ninja: ```cmd cd OpenBLAS @@ -476,14 +491,18 @@ Navigate to the OpenBLAS source code directory and start building OpenBLAS by in ninja -j16 ``` -Note: You might want to include additional options in the cmake command here. For example, the default configuration only generates a static.lib version of the library. If you prefer a DLL, you can add -DBUILD_SHARED_LIBS=ON. - -Note that it is also possible to use the same setup to build OpenBLAS with Make, if you prepare Makefiles over the CMake build for some reason: + Note: You might want to include additional options in the cmake command + here. For example, the default configuration only generates a + `static.lib` version of the library. If you prefer a DLL, you can add + `-DBUILD_SHARED_LIBS=ON`. - ```cmd - $ make CC=clang-cl FC=flang-new AR="llvm-ar" TARGET=ARMV8 ARCH=arm64 RANLIB="llvm-ranlib" MAKE=make - ``` + Note that it is also possible to use the same setup to build OpenBLAS + with Make, if you prefer Makefiles over the CMake build for some + reason: + ```cmd + $ make CC=clang-cl FC=flang-new AR="llvm-ar" TARGET=ARMV8 ARCH=arm64 RANLIB="llvm-ranlib" MAKE=make + ``` #### Generating an import library From f697cfe0d0023afd96bae6bc1026b0d451e1ce6e Mon Sep 17 00:00:00 2001 From: Ralf Gommers Date: Sat, 4 Jan 2025 21:18:07 +0100 Subject: [PATCH 227/244] docs: improve the rendering of the HarmonyOS build instructions --- docs/install.md | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/docs/install.md b/docs/install.md index 7ac85e82d2..a3174202fb 100644 --- a/docs/install.md +++ b/docs/install.md @@ -711,25 +711,36 @@ to the minimum iOS version you want to target and execute this file to build the ### HarmonyOS -For this target you will need the cross-compiler toolchain package by Huawei, which contains solutions for both Windows and Linux. Only the Linux-based -toolchain has been tested so far, but the following instructions may apply similarly to Windows: - -Download https://repo.huaweicloud.com/harmonyos/os/4.1.1-Release/ohos-sdk-windows_linux-public.tar.gz (or whatever newer version may be available in the future). Use tar xvf ohos-sdk-windows_linux_public.tar.gz to unpack it somewhere on your system. This will create a folder named "ohos-sdk" with subfolders "linux" and "windows". In the linux one you will find a ZIP archive named "native-linux-x64-4.1.7.8-Release.zip" - you need to unzip this where you want to -install the cross-compiler, for example in /opt/ohos-sdk. +For this target you will need the cross-compiler toolchain package by Huawei, +which contains solutions for both Windows and Linux. Only the Linux-based +toolchain has been tested so far, but the following instructions may apply +similarly to Windows: + +Download [this HarmonyOS 4.1.1 SDK](https://repo.huaweicloud.com/harmonyos/os/4.1.1-Release/ohos-sdk-windows_linux-public.tar.gz), +or whatever newer version may be available in the future). Use `tar -xvf +ohos-sdk-windows_linux_public.tar.gz` to unpack it somewhere on your system. +This will create a folder named "ohos-sdk" with subfolders "linux" and +"windows". In the linux one you will find a ZIP archive named +`native-linux-x64-4.1.7.8-Release.zip` - you need to unzip this where you want +to install the cross-compiler, for example in `/opt/ohos-sdk`. In the directory where you unpacked OpenBLAS, create a build directory for cmake, and change into it : -``` +```bash mkdir build cd build ``` -Use the version of `cmake` that came with the SDK, and specify the location of its toolchain file as a cmake option. Also set the build target for OpenBLAS to ARMV8 and specify NOFORTRAN=1 (at least as of version 4.1.1, the SDK contains no Fortran compiler): -``` -/opt/ohos-sdk/linux/native/build-tools/cmake/bin/cmake -DCMAKE_TOOLCHAIN_FILE=/opt/ohos-sdk/linux/native/build/cmake/ohos.toolchain.cmake \ +Use the version of `cmake` that came with the SDK, and specify the location of +its toolchain file as a cmake option. Also set the build target for OpenBLAS to +`ARMV8` and specify `NOFORTRAN=1` (at least as of version 4.1.1, the SDK +contains no Fortran compiler): +```bash +/opt/ohos-sdk/linux/native/build-tools/cmake/bin/cmake \ + -DCMAKE_TOOLCHAIN_FILE=/opt/ohos-sdk/linux/native/build/cmake/ohos.toolchain.cmake \ -DOHOS_ARCH="arm64-v8a" -DTARGET=ARMV8 -DNOFORTRAN=1 .. ``` -Additional other OpenBLAS build options like USE_OPENMP=1 or DYNAMIC_ARCH=1 will probably work too. -Finally do the build: -``` +Additional other OpenBLAS build options like `USE_OPENMP=1` or `DYNAMIC_ARCH=1` +will probably work too. Finally do the build: +```bash /opt/ohos-sdk/linux/native/build-tools/cmake/bin/cmake --build . ``` From 8385e02ae15db9e0a0c8f9fd401a65b9d592c020 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 6 Jan 2025 14:42:40 -0800 Subject: [PATCH 228/244] Do not check LDVT when VT is not going to be referenced --- lapack-netlib/LAPACKE/src/lapacke_cgesvd_work.c | 4 +++- lapack-netlib/LAPACKE/src/lapacke_dgesvd_work.c | 2 ++ lapack-netlib/LAPACKE/src/lapacke_sgesvd_work.c | 4 +++- lapack-netlib/LAPACKE/src/lapacke_zgesvd_work.c | 4 +++- 4 files changed, 11 insertions(+), 3 deletions(-) diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgesvd_work.c b/lapack-netlib/LAPACKE/src/lapacke_cgesvd_work.c index 6bc69d48f0..a41819ccc1 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cgesvd_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cgesvd_work.c @@ -74,11 +74,13 @@ lapack_int LAPACKE_cgesvd_work( int matrix_layout, char jobu, char jobvt, LAPACKE_xerbla( "LAPACKE_cgesvd_work", info ); return info; } - if( ldvt < ncols_vt ) { + if( LAPACKE_lsame( jobvt, 'a' ) || LAPACKE_lsame( jobvt, 's' ) ) { + if( ldvt < ncols_vt ) { info = -12; LAPACKE_xerbla( "LAPACKE_cgesvd_work", info ); return info; } + } /* Query optimal working array(s) size if requested */ if( lwork == -1 ) { LAPACK_cgesvd( &jobu, &jobvt, &m, &n, a, &lda_t, s, u, &ldu_t, vt, diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgesvd_work.c b/lapack-netlib/LAPACKE/src/lapacke_dgesvd_work.c index 6668dd7484..d79583b539 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dgesvd_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dgesvd_work.c @@ -72,11 +72,13 @@ lapack_int LAPACKE_dgesvd_work( int matrix_layout, char jobu, char jobvt, LAPACKE_xerbla( "LAPACKE_dgesvd_work", info ); return info; } + if( LAPACKE_lsame( jobvt, 'a' ) || LAPACKE_lsame( jobvt, 's' ) ) { if( ldvt < ncols_vt ) { info = -12; LAPACKE_xerbla( "LAPACKE_dgesvd_work", info ); return info; } + } /* Query optimal working array(s) size if requested */ if( lwork == -1 ) { LAPACK_dgesvd( &jobu, &jobvt, &m, &n, a, &lda_t, s, u, &ldu_t, vt, diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgesvd_work.c b/lapack-netlib/LAPACKE/src/lapacke_sgesvd_work.c index c764333ed3..c7561db0b5 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_sgesvd_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_sgesvd_work.c @@ -72,11 +72,13 @@ lapack_int LAPACKE_sgesvd_work( int matrix_layout, char jobu, char jobvt, LAPACKE_xerbla( "LAPACKE_sgesvd_work", info ); return info; } - if( ldvt < ncols_vt ) { + if( LAPACKE_lsame( jobvt, 'a' ) || LAPACKE_lsame( jobvt, 's' ) ) { + if( ldvt < ncols_vt ) { info = -12; LAPACKE_xerbla( "LAPACKE_sgesvd_work", info ); return info; } + } /* Query optimal working array(s) size if requested */ if( lwork == -1 ) { LAPACK_sgesvd( &jobu, &jobvt, &m, &n, a, &lda_t, s, u, &ldu_t, vt, diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgesvd_work.c b/lapack-netlib/LAPACKE/src/lapacke_zgesvd_work.c index ba48bb052f..07e228c803 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zgesvd_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zgesvd_work.c @@ -74,11 +74,13 @@ lapack_int LAPACKE_zgesvd_work( int matrix_layout, char jobu, char jobvt, LAPACKE_xerbla( "LAPACKE_zgesvd_work", info ); return info; } - if( ldvt < ncols_vt ) { + if( LAPACKE_lsame( jobvt, 'a' ) || LAPACKE_lsame( jobvt, 's' ) ) { + if( ldvt < ncols_vt ) { info = -12; LAPACKE_xerbla( "LAPACKE_zgesvd_work", info ); return info; } + } /* Query optimal working array(s) size if requested */ if( lwork == -1 ) { LAPACK_zgesvd( &jobu, &jobvt, &m, &n, a, &lda_t, s, u, &ldu_t, vt, From 05dce05c24ddfe4ab121314b91d625ed32ebc7a0 Mon Sep 17 00:00:00 2001 From: Felix LeClair Date: Tue, 7 Jan 2025 17:10:34 -0500 Subject: [PATCH 229/244] Update Makefile.riscv64 remove fast-math --- Makefile.riscv64 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.riscv64 b/Makefile.riscv64 index 9f6e48b7ad..0ee26c1b5c 100644 --- a/Makefile.riscv64 +++ b/Makefile.riscv64 @@ -3,7 +3,7 @@ CCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 FCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 -static endif ifeq ($(CORE), x280) -CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_zvl512b -mabi=lp64d -ffast-math +CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_zvl512b -mabi=lp64d FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static endif ifeq ($(CORE), RISCV64_ZVL256B) From 0a5dbf13d3b82c07ed98d5a79935274b9fb6b1fd Mon Sep 17 00:00:00 2001 From: "tingbo.liao" Date: Wed, 8 Jan 2025 11:00:35 +0800 Subject: [PATCH 230/244] Optimize the omatcopy_cn and zomatcopy_cn kernels with RVV 1.0 intrinsic. Signed-off-by: tingbo.liao --- kernel/riscv64/KERNEL.x280 | 6 ++ kernel/riscv64/omatcopy_cn_rvv.c | 109 ++++++++++++++++++++++++++++++ kernel/riscv64/zomatcopy_cn_rvv.c | 100 +++++++++++++++++++++++++++ 3 files changed, 215 insertions(+) create mode 100644 kernel/riscv64/omatcopy_cn_rvv.c create mode 100644 kernel/riscv64/zomatcopy_cn_rvv.c diff --git a/kernel/riscv64/KERNEL.x280 b/kernel/riscv64/KERNEL.x280 index 86708fe015..e909ca9599 100644 --- a/kernel/riscv64/KERNEL.x280 +++ b/kernel/riscv64/KERNEL.x280 @@ -279,3 +279,9 @@ endif ifndef ZGEMM_BETA ZGEMM_BETA = zgemm_beta_rvv.c endif + +ZOMATCOPY_CN = zomatcopy_cn_rvv.c +COMATCOPY_CN = zomatcopy_cn_rvv.c + +DOMATCOPY_CN = omatcopy_cn_rvv.c +SOMATCOPY_CN = omatcopy_cn_rvv.c \ No newline at end of file diff --git a/kernel/riscv64/omatcopy_cn_rvv.c b/kernel/riscv64/omatcopy_cn_rvv.c new file mode 100644 index 0000000000..8cd1fb545e --- /dev/null +++ b/kernel/riscv64/omatcopy_cn_rvv.c @@ -0,0 +1,109 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if !defined(DOUBLE) +#define VSETVL_MAX __riscv_vsetvlmax_e32m8() +#define VSETVL(n) __riscv_vsetvl_e32m8(n) +#define FLOAT_V_T vfloat32m8_t +#define VLEV_FLOAT __riscv_vle32_v_f32m8 +#define VSEV_FLOAT __riscv_vse32_v_f32m8 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8 +#else +#define VSETVL_MAX __riscv_vsetvlmax_e64m8() +#define VSETVL(n) __riscv_vsetvl_e64m8(n) +#define FLOAT_V_T vfloat64m8_t +#define VLEV_FLOAT __riscv_vle64_v_f64m8 +#define VSEV_FLOAT __riscv_vse64_v_f64m8 +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m8 +#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8 +#endif + + +int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb) +{ + BLASLONG i,j; + FLOAT *aptr,*bptr; + size_t vl; + + FLOAT_V_T va, vb; + if ( rows <= 0 ) return(0); + if ( cols <= 0 ) return(0); + + aptr = a; + bptr = b; + + if ( alpha == 0.0 ) + { + vl = VSETVL_MAX; + va = VFMVVF_FLOAT(0, vl); + for ( i=0; i + +#if defined(DOUBLE) +#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m4x2 +#define VSSEG2_FLOAT __riscv_vsseg2e64_v_f64m4x2 +#define VSETVL __riscv_vsetvl_e64m4 +#define FLOAT_VX2_T vfloat64m4x2_t +#define VGET_VX2 __riscv_vget_v_f64m4x2_f64m4 +#define VSET_VX2 __riscv_vset_v_f64m4_f64m4x2 +#define FLOAT_V vfloat64m4_t +#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m4 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4 +#else +#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m4x2 +#define VSSEG2_FLOAT __riscv_vsseg2e32_v_f32m4x2 +#define VSETVL __riscv_vsetvl_e32m4 +#define FLOAT_VX2_T vfloat32m4x2_t +#define VGET_VX2 __riscv_vget_v_f32m4x2_f32m4 +#define VSET_VX2 __riscv_vset_v_f32m4_f32m4x2 +#define FLOAT_V vfloat32m4_t +#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m4 +#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4 +#endif + +int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb) +{ + BLASLONG i,j,ia; + FLOAT *aptr,*bptr; + size_t vl; + FLOAT_VX2_T va, vb; + FLOAT_V va0, va1, vb0, vb1, vtemp; + + if ( rows <= 0 ) return(0); + if ( cols <= 0 ) return(0); + + aptr = a; + bptr = b; + + lda *= 2; + ldb *= 2; + + for ( i=0; i Date: Wed, 8 Jan 2025 23:17:45 +0100 Subject: [PATCH 231/244] Replace while loop with for --- kernel/generic/zgemm_beta.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/kernel/generic/zgemm_beta.c b/kernel/generic/zgemm_beta.c index 7954e22e3c..bf836aa3e1 100644 --- a/kernel/generic/zgemm_beta.c +++ b/kernel/generic/zgemm_beta.c @@ -93,8 +93,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, } else { - j = n; - do { + + for (j=n;j>0;j++) { c_offset1 = c_offset; c_offset += ldc; @@ -151,8 +151,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, i --; } while (i > 0); } - j --; - } while (j > 0); + } } return 0; } From 09e75f158897eb0310512b28851c15729a86d427 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 9 Jan 2025 00:52:14 +0100 Subject: [PATCH 232/244] fix absurd typo --- kernel/generic/zgemm_beta.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/generic/zgemm_beta.c b/kernel/generic/zgemm_beta.c index bf836aa3e1..1f1ffe1237 100644 --- a/kernel/generic/zgemm_beta.c +++ b/kernel/generic/zgemm_beta.c @@ -94,7 +94,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, } else { - for (j=n;j>0;j++) { + for (j=n;j>0;j--) { c_offset1 = c_offset; c_offset += ldc; From a9eec233d3df79b5a8efaf73b169429f2fa9fa3a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 9 Jan 2025 11:34:09 +0100 Subject: [PATCH 233/244] Update FreeBSD jobs to 14.1 --- .cirrus.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.cirrus.yml b/.cirrus.yml index 112afe352c..05008af0e1 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -125,9 +125,9 @@ task: - make USE_OPENMP=1 FreeBSD_task: - name: FreeBSD-gcc12 + name: FreeBSD-gcc freebsd_instance: - image_family: freebsd-13-3 + image_family: freebsd-14-1 install_script: - pkg update -f && pkg upgrade -y && pkg install -y gmake gcc compile_script: @@ -136,9 +136,9 @@ FreeBSD_task: FreeBSD_task: - name: freebsd-gcc12-ilp64 + name: freebsd-gcc-ilp64 freebsd_instance: - image_family: freebsd-13-3 + image_family: freebsd-14-1 install_script: - pkg update -f && pkg upgrade -y && pkg install -y gmake gcc compile_script: @@ -148,10 +148,10 @@ FreeBSD_task: FreeBSD_task: name: FreeBSD-clang-openmp freebsd_instance: - image_family: freebsd-13-3 + image_family: freebsd-14-1 install_script: - pkg update -f && pkg upgrade -y && pkg install -y gmake gcc - - ln -s /usr/local/lib/gcc13/libgfortran.so.5.0.0 /usr/lib/libgfortran.so + - ln -s /usr/local/lib/gcc14/libgfortran.so.5.0.0 /usr/lib/libgfortran.so compile_script: - gmake CC=clang FC=gfortran USE_OPENMP=1 CPP_THREAD_SAFETY_TEST=1 From b67a963412fe5fc7dce9f8f17c7d51443a07b2b6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 9 Jan 2025 11:54:35 +0100 Subject: [PATCH 234/244] gcc remains at 13 even for freebsd-14.1 --- .cirrus.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.cirrus.yml b/.cirrus.yml index 05008af0e1..a7f64255d1 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -151,7 +151,7 @@ FreeBSD_task: image_family: freebsd-14-1 install_script: - pkg update -f && pkg upgrade -y && pkg install -y gmake gcc - - ln -s /usr/local/lib/gcc14/libgfortran.so.5.0.0 /usr/lib/libgfortran.so + - ln -s /usr/local/lib/gcc13/libgfortran.so.5.0.0 /usr/lib/libgfortran.so compile_script: - gmake CC=clang FC=gfortran USE_OPENMP=1 CPP_THREAD_SAFETY_TEST=1 From d91d4fa6e94ef3f4a50c577b3cb5c191e4bc9e5c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 9 Jan 2025 23:11:26 +0100 Subject: [PATCH 235/244] convert the beta=0 branch to a for loop as well --- kernel/generic/zgemm_beta.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/kernel/generic/zgemm_beta.c b/kernel/generic/zgemm_beta.c index 1f1ffe1237..61dd207d0c 100644 --- a/kernel/generic/zgemm_beta.c +++ b/kernel/generic/zgemm_beta.c @@ -58,8 +58,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, c_offset = c; if (beta_r == 0. && beta_i == 0.) { - j = n; - do { + + for (j=n;j>0;j--) { c_offset1 = c_offset; c_offset += ldc; @@ -88,8 +88,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, i--; } while (i > 0); } - j --; - } while (j > 0); + } } else { From e0748588b8f83fae8e546d30305f476927f9b8e9 Mon Sep 17 00:00:00 2001 From: gxw Date: Fri, 10 Jan 2025 11:19:38 +0800 Subject: [PATCH 236/244] LoongArch64: Update dsymv LASX version --- kernel/loongarch64/dsymv_L_lasx.S | 232 ++++++++++++++++-------------- kernel/loongarch64/dsymv_U_lasx.S | 201 ++++++++++++++------------ 2 files changed, 238 insertions(+), 195 deletions(-) diff --git a/kernel/loongarch64/dsymv_L_lasx.S b/kernel/loongarch64/dsymv_L_lasx.S index 2259966d86..a36cff9a93 100644 --- a/kernel/loongarch64/dsymv_L_lasx.S +++ b/kernel/loongarch64/dsymv_L_lasx.S @@ -28,6 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ASSEMBLER #include "common.h" +#include "loongarch64_asm.S" /* Param */ #define M $r4 @@ -57,6 +58,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define T2 $r28 #define T3 $r29 #define T4 $r30 +#define T5 $r17 +#define T6 $r16 /* LSX vectors */ #define U0 $xr31 @@ -87,10 +90,113 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define a8 $f8 #define a9 $f9 +.macro LOAD_Y_8 + beqz T5, .L01_Y_0 + add.d T2, IY, INCY + fldx.d $f4, Y, T2 + add.d T2, T2, INCY + fldx.d $f5, Y, T2 + add.d T2, T2, INCY + fldx.d $f6, Y, T2 + add.d T2, T2, INCY + fldx.d $f7, Y, T2 - PROLOGUE + add.d T2, T2, INCY + fldx.d $f8, Y, T2 + add.d T2, T2, INCY + fldx.d $f9, Y, T2 + add.d T2, T2, INCY + fldx.d $f10, Y, T2 + add.d T2, T2, INCY + fldx.d $f11, Y, T2 + + vextrins.d $vr4, $vr5, 0x10 + vextrins.d $vr6, $vr7, 0x10 + xvpermi.q U4, U6, 0x02 - LDARG BUFFER, $sp, 0 + vextrins.d $vr8, $vr9, 0x10 + vextrins.d $vr10, $vr11, 0x10 + xvpermi.q U8, U10, 0x02 + b .L01_Y_1 +.L01_Y_0: + add.d T3, IY, INCY + xvldx U4, Y, T3 + alsl.d T4, INCY, T3, 2 + xvldx U8, Y, T4 +.L01_Y_1: +.endm + +.macro LOAD_X_8 + beqz T6, .L01_X_0 + add.d T2, IX, INCX + fldx.d $f4, X, T2 + add.d T2, T2, INCX + fldx.d $f5, X, T2 + add.d T2, T2, INCX + fldx.d $f6, X, T2 + add.d T2, T2, INCX + fldx.d $f7, X, T2 + + add.d T2, T2, INCX + fldx.d $f8, X, T2 + add.d T2, T2, INCX + fldx.d $f9, X, T2 + add.d T2, T2, INCX + fldx.d $f10, X, T2 + add.d T2, T2, INCX + fldx.d $f11, X, T2 + + vextrins.d $vr4, $vr5, 0x10 + vextrins.d $vr6, $vr7, 0x10 + xvpermi.q U4, U6, 0x02 + + vextrins.d $vr8, $vr9, 0x10 + vextrins.d $vr10, $vr11, 0x10 + xvpermi.q U8, U10, 0x02 + b .L01_X_1 +.L01_X_0: + add.d T3, IX, INCX + xvldx U4, X, T3 + alsl.d T2, INCX, T3, 2 + xvldx U8, X, T2 +.L01_X_1: +.endm + +.macro STORE_Y_8 + beqz T5, .L01_Y_2 + xvpermi.d U6, U4, 0xee + vextrins.d $vr5, $vr4, 0x01 + vextrins.d $vr7, $vr6, 0x01 + + xvpermi.d U10, U8, 0xee + vextrins.d $vr9, $vr8, 0x01 + vextrins.d $vr11, $vr10, 0x01 + + add.d T2, IY, INCY + fstx.d $f4, Y, T2 + add.d T2, T2, INCY + fstx.d $f5, Y, T2 + add.d T2, T2, INCY + fstx.d $f6, Y, T2 + add.d T2, T2, INCY + fstx.d $f7, Y, T2 + + add.d T2, T2, INCY + fstx.d $f8, Y, T2 + add.d T2, T2, INCY + fstx.d $f9, Y, T2 + add.d T2, T2, INCY + fstx.d $f10, Y, T2 + add.d T2, T2, INCY + fstx.d $f11, Y, T2 + b .L01_Y_3 +.L01_Y_2: + xvstx U4, Y, T3 + xvstx U8, Y, T4 +.L01_Y_3: +.endm + + PROLOGUE addi.d $sp, $sp, -88 @@ -107,6 +213,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvldrepl.d VALPHA, $sp, 80 + addi.d T5, INCY, -1 + addi.d T6, INCX, -1 slli.d LDA, LDA, BASE_SHIFT slli.d INCX, INCX, BASE_SHIFT slli.d INCY, INCY, BASE_SHIFT @@ -122,11 +230,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. beq J, N, .L999 .L01: - MTC a2, $r0 //temp2 + xvxor.v U2, U2, U2 fldx.d a6, X, JX fmul.d a3, ALPHA, a6 //temp1 xvreplve0.d U3, U3 - xvreplve0.d U2, U2 mul.d T0, J, LDA slli.d T1, J, BASE_SHIFT @@ -147,126 +254,41 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. srai.d T0, T0, 3 add.d T0, T0, J addi.d T0, T0, 1 - beq I, T0, .L03 - bge I, T0, .L03 + beq I, T0, .L03 + bge I, T0, .L03 mul.d T1, J, LDA add.d T1, T1, II .L02: /* /8 */ xvldx U1, AO1, T1 - addi.d T1, T1, 32 - xvldx U14, AO1, T1 - addi.d T1, T1, 32 + addi.d T2, T1, 32 + xvldx U14, AO1, T2 - add.d T2, IY, INCY - fldx.d $f4, Y, T2 - add.d T2, T2, INCY - fldx.d $f5, Y, T2 - add.d T2, T2, INCY - fldx.d $f6, Y, T2 - add.d T2, T2, INCY - fldx.d $f7, Y, T2 - - add.d T2, T2, INCY - fldx.d $f8, Y, T2 - add.d T2, T2, INCY - fldx.d $f9, Y, T2 - add.d T2, T2, INCY - fldx.d $f10, Y, T2 - add.d T2, T2, INCY - fldx.d $f11, Y, T2 - - vextrins.d $vr4, $vr5, 0x10 - vextrins.d $vr6, $vr7, 0x10 - xvpermi.q U4, U6, 0x02 - - vextrins.d $vr8, $vr9, 0x10 - vextrins.d $vr10, $vr11, 0x10 - xvpermi.q U8, U10, 0x02 + LOAD_Y_8 xvfmadd.d U4, U3, U1, U4 xvfmadd.d U8, U3, U14, U8 - xvpermi.d U6, U4, 0xee - vextrins.d $vr5, $vr4, 0x01 - vextrins.d $vr7, $vr6, 0x01 - - xvpermi.d U10, U8, 0xee - vextrins.d $vr9, $vr8, 0x01 - vextrins.d $vr11, $vr10, 0x01 - - add.d T2, IY, INCY - fstx.d $f4, Y, T2 - add.d T2, T2, INCY - fstx.d $f5, Y, T2 - add.d T2, T2, INCY - fstx.d $f6, Y, T2 - add.d T2, T2, INCY - fstx.d $f7, Y, T2 - - add.d T2, T2, INCY - fstx.d $f8, Y, T2 - add.d T2, T2, INCY - fstx.d $f9, Y, T2 - add.d T2, T2, INCY - fstx.d $f10, Y, T2 - add.d T2, T2, INCY - fstx.d $f11, Y, T2 - - slli.d T2, INCY, 3 - add.d IY, IY, T2 - - add.d T2, IX, INCX - fldx.d $f4, X, T2 - add.d T2, T2, INCX - fldx.d $f5, X, T2 - add.d T2, T2, INCX - fldx.d $f6, X, T2 - add.d T2, T2, INCX - fldx.d $f7, X, T2 - - add.d T2, T2, INCX - fldx.d $f8, X, T2 - add.d T2, T2, INCX - fldx.d $f9, X, T2 - add.d T2, T2, INCX - fldx.d $f10, X, T2 - add.d T2, T2, INCX - fldx.d $f11, X, T2 - - vextrins.d $vr4, $vr5, 0x10 - vextrins.d $vr6, $vr7, 0x10 - xvpermi.q U4, U6, 0x02 - - vextrins.d $vr8, $vr9, 0x10 - vextrins.d $vr10, $vr11, 0x10 - xvpermi.q U8, U10, 0x02 - - xvand.v $xr12, $xr2, $xr2 - - xvfmadd.d U2, U1, U4, U2 - xvfsub.d U2, U2, $xr12 - xvfmadd.d U2, U14, U8, U2 + STORE_Y_8 - xvpermi.d U4, U2, 0x01 - xvpermi.d U5, U2, 0x02 - xvpermi.d U6, U2, 0x03 + alsl.d IY, INCY, IY, 3 - fadd.d $f2, $f2, $f4 - fadd.d $f2, $f2, $f5 - fadd.d $f2, $f2, $f6 - fadd.d $f2, $f2, $f12 + LOAD_X_8 - xvreplve0.d U2, U2 + xvfmadd.d U2, U1, U4, U2 + xvfmadd.d U2, U14, U8, U2 - slli.d T2, INCX, 3 - add.d IX, IX, T2 + alsl.d IX, INCX, IX, 3 + addi.d T1, T1, 64 addi.d II, II, 64 addi.d I, I, 1 blt I, T0, .L02 + //Acc U2 + GACC xvf, d, U4, U2 + fmov.d $f2, $f4 .L03: /* &4 */ sub.d T0, M, J addi.d T0, T0, -1 @@ -437,4 +459,4 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi.d $sp, $sp, 88 jirl $r0, $r1, 0x0 - EPILOGUE \ No newline at end of file + EPILOGUE diff --git a/kernel/loongarch64/dsymv_U_lasx.S b/kernel/loongarch64/dsymv_U_lasx.S index 57eb90aaef..892c5ed2fa 100644 --- a/kernel/loongarch64/dsymv_U_lasx.S +++ b/kernel/loongarch64/dsymv_U_lasx.S @@ -28,6 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ASSEMBLER #include "common.h" +#include "loongarch64_asm.S" /* Param */ #define M $r4 @@ -57,6 +58,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define T2 $r28 #define T3 $r29 #define T4 $r30 +#define T5 $r17 +#define T6 $r16 /* LSX vectors */ #define U0 $xr31 @@ -87,67 +90,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define a8 $f8 #define a9 $f9 - - PROLOGUE - - LDARG BUFFER, $sp, 0 - - addi.d $sp, $sp, -88 - - SDARG $r23, $sp, 0 - SDARG $r24, $sp, 8 - SDARG $r25, $sp, 16 - SDARG $r26, $sp, 32 - SDARG $r27, $sp, 40 - SDARG $r28, $sp, 48 - SDARG $r29, $sp, 56 - SDARG $r30, $sp, 64 - SDARG $r31, $sp, 72 - ST ALPHA, $sp, 80 - - xvldrepl.d VALPHA, $sp, 80 - - slli.d LDA, LDA, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - slli.d INCY, INCY, BASE_SHIFT - - bge $r0, M, .L999 - bge $r0, N, .L999 - - sub.d M1, M, N - - mul.d JY, M1, INCY - mul.d JX, M1, INCX - - move J, M1 - move AO1, A - - beq J, M, .L999 - -.L01: - MTC $f2, $r0 //temp2 - fldx.d $f6, X, JX - fmul.d $f3, ALPHA, $f6 //temp1 - xvreplve0.d U3, U3 - xvreplve0.d U2, U2 - - move IY, $r0 - move IX, $r0 - move II, $r0 - move I, $r0 - - srai.d T0, J, 3 - beq I, T0, .L03 - - mul.d T1, J, LDA - add.d T1, T1, II - -.L02: /* /8 */ - xvldx U1, AO1, T1 - addi.d T1, T1, 32 - xvldx U14, AO1, T1 - addi.d T1, T1, 32 - +.macro LOAD_Y_8 + beqz T5, .L01_Y_0 fldx.d $f4, Y, IY add.d T2, IY, INCY fldx.d $f5, Y, T2 @@ -167,20 +111,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vextrins.d $vr4, $vr5, 0x10 vextrins.d $vr6, $vr7, 0x10 - xvpermi.q U4, U6, 0x02 + xvpermi.q U4, U6, 0x02 vextrins.d $vr8, $vr9, 0x10 vextrins.d $vr10, $vr11, 0x10 - xvpermi.q U8, U10, 0x02 - - xvfmadd.d U4, U3, U1, U4 - xvfmadd.d U8, U3, U14, U8 - - xvpermi.d U6, U4, 0xee + xvpermi.q U8, U10, 0x02 + b .L01_Y_1 +.L01_Y_0: + xvldx U4, Y, IY + alsl.d T4, INCY, IY, 2 + xvldx U8, Y, T4 +.L01_Y_1: +.endm + +.macro STORE_Y_8 + beqz T5, .L01_Y_2 + xvpermi.d U6, U4, 0xee vextrins.d $vr5, $vr4, 0x01 vextrins.d $vr7, $vr6, 0x01 - xvpermi.d U10, U8, 0xee + xvpermi.d U10, U8, 0xee vextrins.d $vr9, $vr8, 0x01 vextrins.d $vr11, $vr10, 0x01 @@ -200,10 +150,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fstx.d $f10, Y, T2 add.d T2, T2, INCY fstx.d $f11, Y, T2 - - slli.d T2, INCY, 3 - add.d IY, IY, T2 - + b .L01_Y_3 +.L01_Y_2: + xvstx U4, Y, IY + xvstx U8, Y, T4 +.L01_Y_3: +.endm + +.macro LOAD_X_8 + beqz T6, .L01_X_0 fldx.d $f4, X, IX add.d T2, IX, INCX fldx.d $f5, X, T2 @@ -223,36 +178,102 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vextrins.d $vr4, $vr5, 0x10 vextrins.d $vr6, $vr7, 0x10 - xvpermi.q U4, U6, 0x02 + xvpermi.q U4, U6, 0x02 vextrins.d $vr8, $vr9, 0x10 vextrins.d $vr10, $vr11, 0x10 - xvpermi.q U8, U10, 0x02 + xvpermi.q U8, U10, 0x02 + b .L01_X_1 +.L01_X_0: + xvldx U4, X, IX + alsl.d T2, INCX, IX, 2 + xvldx U8, X, T2 +.L01_X_1: +.endm - xvand.v $xr12, $xr2, $xr2 + PROLOGUE - xvfmadd.d U2, U1, U4, U2 - xvfsub.d U2, U2, $xr12 - xvfmadd.d U2, U14, U8, U2 + addi.d $sp, $sp, -88 - xvpermi.d U4, U2, 0x01 - xvpermi.d U5, U2, 0x02 - xvpermi.d U6, U2, 0x03 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 32 + SDARG $r27, $sp, 40 + SDARG $r28, $sp, 48 + SDARG $r29, $sp, 56 + SDARG $r30, $sp, 64 + SDARG $r31, $sp, 72 + ST ALPHA, $sp, 80 - fadd.d $f2, $f2, $f4 - fadd.d $f2, $f2, $f5 - fadd.d $f2, $f2, $f6 - fadd.d $f2, $f2, $f12 + xvldrepl.d VALPHA, $sp, 80 - xvreplve0.d U2, U2 + addi.d T5, INCY, -1 + addi.d T6, INCX, -1 + slli.d LDA, LDA, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT - slli.d T2, INCX, 3 - add.d IX, IX, T2 + bge $r0, M, .L999 + bge $r0, N, .L999 + + sub.d M1, M, N + + mul.d JY, M1, INCY + mul.d JX, M1, INCX + + move J, M1 + move AO1, A + beq J, M, .L999 + +.L01: + xvxor.v U2, U2, U2 + fldx.d $f6, X, JX + fmul.d $f3, ALPHA, $f6 //temp1 + xvreplve0.d U3, U3 + + move IY, $r0 + move IX, $r0 + move II, $r0 + move I, $r0 + + srai.d T0, J, 3 + beq I, T0, .L03 + + mul.d T1, J, LDA + add.d T1, T1, II + +.L02: /* /8 */ + xvldx U1, AO1, T1 + addi.d T2, T1, 32 + xvldx U14, AO1, T2 + + LOAD_Y_8 + + xvfmadd.d U4, U3, U1, U4 + xvfmadd.d U8, U3, U14, U8 + + STORE_Y_8 + + alsl.d IY, INCY, IY, 3 + + LOAD_X_8 + + xvfmadd.d U2, U1, U4, U2 + xvfmadd.d U2, U14, U8, U2 + + alsl.d IX, INCX, IX, 3 + + addi.d T1, T1, 64 addi.d II, II, 64 addi.d I, I, 1 blt I, T0, .L02 + //Acc U2 + GACC xvf, d, U4, U2 + fmov.d $f2, $f4 + .L03: /* &4 */ andi T0, J, 4 beq $r0, T0, .L04 @@ -425,4 +446,4 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi.d $sp, $sp, 88 jirl $r0, $r1, 0x0 - EPILOGUE \ No newline at end of file + EPILOGUE From 20a8e48f25ae523db819546be0bf4116dc192b39 Mon Sep 17 00:00:00 2001 From: gxw Date: Fri, 10 Jan 2025 15:53:14 +0800 Subject: [PATCH 237/244] LoongArch64: Update ssymv LASX version --- kernel/loongarch64/ssymv_L_lasx.S | 210 ++++++++++++++++-------------- kernel/loongarch64/ssymv_U_lasx.S | 184 +++++++++++++------------- 2 files changed, 206 insertions(+), 188 deletions(-) diff --git a/kernel/loongarch64/ssymv_L_lasx.S b/kernel/loongarch64/ssymv_L_lasx.S index 980c10fd74..81796883d7 100644 --- a/kernel/loongarch64/ssymv_L_lasx.S +++ b/kernel/loongarch64/ssymv_L_lasx.S @@ -28,6 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ASSEMBLER #include "common.h" +#include "loongarch64_asm.S" /* Param */ #define M $r4 @@ -57,6 +58,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define T2 $r28 #define T3 $r29 #define T4 $r30 +#define T5 $r17 +#define T6 $r16 /* LSX vectors */ #define U0 $xr31 @@ -87,75 +90,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define a8 $f8 #define a9 $f9 - - PROLOGUE - - LDARG BUFFER, $sp, 0 - - addi.d $sp, $sp, -88 - - SDARG $r23, $sp, 0 - SDARG $r24, $sp, 8 - SDARG $r25, $sp, 16 - SDARG $r26, $sp, 32 - SDARG $r27, $sp, 40 - SDARG $r28, $sp, 48 - SDARG $r29, $sp, 56 - SDARG $r30, $sp, 64 - SDARG $r31, $sp, 72 - ST ALPHA, $sp, 80 - - xvldrepl.w VALPHA, $sp, 80 - - slli.d LDA, LDA, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - slli.d INCY, INCY, BASE_SHIFT - - bge $r0, M, .L999 - bge $r0, N, .L999 - - move J, $r0 - move JY, $r0 - move JX, $r0 - move AO1, A - - beq J, N, .L999 - -.L01: - MTC a2, $r0 //temp2 - fldx.s a6, X, JX - fmul.s a3, ALPHA, a6 //temp1 - xvreplve0.w U3, U3 - xvreplve0.w U2, U2 - - mul.w T0, J, LDA - slli.d T1, J, BASE_SHIFT - add.w T0, T0, T1 - fldx.s a6, AO1, T0 - fldx.s a4, Y, JY - fmadd.s a4, a3, a6, a4 - fstx.s a4, Y, JY - - move IY, JY - move IX, JX - addi.d II, J, 1 - move I, II - slli.d II, II, BASE_SHIFT - - sub.d T0, M, J - addi.d T0, T0, -1 - srai.d T0, T0, 3 - add.d T0, T0, J - addi.d T0, T0, 1 - beq I, T0, .L03 - bge I, T0, .L03 - - mul.w T1, J, LDA - add.d T1, T1, II - -.L02: /* /8 */ - xvldx U1, AO1, T1 - +.macro LOAD_Y_8 + beqz T5, .L01_Y_0 add.d T2, IY, INCY fldx.s $f4, Y, T2 add.d T2, T2, INCY @@ -180,11 +116,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vextrins.w $vr8, $vr9, 0x10 vextrins.w $vr8, $vr10, 0x20 vextrins.w $vr8, $vr11, 0x30 - xvpermi.q U4, U8, 0x02 - - xvfmadd.s U4, U3, U1, U4 - - xvpermi.d U8, U4, 0xee + xvpermi.q U4, U8, 0x02 + b .L01_Y_1 +.L01_Y_0: + add.d T3, IY, INCY + xvldx U4, Y, T3 +.L01_Y_1: +.endm + +.macro STORE_Y_8 + beqz T5, .L01_Y_2 + xvpermi.d U8, U4, 0xee vextrins.w $vr5, $vr4, 0x01 vextrins.w $vr6, $vr4, 0x02 vextrins.w $vr7, $vr4, 0x03 @@ -209,10 +151,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fstx.s $f10, Y, T2 add.d T2, T2, INCY fstx.s $f11, Y, T2 - - slli.d T2, INCY, 3 - add.d IY, IY, T2 - + b .L01_Y_3 +.L01_Y_2: + xvstx U4, Y, T3 +.L01_Y_3: +.endm + +.macro LOAD_X_8 + beqz T6, .L01_X_0 add.d T2, IX, INCX fldx.s $f4, X, T2 add.d T2, T2, INCX @@ -238,39 +184,103 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vextrins.w $vr8, $vr10, 0x20 vextrins.w $vr8, $vr11, 0x30 xvpermi.q U4, U8, 0x02 + b .L01_X_1 +.L01_X_0: + add.d T3, IX, INCX + xvldx U4, X, T3 +.L01_X_1: +.endm + + PROLOGUE - xvand.v $xr12, $xr2, $xr2 + addi.d $sp, $sp, -88 - xvfmadd.s U2, U1, U4, U2 - xvfsub.s U2, U2, $xr12 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 32 + SDARG $r27, $sp, 40 + SDARG $r28, $sp, 48 + SDARG $r29, $sp, 56 + SDARG $r30, $sp, 64 + SDARG $r31, $sp, 72 + ST ALPHA, $sp, 80 - xvpickve.w U4, U2, 0x01 - xvpickve.w U5, U2, 0x02 - xvpickve.w U6, U2, 0x03 - xvpickve.w U7, U2, 0x04 - xvpickve.w U8, U2, 0x05 - xvpickve.w U9, U2, 0x06 - xvpickve.w U10, U2, 0x07 + xvldrepl.w VALPHA, $sp, 80 - fadd.s $f2, $f2, $f4 - fadd.s $f2, $f2, $f5 - fadd.s $f2, $f2, $f6 - fadd.s $f2, $f2, $f7 - fadd.s $f2, $f2, $f8 - fadd.s $f2, $f2, $f9 - fadd.s $f2, $f2, $f10 - fadd.s $f2, $f2, $f12 + addi.d T5, INCY, -1 + addi.d T6, INCX, -1 + slli.d LDA, LDA, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT - xvreplve0.d U2, U2 + bge $r0, M, .L999 + bge $r0, N, .L999 + + move J, $r0 + move JY, $r0 + move JX, $r0 + move AO1, A - slli.d T2, INCX, 3 - add.d IX, IX, T2 + beq J, N, .L999 + +.L01: + xvxor.v U2, U2, U2 + fldx.s a6, X, JX + fmul.s a3, ALPHA, a6 //temp1 + xvreplve0.w U3, U3 + + mul.w T0, J, LDA + slli.d T1, J, BASE_SHIFT + add.w T0, T0, T1 + fldx.s a6, AO1, T0 + fldx.s a4, Y, JY + fmadd.s a4, a3, a6, a4 + fstx.s a4, Y, JY + + move IY, JY + move IX, JX + addi.d II, J, 1 + move I, II + slli.d II, II, BASE_SHIFT + + sub.d T0, M, J + addi.d T0, T0, -1 + srai.d T0, T0, 3 + add.d T0, T0, J + addi.d T0, T0, 1 + beq I, T0, .L03 + bge I, T0, .L03 + + mul.w T1, J, LDA + add.d T1, T1, II + +.L02: /* /8 */ + xvldx U1, AO1, T1 + + LOAD_Y_8 + + xvfmadd.s U4, U3, U1, U4 + + STORE_Y_8 + + alsl.d IY, INCY, IY, 3 + + LOAD_X_8 + + xvfmadd.s U2, U1, U4, U2 + + alsl.d IX, INCX, IX, 3 addi.d II, II, 32 addi.d T1, T1, 32 addi.d I, I, 1 blt I, T0, .L02 + //Acc U2 + GACC xvf, s, U4, U2 + fmov.d $f2, $f4 + .L03: /* &4 */ sub.d T0, M, J addi.d T0, T0, -1 @@ -433,4 +443,4 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi.d $sp, $sp, 88 jirl $r0, $r1, 0x0 - EPILOGUE \ No newline at end of file + EPILOGUE diff --git a/kernel/loongarch64/ssymv_U_lasx.S b/kernel/loongarch64/ssymv_U_lasx.S index bd6fd3dd7a..ff68723e1b 100644 --- a/kernel/loongarch64/ssymv_U_lasx.S +++ b/kernel/loongarch64/ssymv_U_lasx.S @@ -28,6 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ASSEMBLER #include "common.h" +#include "loongarch64_asm.S" /* Param */ #define M $r4 @@ -57,6 +58,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define T2 $r28 #define T3 $r29 #define T4 $r30 +#define T5 $r17 +#define T6 $r16 /* LSX vectors */ #define U0 $xr31 @@ -87,64 +90,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define a8 $f8 #define a9 $f9 - - PROLOGUE - - LDARG BUFFER, $sp, 0 - - addi.d $sp, $sp, -88 - - SDARG $r23, $sp, 0 - SDARG $r24, $sp, 8 - SDARG $r25, $sp, 16 - SDARG $r26, $sp, 32 - SDARG $r27, $sp, 40 - SDARG $r28, $sp, 48 - SDARG $r29, $sp, 56 - SDARG $r30, $sp, 64 - SDARG $r31, $sp, 72 - ST ALPHA, $sp, 80 - - xvldrepl.w VALPHA, $sp, 80 - - slli.d LDA, LDA, BASE_SHIFT - slli.d INCX, INCX, BASE_SHIFT - slli.d INCY, INCY, BASE_SHIFT - - bge $r0, M, .L999 - bge $r0, N, .L999 - - sub.d M1, M, N - - mul.d JY, M1, INCY - mul.d JX, M1, INCX - - move J, M1 - move AO1, A - - beq J, M, .L999 - -.L01: - MTC $f2, $r0 //temp2 - fldx.s $f6, X, JX - fmul.s $f3, ALPHA, $f6 //temp1 - xvreplve0.w U3, U3 - xvreplve0.w U2, U2 - - move IY, $r0 - move IX, $r0 - move II, $r0 - move I, $r0 - - srai.d T0, J, 3 - beq I, T0, .L03 - - mul.w T1, J, LDA - add.d T1, T1, II - -.L02: /* /8 */ - xvldx U1, AO1, T1 - +.macro LOAD_Y_8 + beqz T5, .L01_Y_0 fldx.s $f4, Y, IY add.d T2, IY, INCY fldx.s $f5, Y, T2 @@ -168,10 +115,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vextrins.w $vr8, $vr9, 0x10 vextrins.w $vr8, $vr10, 0x20 vextrins.w $vr8, $vr11, 0x30 - xvpermi.q U4, U8, 0x02 - - xvfmadd.s U4, U3, U1, U4 - + xvpermi.q U4, U8, 0x02 + b .L01_Y_1 +.L01_Y_0: + xvldx U4, Y, IY +.L01_Y_1: +.endm + +.macro STORE_Y_8 + beqz T5, .L01_Y_2 xvpermi.d U8, U4, 0xee vextrins.w $vr5, $vr4, 0x01 vextrins.w $vr6, $vr4, 0x02 @@ -196,10 +148,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fstx.s $f10, Y, T2 add.d T2, T2, INCY fstx.s $f11, Y, T2 - - slli.d T2, INCY, 3 - add.d IY, IY, T2 - + b .L01_Y_3 +.L01_Y_2: + xvstx U4, Y, IY +.L01_Y_3: +.endm + +.macro LOAD_X_8 + beqz T6, .L01_X_0 fldx.s $f4, X, IX add.d T2, IX, INCX fldx.s $f5, X, T2 @@ -224,39 +180,91 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. vextrins.w $vr8, $vr10, 0x20 vextrins.w $vr8, $vr11, 0x30 xvpermi.q U4, U8, 0x02 + b .L01_X_1 +.L01_X_0: + xvldx U4, X, IX +.L01_X_1: +.endm + + PROLOGUE - xvand.v $xr12, $xr2, $xr2 + addi.d $sp, $sp, -88 - xvfmadd.s U2, U1, U4, U2 - xvfsub.s U2, U2, $xr12 + SDARG $r23, $sp, 0 + SDARG $r24, $sp, 8 + SDARG $r25, $sp, 16 + SDARG $r26, $sp, 32 + SDARG $r27, $sp, 40 + SDARG $r28, $sp, 48 + SDARG $r29, $sp, 56 + SDARG $r30, $sp, 64 + SDARG $r31, $sp, 72 + ST ALPHA, $sp, 80 - xvpickve.w U4, U2, 0x01 - xvpickve.w U5, U2, 0x02 - xvpickve.w U6, U2, 0x03 - xvpickve.w U7, U2, 0x04 - xvpickve.w U8, U2, 0x05 - xvpickve.w U9, U2, 0x06 - xvpickve.w U10, U2, 0x07 + xvldrepl.w VALPHA, $sp, 80 - fadd.s $f2, $f2, $f4 - fadd.s $f2, $f2, $f5 - fadd.s $f2, $f2, $f6 - fadd.s $f2, $f2, $f7 - fadd.s $f2, $f2, $f8 - fadd.s $f2, $f2, $f9 - fadd.s $f2, $f2, $f10 - fadd.s $f2, $f2, $f12 + addi.d T5, INCY, -1 + addi.d T6, INCX, -1 + slli.d LDA, LDA, BASE_SHIFT + slli.d INCX, INCX, BASE_SHIFT + slli.d INCY, INCY, BASE_SHIFT - xvreplve0.d U2, U2 + bge $r0, M, .L999 + bge $r0, N, .L999 + + sub.d M1, M, N + + mul.d JY, M1, INCY + mul.d JX, M1, INCX + + move J, M1 + move AO1, A + + beq J, M, .L999 + +.L01: + xvxor.v U2, U2, U2 + fldx.s $f6, X, JX + fmul.s $f3, ALPHA, $f6 //temp1 + xvreplve0.w U3, U3 - slli.d T2, INCX, 3 - add.d IX, IX, T2 + move IY, $r0 + move IX, $r0 + move II, $r0 + move I, $r0 + + srai.d T0, J, 3 + beq I, T0, .L03 + + mul.w T1, J, LDA + add.d T1, T1, II + +.L02: /* /8 */ + xvldx U1, AO1, T1 + + LOAD_Y_8 + + xvfmadd.s U4, U3, U1, U4 + + STORE_Y_8 + + alsl.d IY, INCY, IY, 3 + + LOAD_X_8 + + xvfmadd.s U2, U1, U4, U2 + + alsl.d IX, INCX, IX, 3 addi.d II, II, 32 addi.d T1, T1, 32 addi.d I, I, 1 blt I, T0, .L02 + //Acc U2 + GACC xvf, s, U4, U2 + fmov.d $f2, $f4 + .L03: /* &4 */ andi T0, J, 4 beq $r0, T0, .L04 @@ -421,4 +429,4 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi.d $sp, $sp, 88 jirl $r0, $r1, 0x0 - EPILOGUE \ No newline at end of file + EPILOGUE From 4c1a23673a13d6b696262e31710ed75326bfc1d6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 11 Jan 2025 11:40:31 -0800 Subject: [PATCH 238/244] Remove comparison that is always false (Reference-LAPACK PR 1062) --- lapack-netlib/LAPACKE/src/lapacke_ctpmqrt_work.c | 4 ++-- lapack-netlib/LAPACKE/src/lapacke_dtpmqrt_work.c | 4 ++-- lapack-netlib/LAPACKE/src/lapacke_stpmqrt_work.c | 4 ++-- lapack-netlib/LAPACKE/src/lapacke_ztpmqrt_work.c | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/lapack-netlib/LAPACKE/src/lapacke_ctpmqrt_work.c b/lapack-netlib/LAPACKE/src/lapacke_ctpmqrt_work.c index e01664bdf8..cb80787a85 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ctpmqrt_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ctpmqrt_work.c @@ -51,8 +51,8 @@ lapack_int LAPACKE_ctpmqrt_work( int matrix_layout, char side, char trans, } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { lapack_int nrowsA, ncolsA, nrowsV; - if ( side == LAPACKE_lsame(side, 'l') ) { nrowsA = k; ncolsA = n; nrowsV = m; } - else if ( side == LAPACKE_lsame(side, 'r') ) { nrowsA = m; ncolsA = k; nrowsV = n; } + if ( LAPACKE_lsame(side, 'l') ) { nrowsA = k; ncolsA = n; nrowsV = m; } + else if ( LAPACKE_lsame(side, 'r') ) { nrowsA = m; ncolsA = k; nrowsV = n; } else { info = -2; LAPACKE_xerbla( "LAPACKE_ctpmqrt_work", info ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_dtpmqrt_work.c b/lapack-netlib/LAPACKE/src/lapacke_dtpmqrt_work.c index 366acd3690..1278a8128d 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dtpmqrt_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dtpmqrt_work.c @@ -49,8 +49,8 @@ lapack_int LAPACKE_dtpmqrt_work( int matrix_layout, char side, char trans, } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { lapack_int nrowsA, ncolsA, nrowsV; - if ( side == LAPACKE_lsame(side, 'l') ) { nrowsA = k; ncolsA = n; nrowsV = m; } - else if ( side == LAPACKE_lsame(side, 'r') ) { nrowsA = m; ncolsA = k; nrowsV = n; } + if ( LAPACKE_lsame(side, 'l') ) { nrowsA = k; ncolsA = n; nrowsV = m; } + else if ( LAPACKE_lsame(side, 'r') ) { nrowsA = m; ncolsA = k; nrowsV = n; } else { info = -2; LAPACKE_xerbla( "LAPACKE_dtpmqrt_work", info ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_stpmqrt_work.c b/lapack-netlib/LAPACKE/src/lapacke_stpmqrt_work.c index c5a3a14965..d055223f5b 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_stpmqrt_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_stpmqrt_work.c @@ -49,8 +49,8 @@ lapack_int LAPACKE_stpmqrt_work( int matrix_layout, char side, char trans, } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { lapack_int nrowsA, ncolsA, nrowsV; - if ( side == LAPACKE_lsame(side, 'l') ) { nrowsA = k; ncolsA = n; nrowsV = m; } - else if ( side == LAPACKE_lsame(side, 'r') ) { nrowsA = m; ncolsA = k; nrowsV = n; } + if ( LAPACKE_lsame(side, 'l') ) { nrowsA = k; ncolsA = n; nrowsV = m; } + else if ( LAPACKE_lsame(side, 'r') ) { nrowsA = m; ncolsA = k; nrowsV = n; } else { info = -2; LAPACKE_xerbla( "LAPACKE_stpmqrt_work", info ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_ztpmqrt_work.c b/lapack-netlib/LAPACKE/src/lapacke_ztpmqrt_work.c index 104efa8f3c..0cacc665db 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ztpmqrt_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ztpmqrt_work.c @@ -51,8 +51,8 @@ lapack_int LAPACKE_ztpmqrt_work( int matrix_layout, char side, char trans, } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { lapack_int nrowsA, ncolsA, nrowsV; - if ( side == LAPACKE_lsame(side, 'l') ) { nrowsA = k; ncolsA = n; nrowsV = m; } - else if ( side == LAPACKE_lsame(side, 'r') ) { nrowsA = m; ncolsA = k; nrowsV = n; } + if ( LAPACKE_lsame(side, 'l') ) { nrowsA = k; ncolsA = n; nrowsV = m; } + else if ( LAPACKE_lsame(side, 'r') ) { nrowsA = m; ncolsA = k; nrowsV = n; } else { info = -2; LAPACKE_xerbla( "LAPACKE_ztpmqrt_work", info ); From ed516994d624e63167d57da31bf5b25ac5e6e008 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 11 Jan 2025 15:37:32 -0800 Subject: [PATCH 239/244] replace ?larft with a recursive implementation (Reference-LAPACK PR 1080) --- lapack-netlib/SRC/clarft.f | 596 ++++++++++++++++++++++++++++--------- lapack-netlib/SRC/dlarft.f | 585 +++++++++++++++++++++++++++--------- lapack-netlib/SRC/slarft.f | 583 +++++++++++++++++++++++++++--------- lapack-netlib/SRC/zlarft.f | 596 ++++++++++++++++++++++++++++--------- 4 files changed, 1792 insertions(+), 568 deletions(-) diff --git a/lapack-netlib/SRC/clarft.f b/lapack-netlib/SRC/clarft.f index fdf80b78e9..de8b97bf9c 100644 --- a/lapack-netlib/SRC/clarft.f +++ b/lapack-netlib/SRC/clarft.f @@ -18,7 +18,7 @@ * Definition: * =========== * -* SUBROUTINE CLARFT( DIRECT, STOREV, N, K, V, LDV, TAU, T, LDT ) +* RECURSIVE SUBROUTINE CLARFT( DIRECT, STOREV, N, K, V, LDV, TAU, T, LDT ) * * .. Scalar Arguments .. * CHARACTER DIRECT, STOREV @@ -130,7 +130,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complexOTHERauxiliary +*> \ingroup larft * *> \par Further Details: * ===================== @@ -159,167 +159,473 @@ *> \endverbatim *> * ===================================================================== - SUBROUTINE CLARFT( DIRECT, STOREV, N, K, V, LDV, TAU, T, LDT ) + RECURSIVE SUBROUTINE CLARFT( DIRECT, STOREV, N, K, V, LDV, + $ TAU, T, LDT ) * * -- LAPACK auxiliary routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * -* .. Scalar Arguments .. - CHARACTER DIRECT, STOREV - INTEGER K, LDT, LDV, N +* .. Scalar Arguments +* + CHARACTER DIRECT, STOREV + INTEGER K, LDT, LDV, N * .. * .. Array Arguments .. - COMPLEX T( LDT, * ), TAU( * ), V( LDV, * ) -* .. * -* ===================================================================== + COMPLEX T( LDT, * ), TAU( * ), V( LDV, * ) +* .. * * .. Parameters .. - COMPLEX ONE, ZERO - PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ), - $ ZERO = ( 0.0E+0, 0.0E+0 ) ) -* .. +* + COMPLEX ONE, NEG_ONE, ZERO + PARAMETER(ONE=1.0E+0, ZERO = 0.0E+0, NEG_ONE=-1.0E+0) +* * .. Local Scalars .. - INTEGER I, J, PREVLASTV, LASTV -* .. +* + INTEGER I,J,L + LOGICAL QR,LQ,QL,DIRF,COLV +* * .. External Subroutines .. - EXTERNAL CGEMM, CGEMV, CTRMV -* .. -* .. External Functions .. - LOGICAL LSAME - EXTERNAL LSAME +* + EXTERNAL CTRMM,CGEMM,CLACPY +* +* .. External Functions.. +* + LOGICAL LSAME + EXTERNAL LSAME +* +* .. Intrinsic Functions.. +* + INTRINSIC CONJG +* +* The general scheme used is inspired by the approach inside DGEQRT3 +* which was (at the time of writing this code): +* Based on the algorithm of Elmroth and Gustavson, +* IBM J. Res. Develop. Vol 44 No. 4 July 2000. * .. * .. Executable Statements .. * * Quick return if possible * - IF( N.EQ.0 ) - $ RETURN -* - IF( LSAME( DIRECT, 'F' ) ) THEN - PREVLASTV = N - DO I = 1, K - PREVLASTV = MAX( PREVLASTV, I ) - IF( TAU( I ).EQ.ZERO ) THEN -* -* H(i) = I -* - DO J = 1, I - T( J, I ) = ZERO - END DO - ELSE -* -* general case -* - IF( LSAME( STOREV, 'C' ) ) THEN -* Skip any trailing zeros. - DO LASTV = N, I+1, -1 - IF( V( LASTV, I ).NE.ZERO ) EXIT - END DO - DO J = 1, I-1 - T( J, I ) = -TAU( I ) * CONJG( V( I , J ) ) - END DO - J = MIN( LASTV, PREVLASTV ) -* -* T(1:i-1,i) := - tau(i) * V(i:j,1:i-1)**H * V(i:j,i) -* - CALL CGEMV( 'Conjugate transpose', J-I, I-1, - $ -TAU( I ), V( I+1, 1 ), LDV, - $ V( I+1, I ), 1, - $ ONE, T( 1, I ), 1 ) - ELSE -* Skip any trailing zeros. - DO LASTV = N, I+1, -1 - IF( V( I, LASTV ).NE.ZERO ) EXIT - END DO - DO J = 1, I-1 - T( J, I ) = -TAU( I ) * V( J , I ) - END DO - J = MIN( LASTV, PREVLASTV ) -* -* T(1:i-1,i) := - tau(i) * V(1:i-1,i:j) * V(i,i:j)**H -* - CALL CGEMM( 'N', 'C', I-1, 1, J-I, -TAU( I ), - $ V( 1, I+1 ), LDV, V( I, I+1 ), LDV, - $ ONE, T( 1, I ), LDT ) - END IF -* -* T(1:i-1,i) := T(1:i-1,1:i-1) * T(1:i-1,i) -* - CALL CTRMV( 'Upper', 'No transpose', 'Non-unit', I-1, T, - $ LDT, T( 1, I ), 1 ) - T( I, I ) = TAU( I ) - IF( I.GT.1 ) THEN - PREVLASTV = MAX( PREVLASTV, LASTV ) - ELSE - PREVLASTV = LASTV - END IF - END IF + IF(N.EQ.0.OR.K.EQ.0) THEN + RETURN + END IF +* +* Base case +* + IF(N.EQ.1.OR.K.EQ.1) THEN + T(1,1) = TAU(1) + RETURN + END IF +* +* Beginning of executable statements +* + L = K / 2 +* +* Determine what kind of Q we need to compute +* We assume that if the user doesn't provide 'F' for DIRECT, +* then they meant to provide 'B' and if they don't provide +* 'C' for STOREV, then they meant to provide 'R' +* + DIRF = LSAME(DIRECT,'F') + COLV = LSAME(STOREV,'C') +* +* QR happens when we have forward direction in column storage +* + QR = DIRF.AND.COLV +* +* LQ happens when we have forward direction in row storage +* + LQ = DIRF.AND.(.NOT.COLV) +* +* QL happens when we have backward direction in column storage +* + QL = (.NOT.DIRF).AND.COLV +* +* The last case is RQ. Due to how we structured this, if the +* above 3 are false, then RQ must be true, so we never store +* this +* RQ happens when we have backward direction in row storage +* RQ = (.NOT.DIRF).AND.(.NOT.COLV) +* + IF(QR) THEN +* +* Break V apart into 6 components +* +* V = |---------------| +* |V_{1,1} 0 | +* |V_{2,1} V_{2,2}| +* |V_{3,1} V_{3,2}| +* |---------------| +* +* V_{1,1}\in\C^{l,l} unit lower triangular +* V_{2,1}\in\C^{k-l,l} rectangular +* V_{3,1}\in\C^{n-k,l} rectangular +* +* V_{2,2}\in\C^{k-l,k-l} unit lower triangular +* V_{3,2}\in\C^{n-k,k-l} rectangular +* +* We will construct the T matrix +* T = |---------------| +* |T_{1,1} T_{1,2}| +* |0 T_{2,2}| +* |---------------| +* +* T is the triangular factor obtained from block reflectors. +* To motivate the structure, assume we have already computed T_{1,1} +* and T_{2,2}. Then collect the associated reflectors in V_1 and V_2 +* +* T_{1,1}\in\C^{l, l} upper triangular +* T_{2,2}\in\C^{k-l, k-l} upper triangular +* T_{1,2}\in\C^{l, k-l} rectangular +* +* Where l = floor(k/2) +* +* Then, consider the product: +* +* (I - V_1*T_{1,1}*V_1')*(I - V_2*T_{2,2}*V_2') +* = I - V_1*T_{1,1}*V_1' - V_2*T_{2,2}*V_2' + V_1*T_{1,1}*V_1'*V_2*T_{2,2}*V_2' +* +* Define T{1,2} = -T_{1,1}*V_1'*V_2*T_{2,2} +* +* Then, we can define the matrix V as +* V = |-------| +* |V_1 V_2| +* |-------| +* +* So, our product is equivalent to the matrix product +* I - V*T*V' +* This means, we can compute T_{1,1} and T_{2,2}, then use this information +* to compute T_{1,2} +* +* Compute T_{1,1} recursively +* + CALL CLARFT(DIRECT, STOREV, N, L, V, LDV, TAU, T, LDT) +* +* Compute T_{2,2} recursively +* + CALL CLARFT(DIRECT, STOREV, N-L, K-L, V(L+1, L+1), LDV, + $ TAU(L+1), T(L+1, L+1), LDT) +* +* Compute T_{1,2} +* T_{1,2} = V_{2,1}' +* + DO J = 1, L + DO I = 1, K-L + T(J, L+I) = CONJG(V(L+I, J)) + END DO END DO - ELSE - PREVLASTV = 1 - DO I = K, 1, -1 - IF( TAU( I ).EQ.ZERO ) THEN -* -* H(i) = I -* - DO J = I, K - T( J, I ) = ZERO - END DO - ELSE -* -* general case -* - IF( I.LT.K ) THEN - IF( LSAME( STOREV, 'C' ) ) THEN -* Skip any leading zeros. - DO LASTV = 1, I-1 - IF( V( LASTV, I ).NE.ZERO ) EXIT - END DO - DO J = I+1, K - T( J, I ) = -TAU( I ) * CONJG( V( N-K+I , J ) ) - END DO - J = MAX( LASTV, PREVLASTV ) -* -* T(i+1:k,i) = -tau(i) * V(j:n-k+i,i+1:k)**H * V(j:n-k+i,i) -* - CALL CGEMV( 'Conjugate transpose', N-K+I-J, K-I, - $ -TAU( I ), V( J, I+1 ), LDV, V( J, I ), - $ 1, ONE, T( I+1, I ), 1 ) - ELSE -* Skip any leading zeros. - DO LASTV = 1, I-1 - IF( V( I, LASTV ).NE.ZERO ) EXIT - END DO - DO J = I+1, K - T( J, I ) = -TAU( I ) * V( J, N-K+I ) - END DO - J = MAX( LASTV, PREVLASTV ) -* -* T(i+1:k,i) = -tau(i) * V(i+1:k,j:n-k+i) * V(i,j:n-k+i)**H -* - CALL CGEMM( 'N', 'C', K-I, 1, N-K+I-J, -TAU( I ), - $ V( I+1, J ), LDV, V( I, J ), LDV, - $ ONE, T( I+1, I ), LDT ) - END IF -* -* T(i+1:k,i) := T(i+1:k,i+1:k) * T(i+1:k,i) -* - CALL CTRMV( 'Lower', 'No transpose', 'Non-unit', K-I, - $ T( I+1, I+1 ), LDT, T( I+1, I ), 1 ) - IF( I.GT.1 ) THEN - PREVLASTV = MIN( PREVLASTV, LASTV ) - ELSE - PREVLASTV = LASTV - END IF - END IF - T( I, I ) = TAU( I ) - END IF +* +* T_{1,2} = T_{1,2}*V_{2,2} +* + CALL CTRMM('Right', 'Lower', 'No transpose', 'Unit', L, + $ K-L, ONE, V(L+1, L+1), LDV, T(1, L+1), LDT) + +* +* T_{1,2} = V_{3,1}'*V_{3,2} + T_{1,2} +* Note: We assume K <= N, and GEMM will do nothing if N=K +* + CALL CGEMM('Conjugate', 'No transpose', L, K-L, N-K, ONE, + $ V(K+1, 1), LDV, V(K+1, L+1), LDV, ONE, T(1, L+1), + $ LDT) +* +* At this point, we have that T_{1,2} = V_1'*V_2 +* All that is left is to pre and post multiply by -T_{1,1} and T_{2,2} +* respectively. +* +* T_{1,2} = -T_{1,1}*T_{1,2} +* + CALL CTRMM('Left', 'Upper', 'No transpose', 'Non-unit', L, + $ K-L, NEG_ONE, T, LDT, T(1, L+1), LDT) +* +* T_{1,2} = T_{1,2}*T_{2,2} +* + CALL CTRMM('Right', 'Upper', 'No transpose', 'Non-unit', L, + $ K-L, ONE, T(L+1, L+1), LDT, T(1, L+1), LDT) + + ELSE IF(LQ) THEN +* +* Break V apart into 6 components +* +* V = |----------------------| +* |V_{1,1} V_{1,2} V{1,3}| +* |0 V_{2,2} V{2,3}| +* |----------------------| +* +* V_{1,1}\in\C^{l,l} unit upper triangular +* V_{1,2}\in\C^{l,k-l} rectangular +* V_{1,3}\in\C^{l,n-k} rectangular +* +* V_{2,2}\in\C^{k-l,k-l} unit upper triangular +* V_{2,3}\in\C^{k-l,n-k} rectangular +* +* Where l = floor(k/2) +* +* We will construct the T matrix +* T = |---------------| +* |T_{1,1} T_{1,2}| +* |0 T_{2,2}| +* |---------------| +* +* T is the triangular factor obtained from block reflectors. +* To motivate the structure, assume we have already computed T_{1,1} +* and T_{2,2}. Then collect the associated reflectors in V_1 and V_2 +* +* T_{1,1}\in\C^{l, l} upper triangular +* T_{2,2}\in\C^{k-l, k-l} upper triangular +* T_{1,2}\in\C^{l, k-l} rectangular +* +* Then, consider the product: +* +* (I - V_1'*T_{1,1}*V_1)*(I - V_2'*T_{2,2}*V_2) +* = I - V_1'*T_{1,1}*V_1 - V_2'*T_{2,2}*V_2 + V_1'*T_{1,1}*V_1*V_2'*T_{2,2}*V_2 +* +* Define T_{1,2} = -T_{1,1}*V_1*V_2'*T_{2,2} +* +* Then, we can define the matrix V as +* V = |---| +* |V_1| +* |V_2| +* |---| +* +* So, our product is equivalent to the matrix product +* I - V'*T*V +* This means, we can compute T_{1,1} and T_{2,2}, then use this information +* to compute T_{1,2} +* +* Compute T_{1,1} recursively +* + CALL CLARFT(DIRECT, STOREV, N, L, V, LDV, TAU, T, LDT) +* +* Compute T_{2,2} recursively +* + CALL CLARFT(DIRECT, STOREV, N-L, K-L, V(L+1, L+1), LDV, + $ TAU(L+1), T(L+1, L+1), LDT) + +* +* Compute T_{1,2} +* T_{1,2} = V_{1,2} +* + CALL CLACPY('All', L, K-L, V(1, L+1), LDV, T(1, L+1), LDT) +* +* T_{1,2} = T_{1,2}*V_{2,2}' +* + CALL CTRMM('Right', 'Upper', 'Conjugate', 'Unit', L, K-L, + $ ONE, V(L+1, L+1), LDV, T(1, L+1), LDT) + +* +* T_{1,2} = V_{1,3}*V_{2,3}' + T_{1,2} +* Note: We assume K <= N, and GEMM will do nothing if N=K +* + CALL CGEMM('No transpose', 'Conjugate', L, K-L, N-K, ONE, + $ V(1, K+1), LDV, V(L+1, K+1), LDV, ONE, T(1, L+1), LDT) +* +* At this point, we have that T_{1,2} = V_1*V_2' +* All that is left is to pre and post multiply by -T_{1,1} and T_{2,2} +* respectively. +* +* T_{1,2} = -T_{1,1}*T_{1,2} +* + CALL CTRMM('Left', 'Upper', 'No transpose', 'Non-unit', L, + $ K-L, NEG_ONE, T, LDT, T(1, L+1), LDT) + +* +* T_{1,2} = T_{1,2}*T_{2,2} +* + CALL CTRMM('Right', 'Upper', 'No transpose', 'Non-unit', L, + $ K-L, ONE, T(L+1,L+1), LDT, T(1, L+1), LDT) + ELSE IF(QL) THEN +* +* Break V apart into 6 components +* +* V = |---------------| +* |V_{1,1} V_{1,2}| +* |V_{2,1} V_{2,2}| +* |0 V_{3,2}| +* |---------------| +* +* V_{1,1}\in\C^{n-k,k-l} rectangular +* V_{2,1}\in\C^{k-l,k-l} unit upper triangular +* +* V_{1,2}\in\C^{n-k,l} rectangular +* V_{2,2}\in\C^{k-l,l} rectangular +* V_{3,2}\in\C^{l,l} unit upper triangular +* +* We will construct the T matrix +* T = |---------------| +* |T_{1,1} 0 | +* |T_{2,1} T_{2,2}| +* |---------------| +* +* T is the triangular factor obtained from block reflectors. +* To motivate the structure, assume we have already computed T_{1,1} +* and T_{2,2}. Then collect the associated reflectors in V_1 and V_2 +* +* T_{1,1}\in\C^{k-l, k-l} non-unit lower triangular +* T_{2,2}\in\C^{l, l} non-unit lower triangular +* T_{2,1}\in\C^{k-l, l} rectangular +* +* Where l = floor(k/2) +* +* Then, consider the product: +* +* (I - V_2*T_{2,2}*V_2')*(I - V_1*T_{1,1}*V_1') +* = I - V_2*T_{2,2}*V_2' - V_1*T_{1,1}*V_1' + V_2*T_{2,2}*V_2'*V_1*T_{1,1}*V_1' +* +* Define T_{2,1} = -T_{2,2}*V_2'*V_1*T_{1,1} +* +* Then, we can define the matrix V as +* V = |-------| +* |V_1 V_2| +* |-------| +* +* So, our product is equivalent to the matrix product +* I - V*T*V' +* This means, we can compute T_{1,1} and T_{2,2}, then use this information +* to compute T_{2,1} +* +* Compute T_{1,1} recursively +* + CALL CLARFT(DIRECT, STOREV, N-L, K-L, V, LDV, TAU, T, LDT) +* +* Compute T_{2,2} recursively +* + CALL CLARFT(DIRECT, STOREV, N, L, V(1, K-L+1), LDV, + $ TAU(K-L+1), T(K-L+1, K-L+1), LDT) +* +* Compute T_{2,1} +* T_{2,1} = V_{2,2}' +* + DO J = 1, K-L + DO I = 1, L + T(K-L+I, J) = CONJG(V(N-K+J, K-L+I)) + END DO END DO - END IF - RETURN * -* End of CLARFT +* T_{2,1} = T_{2,1}*V_{2,1} +* + CALL CTRMM('Right', 'Upper', 'No transpose', 'Unit', L, + $ K-L, ONE, V(N-K+1, 1), LDV, T(K-L+1, 1), LDT) + +* +* T_{2,1} = V_{2,2}'*V_{2,1} + T_{2,1} +* Note: We assume K <= N, and GEMM will do nothing if N=K +* + CALL CGEMM('Conjugate', 'No transpose', L, K-L, N-K, ONE, + $ V(1, K-L+1), LDV, V, LDV, ONE, T(K-L+1, 1), + $ LDT) +* +* At this point, we have that T_{2,1} = V_2'*V_1 +* All that is left is to pre and post multiply by -T_{2,2} and T_{1,1} +* respectively. +* +* T_{2,1} = -T_{2,2}*T_{2,1} +* + CALL CTRMM('Left', 'Lower', 'No transpose', 'Non-unit', L, + $ K-L, NEG_ONE, T(K-L+1, K-L+1), LDT, + $ T(K-L+1, 1), LDT) * - END +* T_{2,1} = T_{2,1}*T_{1,1} +* + CALL CTRMM('Right', 'Lower', 'No transpose', 'Non-unit', L, + $ K-L, ONE, T, LDT, T(K-L+1, 1), LDT) + ELSE +* +* Else means RQ case +* +* Break V apart into 6 components +* +* V = |-----------------------| +* |V_{1,1} V_{1,2} 0 | +* |V_{2,1} V_{2,2} V_{2,3}| +* |-----------------------| +* +* V_{1,1}\in\C^{k-l,n-k} rectangular +* V_{1,2}\in\C^{k-l,k-l} unit lower triangular +* +* V_{2,1}\in\C^{l,n-k} rectangular +* V_{2,2}\in\C^{l,k-l} rectangular +* V_{2,3}\in\C^{l,l} unit lower triangular +* +* We will construct the T matrix +* T = |---------------| +* |T_{1,1} 0 | +* |T_{2,1} T_{2,2}| +* |---------------| +* +* T is the triangular factor obtained from block reflectors. +* To motivate the structure, assume we have already computed T_{1,1} +* and T_{2,2}. Then collect the associated reflectors in V_1 and V_2 +* +* T_{1,1}\in\C^{k-l, k-l} non-unit lower triangular +* T_{2,2}\in\C^{l, l} non-unit lower triangular +* T_{2,1}\in\C^{k-l, l} rectangular +* +* Where l = floor(k/2) +* +* Then, consider the product: +* +* (I - V_2'*T_{2,2}*V_2)*(I - V_1'*T_{1,1}*V_1) +* = I - V_2'*T_{2,2}*V_2 - V_1'*T_{1,1}*V_1 + V_2'*T_{2,2}*V_2*V_1'*T_{1,1}*V_1 +* +* Define T_{2,1} = -T_{2,2}*V_2*V_1'*T_{1,1} +* +* Then, we can define the matrix V as +* V = |---| +* |V_1| +* |V_2| +* |---| +* +* So, our product is equivalent to the matrix product +* I - V'*T*V +* This means, we can compute T_{1,1} and T_{2,2}, then use this information +* to compute T_{2,1} +* +* Compute T_{1,1} recursively +* + CALL CLARFT(DIRECT, STOREV, N-L, K-L, V, LDV, TAU, T, LDT) +* +* Compute T_{2,2} recursively +* + CALL CLARFT(DIRECT, STOREV, N, L, V(K-L+1,1), LDV, + $ TAU(K-L+1), T(K-L+1, K-L+1), LDT) +* +* Compute T_{2,1} +* T_{2,1} = V_{2,2} +* + CALL CLACPY('All', L, K-L, V(K-L+1, N-K+1), LDV, + $ T(K-L+1, 1), LDT) + +* +* T_{2,1} = T_{2,1}*V_{1,2}' +* + CALL CTRMM('Right', 'Lower', 'Conjugate', 'Unit', L, K-L, + $ ONE, V(1, N-K+1), LDV, T(K-L+1,1), LDT) + +* +* T_{2,1} = V_{2,1}*V_{1,1}' + T_{2,1} +* Note: We assume K <= N, and GEMM will do nothing if N=K +* + CALL CGEMM('No transpose', 'Conjugate', L, K-L, N-K, ONE, + $ V(K-L+1, 1), LDV, V, LDV, ONE, T(K-L+1, 1), + $ LDT) + +* +* At this point, we have that T_{2,1} = V_2*V_1' +* All that is left is to pre and post multiply by -T_{2,2} and T_{1,1} +* respectively. +* +* T_{2,1} = -T_{2,2}*T_{2,1} +* + CALL CTRMM('Left', 'Lower', 'No tranpose', 'Non-unit', L, + $ K-L, NEG_ONE, T(K-L+1, K-L+1), LDT, + $ T(K-L+1, 1), LDT) + +* +* T_{2,1} = T_{2,1}*T_{1,1} +* + CALL CTRMM('Right', 'Lower', 'No tranpose', 'Non-unit', L, + $ K-L, ONE, T, LDT, T(K-L+1, 1), LDT) + END IF + END SUBROUTINE diff --git a/lapack-netlib/SRC/dlarft.f b/lapack-netlib/SRC/dlarft.f index a8d9de61f1..c27bb1a806 100644 --- a/lapack-netlib/SRC/dlarft.f +++ b/lapack-netlib/SRC/dlarft.f @@ -18,7 +18,7 @@ * Definition: * =========== * -* SUBROUTINE DLARFT( DIRECT, STOREV, N, K, V, LDV, TAU, T, LDT ) +* RECURSIVE SUBROUTINE DLARFT( DIRECT, STOREV, N, K, V, LDV, TAU, T, LDT ) * * .. Scalar Arguments .. * CHARACTER DIRECT, STOREV @@ -130,7 +130,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup doubleOTHERauxiliary +*> \ingroup larft * *> \par Further Details: * ===================== @@ -159,165 +159,470 @@ *> \endverbatim *> * ===================================================================== - SUBROUTINE DLARFT( DIRECT, STOREV, N, K, V, LDV, TAU, T, LDT ) + RECURSIVE SUBROUTINE DLARFT( DIRECT, STOREV, N, K, V, LDV, + $ TAU, T, LDT ) * * -- LAPACK auxiliary routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * -* .. Scalar Arguments .. +* .. Scalar Arguments +* CHARACTER DIRECT, STOREV INTEGER K, LDT, LDV, N * .. * .. Array Arguments .. +* DOUBLE PRECISION T( LDT, * ), TAU( * ), V( LDV, * ) * .. * -* ===================================================================== -* * .. Parameters .. - DOUBLE PRECISION ONE, ZERO - PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) -* .. +* + DOUBLE PRECISION ONE, NEG_ONE, ZERO + PARAMETER(ONE=1.0D+0, ZERO = 0.0D+0, NEG_ONE=-1.0D+0) +* * .. Local Scalars .. - INTEGER I, J, PREVLASTV, LASTV -* .. +* + INTEGER I,J,L + LOGICAL QR,LQ,QL,DIRF,COLV +* * .. External Subroutines .. - EXTERNAL DGEMV, DTRMV -* .. -* .. External Functions .. - LOGICAL LSAME - EXTERNAL LSAME +* + EXTERNAL DTRMM,DGEMM,DLACPY +* +* .. External Functions.. +* + LOGICAL LSAME + EXTERNAL LSAME +* +* The general scheme used is inspired by the approach inside DGEQRT3 +* which was (at the time of writing this code): +* Based on the algorithm of Elmroth and Gustavson, +* IBM J. Res. Develop. Vol 44 No. 4 July 2000. * .. * .. Executable Statements .. * * Quick return if possible * - IF( N.EQ.0 ) - $ RETURN -* - IF( LSAME( DIRECT, 'F' ) ) THEN - PREVLASTV = N - DO I = 1, K - PREVLASTV = MAX( I, PREVLASTV ) - IF( TAU( I ).EQ.ZERO ) THEN -* -* H(i) = I -* - DO J = 1, I - T( J, I ) = ZERO - END DO - ELSE -* -* general case -* - IF( LSAME( STOREV, 'C' ) ) THEN -* Skip any trailing zeros. - DO LASTV = N, I+1, -1 - IF( V( LASTV, I ).NE.ZERO ) EXIT - END DO - DO J = 1, I-1 - T( J, I ) = -TAU( I ) * V( I , J ) - END DO - J = MIN( LASTV, PREVLASTV ) -* -* T(1:i-1,i) := - tau(i) * V(i:j,1:i-1)**T * V(i:j,i) -* - CALL DGEMV( 'Transpose', J-I, I-1, -TAU( I ), - $ V( I+1, 1 ), LDV, V( I+1, I ), 1, ONE, - $ T( 1, I ), 1 ) - ELSE -* Skip any trailing zeros. - DO LASTV = N, I+1, -1 - IF( V( I, LASTV ).NE.ZERO ) EXIT - END DO - DO J = 1, I-1 - T( J, I ) = -TAU( I ) * V( J , I ) - END DO - J = MIN( LASTV, PREVLASTV ) -* -* T(1:i-1,i) := - tau(i) * V(1:i-1,i:j) * V(i,i:j)**T -* - CALL DGEMV( 'No transpose', I-1, J-I, -TAU( I ), - $ V( 1, I+1 ), LDV, V( I, I+1 ), LDV, ONE, - $ T( 1, I ), 1 ) - END IF -* -* T(1:i-1,i) := T(1:i-1,1:i-1) * T(1:i-1,i) -* - CALL DTRMV( 'Upper', 'No transpose', 'Non-unit', I-1, T, - $ LDT, T( 1, I ), 1 ) - T( I, I ) = TAU( I ) - IF( I.GT.1 ) THEN - PREVLASTV = MAX( PREVLASTV, LASTV ) - ELSE - PREVLASTV = LASTV - END IF - END IF + IF(N.EQ.0.OR.K.EQ.0) THEN + RETURN + END IF +* +* Base case +* + IF(N.EQ.1.OR.K.EQ.1) THEN + T(1,1) = TAU(1) + RETURN + END IF +* +* Beginning of executable statements +* + L = K / 2 +* +* Determine what kind of Q we need to compute +* We assume that if the user doesn't provide 'F' for DIRECT, +* then they meant to provide 'B' and if they don't provide +* 'C' for STOREV, then they meant to provide 'R' +* + DIRF = LSAME(DIRECT,'F') + COLV = LSAME(STOREV,'C') +* +* QR happens when we have forward direction in column storage +* + QR = DIRF.AND.COLV +* +* LQ happens when we have forward direction in row storage +* + LQ = DIRF.AND.(.NOT.COLV) +* +* QL happens when we have backward direction in column storage +* + QL = (.NOT.DIRF).AND.COLV +* +* The last case is RQ. Due to how we structured this, if the +* above 3 are false, then RQ must be true, so we never store +* this +* RQ happens when we have backward direction in row storage +* RQ = (.NOT.DIRF).AND.(.NOT.COLV) +* + IF(QR) THEN +* +* Break V apart into 6 components +* +* V = |---------------| +* |V_{1,1} 0 | +* |V_{2,1} V_{2,2}| +* |V_{3,1} V_{3,2}| +* |---------------| +* +* V_{1,1}\in\R^{l,l} unit lower triangular +* V_{2,1}\in\R^{k-l,l} rectangular +* V_{3,1}\in\R^{n-k,l} rectangular +* +* V_{2,2}\in\R^{k-l,k-l} unit lower triangular +* V_{3,2}\in\R^{n-k,k-l} rectangular +* +* We will construct the T matrix +* T = |---------------| +* |T_{1,1} T_{1,2}| +* |0 T_{2,2}| +* |---------------| +* +* T is the triangular factor obtained from block reflectors. +* To motivate the structure, assume we have already computed T_{1,1} +* and T_{2,2}. Then collect the associated reflectors in V_1 and V_2 +* +* T_{1,1}\in\R^{l, l} upper triangular +* T_{2,2}\in\R^{k-l, k-l} upper triangular +* T_{1,2}\in\R^{l, k-l} rectangular +* +* Where l = floor(k/2) +* +* Then, consider the product: +* +* (I - V_1*T_{1,1}*V_1')*(I - V_2*T_{2,2}*V_2') +* = I - V_1*T_{1,1}*V_1' - V_2*T_{2,2}*V_2' + V_1*T_{1,1}*V_1'*V_2*T_{2,2}*V_2' +* +* Define T_{1,2} = -T_{1,1}*V_1'*V_2*T_{2,2} +* +* Then, we can define the matrix V as +* V = |-------| +* |V_1 V_2| +* |-------| +* +* So, our product is equivalent to the matrix product +* I - V*T*V' +* This means, we can compute T_{1,1} and T_{2,2}, then use this information +* to compute T_{1,2} +* +* Compute T_{1,1} recursively +* + CALL DLARFT(DIRECT, STOREV, N, L, V, LDV, TAU, T, LDT) +* +* Compute T_{2,2} recursively +* + CALL DLARFT(DIRECT, STOREV, N-L, K-L, V(L+1, L+1), LDV, + $ TAU(L+1), T(L+1, L+1), LDT) +* +* Compute T_{1,2} +* T_{1,2} = V_{2,1}' +* + DO J = 1, L + DO I = 1, K-L + T(J, L+I) = V(L+I, J) + END DO END DO - ELSE - PREVLASTV = 1 - DO I = K, 1, -1 - IF( TAU( I ).EQ.ZERO ) THEN -* -* H(i) = I -* - DO J = I, K - T( J, I ) = ZERO - END DO - ELSE -* -* general case -* - IF( I.LT.K ) THEN - IF( LSAME( STOREV, 'C' ) ) THEN -* Skip any leading zeros. - DO LASTV = 1, I-1 - IF( V( LASTV, I ).NE.ZERO ) EXIT - END DO - DO J = I+1, K - T( J, I ) = -TAU( I ) * V( N-K+I , J ) - END DO - J = MAX( LASTV, PREVLASTV ) -* -* T(i+1:k,i) = -tau(i) * V(j:n-k+i,i+1:k)**T * V(j:n-k+i,i) -* - CALL DGEMV( 'Transpose', N-K+I-J, K-I, -TAU( I ), - $ V( J, I+1 ), LDV, V( J, I ), 1, ONE, - $ T( I+1, I ), 1 ) - ELSE -* Skip any leading zeros. - DO LASTV = 1, I-1 - IF( V( I, LASTV ).NE.ZERO ) EXIT - END DO - DO J = I+1, K - T( J, I ) = -TAU( I ) * V( J, N-K+I ) - END DO - J = MAX( LASTV, PREVLASTV ) -* -* T(i+1:k,i) = -tau(i) * V(i+1:k,j:n-k+i) * V(i,j:n-k+i)**T -* - CALL DGEMV( 'No transpose', K-I, N-K+I-J, - $ -TAU( I ), V( I+1, J ), LDV, V( I, J ), LDV, - $ ONE, T( I+1, I ), 1 ) - END IF -* -* T(i+1:k,i) := T(i+1:k,i+1:k) * T(i+1:k,i) -* - CALL DTRMV( 'Lower', 'No transpose', 'Non-unit', K-I, - $ T( I+1, I+1 ), LDT, T( I+1, I ), 1 ) - IF( I.GT.1 ) THEN - PREVLASTV = MIN( PREVLASTV, LASTV ) - ELSE - PREVLASTV = LASTV - END IF - END IF - T( I, I ) = TAU( I ) - END IF +* +* T_{1,2} = T_{1,2}*V_{2,2} +* + CALL DTRMM('Right', 'Lower', 'No transpose', 'Unit', L, + $ K-L, ONE, V(L+1, L+1), LDV, T(1, L+1), LDT) + +* +* T_{1,2} = V_{3,1}'*V_{3,2} + T_{1,2} +* Note: We assume K <= N, and GEMM will do nothing if N=K +* + CALL DGEMM('Transpose', 'No transpose', L, K-L, N-K, ONE, + $ V(K+1, 1), LDV, V(K+1, L+1), LDV, ONE, + $ T(1, L+1), LDT) +* +* At this point, we have that T_{1,2} = V_1'*V_2 +* All that is left is to pre and post multiply by -T_{1,1} and T_{2,2} +* respectively. +* +* T_{1,2} = -T_{1,1}*T_{1,2} +* + CALL DTRMM('Left', 'Upper', 'No transpose', 'Non-unit', L, + $ K-L, NEG_ONE, T, LDT, T(1, L+1), LDT) +* +* T_{1,2} = T_{1,2}*T_{2,2} +* + CALL DTRMM('Right', 'Upper', 'No transpose', 'Non-unit', L, + $ K-L, ONE, T(L+1, L+1), LDT, T(1, L+1), LDT) + + ELSE IF(LQ) THEN +* +* Break V apart into 6 components +* +* V = |----------------------| +* |V_{1,1} V_{1,2} V{1,3}| +* |0 V_{2,2} V{2,3}| +* |----------------------| +* +* V_{1,1}\in\R^{l,l} unit upper triangular +* V_{1,2}\in\R^{l,k-l} rectangular +* V_{1,3}\in\R^{l,n-k} rectangular +* +* V_{2,2}\in\R^{k-l,k-l} unit upper triangular +* V_{2,3}\in\R^{k-l,n-k} rectangular +* +* Where l = floor(k/2) +* +* We will construct the T matrix +* T = |---------------| +* |T_{1,1} T_{1,2}| +* |0 T_{2,2}| +* |---------------| +* +* T is the triangular factor obtained from block reflectors. +* To motivate the structure, assume we have already computed T_{1,1} +* and T_{2,2}. Then collect the associated reflectors in V_1 and V_2 +* +* T_{1,1}\in\R^{l, l} upper triangular +* T_{2,2}\in\R^{k-l, k-l} upper triangular +* T_{1,2}\in\R^{l, k-l} rectangular +* +* Then, consider the product: +* +* (I - V_1'*T_{1,1}*V_1)*(I - V_2'*T_{2,2}*V_2) +* = I - V_1'*T_{1,1}*V_1 - V_2'*T_{2,2}*V_2 + V_1'*T_{1,1}*V_1*V_2'*T_{2,2}*V_2 +* +* Define T_{1,2} = -T_{1,1}*V_1*V_2'*T_{2,2} +* +* Then, we can define the matrix V as +* V = |---| +* |V_1| +* |V_2| +* |---| +* +* So, our product is equivalent to the matrix product +* I - V'*T*V +* This means, we can compute T_{1,1} and T_{2,2}, then use this information +* to compute T_{1,2} +* +* Compute T_{1,1} recursively +* + CALL DLARFT(DIRECT, STOREV, N, L, V, LDV, TAU, T, LDT) +* +* Compute T_{2,2} recursively +* + CALL DLARFT(DIRECT, STOREV, N-L, K-L, V(L+1, L+1), LDV, + $ TAU(L+1), T(L+1, L+1), LDT) + +* +* Compute T_{1,2} +* T_{1,2} = V_{1,2} +* + CALL DLACPY('All', L, K-L, V(1, L+1), LDV, T(1, L+1), LDT) +* +* T_{1,2} = T_{1,2}*V_{2,2}' +* + CALL DTRMM('Right', 'Upper', 'Transpose', 'Unit', L, K-L, + $ ONE, V(L+1, L+1), LDV, T(1, L+1), LDT) + +* +* T_{1,2} = V_{1,3}*V_{2,3}' + T_{1,2} +* Note: We assume K <= N, and GEMM will do nothing if N=K +* + CALL DGEMM('No transpose', 'Transpose', L, K-L, N-K, ONE, + $ V(1, K+1), LDV, V(L+1, K+1), LDV, ONE, + $ T(1, L+1), LDT) +* +* At this point, we have that T_{1,2} = V_1*V_2' +* All that is left is to pre and post multiply by -T_{1,1} and T_{2,2} +* respectively. +* +* T_{1,2} = -T_{1,1}*T_{1,2} +* + CALL DTRMM('Left', 'Upper', 'No transpose', 'Non-unit', L, + $ K-L, NEG_ONE, T, LDT, T(1, L+1), LDT) + +* +* T_{1,2} = T_{1,2}*T_{2,2} +* + CALL DTRMM('Right', 'Upper', 'No transpose', 'Non-unit', L, + $ K-L, ONE, T(L+1, L+1), LDT, T(1, L+1), LDT) + ELSE IF(QL) THEN +* +* Break V apart into 6 components +* +* V = |---------------| +* |V_{1,1} V_{1,2}| +* |V_{2,1} V_{2,2}| +* |0 V_{3,2}| +* |---------------| +* +* V_{1,1}\in\R^{n-k,k-l} rectangular +* V_{2,1}\in\R^{k-l,k-l} unit upper triangular +* +* V_{1,2}\in\R^{n-k,l} rectangular +* V_{2,2}\in\R^{k-l,l} rectangular +* V_{3,2}\in\R^{l,l} unit upper triangular +* +* We will construct the T matrix +* T = |---------------| +* |T_{1,1} 0 | +* |T_{2,1} T_{2,2}| +* |---------------| +* +* T is the triangular factor obtained from block reflectors. +* To motivate the structure, assume we have already computed T_{1,1} +* and T_{2,2}. Then collect the associated reflectors in V_1 and V_2 +* +* T_{1,1}\in\R^{k-l, k-l} non-unit lower triangular +* T_{2,2}\in\R^{l, l} non-unit lower triangular +* T_{2,1}\in\R^{k-l, l} rectangular +* +* Where l = floor(k/2) +* +* Then, consider the product: +* +* (I - V_2*T_{2,2}*V_2')*(I - V_1*T_{1,1}*V_1') +* = I - V_2*T_{2,2}*V_2' - V_1*T_{1,1}*V_1' + V_2*T_{2,2}*V_2'*V_1*T_{1,1}*V_1' +* +* Define T_{2,1} = -T_{2,2}*V_2'*V_1*T_{1,1} +* +* Then, we can define the matrix V as +* V = |-------| +* |V_1 V_2| +* |-------| +* +* So, our product is equivalent to the matrix product +* I - V*T*V' +* This means, we can compute T_{1,1} and T_{2,2}, then use this information +* to compute T_{2,1} +* +* Compute T_{1,1} recursively +* + CALL DLARFT(DIRECT, STOREV, N-L, K-L, V, LDV, TAU, T, LDT) +* +* Compute T_{2,2} recursively +* + CALL DLARFT(DIRECT, STOREV, N, L, V(1, K-L+1), LDV, + $ TAU(K-L+1), T(K-L+1, K-L+1), LDT) +* +* Compute T_{2,1} +* T_{2,1} = V_{2,2}' +* + DO J = 1, K-L + DO I = 1, L + T(K-L+I, J) = V(N-K+J, K-L+I) + END DO END DO - END IF - RETURN * -* End of DLARFT +* T_{2,1} = T_{2,1}*V_{2,1} +* + CALL DTRMM('Right', 'Upper', 'No transpose', 'Unit', L, + $ K-L, ONE, V(N-K+1, 1), LDV, T(K-L+1, 1), LDT) + +* +* T_{2,1} = V_{2,2}'*V_{2,1} + T_{2,1} +* Note: We assume K <= N, and GEMM will do nothing if N=K +* + CALL DGEMM('Transpose', 'No transpose', L, K-L, N-K, ONE, + $ V(1, K-L+1), LDV, V, LDV, ONE, T(K-L+1, 1), + $ LDT) +* +* At this point, we have that T_{2,1} = V_2'*V_1 +* All that is left is to pre and post multiply by -T_{2,2} and T_{1,1} +* respectively. +* +* T_{2,1} = -T_{2,2}*T_{2,1} +* + CALL DTRMM('Left', 'Lower', 'No transpose', 'Non-unit', L, + $ K-L, NEG_ONE, T(K-L+1, K-L+1), LDT, + $ T(K-L+1, 1), LDT) * - END +* T_{2,1} = T_{2,1}*T_{1,1} +* + CALL DTRMM('Right', 'Lower', 'No transpose', 'Non-unit', L, + $ K-L, ONE, T, LDT, T(K-L+1, 1), LDT) + ELSE +* +* Else means RQ case +* +* Break V apart into 6 components +* +* V = |-----------------------| +* |V_{1,1} V_{1,2} 0 | +* |V_{2,1} V_{2,2} V_{2,3}| +* |-----------------------| +* +* V_{1,1}\in\R^{k-l,n-k} rectangular +* V_{1,2}\in\R^{k-l,k-l} unit lower triangular +* +* V_{2,1}\in\R^{l,n-k} rectangular +* V_{2,2}\in\R^{l,k-l} rectangular +* V_{2,3}\in\R^{l,l} unit lower triangular +* +* We will construct the T matrix +* T = |---------------| +* |T_{1,1} 0 | +* |T_{2,1} T_{2,2}| +* |---------------| +* +* T is the triangular factor obtained from block reflectors. +* To motivate the structure, assume we have already computed T_{1,1} +* and T_{2,2}. Then collect the associated reflectors in V_1 and V_2 +* +* T_{1,1}\in\R^{k-l, k-l} non-unit lower triangular +* T_{2,2}\in\R^{l, l} non-unit lower triangular +* T_{2,1}\in\R^{k-l, l} rectangular +* +* Where l = floor(k/2) +* +* Then, consider the product: +* +* (I - V_2'*T_{2,2}*V_2)*(I - V_1'*T_{1,1}*V_1) +* = I - V_2'*T_{2,2}*V_2 - V_1'*T_{1,1}*V_1 + V_2'*T_{2,2}*V_2*V_1'*T_{1,1}*V_1 +* +* Define T_{2,1} = -T_{2,2}*V_2*V_1'*T_{1,1} +* +* Then, we can define the matrix V as +* V = |---| +* |V_1| +* |V_2| +* |---| +* +* So, our product is equivalent to the matrix product +* I - V'*T*V +* This means, we can compute T_{1,1} and T_{2,2}, then use this information +* to compute T_{2,1} +* +* Compute T_{1,1} recursively +* + CALL DLARFT(DIRECT, STOREV, N-L, K-L, V, LDV, TAU, T, LDT) +* +* Compute T_{2,2} recursively +* + CALL DLARFT(DIRECT, STOREV, N, L, V(K-L+1, 1), LDV, + $ TAU(K-L+1), T(K-L+1, K-L+1), LDT) +* +* Compute T_{2,1} +* T_{2,1} = V_{2,2} +* + CALL DLACPY('All', L, K-L, V(K-L+1, N-K+1), LDV, + $ T(K-L+1, 1), LDT) + +* +* T_{2,1} = T_{2,1}*V_{1,2}' +* + CALL DTRMM('Right', 'Lower', 'Transpose', 'Unit', L, K-L, + $ ONE, V(1, N-K+1), LDV, T(K-L+1, 1), LDT) + +* +* T_{2,1} = V_{2,1}*V_{1,1}' + T_{2,1} +* Note: We assume K <= N, and GEMM will do nothing if N=K +* + CALL DGEMM('No transpose', 'Transpose', L, K-L, N-K, ONE, + $ V(K-L+1, 1), LDV, V, LDV, ONE, T(K-L+1, 1), + $ LDT) + +* +* At this point, we have that T_{2,1} = V_2*V_1' +* All that is left is to pre and post multiply by -T_{2,2} and T_{1,1} +* respectively. +* +* T_{2,1} = -T_{2,2}*T_{2,1} +* + CALL DTRMM('Left', 'Lower', 'No tranpose', 'Non-unit', L, + $ K-L, NEG_ONE, T(K-L+1, K-L+1), LDT, + $ T(K-L+1, 1), LDT) + +* +* T_{2,1} = T_{2,1}*T_{1,1} +* + CALL DTRMM('Right', 'Lower', 'No tranpose', 'Non-unit', L, + $ K-L, ONE, T, LDT, T(K-L+1, 1), LDT) + END IF + END SUBROUTINE diff --git a/lapack-netlib/SRC/slarft.f b/lapack-netlib/SRC/slarft.f index 9cfe0ad3f9..ad3a4d924c 100644 --- a/lapack-netlib/SRC/slarft.f +++ b/lapack-netlib/SRC/slarft.f @@ -18,7 +18,7 @@ * Definition: * =========== * -* SUBROUTINE SLARFT( DIRECT, STOREV, N, K, V, LDV, TAU, T, LDT ) +* RECURSIVE SUBROUTINE SLARFT( DIRECT, STOREV, N, K, V, LDV, TAU, T, LDT ) * * .. Scalar Arguments .. * CHARACTER DIRECT, STOREV @@ -127,10 +127,10 @@ * *> \author Univ. of Tennessee *> \author Univ. of California Berkeley -*> \author Univ. of Colorado Denver +*> \author Johnathan Rhyne, Univ. of Colorado Denver (original author, 2024) *> \author NAG Ltd. * -*> \ingroup realOTHERauxiliary +*> \ingroup larft * *> \par Further Details: * ===================== @@ -159,165 +159,470 @@ *> \endverbatim *> * ===================================================================== - SUBROUTINE SLARFT( DIRECT, STOREV, N, K, V, LDV, TAU, T, LDT ) + RECURSIVE SUBROUTINE SLARFT( DIRECT, STOREV, N, K, V, LDV, + $ TAU, T, LDT ) * * -- LAPACK auxiliary routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * -* .. Scalar Arguments .. +* .. Scalar Arguments +* CHARACTER DIRECT, STOREV INTEGER K, LDT, LDV, N * .. * .. Array Arguments .. +* REAL T( LDT, * ), TAU( * ), V( LDV, * ) * .. * -* ===================================================================== -* * .. Parameters .. - REAL ONE, ZERO - PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) -* .. +* + REAL ONE, NEG_ONE, ZERO + PARAMETER(ONE=1.0E+0, ZERO = 0.0E+0, NEG_ONE=-1.0E+0) +* * .. Local Scalars .. - INTEGER I, J, PREVLASTV, LASTV -* .. +* + INTEGER I,J,L + LOGICAL QR,LQ,QL,DIRF,COLV +* * .. External Subroutines .. - EXTERNAL SGEMV, STRMV -* .. -* .. External Functions .. +* + EXTERNAL STRMM,SGEMM,SLACPY +* +* .. External Functions.. +* LOGICAL LSAME EXTERNAL LSAME +* +* The general scheme used is inspired by the approach inside DGEQRT3 +* which was (at the time of writing this code): +* Based on the algorithm of Elmroth and Gustavson, +* IBM J. Res. Develop. Vol 44 No. 4 July 2000. * .. * .. Executable Statements .. * * Quick return if possible * - IF( N.EQ.0 ) - $ RETURN -* - IF( LSAME( DIRECT, 'F' ) ) THEN - PREVLASTV = N - DO I = 1, K - PREVLASTV = MAX( I, PREVLASTV ) - IF( TAU( I ).EQ.ZERO ) THEN -* -* H(i) = I -* - DO J = 1, I - T( J, I ) = ZERO - END DO - ELSE -* -* general case -* - IF( LSAME( STOREV, 'C' ) ) THEN -* Skip any trailing zeros. - DO LASTV = N, I+1, -1 - IF( V( LASTV, I ).NE.ZERO ) EXIT - END DO - DO J = 1, I-1 - T( J, I ) = -TAU( I ) * V( I , J ) - END DO - J = MIN( LASTV, PREVLASTV ) -* -* T(1:i-1,i) := - tau(i) * V(i:j,1:i-1)**T * V(i:j,i) -* - CALL SGEMV( 'Transpose', J-I, I-1, -TAU( I ), - $ V( I+1, 1 ), LDV, V( I+1, I ), 1, ONE, - $ T( 1, I ), 1 ) - ELSE -* Skip any trailing zeros. - DO LASTV = N, I+1, -1 - IF( V( I, LASTV ).NE.ZERO ) EXIT - END DO - DO J = 1, I-1 - T( J, I ) = -TAU( I ) * V( J , I ) - END DO - J = MIN( LASTV, PREVLASTV ) -* -* T(1:i-1,i) := - tau(i) * V(1:i-1,i:j) * V(i,i:j)**T -* - CALL SGEMV( 'No transpose', I-1, J-I, -TAU( I ), - $ V( 1, I+1 ), LDV, V( I, I+1 ), LDV, - $ ONE, T( 1, I ), 1 ) - END IF -* -* T(1:i-1,i) := T(1:i-1,1:i-1) * T(1:i-1,i) -* - CALL STRMV( 'Upper', 'No transpose', 'Non-unit', I-1, T, - $ LDT, T( 1, I ), 1 ) - T( I, I ) = TAU( I ) - IF( I.GT.1 ) THEN - PREVLASTV = MAX( PREVLASTV, LASTV ) - ELSE - PREVLASTV = LASTV - END IF - END IF + IF(N.EQ.0.OR.K.EQ.0) THEN + RETURN + END IF +* +* Base case +* + IF(N.EQ.1.OR.K.EQ.1) THEN + T(1,1) = TAU(1) + RETURN + END IF +* +* Beginning of executable statements +* + L = K / 2 +* +* Determine what kind of Q we need to compute +* We assume that if the user doesn't provide 'F' for DIRECT, +* then they meant to provide 'B' and if they don't provide +* 'C' for STOREV, then they meant to provide 'R' +* + DIRF = LSAME(DIRECT,'F') + COLV = LSAME(STOREV,'C') +* +* QR happens when we have forward direction in column storage +* + QR = DIRF.AND.COLV +* +* LQ happens when we have forward direction in row storage +* + LQ = DIRF.AND.(.NOT.COLV) +* +* QL happens when we have backward direction in column storage +* + QL = (.NOT.DIRF).AND.COLV +* +* The last case is RQ. Due to how we structured this, if the +* above 3 are false, then RQ must be true, so we never store +* this +* RQ happens when we have backward direction in row storage +* RQ = (.NOT.DIRF).AND.(.NOT.COLV) +* + IF(QR) THEN +* +* Break V apart into 6 components +* +* V = |---------------| +* |V_{1,1} 0 | +* |V_{2,1} V_{2,2}| +* |V_{3,1} V_{3,2}| +* |---------------| +* +* V_{1,1}\in\R^{l,l} unit lower triangular +* V_{2,1}\in\R^{k-l,l} rectangular +* V_{3,1}\in\R^{n-k,l} rectangular +* +* V_{2,2}\in\R^{k-l,k-l} unit lower triangular +* V_{3,2}\in\R^{n-k,k-l} rectangular +* +* We will construct the T matrix +* T = |---------------| +* |T_{1,1} T_{1,2}| +* |0 T_{2,2}| +* |---------------| +* +* T is the triangular factor obtained from block reflectors. +* To motivate the structure, assume we have already computed T_{1,1} +* and T_{2,2}. Then collect the associated reflectors in V_1 and V_2 +* +* T_{1,1}\in\R^{l, l} upper triangular +* T_{2,2}\in\R^{k-l, k-l} upper triangular +* T_{1,2}\in\R^{l, k-l} rectangular +* +* Where l = floor(k/2) +* +* Then, consider the product: +* +* (I - V_1*T_{1,1}*V_1')*(I - V_2*T_{2,2}*V_2') +* = I - V_1*T_{1,1}*V_1' - V_2*T_{2,2}*V_2' + V_1*T_{1,1}*V_1'*V_2*T_{2,2}*V_2' +* +* Define T_{1,2} = -T_{1,1}*V_1'*V_2*T_{2,2} +* +* Then, we can define the matrix V as +* V = |-------| +* |V_1 V_2| +* |-------| +* +* So, our product is equivalent to the matrix product +* I - V*T*V' +* This means, we can compute T_{1,1} and T_{2,2}, then use this information +* to compute T_{1,2} +* +* Compute T_{1,1} recursively +* + CALL SLARFT(DIRECT, STOREV, N, L, V, LDV, TAU, T, LDT) +* +* Compute T_{2,2} recursively +* + CALL SLARFT(DIRECT, STOREV, N-L, K-L, V(L+1, L+1), LDV, + $ TAU(L+1), T(L+1, L+1), LDT) +* +* Compute T_{1,2} +* T_{1,2} = V_{2,1}' +* + DO J = 1, L + DO I = 1, K-L + T(J, L+I) = V(L+I, J) + END DO END DO - ELSE - PREVLASTV = 1 - DO I = K, 1, -1 - IF( TAU( I ).EQ.ZERO ) THEN -* -* H(i) = I -* - DO J = I, K - T( J, I ) = ZERO - END DO - ELSE -* -* general case -* - IF( I.LT.K ) THEN - IF( LSAME( STOREV, 'C' ) ) THEN -* Skip any leading zeros. - DO LASTV = 1, I-1 - IF( V( LASTV, I ).NE.ZERO ) EXIT - END DO - DO J = I+1, K - T( J, I ) = -TAU( I ) * V( N-K+I , J ) - END DO - J = MAX( LASTV, PREVLASTV ) -* -* T(i+1:k,i) = -tau(i) * V(j:n-k+i,i+1:k)**T * V(j:n-k+i,i) -* - CALL SGEMV( 'Transpose', N-K+I-J, K-I, -TAU( I ), - $ V( J, I+1 ), LDV, V( J, I ), 1, ONE, - $ T( I+1, I ), 1 ) - ELSE -* Skip any leading zeros. - DO LASTV = 1, I-1 - IF( V( I, LASTV ).NE.ZERO ) EXIT - END DO - DO J = I+1, K - T( J, I ) = -TAU( I ) * V( J, N-K+I ) - END DO - J = MAX( LASTV, PREVLASTV ) -* -* T(i+1:k,i) = -tau(i) * V(i+1:k,j:n-k+i) * V(i,j:n-k+i)**T -* - CALL SGEMV( 'No transpose', K-I, N-K+I-J, - $ -TAU( I ), V( I+1, J ), LDV, V( I, J ), LDV, - $ ONE, T( I+1, I ), 1 ) - END IF -* -* T(i+1:k,i) := T(i+1:k,i+1:k) * T(i+1:k,i) -* - CALL STRMV( 'Lower', 'No transpose', 'Non-unit', K-I, - $ T( I+1, I+1 ), LDT, T( I+1, I ), 1 ) - IF( I.GT.1 ) THEN - PREVLASTV = MIN( PREVLASTV, LASTV ) - ELSE - PREVLASTV = LASTV - END IF - END IF - T( I, I ) = TAU( I ) - END IF +* +* T_{1,2} = T_{1,2}*V_{2,2} +* + CALL STRMM('Right', 'Lower', 'No transpose', 'Unit', L, + $ K-L, ONE, V(L+1, L+1), LDV, T(1, L+1), LDT) + +* +* T_{1,2} = V_{3,1}'*V_{3,2} + T_{1,2} +* Note: We assume K <= N, and GEMM will do nothing if N=K +* + CALL SGEMM('Transpose', 'No transpose', L, K-L, N-K, ONE, + $ V(K+1, 1), LDV, V(K+1, L+1), LDV, ONE, + $ T(1, L+1), LDT) +* +* At this point, we have that T_{1,2} = V_1'*V_2 +* All that is left is to pre and post multiply by -T_{1,1} and T_{2,2} +* respectively. +* +* T_{1,2} = -T_{1,1}*T_{1,2} +* + CALL STRMM('Left', 'Upper', 'No transpose', 'Non-unit', L, + $ K-L, NEG_ONE, T, LDT, T(1, L+1), LDT) +* +* T_{1,2} = T_{1,2}*T_{2,2} +* + CALL STRMM('Right', 'Upper', 'No transpose', 'Non-unit', L, + $ K-L, ONE, T(L+1, L+1), LDT, T(1, L+1), LDT) + + ELSE IF(LQ) THEN +* +* Break V apart into 6 components +* +* V = |----------------------| +* |V_{1,1} V_{1,2} V{1,3}| +* |0 V_{2,2} V{2,3}| +* |----------------------| +* +* V_{1,1}\in\R^{l,l} unit upper triangular +* V_{1,2}\in\R^{l,k-l} rectangular +* V_{1,3}\in\R^{l,n-k} rectangular +* +* V_{2,2}\in\R^{k-l,k-l} unit upper triangular +* V_{2,3}\in\R^{k-l,n-k} rectangular +* +* Where l = floor(k/2) +* +* We will construct the T matrix +* T = |---------------| +* |T_{1,1} T_{1,2}| +* |0 T_{2,2}| +* |---------------| +* +* T is the triangular factor obtained from block reflectors. +* To motivate the structure, assume we have already computed T_{1,1} +* and T_{2,2}. Then collect the associated reflectors in V_1 and V_2 +* +* T_{1,1}\in\R^{l, l} upper triangular +* T_{2,2}\in\R^{k-l, k-l} upper triangular +* T_{1,2}\in\R^{l, k-l} rectangular +* +* Then, consider the product: +* +* (I - V_1'*T_{1,1}*V_1)*(I - V_2'*T_{2,2}*V_2) +* = I - V_1'*T_{1,1}*V_1 - V_2'*T_{2,2}*V_2 + V_1'*T_{1,1}*V_1*V_2'*T_{2,2}*V_2 +* +* Define T_{1,2} = -T_{1,1}*V_1*V_2'*T_{2,2} +* +* Then, we can define the matrix V as +* V = |---| +* |V_1| +* |V_2| +* |---| +* +* So, our product is equivalent to the matrix product +* I - V'*T*V +* This means, we can compute T_{1,1} and T_{2,2}, then use this information +* to compute T_{1,2} +* +* Compute T_{1,1} recursively +* + CALL SLARFT(DIRECT, STOREV, N, L, V, LDV, TAU, T, LDT) +* +* Compute T_{2,2} recursively +* + CALL SLARFT(DIRECT, STOREV, N-L, K-L, V(L+1, L+1), LDV, + $ TAU(L+1), T(L+1, L+1), LDT) + +* +* Compute T_{1,2} +* T_{1,2} = V_{1,2} +* + CALL SLACPY('All', L, K-L, V(1, L+1), LDV, T(1, L+1), LDT) +* +* T_{1,2} = T_{1,2}*V_{2,2}' +* + CALL STRMM('Right', 'Upper', 'Transpose', 'Unit', L, K-L, + $ ONE, V(L+1, L+1), LDV, T(1, L+1), LDT) + +* +* T_{1,2} = V_{1,3}*V_{2,3}' + T_{1,2} +* Note: We assume K <= N, and GEMM will do nothing if N=K +* + CALL SGEMM('No transpose', 'Transpose', L, K-L, N-K, ONE, + $ V(1, K+1), LDV, V(L+1, K+1), LDV, ONE, + $ T(1, L+1), LDT) +* +* At this point, we have that T_{1,2} = V_1*V_2' +* All that is left is to pre and post multiply by -T_{1,1} and T_{2,2} +* respectively. +* +* T_{1,2} = -T_{1,1}*T_{1,2} +* + CALL STRMM('Left', 'Upper', 'No transpose', 'Non-unit', L, + $ K-L, NEG_ONE, T, LDT, T(1, L+1), LDT) + +* +* T_{1,2} = T_{1,2}*T_{2,2} +* + CALL STRMM('Right', 'Upper', 'No transpose', 'Non-unit', L, + $ K-L, ONE, T(L+1, L+1), LDT, T(1, L+1), LDT) + ELSE IF(QL) THEN +* +* Break V apart into 6 components +* +* V = |---------------| +* |V_{1,1} V_{1,2}| +* |V_{2,1} V_{2,2}| +* |0 V_{3,2}| +* |---------------| +* +* V_{1,1}\in\R^{n-k,k-l} rectangular +* V_{2,1}\in\R^{k-l,k-l} unit upper triangular +* +* V_{1,2}\in\R^{n-k,l} rectangular +* V_{2,2}\in\R^{k-l,l} rectangular +* V_{3,2}\in\R^{l,l} unit upper triangular +* +* We will construct the T matrix +* T = |---------------| +* |T_{1,1} 0 | +* |T_{2,1} T_{2,2}| +* |---------------| +* +* T is the triangular factor obtained from block reflectors. +* To motivate the structure, assume we have already computed T_{1,1} +* and T_{2,2}. Then collect the associated reflectors in V_1 and V_2 +* +* T_{1,1}\in\R^{k-l, k-l} non-unit lower triangular +* T_{2,2}\in\R^{l, l} non-unit lower triangular +* T_{2,1}\in\R^{k-l, l} rectangular +* +* Where l = floor(k/2) +* +* Then, consider the product: +* +* (I - V_2*T_{2,2}*V_2')*(I - V_1*T_{1,1}*V_1') +* = I - V_2*T_{2,2}*V_2' - V_1*T_{1,1}*V_1' + V_2*T_{2,2}*V_2'*V_1*T_{1,1}*V_1' +* +* Define T_{2,1} = -T_{2,2}*V_2'*V_1*T_{1,1} +* +* Then, we can define the matrix V as +* V = |-------| +* |V_1 V_2| +* |-------| +* +* So, our product is equivalent to the matrix product +* I - V*T*V' +* This means, we can compute T_{1,1} and T_{2,2}, then use this information +* to compute T_{2,1} +* +* Compute T_{1,1} recursively +* + CALL SLARFT(DIRECT, STOREV, N-L, K-L, V, LDV, TAU, T, LDT) +* +* Compute T_{2,2} recursively +* + CALL SLARFT(DIRECT, STOREV, N, L, V(1, K-L+1), LDV, + $ TAU(K-L+1), T(K-L+1, K-L+1), LDT) +* +* Compute T_{2,1} +* T_{2,1} = V_{2,2}' +* + DO J = 1, K-L + DO I = 1, L + T(K-L+I, J) = V(N-K+J, K-L+I) + END DO END DO - END IF - RETURN * -* End of SLARFT +* T_{2,1} = T_{2,1}*V_{2,1} +* + CALL STRMM('Right', 'Upper', 'No transpose', 'Unit', L, + $ K-L, ONE, V(N-K+1, 1), LDV, T(K-L+1, 1), LDT) + +* +* T_{2,1} = V_{2,2}'*V_{2,1} + T_{2,1} +* Note: We assume K <= N, and GEMM will do nothing if N=K +* + CALL SGEMM('Transpose', 'No transpose', L, K-L, N-K, ONE, + $ V(1, K-L+1), LDV, V, LDV, ONE, T(K-L+1, 1), + $ LDT) +* +* At this point, we have that T_{2,1} = V_2'*V_1 +* All that is left is to pre and post multiply by -T_{2,2} and T_{1,1} +* respectively. +* +* T_{2,1} = -T_{2,2}*T_{2,1} +* + CALL STRMM('Left', 'Lower', 'No transpose', 'Non-unit', L, + $ K-L, NEG_ONE, T(K-L+1, K-L+1), LDT, + $ T(K-L+1, 1), LDT) * - END +* T_{2,1} = T_{2,1}*T_{1,1} +* + CALL STRMM('Right', 'Lower', 'No transpose', 'Non-unit', L, + $ K-L, ONE, T, LDT, T(K-L+1, 1), LDT) + ELSE +* +* Else means RQ case +* +* Break V apart into 6 components +* +* V = |-----------------------| +* |V_{1,1} V_{1,2} 0 | +* |V_{2,1} V_{2,2} V_{2,3}| +* |-----------------------| +* +* V_{1,1}\in\R^{k-l,n-k} rectangular +* V_{1,2}\in\R^{k-l,k-l} unit lower triangular +* +* V_{2,1}\in\R^{l,n-k} rectangular +* V_{2,2}\in\R^{l,k-l} rectangular +* V_{2,3}\in\R^{l,l} unit lower triangular +* +* We will construct the T matrix +* T = |---------------| +* |T_{1,1} 0 | +* |T_{2,1} T_{2,2}| +* |---------------| +* +* T is the triangular factor obtained from block reflectors. +* To motivate the structure, assume we have already computed T_{1,1} +* and T_{2,2}. Then collect the associated reflectors in V_1 and V_2 +* +* T_{1,1}\in\R^{k-l, k-l} non-unit lower triangular +* T_{2,2}\in\R^{l, l} non-unit lower triangular +* T_{2,1}\in\R^{k-l, l} rectangular +* +* Where l = floor(k/2) +* +* Then, consider the product: +* +* (I - V_2'*T_{2,2}*V_2)*(I - V_1'*T_{1,1}*V_1) +* = I - V_2'*T_{2,2}*V_2 - V_1'*T_{1,1}*V_1 + V_2'*T_{2,2}*V_2*V_1'*T_{1,1}*V_1 +* +* Define T_{2,1} = -T_{2,2}*V_2*V_1'*T_{1,1} +* +* Then, we can define the matrix V as +* V = |---| +* |V_1| +* |V_2| +* |---| +* +* So, our product is equivalent to the matrix product +* I - V'TV +* This means, we can compute T_{1,1} and T_{2,2}, then use this information +* to compute T_{2,1} +* +* Compute T_{1,1} recursively +* + CALL SLARFT(DIRECT, STOREV, N-L, K-L, V, LDV, TAU, T, LDT) +* +* Compute T_{2,2} recursively +* + CALL SLARFT(DIRECT, STOREV, N, L, V(K-L+1, 1), LDV, + $ TAU(K-L+1), T(K-L+1, K-L+1), LDT) +* +* Compute T_{2,1} +* T_{2,1} = V_{2,2} +* + CALL SLACPY('All', L, K-L, V(K-L+1, N-K+1), LDV, + $ T(K-L+1, 1), LDT) + +* +* T_{2,1} = T_{2,1}*V_{1,2}' +* + CALL STRMM('Right', 'Lower', 'Transpose', 'Unit', L, K-L, + $ ONE, V(1, N-K+1), LDV, T(K-L+1, 1), LDT) + +* +* T_{2,1} = V_{2,1}*V_{1,1}' + T_{2,1} +* Note: We assume K <= N, and GEMM will do nothing if N=K +* + CALL SGEMM('No transpose', 'Transpose', L, K-L, N-K, ONE, + $ V(K-L+1, 1), LDV, V, LDV, ONE, T(K-L+1, 1), + $ LDT) + +* +* At this point, we have that T_{2,1} = V_2*V_1' +* All that is left is to pre and post multiply by -T_{2,2} and T_{1,1} +* respectively. +* +* T_{2,1} = -T_{2,2}*T_{2,1} +* + CALL STRMM('Left', 'Lower', 'No tranpose', 'Non-unit', L, + $ K-L, NEG_ONE, T(K-L+1, K-L+1), LDT, + $ T(K-L+1, 1), LDT) + +* +* T_{2,1} = T_{2,1}*T_{1,1} +* + CALL STRMM('Right', 'Lower', 'No tranpose', 'Non-unit', L, + $ K-L, ONE, T, LDT, T(K-L+1, 1), LDT) + END IF + END SUBROUTINE diff --git a/lapack-netlib/SRC/zlarft.f b/lapack-netlib/SRC/zlarft.f index 5ad0996fab..900795afad 100644 --- a/lapack-netlib/SRC/zlarft.f +++ b/lapack-netlib/SRC/zlarft.f @@ -18,7 +18,7 @@ * Definition: * =========== * -* SUBROUTINE ZLARFT( DIRECT, STOREV, N, K, V, LDV, TAU, T, LDT ) +* RECURSIVE SUBROUTINE ZLARFT( DIRECT, STOREV, N, K, V, LDV, TAU, T, LDT ) * * .. Scalar Arguments .. * CHARACTER DIRECT, STOREV @@ -130,7 +130,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \ingroup complex16OTHERauxiliary +*> \ingroup larft * *> \par Further Details: * ===================== @@ -159,166 +159,474 @@ *> \endverbatim *> * ===================================================================== - SUBROUTINE ZLARFT( DIRECT, STOREV, N, K, V, LDV, TAU, T, LDT ) + RECURSIVE SUBROUTINE ZLARFT( DIRECT, STOREV, N, K, V, LDV, + $ TAU, T, LDT ) * * -- LAPACK auxiliary routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- * -* .. Scalar Arguments .. - CHARACTER DIRECT, STOREV - INTEGER K, LDT, LDV, N +* .. Scalar Arguments +* + CHARACTER DIRECT, STOREV + INTEGER K, LDT, LDV, N * .. * .. Array Arguments .. - COMPLEX*16 T( LDT, * ), TAU( * ), V( LDV, * ) -* .. * -* ===================================================================== + COMPLEX*16 T( LDT, * ), TAU( * ), V( LDV, * ) +* .. * * .. Parameters .. - COMPLEX*16 ONE, ZERO - PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ), - $ ZERO = ( 0.0D+0, 0.0D+0 ) ) -* .. +* + COMPLEX*16 ONE, NEG_ONE, ZERO + PARAMETER(ONE=1.0D+0, ZERO = 0.0D+0, NEG_ONE=-1.0D+0) +* * .. Local Scalars .. - INTEGER I, J, PREVLASTV, LASTV -* .. +* + INTEGER I,J,L + LOGICAL QR,LQ,QL,DIRF,COLV +* * .. External Subroutines .. - EXTERNAL ZGEMV, ZTRMV, ZGEMM -* .. -* .. External Functions .. - LOGICAL LSAME - EXTERNAL LSAME +* + EXTERNAL ZTRMM,ZGEMM,ZLACPY +* +* .. External Functions.. +* + LOGICAL LSAME + EXTERNAL LSAME +* +* .. Intrinsic Functions.. +* + INTRINSIC CONJG +* +* The general scheme used is inspired by the approach inside DGEQRT3 +* which was (at the time of writing this code): +* Based on the algorithm of Elmroth and Gustavson, +* IBM J. Res. Develop. Vol 44 No. 4 July 2000. * .. * .. Executable Statements .. * * Quick return if possible * - IF( N.EQ.0 ) - $ RETURN -* - IF( LSAME( DIRECT, 'F' ) ) THEN - PREVLASTV = N - DO I = 1, K - PREVLASTV = MAX( PREVLASTV, I ) - IF( TAU( I ).EQ.ZERO ) THEN -* -* H(i) = I -* - DO J = 1, I - T( J, I ) = ZERO - END DO - ELSE -* -* general case -* - IF( LSAME( STOREV, 'C' ) ) THEN -* Skip any trailing zeros. - DO LASTV = N, I+1, -1 - IF( V( LASTV, I ).NE.ZERO ) EXIT - END DO - DO J = 1, I-1 - T( J, I ) = -TAU( I ) * CONJG( V( I , J ) ) - END DO - J = MIN( LASTV, PREVLASTV ) -* -* T(1:i-1,i) := - tau(i) * V(i:j,1:i-1)**H * V(i:j,i) -* - CALL ZGEMV( 'Conjugate transpose', J-I, I-1, - $ -TAU( I ), V( I+1, 1 ), LDV, - $ V( I+1, I ), 1, ONE, T( 1, I ), 1 ) - ELSE -* Skip any trailing zeros. - DO LASTV = N, I+1, -1 - IF( V( I, LASTV ).NE.ZERO ) EXIT - END DO - DO J = 1, I-1 - T( J, I ) = -TAU( I ) * V( J , I ) - END DO - J = MIN( LASTV, PREVLASTV ) -* -* T(1:i-1,i) := - tau(i) * V(1:i-1,i:j) * V(i,i:j)**H -* - CALL ZGEMM( 'N', 'C', I-1, 1, J-I, -TAU( I ), - $ V( 1, I+1 ), LDV, V( I, I+1 ), LDV, - $ ONE, T( 1, I ), LDT ) - END IF -* -* T(1:i-1,i) := T(1:i-1,1:i-1) * T(1:i-1,i) -* - CALL ZTRMV( 'Upper', 'No transpose', 'Non-unit', I-1, T, - $ LDT, T( 1, I ), 1 ) - T( I, I ) = TAU( I ) - IF( I.GT.1 ) THEN - PREVLASTV = MAX( PREVLASTV, LASTV ) - ELSE - PREVLASTV = LASTV - END IF - END IF + IF(N.EQ.0.OR.K.EQ.0) THEN + RETURN + END IF +* +* Base case +* + IF(N.EQ.1.OR.K.EQ.1) THEN + T(1,1) = TAU(1) + RETURN + END IF +* +* Beginning of executable statements +* + L = K / 2 +* +* Determine what kind of Q we need to compute +* We assume that if the user doesn't provide 'F' for DIRECT, +* then they meant to provide 'B' and if they don't provide +* 'C' for STOREV, then they meant to provide 'R' +* + DIRF = LSAME(DIRECT,'F') + COLV = LSAME(STOREV,'C') +* +* QR happens when we have forward direction in column storage +* + QR = DIRF.AND.COLV +* +* LQ happens when we have forward direction in row storage +* + LQ = DIRF.AND.(.NOT.COLV) +* +* QL happens when we have backward direction in column storage +* + QL = (.NOT.DIRF).AND.COLV +* +* The last case is RQ. Due to how we structured this, if the +* above 3 are false, then RQ must be true, so we never store +* this +* RQ happens when we have backward direction in row storage +* RQ = (.NOT.DIRF).AND.(.NOT.COLV) +* + IF(QR) THEN +* +* Break V apart into 6 components +* +* V = |---------------| +* |V_{1,1} 0 | +* |V_{2,1} V_{2,2}| +* |V_{3,1} V_{3,2}| +* |---------------| +* +* V_{1,1}\in\C^{l,l} unit lower triangular +* V_{2,1}\in\C^{k-l,l} rectangular +* V_{3,1}\in\C^{n-k,l} rectangular +* +* V_{2,2}\in\C^{k-l,k-l} unit lower triangular +* V_{3,2}\in\C^{n-k,k-l} rectangular +* +* We will construct the T matrix +* T = |---------------| +* |T_{1,1} T_{1,2}| +* |0 T_{2,2}| +* |---------------| +* +* T is the triangular factor obtained from block reflectors. +* To motivate the structure, assume we have already computed T_{1,1} +* and T_{2,2}. Then collect the associated reflectors in V_1 and V_2 +* +* T_{1,1}\in\C^{l, l} upper triangular +* T_{2,2}\in\C^{k-l, k-l} upper triangular +* T_{1,2}\in\C^{l, k-l} rectangular +* +* Where l = floor(k/2) +* +* Then, consider the product: +* +* (I - V_1*T_{1,1}*V_1')*(I - V_2*T_{2,2}*V_2') +* = I - V_1*T_{1,1}*V_1' - V_2*T_{2,2}*V_2' + V_1*T_{1,1}*V_1'*V_2*T_{2,2}*V_2' +* +* Define T_{1,2} = -T_{1,1}*V_1'*V_2*T_{2,2} +* +* Then, we can define the matrix V as +* V = |-------| +* |V_1 V_2| +* |-------| +* +* So, our product is equivalent to the matrix product +* I - V*T*V' +* This means, we can compute T_{1,1} and T_{2,2}, then use this information +* to compute T_{1,2} +* +* Compute T_{1,1} recursively +* + CALL ZLARFT(DIRECT, STOREV, N, L, V, LDV, TAU, T, LDT) +* +* Compute T_{2,2} recursively +* + CALL ZLARFT(DIRECT, STOREV, N-L, K-L, V(L+1, L+1), LDV, + $ TAU(L+1), T(L+1, L+1), LDT) +* +* Compute T_{1,2} +* T_{1,2} = V_{2,1}' +* + DO J = 1, L + DO I = 1, K-L + T(J, L+I) = CONJG(V(L+I, J)) + END DO END DO - ELSE - PREVLASTV = 1 - DO I = K, 1, -1 - IF( TAU( I ).EQ.ZERO ) THEN -* -* H(i) = I -* - DO J = I, K - T( J, I ) = ZERO - END DO - ELSE -* -* general case -* - IF( I.LT.K ) THEN - IF( LSAME( STOREV, 'C' ) ) THEN -* Skip any leading zeros. - DO LASTV = 1, I-1 - IF( V( LASTV, I ).NE.ZERO ) EXIT - END DO - DO J = I+1, K - T( J, I ) = -TAU( I ) * CONJG( V( N-K+I , J ) ) - END DO - J = MAX( LASTV, PREVLASTV ) -* -* T(i+1:k,i) = -tau(i) * V(j:n-k+i,i+1:k)**H * V(j:n-k+i,i) -* - CALL ZGEMV( 'Conjugate transpose', N-K+I-J, K-I, - $ -TAU( I ), V( J, I+1 ), LDV, V( J, I ), - $ 1, ONE, T( I+1, I ), 1 ) - ELSE -* Skip any leading zeros. - DO LASTV = 1, I-1 - IF( V( I, LASTV ).NE.ZERO ) EXIT - END DO - DO J = I+1, K - T( J, I ) = -TAU( I ) * V( J, N-K+I ) - END DO - J = MAX( LASTV, PREVLASTV ) -* -* T(i+1:k,i) = -tau(i) * V(i+1:k,j:n-k+i) * V(i,j:n-k+i)**H -* - CALL ZGEMM( 'N', 'C', K-I, 1, N-K+I-J, -TAU( I ), - $ V( I+1, J ), LDV, V( I, J ), LDV, - $ ONE, T( I+1, I ), LDT ) - END IF -* -* T(i+1:k,i) := T(i+1:k,i+1:k) * T(i+1:k,i) -* - CALL ZTRMV( 'Lower', 'No transpose', 'Non-unit', K-I, - $ T( I+1, I+1 ), LDT, T( I+1, I ), 1 ) - IF( I.GT.1 ) THEN - PREVLASTV = MIN( PREVLASTV, LASTV ) - ELSE - PREVLASTV = LASTV - END IF - END IF - T( I, I ) = TAU( I ) - END IF +* +* T_{1,2} = T_{1,2}*V_{2,2} +* + CALL ZTRMM('Right', 'Lower', 'No transpose', 'Unit', L, + $ K-L, ONE, V(L+1, L+1), LDV, T(1, L+1), LDT) + +* +* T_{1,2} = V_{3,1}'*V_{3,2} + T_{1,2} +* Note: We assume K <= N, and GEMM will do nothing if N=K +* + CALL ZGEMM('Conjugate', 'No transpose', L, K-L, N-K, ONE, + $ V(K+1, 1), LDV, V(K+1, L+1), LDV, ONE, + $ T(1, L+1), LDT) +* +* At this point, we have that T_{1,2} = V_1'*V_2 +* All that is left is to pre and post multiply by -T_{1,1} and T_{2,2} +* respectively. +* +* T_{1,2} = -T_{1,1}*T_{1,2} +* + CALL ZTRMM('Left', 'Upper', 'No transpose', 'Non-unit', L, + $ K-L, NEG_ONE, T, LDT, T(1, L+1), LDT) +* +* T_{1,2} = T_{1,2}*T_{2,2} +* + CALL ZTRMM('Right', 'Upper', 'No transpose', 'Non-unit', L, + $ K-L, ONE, T(L+1, L+1), LDT, T(1, L+1), LDT) + + ELSE IF(LQ) THEN +* +* Break V apart into 6 components +* +* V = |----------------------| +* |V_{1,1} V_{1,2} V{1,3}| +* |0 V_{2,2} V{2,3}| +* |----------------------| +* +* V_{1,1}\in\C^{l,l} unit upper triangular +* V_{1,2}\in\C^{l,k-l} rectangular +* V_{1,3}\in\C^{l,n-k} rectangular +* +* V_{2,2}\in\C^{k-l,k-l} unit upper triangular +* V_{2,3}\in\C^{k-l,n-k} rectangular +* +* Where l = floor(k/2) +* +* We will construct the T matrix +* T = |---------------| +* |T_{1,1} T_{1,2}| +* |0 T_{2,2}| +* |---------------| +* +* T is the triangular factor obtained from block reflectors. +* To motivate the structure, assume we have already computed T_{1,1} +* and T_{2,2}. Then collect the associated reflectors in V_1 and V_2 +* +* T_{1,1}\in\C^{l, l} upper triangular +* T_{2,2}\in\C^{k-l, k-l} upper triangular +* T_{1,2}\in\C^{l, k-l} rectangular +* +* Then, consider the product: +* +* (I - V_1'*T_{1,1}*V_1)*(I - V_2'*T_{2,2}*V_2) +* = I - V_1'*T_{1,1}*V_1 - V_2'*T_{2,2}*V_2 + V_1'*T_{1,1}*V_1*V_2'*T_{2,2}*V_2 +* +* Define T_{1,2} = -T_{1,1}*V_1*V_2'*T_{2,2} +* +* Then, we can define the matrix V as +* V = |---| +* |V_1| +* |V_2| +* |---| +* +* So, our product is equivalent to the matrix product +* I - V'*T*V +* This means, we can compute T_{1,1} and T_{2,2}, then use this information +* to compute T_{1,2} +* +* Compute T_{1,1} recursively +* + CALL ZLARFT(DIRECT, STOREV, N, L, V, LDV, TAU, T, LDT) +* +* Compute T_{2,2} recursively +* + CALL ZLARFT(DIRECT, STOREV, N-L, K-L, V(L+1, L+1), LDV, + $ TAU(L+1), T(L+1, L+1), LDT) + +* +* Compute T_{1,2} +* T_{1,2} = V_{1,2} +* + CALL ZLACPY('All', L, K-L, V(1, L+1), LDV, T(1, L+1), LDT) +* +* T_{1,2} = T_{1,2}*V_{2,2}' +* + CALL ZTRMM('Right', 'Upper', 'Conjugate', 'Unit', L, K-L, + $ ONE, V(L+1, L+1), LDV, T(1, L+1), LDT) + +* +* T_{1,2} = V_{1,3}*V_{2,3}' + T_{1,2} +* Note: We assume K <= N, and GEMM will do nothing if N=K +* + CALL ZGEMM('No transpose', 'Conjugate', L, K-L, N-K, ONE, + $ V(1, K+1), LDV, V(L+1, K+1), LDV, ONE, + $ T(1, L+1), LDT) +* +* At this point, we have that T_{1,2} = V_1*V_2' +* All that is left is to pre and post multiply by -T_{1,1} and T_{2,2} +* respectively. +* +* T_{1,2} = -T_{1,1}*T_{1,2} +* + CALL ZTRMM('Left', 'Upper', 'No transpose', 'Non-unit', L, + $ K-L, NEG_ONE, T, LDT, T(1, L+1), LDT) + +* +* T_{1,2} = T_{1,2}*T_{2,2} +* + CALL ZTRMM('Right', 'Upper', 'No transpose', 'Non-unit', L, + $ K-L, ONE, T(L+1, L+1), LDT, T(1, L+1), LDT) + ELSE IF(QL) THEN +* +* Break V apart into 6 components +* +* V = |---------------| +* |V_{1,1} V_{1,2}| +* |V_{2,1} V_{2,2}| +* |0 V_{3,2}| +* |---------------| +* +* V_{1,1}\in\C^{n-k,k-l} rectangular +* V_{2,1}\in\C^{k-l,k-l} unit upper triangular +* +* V_{1,2}\in\C^{n-k,l} rectangular +* V_{2,2}\in\C^{k-l,l} rectangular +* V_{3,2}\in\C^{l,l} unit upper triangular +* +* We will construct the T matrix +* T = |---------------| +* |T_{1,1} 0 | +* |T_{2,1} T_{2,2}| +* |---------------| +* +* T is the triangular factor obtained from block reflectors. +* To motivate the structure, assume we have already computed T_{1,1} +* and T_{2,2}. Then collect the associated reflectors in V_1 and V_2 +* +* T_{1,1}\in\C^{k-l, k-l} non-unit lower triangular +* T_{2,2}\in\C^{l, l} non-unit lower triangular +* T_{2,1}\in\C^{k-l, l} rectangular +* +* Where l = floor(k/2) +* +* Then, consider the product: +* +* (I - V_2*T_{2,2}*V_2')*(I - V_1*T_{1,1}*V_1') +* = I - V_2*T_{2,2}*V_2' - V_1*T_{1,1}*V_1' + V_2*T_{2,2}*V_2'*V_1*T_{1,1}*V_1' +* +* Define T_{2,1} = -T_{2,2}*V_2'*V_1*T_{1,1} +* +* Then, we can define the matrix V as +* V = |-------| +* |V_1 V_2| +* |-------| +* +* So, our product is equivalent to the matrix product +* I - V*T*V' +* This means, we can compute T_{1,1} and T_{2,2}, then use this information +* to compute T_{2,1} +* +* Compute T_{1,1} recursively +* + CALL ZLARFT(DIRECT, STOREV, N-L, K-L, V, LDV, TAU, T, LDT) +* +* Compute T_{2,2} recursively +* + CALL ZLARFT(DIRECT, STOREV, N, L, V(1, K-L+1), LDV, + $ TAU(K-L+1), T(K-L+1, K-L+1), LDT) +* +* Compute T_{2,1} +* T_{2,1} = V_{2,2}' +* + DO J = 1, K-L + DO I = 1, L + T(K-L+I, J) = CONJG(V(N-K+J, K-L+I)) + END DO END DO - END IF - RETURN * -* End of ZLARFT +* T_{2,1} = T_{2,1}*V_{2,1} +* + CALL ZTRMM('Right', 'Upper', 'No transpose', 'Unit', L, + $ K-L, ONE, V(N-K+1, 1), LDV, T(K-L+1, 1), LDT) + +* +* T_{2,1} = V_{2,2}'*V_{2,1} + T_{2,1} +* Note: We assume K <= N, and GEMM will do nothing if N=K +* + CALL ZGEMM('Conjugate', 'No transpose', L, K-L, N-K, ONE, + $ V(1, K-L+1), LDV, V, LDV, ONE, T(K-L+1, 1), + $ LDT) +* +* At this point, we have that T_{2,1} = V_2'*V_1 +* All that is left is to pre and post multiply by -T_{2,2} and T_{1,1} +* respectively. +* +* T_{2,1} = -T_{2,2}*T_{2,1} +* + CALL ZTRMM('Left', 'Lower', 'No transpose', 'Non-unit', L, + $ K-L, NEG_ONE, T(K-L+1, K-L+1), LDT, + $ T(K-L+1, 1), LDT) * - END +* T_{2,1} = T_{2,1}*T_{1,1} +* + CALL ZTRMM('Right', 'Lower', 'No transpose', 'Non-unit', L, + $ K-L, ONE, T, LDT, T(K-L+1, 1), LDT) + ELSE +* +* Else means RQ case +* +* Break V apart into 6 components +* +* V = |-----------------------| +* |V_{1,1} V_{1,2} 0 | +* |V_{2,1} V_{2,2} V_{2,3}| +* |-----------------------| +* +* V_{1,1}\in\C^{k-l,n-k} rectangular +* V_{1,2}\in\C^{k-l,k-l} unit lower triangular +* +* V_{2,1}\in\C^{l,n-k} rectangular +* V_{2,2}\in\C^{l,k-l} rectangular +* V_{2,3}\in\C^{l,l} unit lower triangular +* +* We will construct the T matrix +* T = |---------------| +* |T_{1,1} 0 | +* |T_{2,1} T_{2,2}| +* |---------------| +* +* T is the triangular factor obtained from block reflectors. +* To motivate the structure, assume we have already computed T_{1,1} +* and T_{2,2}. Then collect the associated reflectors in V_1 and V_2 +* +* T_{1,1}\in\C^{k-l, k-l} non-unit lower triangular +* T_{2,2}\in\C^{l, l} non-unit lower triangular +* T_{2,1}\in\C^{k-l, l} rectangular +* +* Where l = floor(k/2) +* +* Then, consider the product: +* +* (I - V_2'*T_{2,2}*V_2)*(I - V_1'*T_{1,1}*V_1) +* = I - V_2'*T_{2,2}*V_2 - V_1'*T_{1,1}*V_1 + V_2'*T_{2,2}*V_2*V_1'*T_{1,1}*V_1 +* +* Define T_{2,1} = -T_{2,2}*V_2*V_1'*T_{1,1} +* +* Then, we can define the matrix V as +* V = |---| +* |V_1| +* |V_2| +* |---| +* +* So, our product is equivalent to the matrix product +* I - V'*T*V +* This means, we can compute T_{1,1} and T_{2,2}, then use this information +* to compute T_{2,1} +* +* Compute T_{1,1} recursively +* + CALL ZLARFT(DIRECT, STOREV, N-L, K-L, V, LDV, TAU, T, LDT) +* +* Compute T_{2,2} recursively +* + CALL ZLARFT(DIRECT, STOREV, N, L, V(K-L+1, 1), LDV, + $ TAU(K-L+1), T(K-L+1, K-L+1), LDT) +* +* Compute T_{2,1} +* T_{2,1} = V_{2,2} +* + CALL ZLACPY('All', L, K-L, V(K-L+1, N-K+1), LDV, + $ T(K-L+1, 1), LDT) + +* +* T_{2,1} = T_{2,1}*V_{1,2}' +* + CALL ZTRMM('Right', 'Lower', 'Conjugate', 'Unit', L, K-L, + $ ONE, V(1, N-K+1), LDV, T(K-L+1, 1), LDT) + +* +* T_{2,1} = V_{2,1}*V_{1,1}' + T_{2,1} +* Note: We assume K <= N, and GEMM will do nothing if N=K +* + CALL ZGEMM('No transpose', 'Conjugate', L, K-L, N-K, ONE, + $ V(K-L+1, 1), LDV, V, LDV, ONE, T(K-L+1, 1), + $ LDT) + +* +* At this point, we have that T_{2,1} = V_2*V_1' +* All that is left is to pre and post multiply by -T_{2,2} and T_{1,1} +* respectively. +* +* T_{2,1} = -T_{2,2}*T_{2,1} +* + CALL ZTRMM('Left', 'Lower', 'No tranpose', 'Non-unit', L, + $ K-L, NEG_ONE, T(K-L+1, K-L+1), LDT, + $ T(K-L+1, 1), LDT) + +* +* T_{2,1} = T_{2,1}*T_{1,1} +* + CALL ZTRMM('Right', 'Lower', 'No tranpose', 'Non-unit', L, + $ K-L, ONE, T, LDT, T(K-L+1, 1), LDT) + END IF + END SUBROUTINE From 0c4b4cd78c89c9cd387f77cada779290ed35e7ed Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 11 Jan 2025 15:38:44 -0800 Subject: [PATCH 240/244] move the non-recursive original ?larft here (Reference-LAPACK PR 1080) --- lapack-netlib/SRC/VARIANTS/Makefile | 10 ++++++++-- lapack-netlib/SRC/VARIANTS/README | 2 ++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/lapack-netlib/SRC/VARIANTS/Makefile b/lapack-netlib/SRC/VARIANTS/Makefile index 35e50cbc2c..4b0575cc6f 100644 --- a/lapack-netlib/SRC/VARIANTS/Makefile +++ b/lapack-netlib/SRC/VARIANTS/Makefile @@ -30,9 +30,11 @@ LUREC = lu/REC/cgetrf.o lu/REC/dgetrf.o lu/REC/sgetrf.o lu/REC/zgetrf.o QRLL = qr/LL/cgeqrf.o qr/LL/dgeqrf.o qr/LL/sgeqrf.o qr/LL/zgeqrf.o +LARFTL2 = larft/LL-LVL2/clarft.o larft/LL-LVL2/dlarft.o larft/LL-LVL2/slarft.o larft/LL-LVL2/zlarft.o + .PHONY: all -all: cholrl.a choltop.a lucr.a lull.a lurec.a qrll.a +all: cholrl.a choltop.a lucr.a lull.a lurec.a qrll.a larftl2.a cholrl.a: $(CHOLRL) $(AR) $(ARFLAGS) $@ $^ @@ -58,9 +60,13 @@ qrll.a: $(QRLL) $(AR) $(ARFLAGS) $@ $^ $(RANLIB) $@ +larftl2.a: $(LARFTL2) + $(AR) $(ARFLAGS) $@ $^ + $(RANLIB) $@ + .PHONY: clean cleanobj cleanlib clean: cleanobj cleanlib cleanobj: - rm -f $(CHOLRL) $(CHOLTOP) $(LUCR) $(LULL) $(LUREC) $(QRLL) + rm -f $(CHOLRL) $(CHOLTOP) $(LUCR) $(LULL) $(LUREC) $(QRLL) $(LARFTL2) cleanlib: rm -f *.a diff --git a/lapack-netlib/SRC/VARIANTS/README b/lapack-netlib/SRC/VARIANTS/README index ef7626debe..217cfa3e01 100644 --- a/lapack-netlib/SRC/VARIANTS/README +++ b/lapack-netlib/SRC/VARIANTS/README @@ -23,6 +23,7 @@ This directory contains several variants of LAPACK routines in single/double/com - [sdcz]geqrf with QR Left Looking Level 3 BLAS version algorithm [2]- Directory: SRC/VARIANTS/qr/LL - [sdcz]potrf with Cholesky Right Looking Level 3 BLAS version algorithm [2]- Directory: SRC/VARIANTS/cholesky/RL - [sdcz]potrf with Cholesky Top Level 3 BLAS version algorithm [2]- Directory: SRC/VARIANTS/cholesky/TOP + - [sdcz]larft using a Left Looking Level 2 BLAS version algorithm - Directory: SRC/VARIANTS/larft/LL-LVL2 References:For a more detailed description please refer to - [1] Toledo, S. 1997. Locality of Reference in LU Decomposition with Partial Pivoting. SIAM J. Matrix Anal. Appl. 18, 4 (Oct. 1997), @@ -44,6 +45,7 @@ Corresponding libraries created in SRC/VARIANTS: - QR Left Looking : qrll.a - Cholesky Right Looking : cholrl.a - Cholesky Top : choltop.a + - LARFT Level 2: larftl2.a =========== From 459fa8102b599713e97589d1ea4f68b2c3250804 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 12 Jan 2025 00:41:54 +0100 Subject: [PATCH 241/244] Create subdirectory for the old non-recursive ?larft --- lapack-netlib/SRC/VARIANTS/larft/LL_LVL2/slarft.f | 1 + 1 file changed, 1 insertion(+) create mode 100644 lapack-netlib/SRC/VARIANTS/larft/LL_LVL2/slarft.f diff --git a/lapack-netlib/SRC/VARIANTS/larft/LL_LVL2/slarft.f b/lapack-netlib/SRC/VARIANTS/larft/LL_LVL2/slarft.f new file mode 100644 index 0000000000..8b13789179 --- /dev/null +++ b/lapack-netlib/SRC/VARIANTS/larft/LL_LVL2/slarft.f @@ -0,0 +1 @@ + From d035e80d33711e74cdfe7f4bc975d7bb47253e07 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 11 Jan 2025 15:42:56 -0800 Subject: [PATCH 242/244] move the original non-recursive ?LARFT here (Reference-LAPACK PR 1080) --- .../SRC/VARIANTS/larft/LL_LVL2/clarft.f | 328 ++++++++++++++++++ .../SRC/VARIANTS/larft/LL_LVL2/dlarft.f | 326 +++++++++++++++++ .../SRC/VARIANTS/larft/LL_LVL2/slarft.f | 327 ++++++++++++++++- .../SRC/VARIANTS/larft/LL_LVL2/zlarft.f | 327 +++++++++++++++++ 4 files changed, 1307 insertions(+), 1 deletion(-) create mode 100644 lapack-netlib/SRC/VARIANTS/larft/LL_LVL2/clarft.f create mode 100644 lapack-netlib/SRC/VARIANTS/larft/LL_LVL2/dlarft.f create mode 100644 lapack-netlib/SRC/VARIANTS/larft/LL_LVL2/zlarft.f diff --git a/lapack-netlib/SRC/VARIANTS/larft/LL_LVL2/clarft.f b/lapack-netlib/SRC/VARIANTS/larft/LL_LVL2/clarft.f new file mode 100644 index 0000000000..9a7000eff3 --- /dev/null +++ b/lapack-netlib/SRC/VARIANTS/larft/LL_LVL2/clarft.f @@ -0,0 +1,328 @@ +*> \brief \b CLARFT VARIANT: left-looking Level 2 BLAS version of the algorithm +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download CLARFT + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE CLARFT( DIRECT, STOREV, N, K, V, LDV, TAU, T, LDT ) +* +* .. Scalar Arguments .. +* CHARACTER DIRECT, STOREV +* INTEGER K, LDT, LDV, N +* .. +* .. Array Arguments .. +* COMPLEX T( LDT, * ), TAU( * ), V( LDV, * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CLARFT forms the triangular factor T of a complex block reflector H +*> of order n, which is defined as a product of k elementary reflectors. +*> +*> If DIRECT = 'F', H = H(1) H(2) . . . H(k) and T is upper triangular; +*> +*> If DIRECT = 'B', H = H(k) . . . H(2) H(1) and T is lower triangular. +*> +*> If STOREV = 'C', the vector which defines the elementary reflector +*> H(i) is stored in the i-th column of the array V, and +*> +*> H = I - V * T * V**H +*> +*> If STOREV = 'R', the vector which defines the elementary reflector +*> H(i) is stored in the i-th row of the array V, and +*> +*> H = I - V**H * T * V +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] DIRECT +*> \verbatim +*> DIRECT is CHARACTER*1 +*> Specifies the order in which the elementary reflectors are +*> multiplied to form the block reflector: +*> = 'F': H = H(1) H(2) . . . H(k) (Forward) +*> = 'B': H = H(k) . . . H(2) H(1) (Backward) +*> \endverbatim +*> +*> \param[in] STOREV +*> \verbatim +*> STOREV is CHARACTER*1 +*> Specifies how the vectors which define the elementary +*> reflectors are stored (see also Further Details): +*> = 'C': columnwise +*> = 'R': rowwise +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the block reflector H. N >= 0. +*> \endverbatim +*> +*> \param[in] K +*> \verbatim +*> K is INTEGER +*> The order of the triangular factor T (= the number of +*> elementary reflectors). K >= 1. +*> \endverbatim +*> +*> \param[in] V +*> \verbatim +*> V is COMPLEX array, dimension +*> (LDV,K) if STOREV = 'C' +*> (LDV,N) if STOREV = 'R' +*> The matrix V. See further details. +*> \endverbatim +*> +*> \param[in] LDV +*> \verbatim +*> LDV is INTEGER +*> The leading dimension of the array V. +*> If STOREV = 'C', LDV >= max(1,N); if STOREV = 'R', LDV >= K. +*> \endverbatim +*> +*> \param[in] TAU +*> \verbatim +*> TAU is COMPLEX array, dimension (K) +*> TAU(i) must contain the scalar factor of the elementary +*> reflector H(i). +*> \endverbatim +*> +*> \param[out] T +*> \verbatim +*> T is COMPLEX array, dimension (LDT,K) +*> The k by k triangular factor T of the block reflector. +*> If DIRECT = 'F', T is upper triangular; if DIRECT = 'B', T is +*> lower triangular. The rest of the array is not used. +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> The leading dimension of the array T. LDT >= K. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup larft +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> The shape of the matrix V and the storage of the vectors which define +*> the H(i) is best illustrated by the following example with n = 5 and +*> k = 3. The elements equal to 1 are not stored. +*> +*> DIRECT = 'F' and STOREV = 'C': DIRECT = 'F' and STOREV = 'R': +*> +*> V = ( 1 ) V = ( 1 v1 v1 v1 v1 ) +*> ( v1 1 ) ( 1 v2 v2 v2 ) +*> ( v1 v2 1 ) ( 1 v3 v3 ) +*> ( v1 v2 v3 ) +*> ( v1 v2 v3 ) +*> +*> DIRECT = 'B' and STOREV = 'C': DIRECT = 'B' and STOREV = 'R': +*> +*> V = ( v1 v2 v3 ) V = ( v1 v1 1 ) +*> ( v1 v2 v3 ) ( v2 v2 v2 1 ) +*> ( 1 v2 v3 ) ( v3 v3 v3 v3 1 ) +*> ( 1 v3 ) +*> ( 1 ) +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE CLARFT( DIRECT, STOREV, N, K, V, LDV, TAU, T, LDT ) +* +* -- LAPACK auxiliary routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + CHARACTER DIRECT, STOREV + INTEGER K, LDT, LDV, N +* .. +* .. Array Arguments .. + COMPLEX T( LDT, * ), TAU( * ), V( LDV, * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX ONE, ZERO + PARAMETER ( ONE = ( 1.0E+0, 0.0E+0 ), + $ ZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + INTEGER I, J, PREVLASTV, LASTV +* .. +* .. External Subroutines .. + EXTERNAL CGEMM, CGEMV, CTRMV +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. Executable Statements .. +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* + IF( LSAME( DIRECT, 'F' ) ) THEN + PREVLASTV = N + DO I = 1, K + PREVLASTV = MAX( PREVLASTV, I ) + IF( TAU( I ).EQ.ZERO ) THEN +* +* H(i) = I +* + DO J = 1, I + T( J, I ) = ZERO + END DO + ELSE +* +* general case +* + IF( LSAME( STOREV, 'C' ) ) THEN +* Skip any trailing zeros. + DO LASTV = N, I+1, -1 + IF( V( LASTV, I ).NE.ZERO ) EXIT + END DO + DO J = 1, I-1 + T( J, I ) = -TAU( I ) * CONJG( V( I , J ) ) + END DO + J = MIN( LASTV, PREVLASTV ) +* +* T(1:i-1,i) := - tau(i) * V(i:j,1:i-1)**H * V(i:j,i) +* + CALL CGEMV( 'Conjugate transpose', J-I, I-1, + $ -TAU( I ), V( I+1, 1 ), LDV, + $ V( I+1, I ), 1, + $ ONE, T( 1, I ), 1 ) + ELSE +* Skip any trailing zeros. + DO LASTV = N, I+1, -1 + IF( V( I, LASTV ).NE.ZERO ) EXIT + END DO + DO J = 1, I-1 + T( J, I ) = -TAU( I ) * V( J , I ) + END DO + J = MIN( LASTV, PREVLASTV ) +* +* T(1:i-1,i) := - tau(i) * V(1:i-1,i:j) * V(i,i:j)**H +* + CALL CGEMM( 'N', 'C', I-1, 1, J-I, -TAU( I ), + $ V( 1, I+1 ), LDV, V( I, I+1 ), LDV, + $ ONE, T( 1, I ), LDT ) + END IF +* +* T(1:i-1,i) := T(1:i-1,1:i-1) * T(1:i-1,i) +* + CALL CTRMV( 'Upper', 'No transpose', 'Non-unit', I-1, + $ T, + $ LDT, T( 1, I ), 1 ) + T( I, I ) = TAU( I ) + IF( I.GT.1 ) THEN + PREVLASTV = MAX( PREVLASTV, LASTV ) + ELSE + PREVLASTV = LASTV + END IF + END IF + END DO + ELSE + PREVLASTV = 1 + DO I = K, 1, -1 + IF( TAU( I ).EQ.ZERO ) THEN +* +* H(i) = I +* + DO J = I, K + T( J, I ) = ZERO + END DO + ELSE +* +* general case +* + IF( I.LT.K ) THEN + IF( LSAME( STOREV, 'C' ) ) THEN +* Skip any leading zeros. + DO LASTV = 1, I-1 + IF( V( LASTV, I ).NE.ZERO ) EXIT + END DO + DO J = I+1, K + T( J, I ) = -TAU( I ) * CONJG( V( N-K+I , J ) ) + END DO + J = MAX( LASTV, PREVLASTV ) +* +* T(i+1:k,i) = -tau(i) * V(j:n-k+i,i+1:k)**H * V(j:n-k+i,i) +* + CALL CGEMV( 'Conjugate transpose', N-K+I-J, K-I, + $ -TAU( I ), V( J, I+1 ), LDV, V( J, I ), + $ 1, ONE, T( I+1, I ), 1 ) + ELSE +* Skip any leading zeros. + DO LASTV = 1, I-1 + IF( V( I, LASTV ).NE.ZERO ) EXIT + END DO + DO J = I+1, K + T( J, I ) = -TAU( I ) * V( J, N-K+I ) + END DO + J = MAX( LASTV, PREVLASTV ) +* +* T(i+1:k,i) = -tau(i) * V(i+1:k,j:n-k+i) * V(i,j:n-k+i)**H +* + CALL CGEMM( 'N', 'C', K-I, 1, N-K+I-J, + $ -TAU( I ), + $ V( I+1, J ), LDV, V( I, J ), LDV, + $ ONE, T( I+1, I ), LDT ) + END IF +* +* T(i+1:k,i) := T(i+1:k,i+1:k) * T(i+1:k,i) +* + CALL CTRMV( 'Lower', 'No transpose', 'Non-unit', + $ K-I, + $ T( I+1, I+1 ), LDT, T( I+1, I ), 1 ) + IF( I.GT.1 ) THEN + PREVLASTV = MIN( PREVLASTV, LASTV ) + ELSE + PREVLASTV = LASTV + END IF + END IF + T( I, I ) = TAU( I ) + END IF + END DO + END IF + RETURN +* +* End of CLARFT +* + END diff --git a/lapack-netlib/SRC/VARIANTS/larft/LL_LVL2/dlarft.f b/lapack-netlib/SRC/VARIANTS/larft/LL_LVL2/dlarft.f new file mode 100644 index 0000000000..19b7c7b1b2 --- /dev/null +++ b/lapack-netlib/SRC/VARIANTS/larft/LL_LVL2/dlarft.f @@ -0,0 +1,326 @@ +*> \brief \b DLARFT VARIANT: left-looking Level 2 BLAS version of the algorithm +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download DLARFT + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE DLARFT( DIRECT, STOREV, N, K, V, LDV, TAU, T, LDT ) +* +* .. Scalar Arguments .. +* CHARACTER DIRECT, STOREV +* INTEGER K, LDT, LDV, N +* .. +* .. Array Arguments .. +* DOUBLE PRECISION T( LDT, * ), TAU( * ), V( LDV, * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DLARFT forms the triangular factor T of a real block reflector H +*> of order n, which is defined as a product of k elementary reflectors. +*> +*> If DIRECT = 'F', H = H(1) H(2) . . . H(k) and T is upper triangular; +*> +*> If DIRECT = 'B', H = H(k) . . . H(2) H(1) and T is lower triangular. +*> +*> If STOREV = 'C', the vector which defines the elementary reflector +*> H(i) is stored in the i-th column of the array V, and +*> +*> H = I - V * T * V**T +*> +*> If STOREV = 'R', the vector which defines the elementary reflector +*> H(i) is stored in the i-th row of the array V, and +*> +*> H = I - V**T * T * V +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] DIRECT +*> \verbatim +*> DIRECT is CHARACTER*1 +*> Specifies the order in which the elementary reflectors are +*> multiplied to form the block reflector: +*> = 'F': H = H(1) H(2) . . . H(k) (Forward) +*> = 'B': H = H(k) . . . H(2) H(1) (Backward) +*> \endverbatim +*> +*> \param[in] STOREV +*> \verbatim +*> STOREV is CHARACTER*1 +*> Specifies how the vectors which define the elementary +*> reflectors are stored (see also Further Details): +*> = 'C': columnwise +*> = 'R': rowwise +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the block reflector H. N >= 0. +*> \endverbatim +*> +*> \param[in] K +*> \verbatim +*> K is INTEGER +*> The order of the triangular factor T (= the number of +*> elementary reflectors). K >= 1. +*> \endverbatim +*> +*> \param[in] V +*> \verbatim +*> V is DOUBLE PRECISION array, dimension +*> (LDV,K) if STOREV = 'C' +*> (LDV,N) if STOREV = 'R' +*> The matrix V. See further details. +*> \endverbatim +*> +*> \param[in] LDV +*> \verbatim +*> LDV is INTEGER +*> The leading dimension of the array V. +*> If STOREV = 'C', LDV >= max(1,N); if STOREV = 'R', LDV >= K. +*> \endverbatim +*> +*> \param[in] TAU +*> \verbatim +*> TAU is DOUBLE PRECISION array, dimension (K) +*> TAU(i) must contain the scalar factor of the elementary +*> reflector H(i). +*> \endverbatim +*> +*> \param[out] T +*> \verbatim +*> T is DOUBLE PRECISION array, dimension (LDT,K) +*> The k by k triangular factor T of the block reflector. +*> If DIRECT = 'F', T is upper triangular; if DIRECT = 'B', T is +*> lower triangular. The rest of the array is not used. +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> The leading dimension of the array T. LDT >= K. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup larft +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> The shape of the matrix V and the storage of the vectors which define +*> the H(i) is best illustrated by the following example with n = 5 and +*> k = 3. The elements equal to 1 are not stored. +*> +*> DIRECT = 'F' and STOREV = 'C': DIRECT = 'F' and STOREV = 'R': +*> +*> V = ( 1 ) V = ( 1 v1 v1 v1 v1 ) +*> ( v1 1 ) ( 1 v2 v2 v2 ) +*> ( v1 v2 1 ) ( 1 v3 v3 ) +*> ( v1 v2 v3 ) +*> ( v1 v2 v3 ) +*> +*> DIRECT = 'B' and STOREV = 'C': DIRECT = 'B' and STOREV = 'R': +*> +*> V = ( v1 v2 v3 ) V = ( v1 v1 1 ) +*> ( v1 v2 v3 ) ( v2 v2 v2 1 ) +*> ( 1 v2 v3 ) ( v3 v3 v3 v3 1 ) +*> ( 1 v3 ) +*> ( 1 ) +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE DLARFT( DIRECT, STOREV, N, K, V, LDV, TAU, T, LDT ) +* +* -- LAPACK auxiliary routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + CHARACTER DIRECT, STOREV + INTEGER K, LDT, LDV, N +* .. +* .. Array Arguments .. + DOUBLE PRECISION T( LDT, * ), TAU( * ), V( LDV, * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ONE, ZERO + PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) +* .. +* .. Local Scalars .. + INTEGER I, J, PREVLASTV, LASTV +* .. +* .. External Subroutines .. + EXTERNAL DGEMV, DTRMV +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. Executable Statements .. +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* + IF( LSAME( DIRECT, 'F' ) ) THEN + PREVLASTV = N + DO I = 1, K + PREVLASTV = MAX( I, PREVLASTV ) + IF( TAU( I ).EQ.ZERO ) THEN +* +* H(i) = I +* + DO J = 1, I + T( J, I ) = ZERO + END DO + ELSE +* +* general case +* + IF( LSAME( STOREV, 'C' ) ) THEN +* Skip any trailing zeros. + DO LASTV = N, I+1, -1 + IF( V( LASTV, I ).NE.ZERO ) EXIT + END DO + DO J = 1, I-1 + T( J, I ) = -TAU( I ) * V( I , J ) + END DO + J = MIN( LASTV, PREVLASTV ) +* +* T(1:i-1,i) := - tau(i) * V(i:j,1:i-1)**T * V(i:j,i) +* + CALL DGEMV( 'Transpose', J-I, I-1, -TAU( I ), + $ V( I+1, 1 ), LDV, V( I+1, I ), 1, ONE, + $ T( 1, I ), 1 ) + ELSE +* Skip any trailing zeros. + DO LASTV = N, I+1, -1 + IF( V( I, LASTV ).NE.ZERO ) EXIT + END DO + DO J = 1, I-1 + T( J, I ) = -TAU( I ) * V( J , I ) + END DO + J = MIN( LASTV, PREVLASTV ) +* +* T(1:i-1,i) := - tau(i) * V(1:i-1,i:j) * V(i,i:j)**T +* + CALL DGEMV( 'No transpose', I-1, J-I, -TAU( I ), + $ V( 1, I+1 ), LDV, V( I, I+1 ), LDV, ONE, + $ T( 1, I ), 1 ) + END IF +* +* T(1:i-1,i) := T(1:i-1,1:i-1) * T(1:i-1,i) +* + CALL DTRMV( 'Upper', 'No transpose', 'Non-unit', I-1, + $ T, + $ LDT, T( 1, I ), 1 ) + T( I, I ) = TAU( I ) + IF( I.GT.1 ) THEN + PREVLASTV = MAX( PREVLASTV, LASTV ) + ELSE + PREVLASTV = LASTV + END IF + END IF + END DO + ELSE + PREVLASTV = 1 + DO I = K, 1, -1 + IF( TAU( I ).EQ.ZERO ) THEN +* +* H(i) = I +* + DO J = I, K + T( J, I ) = ZERO + END DO + ELSE +* +* general case +* + IF( I.LT.K ) THEN + IF( LSAME( STOREV, 'C' ) ) THEN +* Skip any leading zeros. + DO LASTV = 1, I-1 + IF( V( LASTV, I ).NE.ZERO ) EXIT + END DO + DO J = I+1, K + T( J, I ) = -TAU( I ) * V( N-K+I , J ) + END DO + J = MAX( LASTV, PREVLASTV ) +* +* T(i+1:k,i) = -tau(i) * V(j:n-k+i,i+1:k)**T * V(j:n-k+i,i) +* + CALL DGEMV( 'Transpose', N-K+I-J, K-I, + $ -TAU( I ), + $ V( J, I+1 ), LDV, V( J, I ), 1, ONE, + $ T( I+1, I ), 1 ) + ELSE +* Skip any leading zeros. + DO LASTV = 1, I-1 + IF( V( I, LASTV ).NE.ZERO ) EXIT + END DO + DO J = I+1, K + T( J, I ) = -TAU( I ) * V( J, N-K+I ) + END DO + J = MAX( LASTV, PREVLASTV ) +* +* T(i+1:k,i) = -tau(i) * V(i+1:k,j:n-k+i) * V(i,j:n-k+i)**T +* + CALL DGEMV( 'No transpose', K-I, N-K+I-J, + $ -TAU( I ), V( I+1, J ), LDV, V( I, J ), LDV, + $ ONE, T( I+1, I ), 1 ) + END IF +* +* T(i+1:k,i) := T(i+1:k,i+1:k) * T(i+1:k,i) +* + CALL DTRMV( 'Lower', 'No transpose', 'Non-unit', + $ K-I, + $ T( I+1, I+1 ), LDT, T( I+1, I ), 1 ) + IF( I.GT.1 ) THEN + PREVLASTV = MIN( PREVLASTV, LASTV ) + ELSE + PREVLASTV = LASTV + END IF + END IF + T( I, I ) = TAU( I ) + END IF + END DO + END IF + RETURN +* +* End of DLARFT +* + END diff --git a/lapack-netlib/SRC/VARIANTS/larft/LL_LVL2/slarft.f b/lapack-netlib/SRC/VARIANTS/larft/LL_LVL2/slarft.f index 8b13789179..e1578e2587 100644 --- a/lapack-netlib/SRC/VARIANTS/larft/LL_LVL2/slarft.f +++ b/lapack-netlib/SRC/VARIANTS/larft/LL_LVL2/slarft.f @@ -1 +1,326 @@ - +*> \brief \b SLARFT VARIANT: left-looking Level 2 BLAS version of the algorithm. +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download SLARFT + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE SLARFT( DIRECT, STOREV, N, K, V, LDV, TAU, T, LDT ) +* +* .. Scalar Arguments .. +* CHARACTER DIRECT, STOREV +* INTEGER K, LDT, LDV, N +* .. +* .. Array Arguments .. +* REAL T( LDT, * ), TAU( * ), V( LDV, * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SLARFT forms the triangular factor T of a real block reflector H +*> of order n, which is defined as a product of k elementary reflectors. +*> +*> If DIRECT = 'F', H = H(1) H(2) . . . H(k) and T is upper triangular; +*> +*> If DIRECT = 'B', H = H(k) . . . H(2) H(1) and T is lower triangular. +*> +*> If STOREV = 'C', the vector which defines the elementary reflector +*> H(i) is stored in the i-th column of the array V, and +*> +*> H = I - V * T * V**T +*> +*> If STOREV = 'R', the vector which defines the elementary reflector +*> H(i) is stored in the i-th row of the array V, and +*> +*> H = I - V**T * T * V +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] DIRECT +*> \verbatim +*> DIRECT is CHARACTER*1 +*> Specifies the order in which the elementary reflectors are +*> multiplied to form the block reflector: +*> = 'F': H = H(1) H(2) . . . H(k) (Forward) +*> = 'B': H = H(k) . . . H(2) H(1) (Backward) +*> \endverbatim +*> +*> \param[in] STOREV +*> \verbatim +*> STOREV is CHARACTER*1 +*> Specifies how the vectors which define the elementary +*> reflectors are stored (see also Further Details): +*> = 'C': columnwise +*> = 'R': rowwise +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the block reflector H. N >= 0. +*> \endverbatim +*> +*> \param[in] K +*> \verbatim +*> K is INTEGER +*> The order of the triangular factor T (= the number of +*> elementary reflectors). K >= 1. +*> \endverbatim +*> +*> \param[in] V +*> \verbatim +*> V is REAL array, dimension +*> (LDV,K) if STOREV = 'C' +*> (LDV,N) if STOREV = 'R' +*> The matrix V. See further details. +*> \endverbatim +*> +*> \param[in] LDV +*> \verbatim +*> LDV is INTEGER +*> The leading dimension of the array V. +*> If STOREV = 'C', LDV >= max(1,N); if STOREV = 'R', LDV >= K. +*> \endverbatim +*> +*> \param[in] TAU +*> \verbatim +*> TAU is REAL array, dimension (K) +*> TAU(i) must contain the scalar factor of the elementary +*> reflector H(i). +*> \endverbatim +*> +*> \param[out] T +*> \verbatim +*> T is REAL array, dimension (LDT,K) +*> The k by k triangular factor T of the block reflector. +*> If DIRECT = 'F', T is upper triangular; if DIRECT = 'B', T is +*> lower triangular. The rest of the array is not used. +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> The leading dimension of the array T. LDT >= K. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup larft +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> The shape of the matrix V and the storage of the vectors which define +*> the H(i) is best illustrated by the following example with n = 5 and +*> k = 3. The elements equal to 1 are not stored. +*> +*> DIRECT = 'F' and STOREV = 'C': DIRECT = 'F' and STOREV = 'R': +*> +*> V = ( 1 ) V = ( 1 v1 v1 v1 v1 ) +*> ( v1 1 ) ( 1 v2 v2 v2 ) +*> ( v1 v2 1 ) ( 1 v3 v3 ) +*> ( v1 v2 v3 ) +*> ( v1 v2 v3 ) +*> +*> DIRECT = 'B' and STOREV = 'C': DIRECT = 'B' and STOREV = 'R': +*> +*> V = ( v1 v2 v3 ) V = ( v1 v1 1 ) +*> ( v1 v2 v3 ) ( v2 v2 v2 1 ) +*> ( 1 v2 v3 ) ( v3 v3 v3 v3 1 ) +*> ( 1 v3 ) +*> ( 1 ) +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE SLARFT( DIRECT, STOREV, N, K, V, LDV, TAU, T, LDT ) +* +* -- LAPACK auxiliary routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + CHARACTER DIRECT, STOREV + INTEGER K, LDT, LDV, N +* .. +* .. Array Arguments .. + REAL T( LDT, * ), TAU( * ), V( LDV, * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ONE, ZERO + PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) +* .. +* .. Local Scalars .. + INTEGER I, J, PREVLASTV, LASTV +* .. +* .. External Subroutines .. + EXTERNAL SGEMV, STRMV +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. Executable Statements .. +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* + IF( LSAME( DIRECT, 'F' ) ) THEN + PREVLASTV = N + DO I = 1, K + PREVLASTV = MAX( I, PREVLASTV ) + IF( TAU( I ).EQ.ZERO ) THEN +* +* H(i) = I +* + DO J = 1, I + T( J, I ) = ZERO + END DO + ELSE +* +* general case +* + IF( LSAME( STOREV, 'C' ) ) THEN +* Skip any trailing zeros. + DO LASTV = N, I+1, -1 + IF( V( LASTV, I ).NE.ZERO ) EXIT + END DO + DO J = 1, I-1 + T( J, I ) = -TAU( I ) * V( I , J ) + END DO + J = MIN( LASTV, PREVLASTV ) +* +* T(1:i-1,i) := - tau(i) * V(i:j,1:i-1)**T * V(i:j,i) +* + CALL SGEMV( 'Transpose', J-I, I-1, -TAU( I ), + $ V( I+1, 1 ), LDV, V( I+1, I ), 1, ONE, + $ T( 1, I ), 1 ) + ELSE +* Skip any trailing zeros. + DO LASTV = N, I+1, -1 + IF( V( I, LASTV ).NE.ZERO ) EXIT + END DO + DO J = 1, I-1 + T( J, I ) = -TAU( I ) * V( J , I ) + END DO + J = MIN( LASTV, PREVLASTV ) +* +* T(1:i-1,i) := - tau(i) * V(1:i-1,i:j) * V(i,i:j)**T +* + CALL SGEMV( 'No transpose', I-1, J-I, -TAU( I ), + $ V( 1, I+1 ), LDV, V( I, I+1 ), LDV, + $ ONE, T( 1, I ), 1 ) + END IF +* +* T(1:i-1,i) := T(1:i-1,1:i-1) * T(1:i-1,i) +* + CALL STRMV( 'Upper', 'No transpose', 'Non-unit', I-1, + $ T, + $ LDT, T( 1, I ), 1 ) + T( I, I ) = TAU( I ) + IF( I.GT.1 ) THEN + PREVLASTV = MAX( PREVLASTV, LASTV ) + ELSE + PREVLASTV = LASTV + END IF + END IF + END DO + ELSE + PREVLASTV = 1 + DO I = K, 1, -1 + IF( TAU( I ).EQ.ZERO ) THEN +* +* H(i) = I +* + DO J = I, K + T( J, I ) = ZERO + END DO + ELSE +* +* general case +* + IF( I.LT.K ) THEN + IF( LSAME( STOREV, 'C' ) ) THEN +* Skip any leading zeros. + DO LASTV = 1, I-1 + IF( V( LASTV, I ).NE.ZERO ) EXIT + END DO + DO J = I+1, K + T( J, I ) = -TAU( I ) * V( N-K+I , J ) + END DO + J = MAX( LASTV, PREVLASTV ) +* +* T(i+1:k,i) = -tau(i) * V(j:n-k+i,i+1:k)**T * V(j:n-k+i,i) +* + CALL SGEMV( 'Transpose', N-K+I-J, K-I, + $ -TAU( I ), + $ V( J, I+1 ), LDV, V( J, I ), 1, ONE, + $ T( I+1, I ), 1 ) + ELSE +* Skip any leading zeros. + DO LASTV = 1, I-1 + IF( V( I, LASTV ).NE.ZERO ) EXIT + END DO + DO J = I+1, K + T( J, I ) = -TAU( I ) * V( J, N-K+I ) + END DO + J = MAX( LASTV, PREVLASTV ) +* +* T(i+1:k,i) = -tau(i) * V(i+1:k,j:n-k+i) * V(i,j:n-k+i)**T +* + CALL SGEMV( 'No transpose', K-I, N-K+I-J, + $ -TAU( I ), V( I+1, J ), LDV, V( I, J ), LDV, + $ ONE, T( I+1, I ), 1 ) + END IF +* +* T(i+1:k,i) := T(i+1:k,i+1:k) * T(i+1:k,i) +* + CALL STRMV( 'Lower', 'No transpose', 'Non-unit', + $ K-I, + $ T( I+1, I+1 ), LDT, T( I+1, I ), 1 ) + IF( I.GT.1 ) THEN + PREVLASTV = MIN( PREVLASTV, LASTV ) + ELSE + PREVLASTV = LASTV + END IF + END IF + T( I, I ) = TAU( I ) + END IF + END DO + END IF + RETURN +* +* End of SLARFT +* + END diff --git a/lapack-netlib/SRC/VARIANTS/larft/LL_LVL2/zlarft.f b/lapack-netlib/SRC/VARIANTS/larft/LL_LVL2/zlarft.f new file mode 100644 index 0000000000..6abadd501e --- /dev/null +++ b/lapack-netlib/SRC/VARIANTS/larft/LL_LVL2/zlarft.f @@ -0,0 +1,327 @@ +*> \brief \b ZLARFT VARIANT: left-looking Level 2 BLAS version of the algorithm. +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download ZLARFT + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE ZLARFT( DIRECT, STOREV, N, K, V, LDV, TAU, T, LDT ) +* +* .. Scalar Arguments .. +* CHARACTER DIRECT, STOREV +* INTEGER K, LDT, LDV, N +* .. +* .. Array Arguments .. +* COMPLEX*16 T( LDT, * ), TAU( * ), V( LDV, * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZLARFT forms the triangular factor T of a complex block reflector H +*> of order n, which is defined as a product of k elementary reflectors. +*> +*> If DIRECT = 'F', H = H(1) H(2) . . . H(k) and T is upper triangular; +*> +*> If DIRECT = 'B', H = H(k) . . . H(2) H(1) and T is lower triangular. +*> +*> If STOREV = 'C', the vector which defines the elementary reflector +*> H(i) is stored in the i-th column of the array V, and +*> +*> H = I - V * T * V**H +*> +*> If STOREV = 'R', the vector which defines the elementary reflector +*> H(i) is stored in the i-th row of the array V, and +*> +*> H = I - V**H * T * V +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] DIRECT +*> \verbatim +*> DIRECT is CHARACTER*1 +*> Specifies the order in which the elementary reflectors are +*> multiplied to form the block reflector: +*> = 'F': H = H(1) H(2) . . . H(k) (Forward) +*> = 'B': H = H(k) . . . H(2) H(1) (Backward) +*> \endverbatim +*> +*> \param[in] STOREV +*> \verbatim +*> STOREV is CHARACTER*1 +*> Specifies how the vectors which define the elementary +*> reflectors are stored (see also Further Details): +*> = 'C': columnwise +*> = 'R': rowwise +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The order of the block reflector H. N >= 0. +*> \endverbatim +*> +*> \param[in] K +*> \verbatim +*> K is INTEGER +*> The order of the triangular factor T (= the number of +*> elementary reflectors). K >= 1. +*> \endverbatim +*> +*> \param[in] V +*> \verbatim +*> V is COMPLEX*16 array, dimension +*> (LDV,K) if STOREV = 'C' +*> (LDV,N) if STOREV = 'R' +*> The matrix V. See further details. +*> \endverbatim +*> +*> \param[in] LDV +*> \verbatim +*> LDV is INTEGER +*> The leading dimension of the array V. +*> If STOREV = 'C', LDV >= max(1,N); if STOREV = 'R', LDV >= K. +*> \endverbatim +*> +*> \param[in] TAU +*> \verbatim +*> TAU is COMPLEX*16 array, dimension (K) +*> TAU(i) must contain the scalar factor of the elementary +*> reflector H(i). +*> \endverbatim +*> +*> \param[out] T +*> \verbatim +*> T is COMPLEX*16 array, dimension (LDT,K) +*> The k by k triangular factor T of the block reflector. +*> If DIRECT = 'F', T is upper triangular; if DIRECT = 'B', T is +*> lower triangular. The rest of the array is not used. +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> The leading dimension of the array T. LDT >= K. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup larft +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> The shape of the matrix V and the storage of the vectors which define +*> the H(i) is best illustrated by the following example with n = 5 and +*> k = 3. The elements equal to 1 are not stored. +*> +*> DIRECT = 'F' and STOREV = 'C': DIRECT = 'F' and STOREV = 'R': +*> +*> V = ( 1 ) V = ( 1 v1 v1 v1 v1 ) +*> ( v1 1 ) ( 1 v2 v2 v2 ) +*> ( v1 v2 1 ) ( 1 v3 v3 ) +*> ( v1 v2 v3 ) +*> ( v1 v2 v3 ) +*> +*> DIRECT = 'B' and STOREV = 'C': DIRECT = 'B' and STOREV = 'R': +*> +*> V = ( v1 v2 v3 ) V = ( v1 v1 1 ) +*> ( v1 v2 v3 ) ( v2 v2 v2 1 ) +*> ( 1 v2 v3 ) ( v3 v3 v3 v3 1 ) +*> ( 1 v3 ) +*> ( 1 ) +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE ZLARFT( DIRECT, STOREV, N, K, V, LDV, TAU, T, LDT ) +* +* -- LAPACK auxiliary routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + CHARACTER DIRECT, STOREV + INTEGER K, LDT, LDV, N +* .. +* .. Array Arguments .. + COMPLEX*16 T( LDT, * ), TAU( * ), V( LDV, * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX*16 ONE, ZERO + PARAMETER ( ONE = ( 1.0D+0, 0.0D+0 ), + $ ZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + INTEGER I, J, PREVLASTV, LASTV +* .. +* .. External Subroutines .. + EXTERNAL ZGEMV, ZTRMV, ZGEMM +* .. +* .. External Functions .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. Executable Statements .. +* +* Quick return if possible +* + IF( N.EQ.0 ) + $ RETURN +* + IF( LSAME( DIRECT, 'F' ) ) THEN + PREVLASTV = N + DO I = 1, K + PREVLASTV = MAX( PREVLASTV, I ) + IF( TAU( I ).EQ.ZERO ) THEN +* +* H(i) = I +* + DO J = 1, I + T( J, I ) = ZERO + END DO + ELSE +* +* general case +* + IF( LSAME( STOREV, 'C' ) ) THEN +* Skip any trailing zeros. + DO LASTV = N, I+1, -1 + IF( V( LASTV, I ).NE.ZERO ) EXIT + END DO + DO J = 1, I-1 + T( J, I ) = -TAU( I ) * CONJG( V( I , J ) ) + END DO + J = MIN( LASTV, PREVLASTV ) +* +* T(1:i-1,i) := - tau(i) * V(i:j,1:i-1)**H * V(i:j,i) +* + CALL ZGEMV( 'Conjugate transpose', J-I, I-1, + $ -TAU( I ), V( I+1, 1 ), LDV, + $ V( I+1, I ), 1, ONE, T( 1, I ), 1 ) + ELSE +* Skip any trailing zeros. + DO LASTV = N, I+1, -1 + IF( V( I, LASTV ).NE.ZERO ) EXIT + END DO + DO J = 1, I-1 + T( J, I ) = -TAU( I ) * V( J , I ) + END DO + J = MIN( LASTV, PREVLASTV ) +* +* T(1:i-1,i) := - tau(i) * V(1:i-1,i:j) * V(i,i:j)**H +* + CALL ZGEMM( 'N', 'C', I-1, 1, J-I, -TAU( I ), + $ V( 1, I+1 ), LDV, V( I, I+1 ), LDV, + $ ONE, T( 1, I ), LDT ) + END IF +* +* T(1:i-1,i) := T(1:i-1,1:i-1) * T(1:i-1,i) +* + CALL ZTRMV( 'Upper', 'No transpose', 'Non-unit', I-1, + $ T, + $ LDT, T( 1, I ), 1 ) + T( I, I ) = TAU( I ) + IF( I.GT.1 ) THEN + PREVLASTV = MAX( PREVLASTV, LASTV ) + ELSE + PREVLASTV = LASTV + END IF + END IF + END DO + ELSE + PREVLASTV = 1 + DO I = K, 1, -1 + IF( TAU( I ).EQ.ZERO ) THEN +* +* H(i) = I +* + DO J = I, K + T( J, I ) = ZERO + END DO + ELSE +* +* general case +* + IF( I.LT.K ) THEN + IF( LSAME( STOREV, 'C' ) ) THEN +* Skip any leading zeros. + DO LASTV = 1, I-1 + IF( V( LASTV, I ).NE.ZERO ) EXIT + END DO + DO J = I+1, K + T( J, I ) = -TAU( I ) * CONJG( V( N-K+I , J ) ) + END DO + J = MAX( LASTV, PREVLASTV ) +* +* T(i+1:k,i) = -tau(i) * V(j:n-k+i,i+1:k)**H * V(j:n-k+i,i) +* + CALL ZGEMV( 'Conjugate transpose', N-K+I-J, K-I, + $ -TAU( I ), V( J, I+1 ), LDV, V( J, I ), + $ 1, ONE, T( I+1, I ), 1 ) + ELSE +* Skip any leading zeros. + DO LASTV = 1, I-1 + IF( V( I, LASTV ).NE.ZERO ) EXIT + END DO + DO J = I+1, K + T( J, I ) = -TAU( I ) * V( J, N-K+I ) + END DO + J = MAX( LASTV, PREVLASTV ) +* +* T(i+1:k,i) = -tau(i) * V(i+1:k,j:n-k+i) * V(i,j:n-k+i)**H +* + CALL ZGEMM( 'N', 'C', K-I, 1, N-K+I-J, + $ -TAU( I ), + $ V( I+1, J ), LDV, V( I, J ), LDV, + $ ONE, T( I+1, I ), LDT ) + END IF +* +* T(i+1:k,i) := T(i+1:k,i+1:k) * T(i+1:k,i) +* + CALL ZTRMV( 'Lower', 'No transpose', 'Non-unit', + $ K-I, + $ T( I+1, I+1 ), LDT, T( I+1, I ), 1 ) + IF( I.GT.1 ) THEN + PREVLASTV = MIN( PREVLASTV, LASTV ) + ELSE + PREVLASTV = LASTV + END IF + END IF + T( I, I ) = TAU( I ) + END IF + END DO + END IF + RETURN +* +* End of ZLARFT +* + END From ce66ffe7bb5e554ac9c87c4b49fad13f122ce769 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 12 Jan 2025 00:57:10 +0100 Subject: [PATCH 243/244] Update the Changelog for version 0.3.29 --- Changelog.txt | 94 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) diff --git a/Changelog.txt b/Changelog.txt index 7f89a2eab7..b131dca5c4 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,98 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.3.29 +12-Jan-2025 + +general: + - fixed a potential NULL pointer dereference in multithreaded builds + - added function aliases for GEMMT using its new name GEMMTR adopted by Reference-BLAS + - fixed a build failure when building without LAPACK_DEPRECATED functions + - the minimum required CMake version for CMake-based builds was raised to 3.16.0 in order + to remove many compatibility and deprecation warnings + - added more detailed CMake rules for OpenMP builds (mainly to support recent LLVM) + - fixed the behavior of the recently added CBLAS_?GEMMT functions with row-major data + - improved thread scaling of multithreaded SBGEMV + - improved thread scaling of multithreaded TRTRI + - fixed compilation of the CBLAS testsuite with gcc14 (and no Fortran compiler) + - added support for option handling changes in flang-new from LLVM18 onwards + - added support for recent calling conventions changes in Cray and NVIDIA compilers + - added support for compilation with the NAG Fortran compiler + - fixed placement of the -fopenmp flag and libsuffix in the generated pkgconfig file + - improved the CMakeConfig file generated by the Makefile build + - fixed const-correctness of cblas_?geadd in cblas.h + - fixed a potential inaccuracy in multithreaded BLAS3 calls + - fixed empty implementations of get/set_affinity that print a warning in OpenMP builds + - fixed function signatures for TRTRS in the converted C version of LAPACK + - fixed omission of several single-precision LAPACK symbols in the shared library + - improved build instructions for the provided "pybench" benchmarks + - improved documentation, including added build instructions for WoA and HarmonyOS + - added a separate "make install_tests" target for use with cross-compilations + - integrated improvements and corrections from Reference-LAPACK: + - removed a comparison in LAPACKE ?tpmqrt that is always false (LAPACK PR 1062) + - fixed the leading dimension for B in tests for GGEV (LAPACK PR 1064) + - replaced the ?LARFT functions with a recursive implementation (LAPACK PR 1080) + +arm: + - fixed build with recent versions of the NDK (missing .type declaration of symbols) + +arm64: + - fixed a long-standing bug in the (generic) c/zgemm_beta kernel that could lead to + reads and writes outside the array bounds in some circumstances + - rewrote cpu autodetection to scan all cores and return the highest performing type + - improved the DGEMM performance for SVE targets and small matrix sizes + - improved dimension criteria for forwarding from GEMM to GEMV kernels + - added SVE kernels for ROT and SWAP + - improved SVE kernels for SGEMV and DGEMV on A64FX and NEOVERSEV1 + - added support for using the "small matrix" kernels with CMake as well + - fixed compilation on Windows on Arm + - improved compile-time detection of SVE capability + - added cpu autodetection and initial support for Apple M4 + - added support for compilation on systems running IOS + - added support for compilation on NetBSD ("evbarm" architecture) + - fixed NRM2 implementations for generic SVE targets and the Neoverse N2 + - fixed compilation for SVE-capable targets with the NVIDIA compiler + +x86_64: + - fixed a wrong storage size in the SBGEMV kernel for Cooper Lake + - added cpu autodetection for Intel Granite Rapids + - added cpu autodetection for AMD Ryzen 5 series + - added optimized SOMATCOPY_CT for AVX-capable targets + - fixed the fallback implementation of GEMM3M in GENERIC builds + - tentatively re-enabled builds with the EXPRECISION option + - worked around a miscompilation of tests with mingw32-gfortran14 + - added support for compilation with the Intel oneAPI 2025.0 compiler on Windows + +power: + - fixed multithreaded SBGEMM + - fixed a CMake build problem on POWER10 + - improved the performance of SGEMV + - added vectorized implementations of SBGEMV and support for forwarding 1xN SBGEMM to them + - fixed illegal instructions and potential memory overflow in SGEMM on PPCG4 + - fixed handling of NaN and Inf arguments in SSCAL and DSCAL on PPC440,G4 and 970 + - added improved CGEMM and ZGEMM kernels for POWER10 + - added Makefile logic to remove all optimization flags in DEBUG builds + +mips64: + - fixed compilation with gcc14 + - fixed GEMM parameter selection for the MIPS64_GENERIC target + - fixed a potential build failure when compiling with OpenMP + +loongarch64: + - fixed compilation for Loongson3 with recent versions of gmake + - fixed a potential loss of precision in Loongson3A GEMM + - fixed a potential build failure when compiling with OpenMP + - added optimized SOMATCOPY for LASX-capable targets + - introduced a new cpu naming scheme while retaining compatibility + - added support for cross-compiling Loongarch64 targets with CMake + - added support for compilation with LLVM + +riscv64: + - removed thread yielding overhead caused by sched_yield + - replaced some non-standard intrinsics with their official names + - fixed and sped up the implementations of CGEMM/ZGEMM TCOPY for vector lenghts 128 and 256 + - improved the performance of SNRM2/DNRM2 for RVV1.0 targets + - added optimized ?OMATCOPY_CN kernels for RVV1.0 targets + ==================================================================== Version 0.3.28 8-Aug-2024 From 20f6114e98bce519a3c4f8af4a22a868a1c2d946 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 12 Jan 2025 13:12:41 +0100 Subject: [PATCH 244/244] add descriptions of build/runtime vars to 0.3.29 improvements --- Changelog.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/Changelog.txt b/Changelog.txt index b131dca5c4..b52734c82c 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -26,6 +26,7 @@ general: - fixed omission of several single-precision LAPACK symbols in the shared library - improved build instructions for the provided "pybench" benchmarks - improved documentation, including added build instructions for WoA and HarmonyOS + as well as descriptions of environment variables that affect build and runtime behavior - added a separate "make install_tests" target for use with cross-compilations - integrated improvements and corrections from Reference-LAPACK: - removed a comparison in LAPACKE ?tpmqrt that is always false (LAPACK PR 1062)