diff --git a/.github/workflows/build-aarch64.yml b/.github/workflows/build-aarch64.yml new file mode 100644 index 00000000..9d722a05 --- /dev/null +++ b/.github/workflows/build-aarch64.yml @@ -0,0 +1,37 @@ +name: Build (aarch64) + +on: + push: + branches: + - master + pull_request: + branches: + - master + +env: + # Customize the CMake build type here (Release, Debug, RelWithDebInfo, etc.) + BUILD_TYPE: Debug + +jobs: + build: + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v2 + + - name: Set up QEMU for ARM + uses: docker/setup-qemu-action@v2 + with: + platforms: arm64 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + + - name: Build Docker image for ARM + run: | + docker buildx create --use + docker buildx build --platform linux/arm64 -t my-arm-build --load -f docker/aarch64/Dockerfile . + + # - name: Run tests on ARM Docker container + # run: | + # docker run --rm my-arm-build ./run-tests.sh diff --git a/.github/workflows/cmake.yml b/.github/workflows/build-and-test-x86_64.yml similarity index 96% rename from .github/workflows/cmake.yml rename to .github/workflows/build-and-test-x86_64.yml index ece754f4..802c80fb 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/build-and-test-x86_64.yml @@ -1,4 +1,4 @@ -name: CMake +name: Build and test (x86_64) on: push: @@ -65,6 +65,7 @@ jobs: cmake -B ${{github.workspace}}/build \ -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} \ -DCMAKE_CXX_COMPILER=clang++-18 \ + -DCMAKE_CXX_FLAGS="-mfma -mavx -mavx2 -DXSIMD_DEFAULT_ARCH=\"fma3\"" \ -DBLAST_WITH_BENCHMARK=ON \ -DBLAST_WITH_TEST=ON diff --git a/CMakeLists.txt b/CMakeLists.txt index 2828f75c..ec760caf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -44,9 +44,6 @@ target_link_libraries(blast target_compile_options(blast INTERFACE "-Wno-ignored-attributes" "-fno-math-errno" "-ftemplate-backtrace-limit=0" - # Enable SIMD instruction sets, otherwise it does not compile. - # This will change when we support multiple architectures. - INTERFACE "-march=native" "-mfma" "-mavx" "-mavx2" "-msse4" ) # BLAST_WITH_BLASFEO diff --git a/bench/blast/math/dense/DynamicIamax.cpp b/bench/blast/math/dense/DynamicIamax.cpp index ce28e659..eaf599a7 100644 --- a/bench/blast/math/dense/DynamicIamax.cpp +++ b/bench/blast/math/dense/DynamicIamax.cpp @@ -1,20 +1,9 @@ -// Copyright 2023 Mikhail Katliar -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. +// Copyright (c) 2023-2024 Mikhail Katliar All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. #include - -#include +#include #include #include diff --git a/bench/common/CMakeLists.txt b/bench/common/CMakeLists.txt index 2ed8b4d0..030ae1e9 100644 --- a/bench/common/CMakeLists.txt +++ b/bench/common/CMakeLists.txt @@ -13,6 +13,9 @@ target_link_libraries(bench-blast-common PUBLIC benchmark::benchmark ) -target_compile_options(bench-blast-common - PUBLIC "-mllvm" "-inline-threshold=1000" -) +if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + # More aggressive inlining with Clang + target_compile_options(bench-blast-common + PUBLIC "-mllvm" "-inline-threshold=1000" + ) +endif() diff --git a/docker/aarch64/Dockerfile b/docker/aarch64/Dockerfile new file mode 100644 index 00000000..f8fafdbc --- /dev/null +++ b/docker/aarch64/Dockerfile @@ -0,0 +1,46 @@ +FROM ubuntu:latest +WORKDIR /root +RUN apt-get update +# RUN apt-get upgrade -y +RUN DEBIAN_FRONTEND="noninteractive" apt-get install -y \ + build-essential clang-18 cmake git libopenblas-dev libboost-exception-dev pkg-config + +# Install GTest and GMock +RUN apt install -y libgtest-dev libgmock-dev + +# Install Google benchmark +RUN apt install -y libbenchmark-dev + +# Install Blaze +RUN git clone https://bitbucket.org/blaze-lib/blaze.git +RUN cd blaze && cmake -DBLAZE_BLAS_MODE=True -DBLAZE_BLAS_USE_MATRIX_MATRIX_MULTIPLICATION=False \ + -DBLAZE_BLAS_USE_MATRIX_VECTOR_MULTIPLICATION=False -DBLAZE_VECTORIZATION=False -DBLAZE_SHARED_MEMORY_PARALLELIZATION=False . && make install + +# Install Eigen3 +RUN apt install -y libeigen3-dev + +# Install blasfeo +RUN apt-get install -y bc +RUN git clone https://github.com/giaf/blasfeo.git +RUN cd blasfeo && git checkout cc90e146ee9089de518f57dbb736e064bd82394e +COPY docker/aarch64/blasfeo/Makefile.rule blasfeo +RUN cd blasfeo && make -j `nproc` static_library && make install_static + +# Install xsimd +RUN apt install -y libxsimd-dev + +# Install Clang-18 +RUN apt install -y clang-18 +ENV CC=clang-18 +ENV CXX=clang++-18 + +# Build blast +WORKDIR /blast +COPY bench ./bench +COPY cmake ./cmake +COPY include ./include +COPY test ./test +COPY CMakeLists.txt . +ENV PKG_CONFIG_PATH=/usr/local/lib +RUN cmake -B build -DCMAKE_CXX_FLAGS="-march=native -DXSIMD_DEFAULT_ARCH='neon64'" -DBLAST_WITH_TEST=ON -DBLAST_WITH_BENCHMARK=ON . +RUN cd build && make -j `nproc` diff --git a/docker/aarch64/blasfeo/Makefile.rule b/docker/aarch64/blasfeo/Makefile.rule new file mode 100644 index 00000000..3072fef0 --- /dev/null +++ b/docker/aarch64/blasfeo/Makefile.rule @@ -0,0 +1,479 @@ +################################################################################################### +# # +# This file is part of BLASFEO. # +# # +# BLASFEO -- BLAS for embedded optimization. # +# Copyright (C) 2019 by Gianluca Frison. # +# Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. # +# All rights reserved. # +# # +# The 2-Clause BSD License # +# # +# Redistribution and use in source and binary forms, with or without # +# modification, are permitted provided that the following conditions are met: # +# # +# 1. Redistributions of source code must retain the above copyright notice, this # +# list of conditions and the following disclaimer. # +# 2. Redistributions in binary form must reproduce the above copyright notice, # +# this list of conditions and the following disclaimer in the documentation # +# and/or other materials provided with the distribution. # +# # +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND # +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR # +# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; # +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND # +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # +# # +# Author: Gianluca Frison, gianluca.frison (at) imtek.uni-freiburg.de # +# # +################################################################################################### + +# Do something in this makefile +$(info Parsing Makefile.rule) + +# Get path of Makefile.rule as main project directory +#CURRENT_DIR := $(dir $(lastword $(MAKEFILE_LIST))) +MAKEFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST))) +CURRENT_DIR := $(patsubst %/,%,$(dir $(MAKEFILE_PATH))) + + + +################################################# +### main makefile options +################################################# + +# Select target architecture (TARGET) +# +# X64_INTEL_HASWELL: x86_64 architecture with AVX2 and FMA ISA (64 bit OS) +# Code optimized for Intel Haswell, Intel Skylake and AMD Zen architectures. +# +# X64_INTEL_SANDY_BRIDGE : x86_64 architecture with AVX ISA (64 bit OS) +# Code optimized for Intel Sandy-Bridge architecture. +# +# X64_INTEL_CORE : x86_64 architecture with SSE3 ISA (64 bit OS) +# Code optimized for Intel Core archiecture. +# +# X64_AMD_BULLDOZER : x86_64 architecture with AVX and FMA ISA (64 bit OS) +# Code optimized for AMD Bulldozer. +# +# X86_AMD_JAGUAR : x86 architecture with AVX ISA (32 bit OS) +# Code optimized for AMD Jaguar. +# +# X86_AMD_BARCELONA : x86 architecture with SSE3 ISA (32 bit OS) +# Code optimized for AMD Barcelona. +# +# ARMV8A_ARM_CORTEX_A57 : ARMv8A architecture with NEON (64 bit OS) +# Code optimized for ARM Cortex A57, A72, A73. +# +# ARMV8A_ARM_CORTEX_A53 : ARMv8A architecture with NEON (64 bit OS) +# Code optimized for ARM Cortex A53. +# +# ARMV7A_ARM_CORTEX_A15 : ARMv7A architecture with NEON-VFPv4 ISA (32 bit OS) +# Code optimized for ARM Cortex A15. +# +# ARMV7A_ARM_CORTEX_A9 : ARMv7A architecture with NEON-VFPv3 ISA (32 bit OS) +# Code optimized for ARM Cortex A9. +# +# ARMV7A_ARM_CORTEX_A7 : ARMv7A architecture with NEON-VFPv4 ISA (32 bit OS) +# Code optimized for ARM Cortex A7. +# +# GENERIC : generic architecture, plain C code. +# +# +# TARGET = X64_INTEL_HASWELL +# TARGET = X64_INTEL_SANDY_BRIDGE +# TARGET = X64_INTEL_CORE +# +# TARGET = X64_AMD_BULLDOZER +# TARGET = X86_AMD_JAGUAR +# TARGET = X86_AMD_BARCELONA +# +TARGET = ARMV8A_ARM_CORTEX_A57 +# TARGET = ARMV8A_ARM_CORTEX_A53 +# TARGET = ARMV7A_ARM_CORTEX_A15 +# TARGET = ARMV7A_ARM_CORTEX_A9 +# TARGET = ARMV7A_ARM_CORTEX_A7 +# +# TARGET = GENERIC + +# Select back-end linear lagebra version (LA): +# HIGH_PERFORMANCE : target-tailored; performance-optimized for cache resident matrices; panel-major matrix format +# REFERENCE : target-unspecific lightly-optimized; small code footprint; column-major matrix format +# EXTERNAL_BLAS_WRAPPER : call to external BLAS and LAPACK libraries; column-major matrix format +# +LA = HIGH_PERFORMANCE +# LA = REFERENCE +# LA = EXTERNAL_BLAS_WRAPPER + +# Select external BLAS and LAPACK implementation (to be provided by the user). +# Edit Makefile.external_blas to specify installation location (default /opt). +# It is used by the BLASFEO library if LA=EXTERNAL_BLAS_WRAPPER. +# It may also be used as a comparison in some benchmarks and tests. +# +EXTERNAL_BLAS = 0 +# EXTERNAL_BLAS = SYSTEM +# EXTERNAL_BLAS = OPENBLAS +# EXTERNAL_BLAS = NETLIB +# EXTERNAL_BLAS = MKL +# EXTERNAL_BLAS = BLIS +# EXTERNAL_BLAS = ATLAS +include $(CURRENT_DIR)/Makefile.external_blas + +# Select operating system (automatic selection for LINUX and MAC) +# +UNAME_S := $(shell uname -s) +ifeq ($(UNAME_S), Linux) + OS = LINUX +endif +ifeq ($(UNAME_S), Darwin) + OS = MAC +endif +# +# Select operating system (manual selection) +# +# OS = LINUX +# OS = MAC +# OS = WINDOWS + +# Compile the BLAS API routines provided by BLASFEO +# +# BLAS_API = 0 +BLAS_API = 1 + +# Export standard FORTRAN namings for BLAS API routines +# 0 : routines namings are in the form blasfeo_dgemm +# 1 : routines namings are in the form dgemm_ +# +FORTRAN_BLAS_API = 0 +# FORTRAN_BLAS_API = 1 + +# Complement the BLAS_API with the Netlib BLAS (only for FORTAN_BLAS_API=1) +COMPLEMENT_WITH_NETLIB_BLAS = 0 +# COMPLEMENT_WITH_NETLIB_BLAS = 1 + +# Complement the BLAS_API with the Netlib LAPACK (only for FORTAN_BLAS_API=1) +COMPLEMENT_WITH_NETLIB_LAPACK = 0 +# COMPLEMENT_WITH_NETLIB_LAPACK = 1 + +# Compile the CBLAS API routines provided by BLASFEO +# +CBLAS_API = 0 +# CBLAS_API = 1 + +#Compile the LAPACKE API from Netlib +LAPACKE_API = 0 +# LAPACKE_API = 1 + + + +################################################# +### other makefile options and settings +################################################# + +# In BLAS API, fallback to external BLAS library for some not-yet-implemented routines +# +FALLBACK_TO_EXTERNAL_BLAS = 0 +# FALLBACK_TO_EXTERNAL_BLAS = 1 + +# Maximum inner product length K for buffer allocation on stack (decrease this value if stack size is exceeded) +# +K_MAX_STACK = 1000 + +# Macro level (code size vs performance in assembly kernels): +# 0 : no macro (min code size) +# 1 : all macro but gemm kernel +# 2 : all macro (max performance) +# +MACRO_LEVEL = 1 + +# Use C99 extension to math library +# +# USE_C99_MATH = 0 +USE_C99_MATH = 1 + +# Compile auxiliary functions with external dependencies (for memory allocation, printing and timing) +# +# EXT_DEP = 0 +EXT_DEP = 1 + +# Compile reference implementations with test_ prefix +# in order to check HIGH_PERFORMANCE routines against reference +# TODO bug: if LA=EXTERNAL_BLAS_WRAPPER and TESTING_MODE=1, reference code is used for libblasfeo.a +# Also enables the compilation of tests +# +TESTING_MODE = 0 +# TESTING_MODE = 1 + +# Compile not-yet-implemented routine with just-return instead of print-and-exit +# Also enables the compilation of benchmarks +# +BENCHMARKS_MODE = 0 +# BENCHMARKS_MODE = 1 + +# Enables the compilation of sandbox (experimental) +# +SANDBOX_MODE = 0 +# SANDBOX_MODE = 1 + +# Enable on-line checks for matrix and vector dimensions (experimental) +# +RUNTIME_CHECKS = 0 +# RUNTIME_CHECKS = 1 + +# Print name of BLAS API routines when called (for debugging purposes) +# +PRINT_NAME = 0 +# PRINT_NAME = 1 + +# C Compiler +# +CC ?= gcc +# CC = clang +# CC = x86_64-w64-mingw32-gcc + +# archive routine +# +AR = ar + +# Installation directory +# +PREFIX = /opt + +# compiler / assembler / linker flags +# +# CFLAGS = +ASFLAGS = +LDFLAGS = + +# Common optimization flags +# +CFLAGS ?= -O2 +CFLAGS += -fPIC + +# Debugging flags +# +CFLAGS += #-g #-Wall -pedantic -Wfloat-equal #-pg +ASFLAGS += #-g + +# Profiling flags +# +#CFLAGS += --coverage + + + +# Installation directory +TOP = $(CURRENT_DIR) + +# Support local options +# TODO move somewhere else ??? +-include $(CURRENT_DIR)/Makefile.local + +# search directories +CFLAGS += -I$(TOP)/include + + + +# Conditional definitions and checks + +ifeq ($(LA), HIGH_PERFORMANCE) +CFLAGS += -DLA_HIGH_PERFORMANCE +BINARY_DIR = build/$(LA)/$(TARGET) +endif +ifeq ($(LA), REFERENCE) +CFLAGS += -DLA_REFERENCE +BINARY_DIR = build/$(LA)/$(TARGET) +endif +ifeq ($(LA), EXTERNAL_BLAS_WRAPPER) +ifeq ($(EXTERNAL_BLAS), 0) +$(error No EXTERNAL_BLAS selected for LA=EXTERNAL_BLAS_WRAPPER) +endif +CFLAGS += -DLA_EXTERNAL_BLAS_WRAPPER +BINARY_DIR = build/$(LA)/$(EXTERNAL_BLAS) +endif +# TODO remove and fix tests +# CFLAGS += -DBLASFEO_LA=$(LA) + +ifeq ($(BLAS_API), 1) +CFLAGS += -DBLAS_API +ASFLAGS += -DBLAS_API +ifeq ($(FORTRAN_BLAS_API), 1) +CFLAGS += -DFORTRAN_BLAS_API +ASFLAGS += +endif # FORTRAN_BLAS_API +ifeq ($(FALLBACK_TO_EXTERNAL_BLAS), 1) +CFLAGS += -DFALLBACK_TO_EXTERNAL_BLAS +ASFLAGS += -DFALLBACK_TO_EXTERNAL_BLAS +endif # FALLBACK_TO_EXTERNAL_BLAS +endif # BLAS_API + +ifeq ($(CBLAS_API), 1) +ifeq ($(FORTRAN_BLAS_API), 0) +$(error Cannot expose non-FORTRAN style BLAS_API when building CBLAS_API) +endif +endif + +ifeq ($(LAPACKE_API), 1) +ifeq ($(FORTRAN_BLAS_API), 0) +$(error Cannot expose non-FORTRAN style BLAS_API when building LAPACKE_API) +endif +endif + +ifeq ($(COMPLEMENT_WITH_NETLIB_BLAS), 1) +ifeq ($(FORTRAN_BLAS_API), 0) +$(error Cannot expose non-FORTRAN style BLAS_API when complementing with Netlib BLAS) +endif +endif + +ifeq ($(COMPLEMENT_WITH_NETLIB_LAPACK), 1) +ifeq ($(FORTRAN_BLAS_API), 0) +$(error Cannot expose non-FORTRAN style BLAS_API when complementing with Netlib LAPACK) +endif +endif + +STACK_SIZE := $(shell ulimit -s) +ifneq ($(STACK_SIZE), unlimited) +STACK_SIZE_EXCEEDED := $(shell echo $(K_MAX_STACK)*12*8*2 \> $(STACK_SIZE)*1024 | bc ) +ifeq ($(STACK_SIZE_EXCEEDED), 1) +$(error stack size likely to be exceeded, please decrease the value of K_MAX_STACK ) +endif +endif +CFLAGS += -DK_MAX_STACK=$(K_MAX_STACK) + +ifeq ($(USE_C99_MATH), 1) +CFLAGS += -DUSE_C99_MATH +endif + +ifeq ($(RUNTIME_CHECKS), 1) +CFLAGS += -DDIM_CHECK +endif + +ifeq ($(EXT_DEP), 1) +CFLAGS += -DEXT_DEP +endif + +ifeq ($(TESTING_MODE), 1) +CFLAGS += -DTESTING_MODE +endif + +ifeq ($(BENCHMARKS_MODE), 1) +CFLAGS += -DBENCHMARKS_MODE +endif + +ifeq ($(SANDBOX_MODE), 1) +CFLAGS += -DSANDBOX_MODE +endif + +ifeq ($(MACRO_LEVEL), 1) +ASFLAGS += -DMACRO_LEVEL=1 +endif +ifeq ($(MACRO_LEVEL), 2) +ASFLAGS += -DMACRO_LEVEL=2 +endif + +ifeq ($(PRINT_NAME), 1) +CFLAGS += -DPRINT_NAME +endif + +ifeq ($(OS), LINUX) +CFLAGS += -DOS_LINUX +ASFLAGS += -DOS_LINUX +endif +ifeq ($(OS), MAC) +CFLAGS += -DOS_MAC +ASFLAGS += -DOS_MAC +endif +ifeq ($(OS), WINDOWS) +CFLAGS += -DOS_WINDOWS +ASFLAGS += -DOS_WINDOWS +endif +ifeq ($(SOC), DSPACE) +CFLAGS += -D__DSPACE__ +ASFLAGS += -D__DSPACE__ +endif +ifeq ($(SOC), BACHMANN) +CFLAGS += -D__BACHMANN__ +ASFLAGS += -D__BACHMANN__ +endif + +# EXTERNAL_BLAS + +ifndef EXTERNAL_BLAS + EXTERNAL_BLAS = 0 +endif + +CFLAGS += $(INCLUDE_EXTERNAL_BLAS) +ifeq ($(EXTERNAL_BLAS), 0) +CFLAGS += +endif +ifeq ($(EXTERNAL_BLAS), SYSTEM) +CFLAGS += -DEXTERNAL_BLAS_SYSTEM +endif +ifeq ($(EXTERNAL_BLAS), OPENBLAS) +CFLAGS += -DEXTERNAL_BLAS_OPENBLAS +endif +ifeq ($(EXTERNAL_BLAS), BLIS) +CFLAGS += -DEXTERNAL_BLAS_BLIS -std=gnu99 +endif +ifeq ($(EXTERNAL_BLAS), NETLIB) +CFLAGS += -DEXTERNAL_BLAS_NETLIB +endif +ifeq ($(EXTERNAL_BLAS), MKL) +CFLAGS += -DEXTERNAL_BLAS_MKL -std=c99 -m64 -DMKL_DIRECT_CALL_SEQ +endif +ifeq ($(EXTERNAL_BLAS), ATLAS) +CFLAGS += -DEXTERNAL_BLAS_ATLAS +endif +# TODO remove and fix tests +# CFLAGS += -DEXTERNAL_BLAS=$(EXTERNAL_BLAS) + +# Architecture-specific flags +ifeq ($(TARGET), X64_INTEL_HASWELL) +CFLAGS += -m64 -mavx2 -mfma -DTARGET_X64_INTEL_HASWELL +endif +ifeq ($(TARGET), X64_INTEL_SANDY_BRIDGE) +CFLAGS += -m64 -mavx -DTARGET_X64_INTEL_SANDY_BRIDGE +endif +ifeq ($(TARGET), X64_INTEL_CORE) +CFLAGS += -m64 -msse3 -DTARGET_X64_INTEL_CORE +endif +ifeq ($(TARGET), X64_AMD_BULLDOZER) +CFLAGS += -m64 -mavx -mfma -DTARGET_X64_AMD_BULLDOZER +endif +ifeq ($(TARGET), X86_AMD_JAGUAR) +CFLAGS += -m32 -mavx -DTARGET_X86_AMD_JAGUAR +ASFLAGS += -m32 -mavx -DTARGET_X86_AMD_JAGUAR +endif +ifeq ($(TARGET), X86_AMD_BARCELONA) +CFLAGS += -m32 -msse3 -DTARGET_X86_AMD_BARCELONA +ASFLAGS += -m32 -msse3 -DTARGET_X86_AMD_BARCELONA +endif +ifeq ($(TARGET), ARMV8A_ARM_CORTEX_A57) +CFLAGS += -march=armv8-a+crc+crypto+simd -DTARGET_ARMV8A_ARM_CORTEX_A57 +ASFLAGS += -DTARGET_ARMV8A_ARM_CORTEX_A57 +endif +ifeq ($(TARGET), ARMV8A_ARM_CORTEX_A53) +CFLAGS += -march=armv8-a+crc+crypto+simd -DTARGET_ARMV8A_ARM_CORTEX_A53 +ASFLAGS += -DTARGET_ARMV8A_ARM_CORTEX_A53 +endif +ifeq ($(TARGET), ARMV7A_ARM_CORTEX_A15) +CFLAGS += -marm -mfloat-abi=hard -mfpu=neon-vfpv4 -mcpu=cortex-a15 -DTARGET_ARMV7A_ARM_CORTEX_A15 +ASFLAGS += -mfpu=neon-vfpv4 -DTARGET_ARMV7A_ARM_CORTEX_A15 +endif +ifeq ($(TARGET), ARMV7A_ARM_CORTEX_A9) +CFLAGS += -marm -mfloat-abi=hard -mfpu=neon -mcpu=cortex-a9 -DTARGET_ARMV7A_ARM_CORTEX_A9 +ASFLAGS += -mfpu=neon -DTARGET_ARMV7A_ARM_CORTEX_A9 +endif +ifeq ($(TARGET), ARMV7A_ARM_CORTEX_A7) +CFLAGS += -marm -mfloat-abi=hard -mfpu=neon-vfpv4 -mcpu=cortex-a7 -DTARGET_ARMV7A_ARM_CORTEX_A7 +ASFLAGS += -mfpu=neon-vfpv4 -DTARGET_ARMV7A_ARM_CORTEX_A7 +endif +ifeq ($(TARGET), GENERIC) +CFLAGS += -DTARGET_GENERIC +endif +# TODO remove and fix tests +# CFLAGS += -DBLASFEO_TARGET=$(TARGET) diff --git a/Dockerfile b/docker/x86_64/Dockerfile similarity index 98% rename from Dockerfile rename to docker/x86_64/Dockerfile index 6c1b7204..6b21c225 100644 --- a/Dockerfile +++ b/docker/x86_64/Dockerfile @@ -29,7 +29,7 @@ RUN mkdir -p eigen/build && cd eigen/build && cmake -DCMAKE_INSTALL_PREFIX=/usr/ RUN apt-get install -y bc RUN git clone https://github.com/giaf/blasfeo.git RUN cd blasfeo && git checkout cc90e146ee9089de518f57dbb736e064bd82394e -COPY docker/blasfeo/Makefile.rule blasfeo +COPY docker/x86_64/blasfeo/Makefile.rule blasfeo RUN cd blasfeo && make -j `nproc` static_library && make install_static # Install libxsmm diff --git a/docker/blasfeo/Makefile.rule b/docker/x86_64/blasfeo/Makefile.rule similarity index 100% rename from docker/blasfeo/Makefile.rule rename to docker/x86_64/blasfeo/Makefile.rule diff --git a/include/blast/math/algorithm/Tile.hpp b/include/blast/math/algorithm/Tile.hpp index 7f6ecc63..3b4efef6 100644 --- a/include/blast/math/algorithm/Tile.hpp +++ b/include/blast/math/algorithm/Tile.hpp @@ -8,6 +8,10 @@ # include #endif +#if XSIMD_WITH_NEON64 +# include +#endif + #include #include diff --git a/include/blast/math/algorithm/arch/avx2/Tile.hpp b/include/blast/math/algorithm/arch/avx2/Tile.hpp index aac90b8a..f7decfde 100644 --- a/include/blast/math/algorithm/arch/avx2/Tile.hpp +++ b/include/blast/math/algorithm/arch/avx2/Tile.hpp @@ -6,6 +6,7 @@ #include #include #include +#include #include @@ -46,7 +47,7 @@ namespace blast :: detail BLAST_ALWAYS_INLINE void tile(xsimd::avx2 const& arch, StorageOrder traversal_order, std::size_t m, std::size_t n, FF&& f_full, FP&& f_partial) { size_t constexpr SS = SimdSize_v; - size_t constexpr TILE_STEP = 4; // TODO: this is almost arbitrary and needs to be ppoperly determined + size_t constexpr TILE_STEP = 4; // TODO: this is almost arbitrary and needs to be properly determined static_assert(SO == columnMajor, "tile() for row-major matrices not implemented"); diff --git a/include/blast/math/algorithm/arch/neon64/Tile.hpp b/include/blast/math/algorithm/arch/neon64/Tile.hpp new file mode 100644 index 00000000..652a7917 --- /dev/null +++ b/include/blast/math/algorithm/arch/neon64/Tile.hpp @@ -0,0 +1,141 @@ +// Copyright 2024 Mikhail Katliar. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include +#include +#include +#include +#include + +#include + +#include + + +namespace blast :: detail +{ + template + BLAST_ALWAYS_INLINE void tile_backend(xsimd::neon64, size_t m, size_t n, size_t i, FF&& f_full, FP&& f_partial) + { + RegisterMatrix ker; + + if (i + KM <= m) + { + size_t j = 0; + + for (; j + KN <= n; j += KN) + f_full(ker, i, j); + + if (j < n) + f_partial(ker, i, j, KM, n - j); + } + else + { + size_t j = 0; + + for (; j + KN <= n; j += KN) + f_partial(ker, i, j, m - i, KN); + + if (j < n) + f_partial(ker, i, j, m - i, n - j); + } + } + + + template + BLAST_ALWAYS_INLINE void tile(xsimd::neon64 const& arch, StorageOrder traversal_order, std::size_t m, std::size_t n, FF&& f_full, FP&& f_partial) + { + size_t constexpr SS = SimdSize_v; + size_t constexpr TILE_STEP = 4; // TODO: this is almost arbitrary and needs to be properly determined + + static_assert(SO == columnMajor, "tile() for row-major matrices not implemented"); + + if (traversal_order == columnMajor) + { + size_t j = 0; + + // Main part + for (; j + TILE_STEP <= n; j += TILE_STEP) + { + size_t i = 0; + + // i + 4 * TILE_SIZE != M is to improve performance in case when the remaining number of rows is 4 * TILE_SIZE: + // it is more efficient to apply 2 * TILE_SIZE kernel 2 times than 3 * TILE_SIZE + 1 * TILE_SIZE kernel. + for (; i + 3 * SS <= m && i + 4 * SS != m; i += 3 * SS) + { + RegisterMatrix ker; + f_full(ker, i, j); + } + + for (; i + 2 * SS <= m; i += 2 * SS) + { + RegisterMatrix ker; + f_full(ker, i, j); + } + + for (; i + 1 * SS <= m; i += 1 * SS) + { + RegisterMatrix ker; + f_full(ker, i, j); + } + + // Bottom side + if (i < m) + { + RegisterMatrix ker; + f_partial(ker, i, j, m - i, ker.columns()); + } + } + + + // Right side + if (j < n) + { + size_t i = 0; + + // i + 4 * TILE_STEP != M is to improve performance in case when the remaining number of rows is 4 * TILE_STEP: + // it is more efficient to apply 2 * TILE_STEP kernel 2 times than 3 * TILE_STEP + 1 * TILE_STEP kernel. + for (; i + 3 * SS <= m && i + 4 * SS != m; i += 3 * SS) + { + RegisterMatrix ker; + f_partial(ker, i, j, ker.rows(), n - j); + } + + for (; i + 2 * SS <= m; i += 2 * SS) + { + RegisterMatrix ker; + f_partial(ker, i, j, ker.rows(), n - j); + } + + for (; i + 1 * SS <= m; i += 1 * SS) + { + RegisterMatrix ker; + f_partial(ker, i, j, ker.rows(), n - j); + } + + // Bottom-right corner + if (i < m) + { + RegisterMatrix ker; + f_partial(ker, i, j, m - i, n - j); + } + } + } + else + { + size_t i = 0; + + // i + 4 * SS != M is to improve performance in case when the remaining number of rows is 4 * SS: + // it is more efficient to apply 2 * SS kernel 2 times than 3 * SS + 1 * SS kernel. + for (; i + 2 * SS < m && i + 4 * SS != m; i += 3 * SS) + tile_backend(arch, m, n, i, f_full, f_partial); + + for (; i + 1 * SS < m; i += 2 * SS) + tile_backend(arch, m, n, i, f_full, f_partial); + + for (; i + 0 * SS < m; i += 1 * SS) + tile_backend(arch, m, n, i, f_full, f_partial); + } + } +} diff --git a/include/blast/math/dense/Iamax.hpp b/include/blast/math/dense/Iamax.hpp index 16e2e464..1c5551e3 100644 --- a/include/blast/math/dense/Iamax.hpp +++ b/include/blast/math/dense/Iamax.hpp @@ -17,7 +17,6 @@ #include #include -#include #include #include @@ -168,13 +167,13 @@ namespace blast * * @return index of the first element in @a x having maximum absolute value. */ - template - inline size_t iamax(DenseVector const& x) + template + inline size_t iamax(VT const& x) { size_t const N = size(x); if (N == 0) BLAST_THROW_EXCEPTION(std::invalid_argument {"Vector is empty"}); - return iamax(N, ptr(*x)); + return iamax(N, ptr(x)); } } diff --git a/include/blast/math/dense/Trmm.hpp b/include/blast/math/dense/Trmm.hpp index 34dc7c4f..c82bd674 100644 --- a/include/blast/math/dense/Trmm.hpp +++ b/include/blast/math/dense/Trmm.hpp @@ -5,6 +5,7 @@ #pragma once #include +#include #include #include diff --git a/include/blast/math/panel/PanelSize.hpp b/include/blast/math/panel/PanelSize.hpp index 6bcf6bbf..9f9b500e 100644 --- a/include/blast/math/panel/PanelSize.hpp +++ b/include/blast/math/panel/PanelSize.hpp @@ -14,6 +14,13 @@ namespace blast { + /** + * @brief Default size of a panel (in a panel matrix) for a given architecture and data type + * + * TODO: Is it always equal to SIMD size? Deprecate? + * + * @tparam Arch architecture + */ template size_t constexpr PanelSize_v = SimdSize_v; } diff --git a/include/blast/math/panel/Potrf.hpp b/include/blast/math/panel/Potrf.hpp index e1763809..2cd26740 100644 --- a/include/blast/math/panel/Potrf.hpp +++ b/include/blast/math/panel/Potrf.hpp @@ -7,7 +7,7 @@ #include #include #include -#include +#include #include #include @@ -61,7 +61,7 @@ namespace blast PanelMatrix const& A, PanelMatrix& L) { using ET = ElementType_t; - size_t constexpr PANEL_SIZE = PanelSize_v; + size_t constexpr SS = SimdSize_v; BLAZE_CONSTRAINT_MUST_BE_SAME_TYPE(ElementType_t, ET); @@ -77,7 +77,14 @@ namespace blast if (columns(L) != N) BLAZE_THROW_INVALID_ARGUMENT("Invalid matrix size"); - size_t constexpr KN = 4; + // Calculate Maximum number of columns of a register matrix that can be used in ger() without spilling registers. + // NOTE: RegisterMatrix.potrf() has the limitation that it works only with matrices whose number of columns + // is not less than the number of rows. This limits the max number of columns by the number of rows + // of the smallest used RegisterMatrix, which is 1 * SS. + size_t constexpr RC = registerCapacity(xsimd::default_arch {}); + size_t constexpr MAX_RM = 3; // first dimension of the largest used RegisterMatrix, in SIMD registers + static_assert(RC >= MAX_RM + 1); + size_t constexpr KN = std::min((RC - (MAX_RM + 1)) / MAX_RM, SS); size_t k = 0; // This loop unroll gives some performance benefit for N >= 18, @@ -87,14 +94,14 @@ namespace blast { size_t i = k; - for (; i + 2 * PANEL_SIZE < M; i += 3 * PANEL_SIZE) - potrf_backend<3 * PANEL_SIZE, KN>(k, i, *A, *L); + for (; i + 2 * SS < M; i += 3 * SS) + potrf_backend<3 * SS, KN>(k, i, *A, *L); - for (; i + 1 * PANEL_SIZE < M; i += 2 * PANEL_SIZE) - potrf_backend<2 * PANEL_SIZE, KN>(k, i, *A, *L); + for (; i + 1 * SS < M; i += 2 * SS) + potrf_backend<2 * SS, KN>(k, i, *A, *L); - for (; i + 0 * PANEL_SIZE < M; i += 1 * PANEL_SIZE) - potrf_backend<1 * PANEL_SIZE, KN>(k, i, *A, *L); + for (; i + 0 * SS < M; i += 1 * SS) + potrf_backend<1 * SS, KN>(k, i, *A, *L); } } } diff --git a/include/blast/math/panel/StaticPanelMatrix.hpp b/include/blast/math/panel/StaticPanelMatrix.hpp index 47e7982c..ef29dfac 100644 --- a/include/blast/math/panel/StaticPanelMatrix.hpp +++ b/include/blast/math/panel/StaticPanelMatrix.hpp @@ -15,7 +15,6 @@ #include #include #include -#include #include #include @@ -183,9 +182,6 @@ namespace blast ? i / panelSize_ * spacing_ + i % panelSize_ + j * panelSize_ : j / panelSize_ * spacing_ + j % panelSize_ + i * panelSize_; } - - - BLAZE_CONSTRAINT_MUST_BE_VECTORIZABLE_TYPE(Type); }; diff --git a/include/blast/math/simd/RegisterCapacity.hpp b/include/blast/math/simd/RegisterCapacity.hpp index 35b2ac7e..0f48789d 100644 --- a/include/blast/math/simd/RegisterCapacity.hpp +++ b/include/blast/math/simd/RegisterCapacity.hpp @@ -21,12 +21,13 @@ namespace blast { /** - * @brief Number of available SIMD registers. + * @brief Number of available SIMD registers for a given architecture. * * @return Number of SIMD registers for AVX2 */ - std::size_t constexpr registerCapacity(xsimd::avx2) + template + std::size_t constexpr registerCapacity(Arch arch) { - return 16; + return detail::registerCapacity(arch); } } diff --git a/include/blast/math/simd/Simd.hpp b/include/blast/math/simd/Simd.hpp index b5bd8873..4106ae31 100644 --- a/include/blast/math/simd/Simd.hpp +++ b/include/blast/math/simd/Simd.hpp @@ -18,3 +18,7 @@ #if XSIMD_WITH_AVX2 #include #endif + +#if XSIMD_WITH_NEON64 + #include +#endif diff --git a/include/blast/math/simd/SimdIndex.hpp b/include/blast/math/simd/SimdIndex.hpp index 3d7ca7b2..9ca8c8fd 100644 --- a/include/blast/math/simd/SimdIndex.hpp +++ b/include/blast/math/simd/SimdIndex.hpp @@ -58,18 +58,25 @@ namespace blast using Type = std::uint64_t; }; + template + requires (xsimd::batch::size == 2) && std::is_integral_v + constexpr xsimd::batch integerSequence() + { + return {0, 1}; + } + template requires (xsimd::batch::size == 4) && std::is_integral_v - inline xsimd::batch indexSequence(T start) noexcept + constexpr xsimd::batch integerSequence() { - return {start, start + 1, start + 2, start + 3}; + return {0, 1, 2, 3}; } template requires (xsimd::batch::size == 8) && std::is_integral_v - inline xsimd::batch indexSequence(T start) noexcept + constexpr xsimd::batch integerSequence() { - return {start, start + 1, start + 2, start + 3, start + 4, start + 5, start + 6, start + 7}; + return {0, 1, 2, 3, 4, 5, 6, 7}; } } @@ -85,14 +92,11 @@ namespace blast /// @brief Construct an integer index sequence /// - /// @param start start of the sequence - /// - /// @return [ @a start, @a start + 1, ..., @a start + N - 1 ] - /// where N = SimdIndex::size + /// @return [0, 1, ..., SimdIndex::size - 1] /// template - inline SimdIndex indexSequence(typename SimdIndex::value_type start = 0) noexcept + constexpr SimdIndex indexSequence() { - return detail::indexSequence, Arch>(start); + return detail::integerSequence, Arch>(); } } diff --git a/include/blast/math/simd/arch/Avx2.hpp b/include/blast/math/simd/arch/Avx2.hpp index b5aed8c1..560e9a62 100644 --- a/include/blast/math/simd/arch/Avx2.hpp +++ b/include/blast/math/simd/arch/Avx2.hpp @@ -20,6 +20,15 @@ namespace blast { + namespace detail + { + std::size_t constexpr registerCapacity(xsimd::avx2) + { + return 16; + } + } + + template requires std::is_base_of_v inline xsimd::batch maskload(float const * src, xsimd::batch_bool const& mask) noexcept diff --git a/include/blast/math/simd/arch/Neon64.hpp b/include/blast/math/simd/arch/Neon64.hpp new file mode 100644 index 00000000..35b788f4 --- /dev/null +++ b/include/blast/math/simd/arch/Neon64.hpp @@ -0,0 +1,78 @@ +// Copyright 2024 Mikhail Katliar +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include + +#include + + +namespace blast +{ + namespace detail + { + std::size_t constexpr registerCapacity(xsimd::neon64) + { + return 32; + } + } + + + template + requires std::is_base_of_v + inline xsimd::batch maskload(float const * src, xsimd::batch_bool const& mask) noexcept + { + throw std::logic_error {"Not implemented"}; + } + + + template + requires std::is_base_of_v + inline xsimd::batch maskload(double const * src, xsimd::batch_bool const& mask) noexcept + { + throw std::logic_error {"Not implemented"}; + } + + + template + requires std::is_base_of_v + inline void maskstore(xsimd::batch const& v, float * dst, xsimd::batch_bool const& mask) noexcept + { + throw std::logic_error {"Not implemented"}; + } + + + template + requires std::is_base_of_v + inline void maskstore(xsimd::batch const& v, double * dst, xsimd::batch_bool const& mask) noexcept + { + throw std::logic_error {"Not implemented"}; + } + + + template + requires std::is_base_of_v + inline std::tuple, xsimd::batch> imax(xsimd::batch const& v1, xsimd::batch const& idx) noexcept + { + throw std::logic_error {"Not implemented"}; + } + + + template + requires std::is_base_of_v + inline std::tuple, xsimd::batch> imax(xsimd::batch const& x, xsimd::batch const& idx) noexcept + { + throw std::logic_error {"Not implemented"}; + } +} diff --git a/include/blast/system/Tile.hpp b/include/blast/system/Tile.hpp index dd471a6f..7820eb13 100644 --- a/include/blast/system/Tile.hpp +++ b/include/blast/system/Tile.hpp @@ -4,18 +4,14 @@ #pragma once -//************************************************************************************************* -// Includes -//************************************************************************************************* - -#include +#include namespace blast { - using namespace blaze; - - + /** + * @brief TODO: deprecate? + */ template struct TileSize;