From 79d7b332e56427964ded0126fd97e7619348c6a6 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Tue, 8 Feb 2022 23:08:58 +0000 Subject: [PATCH 01/67] ARM Backend for marian Provides an arm backend for matrix multiplies using google/ruy and math functions through simde (https://simd-everywhere.github.io/blog/about/) effectively getting marian-decoder to run on ARM. The following cmake flags are added: - USE_INTGEMM (to switch intgemm on/off) - USE_RUY (to switch ruy on/off) - USE_ONNX_SGEMM (use onnx sgemm added by wasm to provide attention matrix multiply which is currently reliant on a BLAS library). - USE_SIMDE (swaps out existing intel based functions by using SIMDE instead). The built marian-decoder is tested on an Oracle Cloud ARM Machine with the following specs: Architecture : aarch64 CPU op-mode(s) : 32-bit, 64-bit Byte Order : Little Endian Vendor ID : ARM Model name : Neoverse-N1 Flags : fp asimd evtstrm aes pmull sha1 sha2 crc32 atomics fphp asimdhp cpuid asimdrdm lrcpc dcpop asimddp ssbs A CI check on GitHub actions is added to use android-ndk cross-compile targetting arm64-v8a. The built binary is tested to work on an Android Phone using termux (Samsung M30s). Successful android build additionally requires a patch (sentencepiece -> protobuf). See https://github.com/opencv/opencv/issues/17282 and https://github.com/opencv/opencv/pull/19049. -Werror etc causes issues with ruy (-Wmulti-line-comment) and are disabled. The following minor changes are also applied: - Remove M32_BINARIES use COMPILE_WASM for -m32 - Hide msse4.1 if unknown platform - faiss was previously hardcoded for platforms with SSE available. This has been mitigated by adding a refernce standard cpp implementation of the missing function. - Exclude packed_gemm_....cpp from sources if USE_FBGEMM=off - MSVC workaround following https://github.com/browsermt/marian-dev/pull/56#issuecomment-945821693 --- .github/workflows/arm.yml | 147 +++++ .github/workflows/macos.yml | 2 +- .../native-customized_marian-macos.yml | 2 +- .github/workflows/ubuntu.yml | 2 +- .../wasm-customized_marian-macos.yml | 2 +- .../wasm-customized_marian-ubuntu.yml | 2 +- .github/workflows/windows.yml | 2 +- .gitmodules | 6 + CMakeLists.txt | 68 ++- patches/01-spm-protobuf-android.patch | 14 + scripts/run.sh | 40 ++ src/3rd_party/CMakeLists.txt | 28 +- src/3rd_party/faiss/VectorTransform.cpp | 13 +- src/3rd_party/ruy | 1 + src/3rd_party/sentencepiece | 2 +- src/3rd_party/simde-no-tests | 1 + src/3rd_party/sse_mathfun.h | 9 + src/CMakeLists.txt | 15 +- src/common/config_parser.cpp | 10 - src/common/logging.cpp | 6 +- src/common/types.h | 6 + src/layers/lsh.cpp | 4 +- src/tensors/cpu/expression_graph_packable.h | 8 +- src/tensors/cpu/integer_common.cpp | 15 +- src/tensors/cpu/integer_common.h | 34 +- src/tensors/cpu/intgemm_interface.h | 62 +- src/tensors/cpu/prod.cpp | 13 +- src/tensors/cpu/prod_blas.h | 15 +- src/tensors/cpu/ruy_adapter.h | 534 ++++++++++++++++++ 29 files changed, 974 insertions(+), 89 deletions(-) create mode 100644 .github/workflows/arm.yml create mode 100644 patches/01-spm-protobuf-android.patch create mode 100644 scripts/run.sh create mode 160000 src/3rd_party/ruy create mode 160000 src/3rd_party/simde-no-tests create mode 100644 src/tensors/cpu/ruy_adapter.h diff --git a/.github/workflows/arm.yml b/.github/workflows/arm.yml new file mode 100644 index 000000000..4cab55c9e --- /dev/null +++ b/.github/workflows/arm.yml @@ -0,0 +1,147 @@ +name: ARM +'on': + push: + branches: + - main + - ci-sandbox + pull_request: + branches: + - '**' +env: + ccache_basedir: ${{ github.workspace }} + ccache_dir: "${{ github.workspace }}/.ccache" + ccache_compilercheck: content + ccache_compress: 'true' + ccache_compresslevel: 9 + ccache_maxsize: 200M + ccache_cmake: -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache + ndk: "${{ github.workspace }}/android-ndk-r23b" + abi: "arm64-v8a" + minsdk_version : 28 + android_platform: 28 + +jobs: + ubuntu: + name: "arm-v8a cross-compile via Android NDK" + runs-on: ubuntu-latest + + steps: + - name: Checkout + uses: actions/checkout@v2 + with: + submodules: recursive + + - name: Install prerequisites + run: | + wget -c --quiet https://dl.google.com/android/repository/android-ndk-r23b-linux.zip + unzip -qq android-ndk-r23b-linux.zip + sudo apt-get -y install ccache cmake + + - name: Generate ccache_vars for ccache based on machine + shell: bash + id: ccache_vars + run: |- + echo "::set-output name=hash::$(echo ${{ env.ccache_compilercheck }})" + echo "::set-output name=timestamp::$(date '+%Y-%m-%dT%H.%M.%S')" + + - name: Cache-op for build-cache through ccache + uses: actions/cache@v2 + with: + path: ${{ env.ccache_dir }} + key: ccache-${{ matrix.identifier }}-${{ steps.ccache_vars.outputs.hash }}-${{ github.ref }}-${{ steps.ccache_vars.outputs.timestamp }} + restore-keys: |- + ccache-${{ matrix.identifier }}-${{ steps.ccache_vars.outputs.hash }}-${{ github.ref }} + ccache-${{ matrix.identifier }}-${{ steps.ccache_vars.outputs.hash }} + ccache-${{ matrix.identifier }} + + - name: ccache environment setup + run: |- + echo "CCACHE_COMPILER_CHECK=${{ env.ccache_compilercheck }}" >> $GITHUB_ENV + echo "CCACHE_BASEDIR=${{ env.ccache_basedir }}" >> $GITHUB_ENV + echo "CCACHE_COMPRESS=${{ env.ccache_compress }}" >> $GITHUB_ENV + echo "CCACHE_COMPRESSLEVEL=${{ env.ccache_compresslevel }}" >> $GITHUB_ENV + echo "CCACHE_DIR=${{ env.ccache_dir }}" >> $GITHUB_ENV + echo "CCACHE_MAXSIZE=${{ env.ccache_maxsize }}" >> $GITHUB_ENV + + - name: ccache prolog + run: |- + ccache -s # Print current cache stats + ccache -z # Zero cache entry + + - name: Apply patch sentencepiece for android + run: |- + patch -p1 < patches/01-spm-protobuf-android.patch + + - name: Generate buildfiles for marian on android via cmake + run: |- + mkdir -p build + cd build + NDK=${{ env.ndk }} + ABI=${{ env.abi }} + MINSDK_VERSION=${{ env.minsdk_version }} + ANDROID_PLATFORM=${{ env.android_platform }} + OTHER_ANDROID_ARGS=( + -DANDROID_ARM_NEON=TRUE + ) + OTHER_MARIAN_ARGS=( + -DCOMPILE_CUDA=off + -DCOMPILE_CPU=on + -DCMAKE_HAVE_THREADS_LIBRARY=1 + -DCMAKE_USE_WIN32_THREADS_INIT=0 + -DCMAKE_USE_PTHREADS_INIT=1 + -DTHREADS_PREFER_PTHREAD_FLAG=ON + -DBUILD_ARCH=armv8-a + -DUSE_INTGEMM=off + -DUSE_SIMDE=on + -DUSE_RUY=on + -DUSE_ONNX_SGEMM=on # For time being. + # -DCOMPILE_WITHOUT_EXCEPTIONS=on # Apparently this can reduce the binary size, let's see. + ) + # Additionally list variables finally configured. + cmake -L \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake \ + -DANDROID_TOOLCHAIN=clang \ + -DANDROID_ABI=$ABI \ + -DANDROID_PLATFORM=$ANDROID_PLATFORM \ + -DANDROID_NATIVE_API_LEVEL=$MINSDKVERSION \ + -DANDROID_TOOLCHAIN_NAME=arm-linux-androideabi-4.8 \ + -DANDROID_STL=c++_static \ + -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache \ + "${OTHER_ANDROID_ARGS[@]}" "${OTHER_MARIAN_ARGS[@]}" \ + .. + + + - name : Build marian for android + working-directory: build + run: |- + # Only build marian (lib) for now. + make -j2 + + - name: ccache epilog + run: 'ccache -s # Print current cache stats' + + - uses: actions/upload-artifact@v2 + with: + path: ${{github.workspace}}/build/marian-decoder + + + # Disable release for now. + # release: + # name: Release Latest Build + # runs-on: ubuntu-latest + # needs: [ubuntu] + # if: github.ref == 'refs/heads/master' + # steps: + # - name: Download artifacts + # uses: actions/download-artifact@v2 + # + # - name: Update GitHub prerelease + # uses: marvinpinto/action-automatic-releases@latest + # with: + # repo_token: ${{ secrets.GITHUB_TOKEN }} + # automatic_release_tag: latest + # prerelease: true + # title: "Latest Build" + # files: | + # artifact/marian-decoder diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml index 6d18dea84..b71ddad54 100644 --- a/.github/workflows/macos.yml +++ b/.github/workflows/macos.yml @@ -4,7 +4,7 @@ on: push: branches: [ master ] pull_request: - branches: [ master ] + branches: [ "**" ] jobs: build-macos: diff --git a/.github/workflows/native-customized_marian-macos.yml b/.github/workflows/native-customized_marian-macos.yml index 41f102a0d..6c3405a01 100644 --- a/.github/workflows/native-customized_marian-macos.yml +++ b/.github/workflows/native-customized_marian-macos.yml @@ -4,7 +4,7 @@ on: push: branches: [ master ] pull_request: - branches: [ master ] + branches: [ "**" ] jobs: build-macos: diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml index a01770e17..f4a1c32c8 100644 --- a/.github/workflows/ubuntu.yml +++ b/.github/workflows/ubuntu.yml @@ -4,7 +4,7 @@ on: push: branches: [ master ] pull_request: - branches: [ master ] + branches: [ "**" ] jobs: build-ubuntu: diff --git a/.github/workflows/wasm-customized_marian-macos.yml b/.github/workflows/wasm-customized_marian-macos.yml index cfafe0182..5b797499b 100644 --- a/.github/workflows/wasm-customized_marian-macos.yml +++ b/.github/workflows/wasm-customized_marian-macos.yml @@ -4,7 +4,7 @@ on: push: branches: [ master ] pull_request: - branches: [ master ] + branches: [ "**" ] jobs: build-wasm: diff --git a/.github/workflows/wasm-customized_marian-ubuntu.yml b/.github/workflows/wasm-customized_marian-ubuntu.yml index 8294665f6..4777d04de 100644 --- a/.github/workflows/wasm-customized_marian-ubuntu.yml +++ b/.github/workflows/wasm-customized_marian-ubuntu.yml @@ -4,7 +4,7 @@ on: push: branches: [ master ] pull_request: - branches: [ master ] + branches: [ "**" ] jobs: build-wasm: diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index 648887aec..06aa22623 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -4,7 +4,7 @@ on: push: branches: [ master ] pull_request: - branches: [ master ] + branches: [ "**" ] env: MKL_URL: "https://romang.blob.core.windows.net/mariandev/ci/mkl-2020.1-windows-static.zip" diff --git a/.gitmodules b/.gitmodules index a8facd1fd..ae76037f1 100644 --- a/.gitmodules +++ b/.gitmodules @@ -23,3 +23,9 @@ [submodule "src/3rd_party/onnxjs"] path = src/3rd_party/onnxjs url = https://github.com/abhi-agg/onnxjs.git +[submodule "src/3rd_party/simde-no-tests"] + path = src/3rd_party/simde-no-tests + url = https://github.com/simd-everywhere/simde-no-tests/ +[submodule "src/3rd_party/ruy"] + path = src/3rd_party/ruy + url = https://github.com/google/ruy diff --git a/CMakeLists.txt b/CMakeLists.txt index 12b38f0f5..97ef84269 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -31,29 +31,31 @@ option(USE_CCACHE "Use ccache compiler cache (https://ccache.dev)" OFF) option(USE_CUDNN "Use CUDNN library" OFF) option(USE_DOXYGEN "Build documentation with Doxygen" ON) option(USE_FBGEMM "Use FBGEMM" OFF) +option(USE_INTGEMM "Use INTGEMM" ON) option(USE_MKL "Compile with MKL support" ON) option(USE_MPI "Use MPI library" OFF) option(USE_NCCL "Use NCCL library" ON) option(USE_SENTENCEPIECE "Download and compile SentencePiece" ON) option(USE_STATIC_LIBS "Link statically against non-system libs" OFF) option(GENERATE_MARIAN_INSTALL_TARGETS "Generate Marian install targets (requires CMake 3.12+)" OFF) -option(M32_BINARIES "Generate 32bit binaries even when building outside of WASM. Useful for testing some WASM specific functionality without the need for the compiling to WASM." OFF) option(COMPILE_WASM "Compile (wasm compatible) marian for WASM target" OFF) option(USE_WASM_COMPATIBLE_SOURCE "Enable the minimal marian sources that compile to wasm. Useful for debugging wasm failures by building same sources natively" OFF) +option(USE_SIMDE "Enable simde to target instruction sets" OFF) +option(USE_ONNX_SGEMM "Compile with wasm compatible blas" OFF) +option(COMPILE_WITHOUT_EXCEPTIONS "Compile without exceptions" OFF) + # cmake options that are dependent on USE_WASM_COMPATIBLE_SOURCE cmake option CMAKE_DEPENDENT_OPTION(USE_THREADS "Compile with multi-threading support" OFF "USE_WASM_COMPATIBLE_SOURCE" ON) -CMAKE_DEPENDENT_OPTION(USE_WASM_COMPATIBLE_BLAS "Compile with wasm compatible blas" ON - "USE_WASM_COMPATIBLE_SOURCE" OFF) -CMAKE_DEPENDENT_OPTION(COMPILE_WITHOUT_EXCEPTIONS "Compile without exceptions" ON - "USE_WASM_COMPATIBLE_SOURCE" OFF) if (USE_WASM_COMPATIBLE_SOURCE) set(SPM_BUILD_LIBRARY_ONLY ON CACHE BOOL "Build only sentencepiece library (skip building executables)") add_compile_definitions(WASM_COMPATIBLE_SOURCE) # Setting USE_SSE2 definition to enable SSE2 specific code in "3rd_party/sse_mathfun.h" for wasm builds add_compile_definitions(USE_SSE2) + set(USE_ONNX_SGEMM ON CACHE BOOL "") + set(COMPILE_WITHOUT_EXCEPTIONS ON CACHE BOOL "") endif() if (COMPILE_WASM) @@ -61,10 +63,11 @@ if (COMPILE_WASM) set(WORMHOLE ON CACHE BOOL "Use WASM wormhole in intgemm https://bugzilla.mozilla.org/show_bug.cgi?id=1672160") endif() -if(M32_BINARIES OR COMPILE_WASM) + +if(COMPILE_WASM) set("BUILD_WIDTH" "-m32") -else(M32_BINARIES OR COMPILE_WASM) - set("BUILD_WIDTH" "-m64") +else(COMPILE_WASM) + set("BUILD_WIDTH" "") endif() if(NOT COMPILE_WASM) @@ -167,6 +170,7 @@ if(MSVC) # set(INTRINSICS "/arch:AVX") add_definitions(-DUSE_SSE2=1) + # Or maybe use these? set(INTRINSICS ${MSVC_BUILD_ARCH}) # set(INTRINSICS "/arch:AVX512") @@ -193,6 +197,16 @@ if(MSVC) set(EXT_LIBS ${EXT_LIBS} fbgemm) add_definitions(-DUSE_FBGEMM=1 -DFBGEMM_STATIC=1) endif(USE_FBGEMM) + + if(USE_INTGEMM) + add_definitions(-DUSE_INTGEMM=1) + endif(USE_INTGEMM) + + if(USE_SIMDE) + add_definitions(-DUSE_SIMDE=1) + add_definitions(-DSIMDE_ENABLE_NATIVE_ALIASES=1) + endif(USE_SIMDE) + else(MSVC) # Check we are using at least g++ 5.0 @@ -249,7 +263,7 @@ else(MSVC) # -msse4.1 once marian can solely be compiled with intgemm ("onnxjs" will be removed in that case) set(INTRINSICS "-mssse3 -msimd128") else() - set(INTRINSICS "-msse4.1") + # Not assuming we have "-msse4.1 here" endif() if(USE_FBGEMM) @@ -257,6 +271,15 @@ else(MSVC) add_definitions(-DUSE_FBGEMM=1) endif(USE_FBGEMM) + if(USE_INTGEMM) + add_definitions(-DUSE_INTGEMM=1) + endif(USE_INTGEMM) + + if(USE_SIMDE) + add_definitions(-DUSE_SIMDE=1) + add_definitions(-DSIMDE_ENABLE_NATIVE_ALIASES=1) + endif(USE_SIMDE) + if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 9.0) # Clang-10.0.0 complains when CUDA is newer than 10.1 set(CLANG_IGNORE_UNKNOWN_CUDA "-Wno-unknown-warning-option -Wno-unknown-cuda-version") @@ -266,7 +289,8 @@ else(MSVC) set(DISABLE_GLOBALLY "-Wno-unused-result ${CLANG_IGNORE_UNKNOWN_CUDA} ${CLANG_IGNORE_UNUSED_VALUES}") # This needs to appear here as well to appease clang11+ on linux # These are used in src/CMakeLists.txt on a per-target basis - list(APPEND ALL_WARNINGS -Wall; -Werror; -Wextra; -Wno-unused-result; -Wno-deprecated; + list(APPEND ALL_WARNINGS -Wall; # -Werror; + -Wextra; -Wno-unused-result; -Wno-deprecated; -Wno-pragmas; -Wno-unused-parameter; -Wno-unused-function; -Wno-unused-value; -Wno-unknown-pragmas; -Wno-sign-compare; -Wno-missing-field-initializers; ${CLANG_IGNORE_UNUSED_PRIVATE_FIELD}) @@ -542,24 +566,32 @@ endif(USE_MPI) ############################################################################### # Find BLAS library for CPU compilation if(COMPILE_CPU) - set(EXT_LIBS ${EXT_LIBS} intgemm) # Move the intgemm bits on top since they compile with every single variant + if(USE_INTGEMM) + set(EXT_LIBS ${EXT_LIBS} intgemm) # Move the intgemm bits on top since they compile with every single variant + endif(USE_INTGEMM) + + if(USE_RUY) + set(EXT_LIBS ${EXT_LIBS} ruy) + endif(USE_RUY) + add_definitions(-DCOMPILE_CPU=1) # Move the compile CPU definition on top since we want to compile intgemm when we set compile CPU # in case a BLAS vendor is not found, we have a runtime error, although we should probably not allow the compilation to go on # if there are BLAS vendors, we have other runtime checks with sane error messages. - if(USE_WASM_COMPATIBLE_BLAS) + if(USE_ONNX_SGEMM) ## Use a wasm compatible BLAS + ## ^ SGEMM != BLAS set(EXT_LIBS ${EXT_LIBS} onnx-sgemm) - set(BLAS_FOUND TRUE) - set(BLAS_VENDOR "ONNX-SGEMM") - add_definitions(-DBLAS_FOUND=1 -DWASM_COMPATIBLE_BLAS=1) # Might be required in some cmake files further down the line, let's avoid using add_compile_definitions in this codeblock - elseif(APPLE AND USE_APPLE_ACCELERATE) + add_definitions(-DUSE_ONNX_SGEMM=1) # Might be required in some cmake files further down the line, let's avoid using add_compile_definitions in this codeblock + endif(USE_ONNX_SGEMM) + + if(APPLE AND USE_APPLE_ACCELERATE) set(BLAS_VENDOR "Accelerate") # see https://developer.apple.com/documentation/accelerate for more info # you may need to install Xcode command line tools if you don't have them already (https://developer.apple.com/xcode/features/) include_directories("/Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/System/Library/Frameworks/Accelerate.framework/Frameworks/vecLib.framework/Headers") set(EXT_LIBS ${EXT_LIBS} "-framework Accelerate") add_definitions(-DBLAS_FOUND=1) - else(USE_WASM_COMPATIBLE_BLAS) + else(USE_ONNX_SGEMM) if(USE_MKL) find_package(MKL) endif(USE_MKL) @@ -580,7 +612,7 @@ if(COMPILE_CPU) endif(CBLAS_FOUND) endif(BLAS_FOUND) endif(MKL_FOUND) - endif(USE_WASM_COMPATIBLE_BLAS) + endif(USE_ONNX_SGEMM) endif(COMPILE_CPU) ############################################################################### diff --git a/patches/01-spm-protobuf-android.patch b/patches/01-spm-protobuf-android.patch new file mode 100644 index 000000000..6eadb5454 --- /dev/null +++ b/patches/01-spm-protobuf-android.patch @@ -0,0 +1,14 @@ +diff --git a/src/3rd_party/sentencepiece/src/CMakeLists.txt b/src/3rd_party/sentencepiece/src/CMakeLists.txt +index 0ba6407..276caec 100644 +--- a/ src/3rd_party/sentencepiece/src/CMakeLists.txt ++++ b/src/3rd_party/sentencepiece/src/CMakeLists.txt +@@ -159,6 +159,9 @@ set(SPM_TEST_SRCS + find_package(Threads REQUIRED) + + set(SPM_LIBS ${PROTOBUF_LITE_LIBRARY} Threads::Threads) ++if(ANDROID) ++ set(SPM_LIBS ${SPM_LIBS} android log) ++endif(ANDROID) + + if (SPM_ENABLE_NFKC_COMPILE) + find_package(ICU 4.4 COMPONENTS i18n data uc REQUIRED) diff --git a/scripts/run.sh b/scripts/run.sh new file mode 100644 index 000000000..f080dee68 --- /dev/null +++ b/scripts/run.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +NDK=/mnt/Storage/jphilip/android-ndk-r23b +ABI=arm64-v8a +MINSDK_VERSION=28 +CUSTOM_MODULE_PATH=/mnt/Storage/jphilip/marian-android/openblas-install/lib/cmake/openblas +ANDROID_PLATFORM=28 + +OTHER_ANDROID_ARGS=( + -DANDROID_ARM_NEON=TRUE +) + +OTHER_MARIAN_ARGS=( + -DCOMPILE_CUDA=off + -DCOMPILE_CPU=on + -DCMAKE_HAVE_THREADS_LIBRARY=1 + -DCMAKE_USE_WIN32_THREADS_INIT=0 + -DCMAKE_USE_PTHREADS_INIT=1 + -DTHREADS_PREFER_PTHREAD_FLAG=ON + -DBUILD_ARCH=armv8-a + -DUSE_INTGEMM=off + -DUSE_SIMDE=on + -DUSE_RUY=on + -DUSE_ONNX_SGEMM=on # For time being. + -DCOMPILE_WITHOUT_EXCEPTIONS=on # Apparently this can reduce the binary size, let's see. +) +# Additionally list variables finally configured. +cmake -L \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake \ + -DCMAKE_MODULE_PATH=$CUSTOM_MODULE_PATH \ + -DANDROID_TOOLCHAIN=clang \ + -DANDROID_ABI=$ABI \ + -DANDROID_PLATFORM=$ANDROID_PLATFORM \ + -DANDROID_NATIVE_API_LEVEL=$MINSDKVERSION \ + -DANDROID_TOOLCHAIN_NAME=arm-linux-androideabi-4.8 \ + -DANDROID_STL=c++_static \ + -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache \ + "${OTHER_ANDROID_ARGS[@]}" "${OTHER_MARIAN_ARGS[@]}" \ + .. diff --git a/src/3rd_party/CMakeLists.txt b/src/3rd_party/CMakeLists.txt index f335c218a..2fbe69b14 100644 --- a/src/3rd_party/CMakeLists.txt +++ b/src/3rd_party/CMakeLists.txt @@ -5,17 +5,35 @@ add_subdirectory(./yaml-cpp) if(NOT USE_WASM_COMPATIBLE_SOURCE) add_subdirectory(./SQLiteCpp) add_subdirectory(./zlib) + add_subdirectory(./faiss) include_directories(./faiss) endif() -add_subdirectory(./pathie-cpp) -set(INTGEMM_DONT_BUILD_TESTS ON CACHE BOOL "Disable intgemm tests") -add_subdirectory(./intgemm) +add_subdirectory(./pathie-cpp) -if(USE_WASM_COMPATIBLE_BLAS) +if(USE_INTGEMM) + set(INTGEMM_DONT_BUILD_TESTS ON CACHE BOOL "Disable intgemm tests") + add_subdirectory(./intgemm) +endif(USE_INTGEMM) + +if(USE_RUY) + set(CPUINFO_BUILD_MOCK_TESTS OFF CACHE BOOL " " FORCE) + set(CPUINFO_BUILD_BENCHMARKS OFF CACHE BOOL " " FORCE) + set(CPUINFO_BUILD_PKG_CONFIG OFF CACHE BOOL " " FORCE) + set(CPUINFO_BUILD_UNIT_TESTS OFF CACHE BOOL " " FORCE) + set(CPUINFO_BUILD_TOOLS OFF CACHE BOOL " " FORCE) + add_subdirectory(ruy/third_party/cpuinfo EXCLUDE_FROM_ALL) + add_subdirectory(ruy EXCLUDE_FROM_ALL) +endif(USE_RUY) + +if(USE_SIMDE) + include_directories(./simde-no-tests) +endif(USE_SIMDE) + +if(USE_ONNX_SGEMM) add_subdirectory(./onnxjs) -endif(USE_WASM_COMPATIBLE_BLAS) +endif(USE_ONNX_SGEMM) if(USE_FBGEMM) # @TODO: find out if this is somehow harmful. This is supppressing CMake warnings for CMAKE_SUPPRESS_DEVELOPER_WARNINGS diff --git a/src/3rd_party/faiss/VectorTransform.cpp b/src/3rd_party/faiss/VectorTransform.cpp index 103b0910e..a26c2b4d3 100644 --- a/src/3rd_party/faiss/VectorTransform.cpp +++ b/src/3rd_party/faiss/VectorTransform.cpp @@ -132,7 +132,18 @@ const float *fvecs_maybe_subsample( return x_subset; } -#if 1 // def __SSE__ +float fvec_norm_L2sqr_ref(const float *x, size_t d) +{ + size_t i; + double res = 0; + for (i = 0; i < d; i++) + res += x[i] * x[i]; + return res; +} + + + +#ifdef __SSE__ // reads 0 <= d < 4 floats as __m128 static inline __m128 masked_read(int d, const float *x) { diff --git a/src/3rd_party/ruy b/src/3rd_party/ruy new file mode 160000 index 000000000..2d950b3bf --- /dev/null +++ b/src/3rd_party/ruy @@ -0,0 +1 @@ +Subproject commit 2d950b3bfa7ebfbe7a97ecb44b1cc4da5ac1d6f0 diff --git a/src/3rd_party/sentencepiece b/src/3rd_party/sentencepiece index 3ffdc0065..c307b874d 160000 --- a/src/3rd_party/sentencepiece +++ b/src/3rd_party/sentencepiece @@ -1 +1 @@ -Subproject commit 3ffdc0065a03cadd9d0e5e123aaf9b6ea7ffb05d +Subproject commit c307b874deb5ea896db8f93506e173353e66d4d3 diff --git a/src/3rd_party/simde-no-tests b/src/3rd_party/simde-no-tests new file mode 160000 index 000000000..9af03cd0f --- /dev/null +++ b/src/3rd_party/simde-no-tests @@ -0,0 +1 @@ +Subproject commit 9af03cd0f30efae1beb94ef31430dc0370b98b0c diff --git a/src/3rd_party/sse_mathfun.h b/src/3rd_party/sse_mathfun.h index 91155cac3..89ca1d3ed 100644 --- a/src/3rd_party/sse_mathfun.h +++ b/src/3rd_party/sse_mathfun.h @@ -29,7 +29,13 @@ (this is the zlib license) */ +#ifndef USE_SIMDE #include +#else +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "x86/sse.h" +#endif /* yes I know, the top of this file is quite ugly */ @@ -712,3 +718,6 @@ static inline void sincos_ps(v4sf x, v4sf *s, v4sf *c) { *c = _mm_xor_ps(xmm2, sign_bit_cos); } +#ifdef USE_SIMDE +#pragma GCC diagnostic pop +#endif diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 9123e2324..5845a6710 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -7,6 +7,8 @@ include_directories(3rd_party/sentencepiece) include_directories(3rd_party/sentencepiece/third_party/protobuf-lite) include_directories(3rd_party/fbgemm/include) include_directories(3rd_party/intgemm) +include_directories(3rd_party/ruy) +include_directories(3rd_party/simde-no-tests) include_directories(${CMAKE_CURRENT_BINARY_DIR}/3rd_party/intgemm) include_directories(${CMAKE_CURRENT_BINARY_DIR}/3rd_party) include_directories(${CMAKE_BINARY_DIR}/local/include) @@ -99,12 +101,16 @@ set(MARIAN_SOURCES $ ) -if (NOT USE_WASM_COMPATIBLE_SOURCE) +if (NOT USE_WASM_COMPATIBLE_SOURCE AND NOT ANDROID) + # Hi WASM, Android hates this too. list(APPEND MARIAN_SOURCES 3rd_party/ExceptionWithCallStack.cpp + ) +endif() +if (NOT USE_WASM_COMPATIBLE_SOURCE) + list(APPEND MARIAN_SOURCES data/corpus_sqlite.cpp - tensors/cpu/fbgemm/packed_gemm.cpp layers/lsh.cpp optimizers/quantizer.cpp @@ -122,6 +128,11 @@ if (NOT USE_WASM_COMPATIBLE_SOURCE) $ $ ) + if(USE_FBGEMM) + list(APPEND MARIAN_SOURCES + tensors/cpu/fbgemm/packed_gemm.cpp + ) + endif(USE_FBGEMM) endif() add_library(marian STATIC ${MARIAN_SOURCES}) diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp index 9e90bbb59..3bcc6518e 100755 --- a/src/common/config_parser.cpp +++ b/src/common/config_parser.cpp @@ -15,16 +15,6 @@ #include #include -#if MKL_FOUND -#include -#elif BLAS_FOUND - #if WASM_COMPATIBLE_BLAS - #include "3rd_party/onnxjs/src/wasm-ops/gemm.h" - #else - #include - #endif // WASM_COMPATIBLE_BLAS -#endif - namespace marian { // TODO: Move this to CLIWrapper and allow to mark options as paths in the same place they are diff --git a/src/common/logging.cpp b/src/common/logging.cpp index 999b97b42..51d80510a 100644 --- a/src/common/logging.cpp +++ b/src/common/logging.cpp @@ -128,7 +128,7 @@ static void setErrorHandlers() { std::set_terminate(unhandledException); #ifdef __unix__ // catch segfaults - struct sigaction sa = { {0} }; + struct sigaction sa = { 0 }; sigemptyset(&sa.sa_mask); sa.sa_flags = SA_SIGINFO; sa.sa_sigaction = [](int /*signal*/, siginfo_t*, void*) { ABORT("Segmentation fault"); }; @@ -149,8 +149,8 @@ void switchtoMultinodeLogging(std::string nodeIdStr) { namespace marian { std::string noinline getCallStack(size_t skipLevels) { - #ifdef WASM_COMPATIBLE_SOURCE - return "Callstacks not supported in WASM builds currently"; + #if defined(WASM_COMPATIBLE_SOURCE) || defined(__ANDROID__) + return "Callstacks not supported in WASM or Android builds currently"; #else return ::Microsoft::MSR::CNTK::DebugUtil::GetCallStack(skipLevels + 2, /*makeFunctionNamesStandOut=*/true); #endif diff --git a/src/common/types.h b/src/common/types.h index 575e77120..fb6aab27a 100644 --- a/src/common/types.h +++ b/src/common/types.h @@ -17,7 +17,13 @@ #include #ifndef __CUDACC__ // NVCC is very unreliable when it comes to CPU intrinsics, we hide them completely from NVCC-compiled code + +#ifndef USE_SIMDE #include +#else +#include "x86/avx2.h" +#endif + #endif #ifdef __CUDACC__ // nvcc is compiling this code diff --git a/src/layers/lsh.cpp b/src/layers/lsh.cpp index a91778ed5..19b040ab9 100644 --- a/src/layers/lsh.cpp +++ b/src/layers/lsh.cpp @@ -3,7 +3,7 @@ #include "tensors/cpu/prod_blas.h" #if BLAS_FOUND -#include "3rd_party/faiss/IndexLSH.h" +#include "faiss/IndexLSH.h" #endif namespace marian { @@ -127,4 +127,4 @@ Expr LSH::affine(Expr idx, Expr input, Expr W, Expr b) { } #endif -} // namespace marian \ No newline at end of file +} // namespace marian diff --git a/src/tensors/cpu/expression_graph_packable.h b/src/tensors/cpu/expression_graph_packable.h index d93719d8e..4af120e4d 100644 --- a/src/tensors/cpu/expression_graph_packable.h +++ b/src/tensors/cpu/expression_graph_packable.h @@ -209,7 +209,7 @@ class ExpressionGraphPackable : public ExpressionGraph { if (gemmElementType == Type::intgemm8) { #if defined(WASM) ABORT("Int8::PrepareA is not implemented for wasm."); -#else +#elif defined(USE_INTGEMM) float quantMult = 127.0f / intgemm::MaxAbsolute(val->data(), val->data() + val->shape().elements()); intgemm::Int8::PrepareA(tmp->data(), /*input*/ paramMat->data(), /*output*/ @@ -218,11 +218,13 @@ class ExpressionGraphPackable : public ExpressionGraph { cols(val)); //Put the quantMult at the back of the tensor *(reinterpret_cast(paramMat->data() + val->shape().elements())) = quantMult; +#else + ABORT("Int8::PrepareA not implemented yet for ruy"); #endif } else { #if defined(WASM) ABORT("Int16::PrepareA is not implemented for wasm."); -#else +#elif defined(USE_INTGEMM) float quantMult = 1024.0f; intgemm::Int16::PrepareA(tmp->data(), /*input*/ paramMat->data(), /*output*/ @@ -231,6 +233,8 @@ class ExpressionGraphPackable : public ExpressionGraph { cols(val)); //Put the quantMult at the back of the tensor *(reinterpret_cast(paramMat->data() + val->shape().elements())) = quantMult; +#else + ABORT("Int16::PrepareA is not implemented for wasm."); #endif } diff --git a/src/tensors/cpu/integer_common.cpp b/src/tensors/cpu/integer_common.cpp index 21e7254fa..a941c0d02 100644 --- a/src/tensors/cpu/integer_common.cpp +++ b/src/tensors/cpu/integer_common.cpp @@ -1,5 +1,18 @@ #include "integer_common.h" +#ifndef USE_SIMDE +#include +#include +#include +#include +#else // USE_SIMDE +// https://wiki.debian.org/SIMDEverywhere#Approach +#include "x86/sse2.h" +#include "x86/avx2.h" +#include "x86/ssse3.h" +#include "x86/sse.h" +#endif + namespace marian { namespace cpu { namespace integer { @@ -39,4 +52,4 @@ void AddBias(marian::Tensor C, const marian::Tensor Bias) { } //integer } //cpu -} //marian \ No newline at end of file +} //marian diff --git a/src/tensors/cpu/integer_common.h b/src/tensors/cpu/integer_common.h index 97ca79c12..80aa31d24 100644 --- a/src/tensors/cpu/integer_common.h +++ b/src/tensors/cpu/integer_common.h @@ -4,15 +4,16 @@ #include "tensors/tensor_operators.h" #include "tensors/cpu/aligned.h" #include "common/io_item.h" +#ifdef USE_INTGEMM #include "3rd_party/intgemm/intgemm/intgemm.h" +#else // USE_INTGEMM +#include +#include "ruy_adapter.h" +#endif // USE_INTGEMM #if defined(WASM) #include "wasm_intgemm_interface.h" #endif -#include -#include -#include -#include #include #include @@ -27,6 +28,11 @@ inline int rows(Tensor& tensor) { return tensor->shape().elements() / cols(tenso inline int cols(Shape& shape) { return shape[-1]; } inline int rows(Shape& shape) { return shape.elements() / cols(shape); } +// This operates on floats after processing so doesn't care about int8_t vs int16_t. +void AddBias(marian::Tensor C, const marian::Tensor Bias); + +#ifdef USE_INTGEMM + template struct intgemm_; template <> struct intgemm_ {using width = intgemm::Int8; using type = int8_t; @@ -35,8 +41,19 @@ template <> struct intgemm_ {using width = intgemm::Int16; using type = int16_t; constexpr static const Type intgemmType = Type::intgemm16;}; -// This operates on floats after processing so doesn't care about int8_t vs int16_t. -void AddBias(marian::Tensor C, const marian::Tensor Bias); + + +#else // USE_INTGEMM + +template struct intgemm_; +template <> struct intgemm_ {using width = IntgemmViaRuy::Int8; + using type = IntgemmViaRuy::Int8::Type; + constexpr static const Type intgemmType = Type::intgemm8;}; +template <> struct intgemm_ {using width = IntgemmViaRuy::Int16; + using type = IntgemmViaRuy::Int16::Type; + constexpr static const Type intgemmType = Type::intgemm16;}; + +#endif // USE_INTGEMM // For loading architecture agnostic models. We do PrepareAndTranpose, because we already transposed // in our binary format. Then we copy the quantizationMultiplier information at the end @@ -86,7 +103,7 @@ void prepareAndTransposeB(io::Item& item, const char * input) { //Copy the quantMult float quantMult = *(reinterpret_cast(reinterpret_cast(input) + item.shape.elements())); *(reinterpret_cast(&(*(output_tensor + item.shape.elements())))) = quantMult; - #else + #else // COMPILE_CPU ABORT("Using intgemm models is supported only with -DCOMPILE_CPU=on"); #endif } @@ -106,4 +123,5 @@ void unquantizeWemb(io::Item& item, const char * input) { } //integer } //cpu -} //marian \ No newline at end of file +} //marian + diff --git a/src/tensors/cpu/intgemm_interface.h b/src/tensors/cpu/intgemm_interface.h index 7b965d296..4dcfc83f3 100644 --- a/src/tensors/cpu/intgemm_interface.h +++ b/src/tensors/cpu/intgemm_interface.h @@ -42,7 +42,7 @@ bool shifted_; rows(child(0)->val()), cols(child(0)->val()), val_->data() /*output*/); - #else + #elif defined(USE_INTGEMM) typedef typename intgemm_::type Integer; if (!shifted_) { intgemm_::width::PrepareA(child(0)->val()->data(), /*input*/ @@ -57,6 +57,14 @@ bool shifted_; rows(child(0)->val()), cols(child(0)->val())); } + #else + // Copied from above. No shifted in ARM. + typedef typename intgemm_::type Integer; + intgemm_::width::PrepareA(child(0)->val()->data(), /*input*/ + val_->data(), /*output*/ + *child(1)->val()->data(), /*Quant Mult*/ + rows(child(0)->val()), + cols(child(0)->val())); #endif }}; #else @@ -258,8 +266,8 @@ struct QuantMultNodeOp : public UnaryNodeOp { #pragma warning(push) #pragma warning(disable: 4127) //VSCODE thinks line 222 is constant conditional expression, which it is only after the template resolution, not before. NodeOps forwardOps() override { -#ifdef COMPILE_CPU - return {NodeOp( + return {[=](){ + #ifdef COMPILE_CPU if (vtype == Type::int16) { *val_->data() = 1024.0f; } else if (child(0)->type() == "intgemmSelectColumnsB") { @@ -269,17 +277,21 @@ struct QuantMultNodeOp : public UnaryNodeOp { *val_->data() = *(reinterpret_cast(reinterpret_cast(child(0)->val()->data()) + child(0)->val()->shape().elements())); } else { if (child(0)->graph()->getBackend()->DumpQuantMult()) { + #if defined(USE_INTGEMM) intgemm::MeanStd meanstd = intgemm::GetVectorMeanStd(child(0)->val()->data(), child(0)->val()->data() + child(0)->val()->shape().elements(), true); intgemm::MeanStd meanstd2 = intgemm::GetVectorMeanStd(child(0)->val()->data(), child(0)->val()->data() + child(0)->val()->shape().elements()); std::cerr << "Name: " << name() << " MeanAbs: " << meanstd.mean << " stddevAbs: " << meanstd.stddev << " Mean: " << meanstd2.mean << " stddev: " << meanstd2.stddev << " MaxAbs: " << intgemm::MaxAbsolute(child(0)->val()->data(), child(0)->val()->data() + child(0)->val()->shape().elements()) << std::endl; + #endif } + #if defined(USE_INTGEMM) *val_->data() = 127.0f / intgemm::MaxAbsolute(child(0)->val()->data(), child(0)->val()->data() + child(0)->val()->shape().elements()); + #else + *val_->data() = 127.0f / IntgemmViaRuy::MaxAbsolute(child(0)->val()->data(), child(0)->val()->data() + child(0)->val()->shape().elements()); + #endif } - )}; -#else - return {NodeOp()}; -#endif + #endif // COMPILE_CPU + }}; } #pragma warning(pop) NodeOps backwardOps() override { @@ -345,9 +357,11 @@ class PrepareBiasForBNodeOp : public NaryNodeOp { float scale_a = *quant_mult_a->data(); float scale_b = *quant_mult_b->data(); int8PrepareBias((const int8_t *)b->data(), scale_a, 0.0 /*zero_point_a*/, scale_b, 0.0 /*zero_point_b*/, rows(b), cols(b), bias->data(), val_->data()); - #else + #elif defined(USE_INTGEMM) float unquant_mult = (-1)*((127.0f / *quant_mult_a->data())*(127.0f / *quant_mult_b->data()))/(127.0f); //Minus one to invert add_ps later on intgemm::Int8Shift::PrepareBias((const int8_t *)b->data(), rows(b), cols(b), intgemm::callbacks::UnquantizeAndAddBiasAndWrite(unquant_mult, bias->data(), val_->data())); + #else + IntgemmViaRuy::PrepareBias(nullptr, val_->data(), rows(b), cols(b)); #endif } }}; @@ -382,9 +396,13 @@ class PrepareFakeBiasForBNodeOp : public NaryNodeOp { float scale_a = *quant_mult_a->data(); float scale_b = *quant_mult_b->data(); int8PrepareBias((const int8_t *)b->data(), scale_a, 0.0 /*zero_point_a*/, scale_b, 0.0 /*zero_point_b*/, rows(b), cols(b), nullptr/*input_bias*/, val_->data()); - #else + #elif defined(USE_INTGEMM) float unquant_mult = (-1)*((127.0f / *quant_mult_a->data())*(127.0f / *quant_mult_b->data()))/(127.0f); //Minus one to invert add_ps later on intgemm::Int8Shift::PrepareBias((const int8_t *)b->data(), rows(b), cols(b), intgemm::callbacks::UnquantizeAndWrite(unquant_mult, val_->data())); + #else + const float *bias = nullptr; + float *bias_prepared = val_->data(); + IntgemmViaRuy::PrepareBias(bias, bias_prepared, rows(b), cols(b)); #endif }}; #else @@ -433,7 +451,7 @@ float scalar_; "Int16::Multiply is not implemented for wasm."); ABORT_IF(intgemm_::intgemmType == Type::intgemm8, "Int8::Multiply is not implemented for wasm."); - #else + #elif defined(USE_INTGEMM) typedef typename intgemm_::type Integer; intgemm_::width::Multiply(reinterpret_cast(child(0)->val()->data()), /*A*/ reinterpret_cast(child(1)->val()->data()), /*B*/ @@ -441,6 +459,16 @@ float scalar_; cols(child(0)->val()), cols(child(1)->val()), intgemm::callbacks::UnquantizeAndWrite(unquant_mult, val_->data())); + #else + typedef typename intgemm_::type Integer; + intgemm_::width::Multiply(reinterpret_cast(child(0)->val()->data()), /*A*/ + reinterpret_cast(child(1)->val()->data()), /*B*/ + val_->data(), /*output*/ + rows(child(0)->val()), + cols(child(0)->val()), + cols(child(1)->val()), + unquant_mult); + #endif }}; #else @@ -507,7 +535,7 @@ class AffineNodeOp : public NaryNodeOp { cols(child(0)->val()), cols(child(1)->val()), val_->data()); - #else + #elif defined(USE_INTGEMM) typedef typename intgemm_::type Integer; if (!shifted_) { intgemm_::width::Multiply(reinterpret_cast(child(0)->val()->data()), /*A*/ @@ -524,6 +552,18 @@ class AffineNodeOp : public NaryNodeOp { cols(child(1)->val()), /*child(2) is bias*/ intgemm::callbacks::UnquantizeAndAddBiasAndWrite(unquant_mult, child(2)->val()->data(), val_->data())); } + #else + typedef typename intgemm_::type Integer; + intgemm_::width::Multiply(reinterpret_cast(child(0)->val()->data()), /*A*/ + reinterpret_cast(child(1)->val()->data()), /*B*/ + child(2)->val()->data(), /*child(2) is bias*/ + val_->data(), /*output*/ + rows(child(0)->val()), + cols(child(0)->val()), + cols(child(1)->val()), + unquant_mult); + + #endif }}; #else diff --git a/src/tensors/cpu/prod.cpp b/src/tensors/cpu/prod.cpp index 4d761cf4b..8cc030539 100755 --- a/src/tensors/cpu/prod.cpp +++ b/src/tensors/cpu/prod.cpp @@ -10,11 +10,9 @@ #if MKL_FOUND #include #elif BLAS_FOUND - #if WASM_COMPATIBLE_BLAS - #include "3rd_party/onnxjs/src/wasm-ops/gemm.h" - #else - #include - #endif // WASM_COMPATIBLE_BLAS + #include +#elif USE_ONNX_SGEMM + #include "3rd_party/onnxjs/src/wasm-ops/gemm.h" #endif #include "integer_common.h" @@ -79,7 +77,6 @@ void ProdBatchedOld(marian::Tensor C, bool transB, float beta, float scalar) { -#if BLAS_FOUND float alpha = scalar; size_t batchA = A->shape().elements() / (A->shape()[-1] * A->shape()[-2]); @@ -183,10 +180,6 @@ void ProdBatchedOld(marian::Tensor C, (int)ldc); } #endif -#else - C; A; B; transA; transB; beta; scalar; - ABORT("You need to compile with MKL in order to use the CPU version"); -#endif } void ProdBatched(marian::Tensor C, diff --git a/src/tensors/cpu/prod_blas.h b/src/tensors/cpu/prod_blas.h index 1d6757927..c9dd6d7bc 100644 --- a/src/tensors/cpu/prod_blas.h +++ b/src/tensors/cpu/prod_blas.h @@ -1,11 +1,10 @@ +#pragma once #if MKL_FOUND -#include + #include #elif BLAS_FOUND - #if WASM_COMPATIBLE_BLAS - #include "3rd_party/onnxjs/src/wasm-ops/gemm.h" - #else #include - #endif // WASM_COMPATIBLE_BLAS +#elif USE_ONNX_SGEMM + #include "3rd_party/onnxjs/src/wasm-ops/gemm.h" #endif inline void sgemm(bool transA, @@ -22,9 +21,6 @@ inline void sgemm(bool transA, float* c, int ldc) { #if BLAS_FOUND - #if WASM_COMPATIBLE_BLAS - gemm_f32_imp(transA, transB, rows_a, rows_b, width, alpha, a, b, beta, c); - #else cblas_sgemm(CblasRowMajor, transA ? CblasTrans : CblasNoTrans, transB ? CblasTrans : CblasNoTrans, @@ -39,7 +35,8 @@ inline void sgemm(bool transA, beta, c, ldc); - #endif +#elif USE_ONNX_SGEMM + gemm_f32_imp(transA, transB, rows_a, rows_b, width, alpha, a, b, beta, c); #else transA; transB; rows_a; rows_b; width; alpha; a; lda; b; ldb; beta; c; ldc; // make compiler happy ABORT("Marian must be compiled with a BLAS library"); diff --git a/src/tensors/cpu/ruy_adapter.h b/src/tensors/cpu/ruy_adapter.h new file mode 100644 index 000000000..d4bbfc002 --- /dev/null +++ b/src/tensors/cpu/ruy_adapter.h @@ -0,0 +1,534 @@ +/* + * This file follows intgemm and is a means of retrofitting ruy into the intgemm based wiring in + * `intgemm_interface.h`. ruy is an inference backend used in tensorflow and android deployment and + * has an optimized ARM backend for the multiply operations required. Optimized code for quantize, + * unquantize, transpose are added separately to connect the multiply library to marian. + */ + +#pragma once +#include +#include +#include +#include "ruy/platform.h" +#include "ruy/system_aligned_alloc.h" + +#if RUY_PLATFORM_NEON +#include +#endif + +namespace marian { +namespace cpu { +namespace integer { + +using Index = unsigned int; + +/* + * An AlignedVector is similar to intgemm's aligned allocations. Defined here + * independently because we are ignoring intgemm path entirely on ARM. + */ +template +class AlignedVector { +public: + AlignedVector(size_t num_elem) + : size_(num_elem), + storage_(reinterpret_cast(ruy::detail::SystemAlignedAlloc(sizeof(T) * num_elem))) {} + + T *begin() { return storage_; } + T *data() { return storage_; } + size_t size() const { return size_; } + size_t memSize() const { return sizeof(T) * size_; } + + // Forbid copy + AlignedVector(const AlignedVector &) = delete; + AlignedVector &operator=(const AlignedVector &) = delete; + + ~AlignedVector() { ruy::detail::SystemAlignedFree(reinterpret_cast(storage_)); } + +private: + size_t size_; + T *storage_; +}; + +enum class Path { kStandardCpp = 0, kNeon = 1 }; + +#if RUY_PLATFORM_NEON +constexpr Path kHighestPath = Path::kNeon; +#else +constexpr Path kHighestPath = Path::kStandardCpp; +#endif + +template +struct Preprocess; + +/* + * Naive implementation using standard C++ functions. Not optimized using SIMD operations. + */ +template <> +struct Preprocess { + static void quantize(const float *input, int8_t *output, float scale, Index rows, Index width) { + const Index size = rows * width; + for(Index i = 0; i < size; i++) { + // Round to nearest after multiplying with scale. + float value = roundf(scale * input[i]); + + // Since float can store bigger values, we threshold anything that's gone + // higher and can't fit in int8. + value = std::max(-127.0f, value); + value = std::min(127.0f, value); + + // Finally a static cast. + output[i] = static_cast(value); + }; + } + + template + static void transpose(const Scalar *input, Index rows, Index cols, Scalar *output) { + for(Index i = 0; i < rows; i++) { + for(Index j = 0; j < cols; j++) { + output[j * rows + i] = input[i * cols + j]; + } + } + } + + static void unquantizeAddBias(const int32_t *input, + const float *input_bias_prepared, + float unquant_multiplier, + Index rows_A, + Index cols_B, + float *output) { + for(Index i = 0; i < rows_A; i++) { + for(Index j = 0; j < cols_B; j++) { + Index idx = i * cols_B + j; + output[idx] = (input[idx] * unquant_multiplier) + input_bias_prepared[j]; + } + } + } + + static void unquantize(const int32_t *input, + float unquant_multiplier, + Index rows_A, + Index cols_B, + float *output) { + for(Index i = 0; i < rows_A; i++) { + for(Index j = 0; j < cols_B; j++) { + Index idx = i * cols_B + j; + output[idx] = (input[idx] * unquant_multiplier); + } + } + } +}; + +#if RUY_PLATFORM_NEON + +/* + * Optimized path using ARM NEON SIMD intrinsics. Currently only supports int8_t. + * TODO: Expand support to 16-bit. + */ +template <> +struct Preprocess { + static void quantize(const float *input, int8_t *output, float scale, Index rows, Index width) { + const float32x4_t *Input = reinterpret_cast(input); + const float32x4_t *InputEnd = reinterpret_cast(input + rows * width); + + int8x8_t *Output = reinterpret_cast(output); + while(Input != InputEnd) { + // Vector multiply by scalar + // float32x4_t vmulq_n_f32(float32x4_t a, float32_t b); + // VMUL.F32 q0,q0,d0[0] + float32x4_t scaledFloat_lo = vmulq_n_f32(*Input++, scale); + + // Convert from float + // int32x4_t vcvtnq_s32_f32(float32x4_t a); + // VCVT.S32.F32 q0, q0 + int32x4_t scaledInt_lo = vcvtnq_s32_f32(scaledFloat_lo); + + // Vector saturating narrow integer + // int16x4_t vqmovn_s32(int32x4_t a); // VQMOVN.S32 d0,q0 + int16x4_t s16x4_lo = vqmovn_s32(scaledInt_lo); + + // Vector multiply by scalar + // float32x4_t vmulq_n_f32(float32x4_t a, float32_t b); + // VMUL.F32 q0,q0,d0[0] + float32x4_t scaledFloat_hi = vmulq_n_f32(*Input++, scale); + + // Convert from float + // int32x4_t vcvtnq_s32_f32(float32x4_t a); + // VCVT.S32.F32 q0, q0 + int32x4_t scaledInt_hi = vcvtnq_s32_f32(scaledFloat_hi); + + // Vector saturating narrow integer + // int16x4_t vqmovn_s32(int32x4_t a); + // VQMOVN.S32 d0,q0 + int16x4_t s16x4_hi = vqmovn_s32(scaledInt_hi); + + // Combine two ints. + // int16x8_t vcombine_s16(int16x4_t low, int16x4_t high); + int16x8_t s16x8 = vcombine_s16(s16x4_lo, s16x4_hi); + + // Vector saturating narrow integer + int8x8_t s8x8 = vqmovn_s16(s16x8); + + *Output = s8x8; + ++Output; + }; + } + + template + static void transpose(const Scalar *input, Index rows, Index cols, Scalar *output) { + // This is a template with an abort. The specialized implementation is done + // below for int8_t. + std::abort(); + } + + // Specialization for int8_t + static void transpose(const int8_t *input, Index rows, Index cols, int8_t *output) { + constexpr Index tile_size = 16; + // TODO(jerin): Enable + // assert(rows % tile_size == 0 && cols & tile_size == 0); + for(Index i = 0; i < rows; i += tile_size) { + for(Index j = 0; j < cols; j += tile_size) { + _transpose_16x16(input, i, j, rows, cols, output); + } + } + } + + static void _transpose_16x16(const int8_t *src, + Index i, + Index j, + Index rows, + Index cols, + int8_t *dst) { + // Implemented following the algorithm described in + // https://stackoverflow.com/a/29587984/4565794 + // + // permute n 32-bit rows + // permute n 64-bit rows + // ... + // permute n simd_width/2-bit rows + + // clang-format off + + // Permute 8 8-bit rows. + // Load int8x16x2 from memory into SIMD registers, transpose as 2x2 matrices. + + Index srcRowBegin = i*cols + j; + int8x16x2_t r0 = vtrnq_s8(vld1q_s8(&src[ 0*cols + srcRowBegin]), vld1q_s8(&src[ 1*cols + srcRowBegin])); + int8x16x2_t r1 = vtrnq_s8(vld1q_s8(&src[ 2*cols + srcRowBegin]), vld1q_s8(&src[ 3*cols + srcRowBegin])); + int8x16x2_t r2 = vtrnq_s8(vld1q_s8(&src[ 4*cols + srcRowBegin]), vld1q_s8(&src[ 5*cols + srcRowBegin])); + int8x16x2_t r3 = vtrnq_s8(vld1q_s8(&src[ 6*cols + srcRowBegin]), vld1q_s8(&src[ 7*cols + srcRowBegin])); + int8x16x2_t r4 = vtrnq_s8(vld1q_s8(&src[ 8*cols + srcRowBegin]), vld1q_s8(&src[ 9*cols + srcRowBegin])); + int8x16x2_t r5 = vtrnq_s8(vld1q_s8(&src[10*cols + srcRowBegin]), vld1q_s8(&src[11*cols + srcRowBegin])); + int8x16x2_t r6 = vtrnq_s8(vld1q_s8(&src[12*cols + srcRowBegin]), vld1q_s8(&src[13*cols + srcRowBegin])); + int8x16x2_t r7 = vtrnq_s8(vld1q_s8(&src[14*cols + srcRowBegin]), vld1q_s8(&src[15*cols + srcRowBegin])); + + + // Permute 8 16-bit rows. + // Next step is to treat the entries as int16x8x2 (via cast) and do + // transpose for int16, which will now leave intra-2 pairs intact while + // transposing inter 2-pairs into the right places. + int16x8x2_t t0 = vtrnq_s16(vreinterpretq_s16_s8(r0.val[0]), vreinterpretq_s16_s8(r1.val[0])); + int16x8x2_t t1 = vtrnq_s16(vreinterpretq_s16_s8(r2.val[0]), vreinterpretq_s16_s8(r3.val[0])); + int16x8x2_t t2 = vtrnq_s16(vreinterpretq_s16_s8(r4.val[0]), vreinterpretq_s16_s8(r5.val[0])); + int16x8x2_t t3 = vtrnq_s16(vreinterpretq_s16_s8(r6.val[0]), vreinterpretq_s16_s8(r7.val[0])); + int16x8x2_t t4 = vtrnq_s16(vreinterpretq_s16_s8(r0.val[1]), vreinterpretq_s16_s8(r1.val[1])); + int16x8x2_t t5 = vtrnq_s16(vreinterpretq_s16_s8(r2.val[1]), vreinterpretq_s16_s8(r3.val[1])); + int16x8x2_t t6 = vtrnq_s16(vreinterpretq_s16_s8(r4.val[1]), vreinterpretq_s16_s8(r5.val[1])); + int16x8x2_t t7 = vtrnq_s16(vreinterpretq_s16_s8(r6.val[1]), vreinterpretq_s16_s8(r7.val[1])); + + // Permute 8 32-bit rows. + int32x4x2_t x0 = vtrnq_s32(vreinterpretq_s32_s16(t0.val[0]), vreinterpretq_s32_s16(t1.val[0])); + int32x4x2_t x1 = vtrnq_s32(vreinterpretq_s32_s16(t4.val[0]), vreinterpretq_s32_s16(t5.val[0])); + int32x4x2_t x2 = vtrnq_s32(vreinterpretq_s32_s16(t0.val[1]), vreinterpretq_s32_s16(t1.val[1])); + int32x4x2_t x3 = vtrnq_s32(vreinterpretq_s32_s16(t4.val[1]), vreinterpretq_s32_s16(t5.val[1])); + + int32x4x2_t x4 = vtrnq_s32(vreinterpretq_s32_s16(t2.val[0]), vreinterpretq_s32_s16(t3.val[0])); + int32x4x2_t x5 = vtrnq_s32(vreinterpretq_s32_s16(t6.val[0]), vreinterpretq_s32_s16(t7.val[0])); + int32x4x2_t x6 = vtrnq_s32(vreinterpretq_s32_s16(t2.val[1]), vreinterpretq_s32_s16(t3.val[1])); + int32x4x2_t x7 = vtrnq_s32(vreinterpretq_s32_s16(t6.val[1]), vreinterpretq_s32_s16(t7.val[1])); + + // There is no permute 8 64-bit rows available. + // Instead we follow extracting low and high and placing them into the right places. + Index dstRowBegin = j*rows + i; + vst1q_s8(&dst[ 0*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32( vget_low_s32(x0.val[0]), vget_low_s32(x4.val[0])))); + vst1q_s8(&dst[ 1*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32( vget_low_s32(x1.val[0]), vget_low_s32(x5.val[0])))); + vst1q_s8(&dst[ 2*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32( vget_low_s32(x2.val[0]), vget_low_s32(x6.val[0])))); + vst1q_s8(&dst[ 3*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32( vget_low_s32(x3.val[0]), vget_low_s32(x7.val[0])))); + vst1q_s8(&dst[ 4*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32( vget_low_s32(x0.val[1]), vget_low_s32(x4.val[1])))); + vst1q_s8(&dst[ 5*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32( vget_low_s32(x1.val[1]), vget_low_s32(x5.val[1])))); + vst1q_s8(&dst[ 6*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32( vget_low_s32(x2.val[1]), vget_low_s32(x6.val[1])))); + vst1q_s8(&dst[ 7*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32( vget_low_s32(x3.val[1]), vget_low_s32(x7.val[1])))); + + vst1q_s8(&dst[ 8*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(x0.val[0]), vget_high_s32(x4.val[0])))); + vst1q_s8(&dst[ 9*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(x1.val[0]), vget_high_s32(x5.val[0])))); + vst1q_s8(&dst[10*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(x2.val[0]), vget_high_s32(x6.val[0])))); + vst1q_s8(&dst[11*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(x3.val[0]), vget_high_s32(x7.val[0])))); + vst1q_s8(&dst[12*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(x0.val[1]), vget_high_s32(x4.val[1])))); + vst1q_s8(&dst[13*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(x1.val[1]), vget_high_s32(x5.val[1])))); + vst1q_s8(&dst[14*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(x2.val[1]), vget_high_s32(x6.val[1])))); + vst1q_s8(&dst[15*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(x3.val[1]), vget_high_s32(x7.val[1])))); + + // clang-format on + } + + static void unquantizeAddBias(const int32_t *input, + const float *input_bias_prepared, + float unquant_multiplier, + Index rows_A, + Index cols_B, + float *output) { + // Set all registers in lane from same scalar value. + float32x4_t multiplier = vdupq_n_f32(unquant_multiplier); + const int32x4_t *Input = reinterpret_cast(input); + const int32x4_t *InputEnd = reinterpret_cast(input + rows_A * cols_B); + float32x4_t *Output = reinterpret_cast(output); + + while(Input != InputEnd) { + // Bias cycles every column for addition. + const float32x4_t *Bias = reinterpret_cast(input_bias_prepared); + + // InputEnd needs to be determined to end the while loop below. + const int32x4_t *RowEnd + = reinterpret_cast(reinterpret_cast(Input) + cols_B); + + while(Input != RowEnd) { + // Operation happening for 4-elements together: + // output = [int32_t]input * [float]quant_mult + [float]bias; + float32x4_t floatInput = vcvtq_f32_s32(*Input++); + float32x4_t unquantized = vmulq_f32(floatInput, multiplier); + *Output++ = vaddq_f32(unquantized, *Bias++); + } + } + } + + static void unquantize(const int32_t *input, + float unquant_multiplier, + Index rows_A, + Index cols_B, + float *output) { + // Set all registers in lane from same scalar value. + float32x4_t multiplier = vdupq_n_f32(unquant_multiplier); + const int32x4_t *Input = reinterpret_cast(input); + const int32x4_t *InputEnd = reinterpret_cast(input + rows_A * cols_B); + float32x4_t *Output = reinterpret_cast(output); + + while(Input != InputEnd) { + // Bias cycles every column for addition. + + // InputEnd needs to be determined to end the while loop below. + const int32x4_t *RowEnd + = reinterpret_cast(reinterpret_cast(Input) + cols_B); + + while(Input != RowEnd) { + // Operation happening for 4-elements together: + // output = [int32_t]input * [float]quant_mult + [float]bias; + float32x4_t floatInput = vcvtq_f32_s32(*Input++); + float32x4_t unquantized = vmulq_f32(floatInput, multiplier); + *Output++ = unquantized; + } + } + } +}; + +#endif + +/* + * The following nomenclature comes from intgemm. The current state of code is to keep the + * intgemm_interface.h diff minimal. There are possibly better abstractions. + */ +struct IntgemmViaRuy { + // Convert compile time errors into run-time ABORTS. This allows bringing in only int8_t and + // select functions that are required to create a path which will run while not achieving parity + // with intgemm. + template + struct IntBase { + using Type = T; + static void Quantize(const float *, Type *, float, Index) { ABORT("Quantize unsupported"); } + + static void PrepareA(const float *input, + Type *output, + float quant_mult, + Index rows, + Index cols) { + ABORT("PrepareA Unsupported"); + } + + static void PrepareB(const float *, Type *, float, Index, Index) { + ABORT("PrepareB Unsupported"); + } + static void PrepareBQuantizedTransposed(const Type *, Type *, Index, Index) { + ABORT("PrepareBQuantizedTransposed Unsupported"); + } + static void PrepareBTransposed(const float *, Type *, float, Index, Index) { + ABORT("PrepareBTransposed Unsupported"); + } + static void SelectColumnsB(const Type *, Type *, Index, const Index *, const Index *) { + ABORT("SelectColumnsB Unsupported"); + } + + static void + Multiply(const Type *, const Type *, const float *, const float *, Index, Index, Index, float) { + ABORT("Multiply (A*B + bias) Unsupported"); + } + + static void Multiply(const Type *, const Type *, const float *, Index, Index, Index, float) { + ABORT("Multiply (A*B) Unsupported"); + } + }; + + // Intgemm nomenclature expects Int8. Missing functions are ABORTs. + struct Int8 : IntBase { + using Type = int8_t; + static void PrepareBQuantizedTransposed(const Type *input, + Type *output, + Index rows, + Index cols) { + std::memcpy(output, input, /*count=*/sizeof(Type) * (rows * cols)); + } + + static void PrepareBTransposed(const float *input, + Type *output, + float quant_mult, + Index rows, + Index cols) { + Preprocess::quantize(input, output, quant_mult, rows, cols); + } + + static void PrepareA(const float *input, + int8_t *output, + float quant_mult, + Index rows, + Index cols) { + Preprocess::quantize(input, output, quant_mult, rows, cols); + } + + static void SelectColumnsB(const Type *input, + Type *output, + Index width, + const Index *cols, + const Index *cols_end) { + // B_prepared is expected to be col-major, for our implementation via ruy. If + // col-major we can memcpy the respective column entries as they're + // sequential. There are width=rows entries. + Index num_cols = static_cast(std::distance(cols, cols_end)); + for(Index c = 0; c < num_cols; ++c) { + std::memcpy(&(output[c * width]), &(input[cols[c] * width]), width); + } + } + + // We don't have callback an no-op capability here yet. Multiply is kept similar to Mozilla + // specification and there are overloads with and without bias to avoid an if inside. This + // method corresponds to the one with bias. + // output = A*B + bias + static void Multiply(const Type *input_A_prepared, + const Type *input_B_prepared, + const float *bias_prepared, + float *output, + Index rows_A, + Index width, + Index cols_B, + float unquant_multiplier) { + // It is expected that somehow we have managed to call all prepare by the time + // we are here, with inputs (prepared) in int8_t. All that's left to do is use + // ruy for multiply and then start with the reverse ops to get to fp32. + + // Use ruy to multiply. + // The following is adapted from + // https://github.com/google/ruy/blob/878283640de7946a43053e8ebf4f15114fbc9156/example/example.cc#L129-L152 + + ruy::Context context; + ruy::Matrix lhs; + ruy::MakeSimpleLayout(rows_A, width, ruy::Order::kRowMajor, lhs.mutable_layout()); + lhs.set_data(input_A_prepared); + + // PRINT_MATRIX_DEBUG(input_A_prepared, rows_A, width, Order::RowMajor); + + ruy::Matrix rhs; + ruy::MakeSimpleLayout(width, cols_B, ruy::Order::kColMajor, rhs.mutable_layout()); + rhs.set_data(input_B_prepared); + + // PRINT_MATRIX_DEBUG(input_B_prepared, width, cols_B, Order::ColMajor); + + ruy::Matrix dst; + ruy::MakeSimpleLayout(rows_A, cols_B, ruy::Order::kRowMajor, dst.mutable_layout()); + + AlignedVector dst_data(rows_A * cols_B); + std::int32_t *dest_ptr = dst_data.data(); + + dst.set_data(dest_ptr); + + // When Dst is int32, mul_params is unused. + ruy::MulParams mul_params; + ruy::Mul(lhs, rhs, mul_params, &context, &dst); + + // Unquantizes, then adds bias in a single statement on the output. + // float unquant_multiplier = (1.0f * scale_output) / (scale_A * scale_B); + Preprocess::unquantizeAddBias( + dest_ptr, bias_prepared, unquant_multiplier, rows_A, cols_B, output); + } + + // output = A*B (notice no bias). + static void Multiply(const Type *input_A_prepared, + const Type *input_B_prepared, + float *output, + Index rows_A, + Index width, + Index cols_B, + float unquant_multiplier) { + // It is expected that somehow we have managed to call all prepare by the time + // we are here, with inputs (prepared) in int8_t. All that's left to do is use + // ruy for multiply and then start with the reverse ops to get to fp32. + + // Use ruy to multiply. + // The following is adapted from + // https://github.com/google/ruy/blob/878283640de7946a43053e8ebf4f15114fbc9156/example/example.cc#L129-L152 + + ruy::Context context; + ruy::Matrix lhs; + ruy::MakeSimpleLayout(rows_A, width, ruy::Order::kRowMajor, lhs.mutable_layout()); + lhs.set_data(input_A_prepared); + + ruy::Matrix rhs; + ruy::MakeSimpleLayout(width, cols_B, ruy::Order::kColMajor, rhs.mutable_layout()); + rhs.set_data(input_B_prepared); + + ruy::Matrix dst; + ruy::MakeSimpleLayout(rows_A, cols_B, ruy::Order::kRowMajor, dst.mutable_layout()); + + AlignedVector dst_data(rows_A * cols_B); + std::int32_t *dest_ptr = dst_data.data(); + + dst.set_data(dest_ptr); + + // When Dst is int32, mul_params is unused. + ruy::MulParams mul_params; + ruy::Mul(lhs, rhs, mul_params, &context, &dst); + + // Unquantizes, then adds bias in a single statement on the output. + // float unquant_multiplier = (1.0f * scale_output) / (scale_A * scale_B); + Preprocess::unquantize(dest_ptr, unquant_multiplier, rows_A, cols_B, output); + } + }; + + // Int16 support is currently missing. + struct Int16 : IntBase { + using Type = int16_t; + }; + + template + static T MaxAbsolute(const T *begin, const T *end) { + T result = 0; + for(auto p = begin; p < end; ++p) { + result = std::max(result, std::abs(*p)); + } + return result; + } + + static void PrepareBias(const float *input, float *output, Index rows, Index cols) { + assert(input != nullptr && output != nullptr); + std::memcpy(output, input, /*count=*/sizeof(float) * (1 * cols)); + } +}; + +} // namespace integer +} // namespace cpu +} // namespace marian \ No newline at end of file From 2ac7cbc95cd3213797490b6d712385417fa66c32 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Wed, 9 Mar 2022 20:32:18 +0000 Subject: [PATCH 02/67] Fix sentencepiece submodule mixup --- src/3rd_party/sentencepiece | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/3rd_party/sentencepiece b/src/3rd_party/sentencepiece index c307b874d..3ffdc0065 160000 --- a/src/3rd_party/sentencepiece +++ b/src/3rd_party/sentencepiece @@ -1 +1 @@ -Subproject commit c307b874deb5ea896db8f93506e173353e66d4d3 +Subproject commit 3ffdc0065a03cadd9d0e5e123aaf9b6ea7ffb05d From 96749735709d9837c9cf49065010fd0f95f1da2c Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Sun, 10 Apr 2022 21:47:34 +0000 Subject: [PATCH 03/67] [sentencepiece] android cmake additional libs --- src/3rd_party/sentencepiece | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/3rd_party/sentencepiece b/src/3rd_party/sentencepiece index 3ffdc0065..7f669c3f8 160000 --- a/src/3rd_party/sentencepiece +++ b/src/3rd_party/sentencepiece @@ -1 +1 @@ -Subproject commit 3ffdc0065a03cadd9d0e5e123aaf9b6ea7ffb05d +Subproject commit 7f669c3f8f5fcc288838f3beba88a04533824f73 From f3e78185f7c8073cc5db7ea14f555c46ad63e344 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Sun, 10 Apr 2022 21:48:02 +0000 Subject: [PATCH 04/67] Remove separately added patch in favour of submodule update --- .github/workflows/arm.yml | 4 ---- patches/01-spm-protobuf-android.patch | 14 -------------- 2 files changed, 18 deletions(-) delete mode 100644 patches/01-spm-protobuf-android.patch diff --git a/.github/workflows/arm.yml b/.github/workflows/arm.yml index 4cab55c9e..e10d29d6f 100644 --- a/.github/workflows/arm.yml +++ b/.github/workflows/arm.yml @@ -68,10 +68,6 @@ jobs: ccache -s # Print current cache stats ccache -z # Zero cache entry - - name: Apply patch sentencepiece for android - run: |- - patch -p1 < patches/01-spm-protobuf-android.patch - - name: Generate buildfiles for marian on android via cmake run: |- mkdir -p build diff --git a/patches/01-spm-protobuf-android.patch b/patches/01-spm-protobuf-android.patch deleted file mode 100644 index 6eadb5454..000000000 --- a/patches/01-spm-protobuf-android.patch +++ /dev/null @@ -1,14 +0,0 @@ -diff --git a/src/3rd_party/sentencepiece/src/CMakeLists.txt b/src/3rd_party/sentencepiece/src/CMakeLists.txt -index 0ba6407..276caec 100644 ---- a/ src/3rd_party/sentencepiece/src/CMakeLists.txt -+++ b/src/3rd_party/sentencepiece/src/CMakeLists.txt -@@ -159,6 +159,9 @@ set(SPM_TEST_SRCS - find_package(Threads REQUIRED) - - set(SPM_LIBS ${PROTOBUF_LITE_LIBRARY} Threads::Threads) -+if(ANDROID) -+ set(SPM_LIBS ${SPM_LIBS} android log) -+endif(ANDROID) - - if (SPM_ENABLE_NFKC_COMPILE) - find_package(ICU 4.4 COMPONENTS i18n data uc REQUIRED) From 5250b9e81ebbab8974e7c9834c1e0457bf92116c Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Sun, 10 Apr 2022 21:53:43 +0000 Subject: [PATCH 05/67] Remove trailing newline in integer_common.h to prettify diff --- src/tensors/cpu/integer_common.h | 1 - 1 file changed, 1 deletion(-) diff --git a/src/tensors/cpu/integer_common.h b/src/tensors/cpu/integer_common.h index 80aa31d24..67ee1e94f 100644 --- a/src/tensors/cpu/integer_common.h +++ b/src/tensors/cpu/integer_common.h @@ -124,4 +124,3 @@ void unquantizeWemb(io::Item& item, const char * input) { } //integer } //cpu } //marian - From 26d3ba25ab7aa52364918ec94d748229fdb1d3d4 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Sun, 10 Apr 2022 21:58:16 +0000 Subject: [PATCH 06/67] Remove trailing newline in ruy_adapter.h --- src/tensors/cpu/ruy_adapter.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tensors/cpu/ruy_adapter.h b/src/tensors/cpu/ruy_adapter.h index d4bbfc002..ec03903dd 100644 --- a/src/tensors/cpu/ruy_adapter.h +++ b/src/tensors/cpu/ruy_adapter.h @@ -531,4 +531,4 @@ struct IntgemmViaRuy { } // namespace integer } // namespace cpu -} // namespace marian \ No newline at end of file +} // namespace marian From b271b70bfe7337a4cacca789cc657dc020968cc2 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Thu, 14 Apr 2022 13:35:31 +0000 Subject: [PATCH 07/67] In-place multiply without malloc by reinterpret_cast --- src/tensors/cpu/ruy_adapter.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/tensors/cpu/ruy_adapter.h b/src/tensors/cpu/ruy_adapter.h index ec03903dd..725d1aeff 100644 --- a/src/tensors/cpu/ruy_adapter.h +++ b/src/tensors/cpu/ruy_adapter.h @@ -494,9 +494,7 @@ struct IntgemmViaRuy { ruy::Matrix dst; ruy::MakeSimpleLayout(rows_A, cols_B, ruy::Order::kRowMajor, dst.mutable_layout()); - AlignedVector dst_data(rows_A * cols_B); - std::int32_t *dest_ptr = dst_data.data(); - + std::int32_t *dest_ptr = reinterpret_cast(output); dst.set_data(dest_ptr); // When Dst is int32, mul_params is unused. From efa5a854df77f4da3a85284cc979ad1dc87c65fb Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Thu, 14 Apr 2022 14:44:17 +0000 Subject: [PATCH 08/67] Documentation for the stdcpp/NEON paths created --- src/tensors/cpu/ruy_adapter.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/tensors/cpu/ruy_adapter.h b/src/tensors/cpu/ruy_adapter.h index 725d1aeff..490c19b4d 100644 --- a/src/tensors/cpu/ruy_adapter.h +++ b/src/tensors/cpu/ruy_adapter.h @@ -49,7 +49,16 @@ class AlignedVector { T *storage_; }; -enum class Path { kStandardCpp = 0, kNeon = 1 }; +// The following partitions a pure C++ slow implementation and a faster SIMD implementation using +// NEON intrinsics on ARM hardware. Ruy already has such a routing, but we add some preprocessing +// and postprocessing functions (quantize, transpose, unquantize) that are outside ruy's offerings +// and required in the fast matrix-multiplication workflow for machine-translation, that exists in +// marian. + +enum class Path { + kStandardCpp = 0, // Pure C++ + kNeon = 1 // NEON Intrinsics (ARM) +}; #if RUY_PLATFORM_NEON constexpr Path kHighestPath = Path::kNeon; From 179f239a7715291be824eb7fcbd546415a98858d Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Mon, 18 Apr 2022 08:07:29 +0000 Subject: [PATCH 09/67] Remove templated abort transpose() --- src/tensors/cpu/ruy_adapter.h | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/tensors/cpu/ruy_adapter.h b/src/tensors/cpu/ruy_adapter.h index 490c19b4d..aaa0ac18c 100644 --- a/src/tensors/cpu/ruy_adapter.h +++ b/src/tensors/cpu/ruy_adapter.h @@ -182,13 +182,6 @@ struct Preprocess { }; } - template - static void transpose(const Scalar *input, Index rows, Index cols, Scalar *output) { - // This is a template with an abort. The specialized implementation is done - // below for int8_t. - std::abort(); - } - // Specialization for int8_t static void transpose(const int8_t *input, Index rows, Index cols, int8_t *output) { constexpr Index tile_size = 16; From 0d189c81a526ab87608b26bb6fcbb7a0d5828ba0 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Mon, 18 Apr 2022 08:14:21 +0000 Subject: [PATCH 10/67] Reinterpret at unquantize add bias as well as int32_t from float32_t --- src/tensors/cpu/ruy_adapter.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/tensors/cpu/ruy_adapter.h b/src/tensors/cpu/ruy_adapter.h index aaa0ac18c..ee3cf9822 100644 --- a/src/tensors/cpu/ruy_adapter.h +++ b/src/tensors/cpu/ruy_adapter.h @@ -453,9 +453,7 @@ struct IntgemmViaRuy { ruy::Matrix dst; ruy::MakeSimpleLayout(rows_A, cols_B, ruy::Order::kRowMajor, dst.mutable_layout()); - AlignedVector dst_data(rows_A * cols_B); - std::int32_t *dest_ptr = dst_data.data(); - + std::int32_t *dest_ptr = reinterpret_cast(output); dst.set_data(dest_ptr); // When Dst is int32, mul_params is unused. From 8951261047b4ce60d6e2ec4c400ed589273b27fc Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Mon, 18 Apr 2022 18:17:03 +0000 Subject: [PATCH 11/67] Remove AlignedVector from ruy_adapter - not required here. --- src/tensors/cpu/ruy_adapter.h | 27 --------------------------- 1 file changed, 27 deletions(-) diff --git a/src/tensors/cpu/ruy_adapter.h b/src/tensors/cpu/ruy_adapter.h index ee3cf9822..3a04e475d 100644 --- a/src/tensors/cpu/ruy_adapter.h +++ b/src/tensors/cpu/ruy_adapter.h @@ -22,33 +22,6 @@ namespace integer { using Index = unsigned int; -/* - * An AlignedVector is similar to intgemm's aligned allocations. Defined here - * independently because we are ignoring intgemm path entirely on ARM. - */ -template -class AlignedVector { -public: - AlignedVector(size_t num_elem) - : size_(num_elem), - storage_(reinterpret_cast(ruy::detail::SystemAlignedAlloc(sizeof(T) * num_elem))) {} - - T *begin() { return storage_; } - T *data() { return storage_; } - size_t size() const { return size_; } - size_t memSize() const { return sizeof(T) * size_; } - - // Forbid copy - AlignedVector(const AlignedVector &) = delete; - AlignedVector &operator=(const AlignedVector &) = delete; - - ~AlignedVector() { ruy::detail::SystemAlignedFree(reinterpret_cast(storage_)); } - -private: - size_t size_; - T *storage_; -}; - // The following partitions a pure C++ slow implementation and a faster SIMD implementation using // NEON intrinsics on ARM hardware. Ruy already has such a routing, but we add some preprocessing // and postprocessing functions (quantize, transpose, unquantize) that are outside ruy's offerings From 49beb50ba1935bc2f27b20ec07804b2ba7cb09d8 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Mon, 18 Apr 2022 18:26:15 +0000 Subject: [PATCH 12/67] Remove ViaRuy::PrepareBias without effect to output --- src/tensors/cpu/intgemm_interface.h | 6 ++---- src/tensors/cpu/ruy_adapter.h | 5 ----- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/src/tensors/cpu/intgemm_interface.h b/src/tensors/cpu/intgemm_interface.h index 194169c60..57d01081b 100644 --- a/src/tensors/cpu/intgemm_interface.h +++ b/src/tensors/cpu/intgemm_interface.h @@ -363,7 +363,7 @@ class PrepareBiasForBNodeOp : public NaryNodeOp { float unquant_mult = (-1)*((127.0f / *quant_mult_a->data())*(127.0f / *quant_mult_b->data()))/(127.0f); //Minus one to invert add_ps later on intgemm::Int8Shift::PrepareBias((const int8_t *)b->data(), rows(b), cols(b), intgemm::callbacks::UnquantizeAndAddBiasAndWrite(unquant_mult, bias->data(), val_->data())); #else - IntgemmViaRuy::PrepareBias(nullptr, val_->data(), rows(b), cols(b)); + ABORT("PrepareBias should not be called on ARM"); #endif } }}; @@ -402,9 +402,7 @@ class PrepareFakeBiasForBNodeOp : public NaryNodeOp { float unquant_mult = (-1)*((127.0f / *quant_mult_a->data())*(127.0f / *quant_mult_b->data()))/(127.0f); //Minus one to invert add_ps later on intgemm::Int8Shift::PrepareBias((const int8_t *)b->data(), rows(b), cols(b), intgemm::callbacks::UnquantizeAndWrite(unquant_mult, val_->data())); #else - const float *bias = nullptr; - float *bias_prepared = val_->data(); - IntgemmViaRuy::PrepareBias(bias, bias_prepared, rows(b), cols(b)); + // Not sure what's going on here. #endif }}; #else diff --git a/src/tensors/cpu/ruy_adapter.h b/src/tensors/cpu/ruy_adapter.h index 3a04e475d..a200c331b 100644 --- a/src/tensors/cpu/ruy_adapter.h +++ b/src/tensors/cpu/ruy_adapter.h @@ -493,11 +493,6 @@ struct IntgemmViaRuy { } return result; } - - static void PrepareBias(const float *input, float *output, Index rows, Index cols) { - assert(input != nullptr && output != nullptr); - std::memcpy(output, input, /*count=*/sizeof(float) * (1 * cols)); - } }; } // namespace integer From 3cf85f703fa3bb54787631c13c8ff99aeb4e6dc0 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Mon, 18 Apr 2022 18:58:00 +0000 Subject: [PATCH 13/67] If SSE4.1 found use it to avoid perf regressions even if not -march=native --- CMakeLists.txt | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 97ef84269..5fc7a6dee 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -215,14 +215,13 @@ else(MSVC) endif() # Detect support CPU instrinsics for the current platform. This will - # only by used with BUILD_ARCH=native. For overridden BUILD_ARCH we - # minimally use -msse4.1. This seems to work with MKL. + # only by used with BUILD_ARCH=native. + include(FindSSE) set(INTRINSICS "") list(APPEND INTRINSICS_NVCC) if(BUILD_ARCH STREQUAL "native") message(STATUS "Checking support for CPU intrinsics") - include(FindSSE) if(SSE2_FOUND) message(STATUS "SSE2 support found") set(INTRINSICS "${INTRINSICS} -msse2") @@ -263,7 +262,12 @@ else(MSVC) # -msse4.1 once marian can solely be compiled with intgemm ("onnxjs" will be removed in that case) set(INTRINSICS "-mssse3 -msimd128") else() - # Not assuming we have "-msse4.1 here" + # For overridden BUILD_ARCH we minimally use -msse4.1 (if available) + if(SSE4_1_FOUND) + message(STATUS "SSE4.1 support found") + set(INTRINSICS "${INTRINSICS} -msse4.1") + list(APPEND INTRINSICS_NVCC -Xcompiler\ -msse4.1) + endif(SSE4_1_FOUND) endif() if(USE_FBGEMM) From 4edc8ef62dc4d580e8553af1fe1d110854d474d1 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Mon, 18 Apr 2022 19:31:37 +0000 Subject: [PATCH 14/67] Deduplicate multiply by capturing variability through callbacks Bit ugly for now, but we are headed towards better. --- src/tensors/cpu/intgemm_interface.h | 8 +- src/tensors/cpu/ruy_adapter.h | 220 +++++++++++++--------------- 2 files changed, 104 insertions(+), 124 deletions(-) diff --git a/src/tensors/cpu/intgemm_interface.h b/src/tensors/cpu/intgemm_interface.h index 57d01081b..0612559d9 100644 --- a/src/tensors/cpu/intgemm_interface.h +++ b/src/tensors/cpu/intgemm_interface.h @@ -461,13 +461,14 @@ float scalar_; intgemm::callbacks::UnquantizeAndWrite(unquant_mult, val_->data())); #else typedef typename intgemm_::type Integer; + auto callback = marian::cpu::integer::Preprocess::UnquantizeAndWrite(unquant_mult); intgemm_::width::Multiply(reinterpret_cast(child(0)->val()->data()), /*A*/ reinterpret_cast(child(1)->val()->data()), /*B*/ val_->data(), /*output*/ rows(child(0)->val()), cols(child(0)->val()), cols(child(1)->val()), - unquant_mult); + callback); #endif }}; @@ -554,14 +555,15 @@ class AffineNodeOp : public NaryNodeOp { } #else typedef typename intgemm_::type Integer; + auto callback = marian::cpu::integer::Preprocess::UnquantizeAndAddBiasAndWrite(unquant_mult, + child(2)->val()->data() /*child(2) is bias*/); intgemm_::width::Multiply(reinterpret_cast(child(0)->val()->data()), /*A*/ reinterpret_cast(child(1)->val()->data()), /*B*/ - child(2)->val()->data(), /*child(2) is bias*/ val_->data(), /*output*/ rows(child(0)->val()), cols(child(0)->val()), cols(child(1)->val()), - unquant_mult); + callback); #endif diff --git a/src/tensors/cpu/ruy_adapter.h b/src/tensors/cpu/ruy_adapter.h index a200c331b..e539218cd 100644 --- a/src/tensors/cpu/ruy_adapter.h +++ b/src/tensors/cpu/ruy_adapter.h @@ -72,32 +72,36 @@ struct Preprocess { } } - static void unquantizeAddBias(const int32_t *input, - const float *input_bias_prepared, - float unquant_multiplier, - Index rows_A, - Index cols_B, - float *output) { - for(Index i = 0; i < rows_A; i++) { - for(Index j = 0; j < cols_B; j++) { - Index idx = i * cols_B + j; - output[idx] = (input[idx] * unquant_multiplier) + input_bias_prepared[j]; + struct UnquantizeAndAddBiasAndWrite { + UnquantizeAndAddBiasAndWrite(float unquant_multiplier, const float *input_bias_prepared) + : unquant_multiplier(unquant_multiplier), input_bias_prepared(input_bias_prepared) {} + + void operator()(const int32_t *input, Index rows_A, Index cols_B, float *output) { + for(Index i = 0; i < rows_A; i++) { + for(Index j = 0; j < cols_B; j++) { + Index idx = i * cols_B + j; + output[idx] = (input[idx] * unquant_multiplier) + input_bias_prepared[j]; + } } } - } - static void unquantize(const int32_t *input, - float unquant_multiplier, - Index rows_A, - Index cols_B, - float *output) { - for(Index i = 0; i < rows_A; i++) { - for(Index j = 0; j < cols_B; j++) { - Index idx = i * cols_B + j; - output[idx] = (input[idx] * unquant_multiplier); + float unquant_multiplier; + const float *input_bias_prepared; + }; + + struct UnquantizeAndWrite { + UnquantizeAndWrite(float unquant_multiplier) : unquant_multiplier(unquant_multiplier) {} + + void operator()(const int32_t *input, Index rows_A, Index cols_B, float *output) { + for(Index i = 0; i < rows_A; i++) { + for(Index j = 0; j < cols_B; j++) { + Index idx = i * cols_B + j; + output[idx] = (input[idx] * unquant_multiplier); + } } } - } + float unquant_multiplier; + }; }; #if RUY_PLATFORM_NEON @@ -245,63 +249,68 @@ struct Preprocess { // clang-format on } - static void unquantizeAddBias(const int32_t *input, - const float *input_bias_prepared, - float unquant_multiplier, - Index rows_A, - Index cols_B, - float *output) { - // Set all registers in lane from same scalar value. - float32x4_t multiplier = vdupq_n_f32(unquant_multiplier); - const int32x4_t *Input = reinterpret_cast(input); - const int32x4_t *InputEnd = reinterpret_cast(input + rows_A * cols_B); - float32x4_t *Output = reinterpret_cast(output); - - while(Input != InputEnd) { - // Bias cycles every column for addition. - const float32x4_t *Bias = reinterpret_cast(input_bias_prepared); - - // InputEnd needs to be determined to end the while loop below. - const int32x4_t *RowEnd - = reinterpret_cast(reinterpret_cast(Input) + cols_B); - - while(Input != RowEnd) { - // Operation happening for 4-elements together: - // output = [int32_t]input * [float]quant_mult + [float]bias; - float32x4_t floatInput = vcvtq_f32_s32(*Input++); - float32x4_t unquantized = vmulq_f32(floatInput, multiplier); - *Output++ = vaddq_f32(unquantized, *Bias++); + struct UnquantizeAndAddBiasAndWrite { + UnquantizeAndAddBiasAndWrite(float unquant_multiplier, const float *input_bias_prepared) + : unquant_multiplier(unquant_multiplier), input_bias_prepared(input_bias_prepared) {} + + void operator()(const int32_t *input, Index rows_A, Index cols_B, float *output) { + // Set all registers in lane from same scalar value. + float32x4_t multiplier = vdupq_n_f32(unquant_multiplier); + const int32x4_t *Input = reinterpret_cast(input); + const int32x4_t *InputEnd = reinterpret_cast(input + rows_A * cols_B); + float32x4_t *Output = reinterpret_cast(output); + + while(Input != InputEnd) { + // Bias cycles every column for addition. + const float32x4_t *Bias = reinterpret_cast(input_bias_prepared); + + // InputEnd needs to be determined to end the while loop below. + const int32x4_t *RowEnd = reinterpret_cast( + reinterpret_cast(Input) + cols_B); + + while(Input != RowEnd) { + // Operation happening for 4-elements together: + // output = [int32_t]input * [float]quant_mult + [float]bias; + float32x4_t floatInput = vcvtq_f32_s32(*Input++); + float32x4_t unquantized = vmulq_f32(floatInput, multiplier); + *Output++ = vaddq_f32(unquantized, *Bias++); + } } } - } - static void unquantize(const int32_t *input, - float unquant_multiplier, - Index rows_A, - Index cols_B, - float *output) { - // Set all registers in lane from same scalar value. - float32x4_t multiplier = vdupq_n_f32(unquant_multiplier); - const int32x4_t *Input = reinterpret_cast(input); - const int32x4_t *InputEnd = reinterpret_cast(input + rows_A * cols_B); - float32x4_t *Output = reinterpret_cast(output); + float unquant_multiplier; + const float *input_bias_prepared; + }; - while(Input != InputEnd) { - // Bias cycles every column for addition. - - // InputEnd needs to be determined to end the while loop below. - const int32x4_t *RowEnd - = reinterpret_cast(reinterpret_cast(Input) + cols_B); - - while(Input != RowEnd) { - // Operation happening for 4-elements together: - // output = [int32_t]input * [float]quant_mult + [float]bias; - float32x4_t floatInput = vcvtq_f32_s32(*Input++); - float32x4_t unquantized = vmulq_f32(floatInput, multiplier); - *Output++ = unquantized; + struct UnquantizeAndWrite { + UnquantizeAndWrite(float unquant_multiplier) : unquant_multiplier(unquant_multiplier) {} + + void operator()(const int32_t *input, Index rows_A, Index cols_B, float *output) { + // Set all registers in lane from same scalar value. + float32x4_t multiplier = vdupq_n_f32(unquant_multiplier); + const int32x4_t *Input = reinterpret_cast(input); + const int32x4_t *InputEnd = reinterpret_cast(input + rows_A * cols_B); + float32x4_t *Output = reinterpret_cast(output); + + while(Input != InputEnd) { + // Bias cycles every column for addition. + + // InputEnd needs to be determined to end the while loop below. + const int32x4_t *RowEnd = reinterpret_cast( + reinterpret_cast(Input) + cols_B); + + while(Input != RowEnd) { + // Operation happening for 4-elements together: + // output = [int32_t]input * [float]quant_mult + [float]bias; + float32x4_t floatInput = vcvtq_f32_s32(*Input++); + float32x4_t unquantized = vmulq_f32(floatInput, multiplier); + *Output++ = unquantized; + } } } - } + + float unquant_multiplier; + }; }; #endif @@ -312,8 +321,8 @@ struct Preprocess { */ struct IntgemmViaRuy { // Convert compile time errors into run-time ABORTS. This allows bringing in only int8_t and - // select functions that are required to create a path which will run while not achieving parity - // with intgemm. + // select functions that are required to create a path which will run while not achieving + // parity with intgemm. template struct IntBase { using Type = T; @@ -340,12 +349,14 @@ struct IntgemmViaRuy { ABORT("SelectColumnsB Unsupported"); } - static void - Multiply(const Type *, const Type *, const float *, const float *, Index, Index, Index, float) { - ABORT("Multiply (A*B + bias) Unsupported"); - } - - static void Multiply(const Type *, const Type *, const float *, Index, Index, Index, float) { + template + static void Multiply(const Type *A_prepared, + const Type *B_prepared, + float *output, + Index rows_A, + Index width, + Index cols_B, + Callback callback) { ABORT("Multiply (A*B) Unsupported"); } }; @@ -394,14 +405,14 @@ struct IntgemmViaRuy { // specification and there are overloads with and without bias to avoid an if inside. This // method corresponds to the one with bias. // output = A*B + bias + template static void Multiply(const Type *input_A_prepared, const Type *input_B_prepared, - const float *bias_prepared, float *output, Index rows_A, Index width, Index cols_B, - float unquant_multiplier) { + Callback callback) { // It is expected that somehow we have managed to call all prepare by the time // we are here, with inputs (prepared) in int8_t. All that's left to do is use // ruy for multiply and then start with the reverse ops to get to fp32. @@ -435,48 +446,15 @@ struct IntgemmViaRuy { // Unquantizes, then adds bias in a single statement on the output. // float unquant_multiplier = (1.0f * scale_output) / (scale_A * scale_B); - Preprocess::unquantizeAddBias( - dest_ptr, bias_prepared, unquant_multiplier, rows_A, cols_B, output); - } - - // output = A*B (notice no bias). - static void Multiply(const Type *input_A_prepared, - const Type *input_B_prepared, - float *output, - Index rows_A, - Index width, - Index cols_B, - float unquant_multiplier) { - // It is expected that somehow we have managed to call all prepare by the time - // we are here, with inputs (prepared) in int8_t. All that's left to do is use - // ruy for multiply and then start with the reverse ops to get to fp32. - - // Use ruy to multiply. - // The following is adapted from - // https://github.com/google/ruy/blob/878283640de7946a43053e8ebf4f15114fbc9156/example/example.cc#L129-L152 - - ruy::Context context; - ruy::Matrix lhs; - ruy::MakeSimpleLayout(rows_A, width, ruy::Order::kRowMajor, lhs.mutable_layout()); - lhs.set_data(input_A_prepared); - - ruy::Matrix rhs; - ruy::MakeSimpleLayout(width, cols_B, ruy::Order::kColMajor, rhs.mutable_layout()); - rhs.set_data(input_B_prepared); - - ruy::Matrix dst; - ruy::MakeSimpleLayout(rows_A, cols_B, ruy::Order::kRowMajor, dst.mutable_layout()); - - std::int32_t *dest_ptr = reinterpret_cast(output); - dst.set_data(dest_ptr); - - // When Dst is int32, mul_params is unused. - ruy::MulParams mul_params; - ruy::Mul(lhs, rhs, mul_params, &context, &dst); + // Preprocess::unquantizeAddBias( + // dest_ptr, bias_prepared, unquant_multiplier, rows_A, cols_B, output); // Unquantizes, then adds bias in a single statement on the output. // float unquant_multiplier = (1.0f * scale_output) / (scale_A * scale_B); - Preprocess::unquantize(dest_ptr, unquant_multiplier, rows_A, cols_B, output); + // Preprocess::unquantize(dest_ptr, unquant_multiplier, rows_A, cols_B, + // output); + + callback(dest_ptr, rows_A, cols_B, output); } }; From a414b60dd3c8637e5d5dd28050978aa3016052db Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Mon, 18 Apr 2022 19:35:45 +0000 Subject: [PATCH 15/67] Revert "If SSE4.1 found use it to avoid perf regressions even if not -march=native" This reverts commit 3cf85f703fa3bb54787631c13c8ff99aeb4e6dc0. --- CMakeLists.txt | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5fc7a6dee..97ef84269 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -215,13 +215,14 @@ else(MSVC) endif() # Detect support CPU instrinsics for the current platform. This will - # only by used with BUILD_ARCH=native. - include(FindSSE) + # only by used with BUILD_ARCH=native. For overridden BUILD_ARCH we + # minimally use -msse4.1. This seems to work with MKL. set(INTRINSICS "") list(APPEND INTRINSICS_NVCC) if(BUILD_ARCH STREQUAL "native") message(STATUS "Checking support for CPU intrinsics") + include(FindSSE) if(SSE2_FOUND) message(STATUS "SSE2 support found") set(INTRINSICS "${INTRINSICS} -msse2") @@ -262,12 +263,7 @@ else(MSVC) # -msse4.1 once marian can solely be compiled with intgemm ("onnxjs" will be removed in that case) set(INTRINSICS "-mssse3 -msimd128") else() - # For overridden BUILD_ARCH we minimally use -msse4.1 (if available) - if(SSE4_1_FOUND) - message(STATUS "SSE4.1 support found") - set(INTRINSICS "${INTRINSICS} -msse4.1") - list(APPEND INTRINSICS_NVCC -Xcompiler\ -msse4.1) - endif(SSE4_1_FOUND) + # Not assuming we have "-msse4.1 here" endif() if(USE_FBGEMM) From e2069bfee29149ce38b06ba7b062daff454ac077 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Mon, 18 Apr 2022 22:01:44 +0000 Subject: [PATCH 16/67] CMAKE_SYSTEM_PROCESSOR indicates x86 and native mode is not enabled, apply -msse4.1. --- CMakeLists.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 97ef84269..5ff0b5da8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -263,7 +263,9 @@ else(MSVC) # -msse4.1 once marian can solely be compiled with intgemm ("onnxjs" will be removed in that case) set(INTRINSICS "-mssse3 -msimd128") else() - # Not assuming we have "-msse4.1 here" + if(CMAKE_SYSTEM_PROCESSOR STREQUAL x86_64 OR CMAKE_SYSTEM_PROCESSOR STREQUAL amd64) + set(INTRINSICS "-msse4.1") + endif () endif() if(USE_FBGEMM) From 1b4049a4551746142d599c3e575a87e58bf24124 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Mon, 18 Apr 2022 22:20:19 +0000 Subject: [PATCH 17/67] Remove comments, now that callback is working --- src/tensors/cpu/ruy_adapter.h | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/src/tensors/cpu/ruy_adapter.h b/src/tensors/cpu/ruy_adapter.h index e539218cd..89e856d10 100644 --- a/src/tensors/cpu/ruy_adapter.h +++ b/src/tensors/cpu/ruy_adapter.h @@ -444,16 +444,6 @@ struct IntgemmViaRuy { ruy::MulParams mul_params; ruy::Mul(lhs, rhs, mul_params, &context, &dst); - // Unquantizes, then adds bias in a single statement on the output. - // float unquant_multiplier = (1.0f * scale_output) / (scale_A * scale_B); - // Preprocess::unquantizeAddBias( - // dest_ptr, bias_prepared, unquant_multiplier, rows_A, cols_B, output); - - // Unquantizes, then adds bias in a single statement on the output. - // float unquant_multiplier = (1.0f * scale_output) / (scale_A * scale_B); - // Preprocess::unquantize(dest_ptr, unquant_multiplier, rows_A, cols_B, - // output); - callback(dest_ptr, rows_A, cols_B, output); } }; From e522e6cdcbcef11f48f92aef906400cc94c930e2 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Tue, 19 Apr 2022 16:21:12 +0000 Subject: [PATCH 18/67] Minimal gemmRuy There are no accompanying tests. --- CMakeLists.txt | 5 +- src/tensors/cpu/prod_blas.h | 127 +++++++++++++++++++++++++++++++++++- 2 files changed, 130 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5ff0b5da8..d5b61c9a8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -43,6 +43,7 @@ option(USE_WASM_COMPATIBLE_SOURCE "Enable the minimal marian sources that compil option(USE_SIMDE "Enable simde to target instruction sets" OFF) option(USE_ONNX_SGEMM "Compile with wasm compatible blas" OFF) +option(USE_RUY_SGEMM "Compile with Ruy SGEMM" OFF) option(COMPILE_WITHOUT_EXCEPTIONS "Compile without exceptions" OFF) # cmake options that are dependent on USE_WASM_COMPATIBLE_SOURCE cmake option @@ -573,7 +574,8 @@ if(COMPILE_CPU) endif(USE_INTGEMM) if(USE_RUY) - set(EXT_LIBS ${EXT_LIBS} ruy) + set(EXT_LIBS ${EXT_LIBS} ruy) + add_definitions(-DUSE_RUY_SGEMM=1) endif(USE_RUY) add_definitions(-DCOMPILE_CPU=1) # Move the compile CPU definition on top since we want to compile intgemm when we set compile CPU @@ -615,6 +617,7 @@ if(COMPILE_CPU) endif(BLAS_FOUND) endif(MKL_FOUND) endif(USE_ONNX_SGEMM) + endif(COMPILE_CPU) ############################################################################### diff --git a/src/tensors/cpu/prod_blas.h b/src/tensors/cpu/prod_blas.h index c9dd6d7bc..a3d11db9c 100644 --- a/src/tensors/cpu/prod_blas.h +++ b/src/tensors/cpu/prod_blas.h @@ -5,8 +5,119 @@ #include #elif USE_ONNX_SGEMM #include "3rd_party/onnxjs/src/wasm-ops/gemm.h" +#elif USE_RUY_SGEMM + #include "ruy/ruy.h" + #include "ruy/system_aligned_alloc.h" #endif +#if USE_RUY_SGEMM + +// AlignedVector allocates aligned memory and cleans up after itself. RAII +// wrapper similar to intgemm's AlignedVector. +template +class AlignedVector { +public: + AlignedVector(size_t num_elem) + : size_(num_elem), + storage_(reinterpret_cast(ruy::detail::SystemAlignedAlloc(sizeof(T) * num_elem))) {} + + T *begin() { return storage_; } + T *data() { return storage_; } + size_t size() const { return size_; } + size_t memSize() const { return sizeof(T) * size_; } + + // Forbid copy + AlignedVector(const AlignedVector &) = delete; + AlignedVector &operator=(const AlignedVector &) = delete; + + ~AlignedVector() { ruy::detail::SystemAlignedFree(reinterpret_cast(storage_)); } + +private: + size_t size_; + T *storage_; +}; + + +inline void GemmRuy(const bool transA, + const bool transB, + const int M, + const int N, + const int K, + const float alpha, + const float *A, + const int lda, + const float *B, + const int ldb, + const float beta, + float *C, + const int ldc) { + LOG(info, "Ruy multiplication called..."); + ruy::Context context; + + // If we need to transpose, we can swap dimensions in layout claim the matrix + // is just column-major. Set ordering so transpose. + const auto orderA = (transA ? ruy::Order::kColMajor : ruy::Order::kRowMajor); + const auto orderB = (transB ? ruy::Order::kColMajor : ruy::Order::kRowMajor); + + ruy::Matrix lhs; + ruy::MakeSimpleLayout(M, K, orderA, lhs.mutable_layout()); + lhs.set_data(A); + + ruy::Matrix rhs; + ruy::MakeSimpleLayout(K, N, orderB, rhs.mutable_layout()); + rhs.set_data(B); + + ruy::Matrix dst; + ruy::MakeSimpleLayout(M, N, ruy::Order::kRowMajor, dst.mutable_layout()); + + if(beta == 0) { + // For beta = 0, we want to avoid the additional allocation. This is a + // large amount of our inference use-cases. sgemm is called with `beta` for + // accumulating gradients in backpropogation, which is 0.0 during + // inference. + + dst.set_data(C); + ruy::MulParams mul_params; + ruy::Mul(lhs, rhs, mul_params, &context, &dst); + + // Write out C as C = alpha * [op(A) * op(B)] + beta * C + // Can we expect the compiler to autovectorize this? + // TODO: Come back and explicitly use SIMD. + const size_t size = M * N; + const float *opA_opB = C; // Alias. +#pragma clang loop vectorize(enable) interleave(enable) + for(size_t i = 0; i < size; i++) { + C[i] = alpha * opA_opB[i]; + } + } else { + // @jerinphilip has not yet been able to find a ruy primitive that does in + // place addition to obtain full gemm. + // + // Safe bet is to make an additional allocation to store the result of + // multiply and use the existing values in C. + // + // See also: https://github.com/google/ruy/issues/307 + + AlignedVector intermediate(M * N); + dst.set_data(intermediate.data()); + ruy::MulParams mul_params; + ruy::Mul(lhs, rhs, mul_params, &context, &dst); + + // Write out C as C = alpha * [op(A) * op(B)] + beta * C + // Can we expect the compiler to autovectorize this? + // TODO: Come back and explicitly use SIMD. + const size_t size = M * N; + const float *opA_opB = intermediate.data(); +#pragma clang loop vectorize(enable) interleave(enable) + for(size_t i = 0; i < size; i++) { + C[i] = alpha * opA_opB[i] + beta * C[i]; + } + } +} + +#endif // RUY_SGEMM + + inline void sgemm(bool transA, bool transB, int rows_a, @@ -37,7 +148,21 @@ inline void sgemm(bool transA, ldc); #elif USE_ONNX_SGEMM gemm_f32_imp(transA, transB, rows_a, rows_b, width, alpha, a, b, beta, c); -#else +#elif USE_RUY_SGEMM + GemmRuy(transA, + transB, + rows_a, + rows_b, + width, + alpha, + a, + lda, + b, + ldb, + beta, + c, + ldc); +#else transA; transB; rows_a; rows_b; width; alpha; a; lda; b; ldb; beta; c; ldc; // make compiler happy ABORT("Marian must be compiled with a BLAS library"); #endif From 90858a5ddb0dc689da920d19cccda214e049f38c Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Tue, 19 Apr 2022 16:22:47 +0000 Subject: [PATCH 19/67] Update CI --- .github/workflows/arm.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/arm.yml b/.github/workflows/arm.yml index e10d29d6f..9cb865895 100644 --- a/.github/workflows/arm.yml +++ b/.github/workflows/arm.yml @@ -90,7 +90,6 @@ jobs: -DUSE_INTGEMM=off -DUSE_SIMDE=on -DUSE_RUY=on - -DUSE_ONNX_SGEMM=on # For time being. # -DCOMPILE_WITHOUT_EXCEPTIONS=on # Apparently this can reduce the binary size, let's see. ) # Additionally list variables finally configured. From 557de0ceeae1125441437a1f71ccf66e089fc5aa Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Mon, 25 Apr 2022 14:57:27 +0100 Subject: [PATCH 20/67] Using simd_utils instead of SIMDE * Remove SIMDE dependency from integer_common.cpp:AddBias * Typo: __ARM_NEON__ * Revert "Typo: __ARM_NEON__" This reverts commit f29a0bb24e5a0b21d7bfd2f694bc24e37e87dd72. * Typo __ARM_NEON__: NoAutoFormatBuffer * AVX and optional expansion, include guarded by __AVX__ instead of SIMDE * Create an ARM_NEON structure to advance compilation without SIMDE * Import neon header files * No SIMDE for sse_mathfun * Import submodule as a whole, can't do only neon * Removing old files * Update simdutils submodule usage * Remove simde submodule * More SIMDE removal * Remove the neon_mathfun.h file * USE_SIMD_UTILS instead of USE_SIMDE in CI * xmmintrin.h include if SSE for WebAssembly * Include simd_utils only if flag set * Create a dummy float32x4 because Windows * #else block is SSE __m128d now, old behaviour * Windows does not give us flags, expects sse_mathfun * Point simd_utils to a fork with an experimental patch * Restore TODO, it's valid after the reset --- .github/workflows/arm.yml | 2 +- .gitmodules | 6 ++--- CMakeLists.txt | 18 +++++-------- src/3rd_party/CMakeLists.txt | 4 --- src/3rd_party/simd_utils | 1 + src/3rd_party/simde-no-tests | 1 - src/3rd_party/sse_mathfun.h | 6 ----- src/common/types.h | 42 +++++++++++++++++++++++++++--- src/functional/operators.h | 6 +++++ src/tensors/cpu/integer_common.cpp | 30 +++++++++++++++------ 10 files changed, 78 insertions(+), 38 deletions(-) create mode 160000 src/3rd_party/simd_utils delete mode 160000 src/3rd_party/simde-no-tests diff --git a/.github/workflows/arm.yml b/.github/workflows/arm.yml index 9cb865895..1b8ff0839 100644 --- a/.github/workflows/arm.yml +++ b/.github/workflows/arm.yml @@ -88,7 +88,7 @@ jobs: -DTHREADS_PREFER_PTHREAD_FLAG=ON -DBUILD_ARCH=armv8-a -DUSE_INTGEMM=off - -DUSE_SIMDE=on + -DUSE_SIMD_UTILS=on -DUSE_RUY=on # -DCOMPILE_WITHOUT_EXCEPTIONS=on # Apparently this can reduce the binary size, let's see. ) diff --git a/.gitmodules b/.gitmodules index ae76037f1..536fa3829 100644 --- a/.gitmodules +++ b/.gitmodules @@ -23,9 +23,9 @@ [submodule "src/3rd_party/onnxjs"] path = src/3rd_party/onnxjs url = https://github.com/abhi-agg/onnxjs.git -[submodule "src/3rd_party/simde-no-tests"] - path = src/3rd_party/simde-no-tests - url = https://github.com/simd-everywhere/simde-no-tests/ [submodule "src/3rd_party/ruy"] path = src/3rd_party/ruy url = https://github.com/google/ruy +[submodule "src/3rd_party/simd_utils"] + path = src/3rd_party/simd_utils + url = https://github.com/JishinMaster/simd_utils/ diff --git a/CMakeLists.txt b/CMakeLists.txt index d5b61c9a8..19d74d770 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -41,7 +41,7 @@ option(GENERATE_MARIAN_INSTALL_TARGETS "Generate Marian install targets (require option(COMPILE_WASM "Compile (wasm compatible) marian for WASM target" OFF) option(USE_WASM_COMPATIBLE_SOURCE "Enable the minimal marian sources that compile to wasm. Useful for debugging wasm failures by building same sources natively" OFF) -option(USE_SIMDE "Enable simde to target instruction sets" OFF) +option(USE_SIMD_UTILS "Enable simde to target instruction sets" OFF) option(USE_ONNX_SGEMM "Compile with wasm compatible blas" OFF) option(USE_RUY_SGEMM "Compile with Ruy SGEMM" OFF) option(COMPILE_WITHOUT_EXCEPTIONS "Compile without exceptions" OFF) @@ -55,8 +55,8 @@ if (USE_WASM_COMPATIBLE_SOURCE) add_compile_definitions(WASM_COMPATIBLE_SOURCE) # Setting USE_SSE2 definition to enable SSE2 specific code in "3rd_party/sse_mathfun.h" for wasm builds add_compile_definitions(USE_SSE2) - set(USE_ONNX_SGEMM ON CACHE BOOL "") - set(COMPILE_WITHOUT_EXCEPTIONS ON CACHE BOOL "") + set(USE_ONNX_SGEMM ON CACHE BOOL "Use ONNX SGEMM (for WebAssembly)") + set(COMPILE_WITHOUT_EXCEPTIONS ON CACHE BOOL "Compile without Exceptions") endif() if (COMPILE_WASM) @@ -203,11 +203,6 @@ if(MSVC) add_definitions(-DUSE_INTGEMM=1) endif(USE_INTGEMM) - if(USE_SIMDE) - add_definitions(-DUSE_SIMDE=1) - add_definitions(-DSIMDE_ENABLE_NATIVE_ALIASES=1) - endif(USE_SIMDE) - else(MSVC) # Check we are using at least g++ 5.0 @@ -278,10 +273,9 @@ else(MSVC) add_definitions(-DUSE_INTGEMM=1) endif(USE_INTGEMM) - if(USE_SIMDE) - add_definitions(-DUSE_SIMDE=1) - add_definitions(-DSIMDE_ENABLE_NATIVE_ALIASES=1) - endif(USE_SIMDE) + if(USE_SIMD_UTILS) + add_definitions(-DARM -DSSE -flax-vector-conversions) #added for ARM + endif(USE_SIMD_UTILS) if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 9.0) # Clang-10.0.0 complains when CUDA is newer than 10.1 diff --git a/src/3rd_party/CMakeLists.txt b/src/3rd_party/CMakeLists.txt index 2fbe69b14..f2062c381 100644 --- a/src/3rd_party/CMakeLists.txt +++ b/src/3rd_party/CMakeLists.txt @@ -27,10 +27,6 @@ if(USE_RUY) add_subdirectory(ruy EXCLUDE_FROM_ALL) endif(USE_RUY) -if(USE_SIMDE) - include_directories(./simde-no-tests) -endif(USE_SIMDE) - if(USE_ONNX_SGEMM) add_subdirectory(./onnxjs) endif(USE_ONNX_SGEMM) diff --git a/src/3rd_party/simd_utils b/src/3rd_party/simd_utils new file mode 160000 index 000000000..696036258 --- /dev/null +++ b/src/3rd_party/simd_utils @@ -0,0 +1 @@ +Subproject commit 6960362584481c977cdae9f6a8f7061a37c766cb diff --git a/src/3rd_party/simde-no-tests b/src/3rd_party/simde-no-tests deleted file mode 160000 index 9af03cd0f..000000000 --- a/src/3rd_party/simde-no-tests +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 9af03cd0f30efae1beb94ef31430dc0370b98b0c diff --git a/src/3rd_party/sse_mathfun.h b/src/3rd_party/sse_mathfun.h index 89ca1d3ed..bb0be6c65 100644 --- a/src/3rd_party/sse_mathfun.h +++ b/src/3rd_party/sse_mathfun.h @@ -29,13 +29,7 @@ (this is the zlib license) */ -#ifndef USE_SIMDE #include -#else -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#include "x86/sse.h" -#endif /* yes I know, the top of this file is quite ugly */ diff --git a/src/common/types.h b/src/common/types.h index fb6aab27a..621551daf 100644 --- a/src/common/types.h +++ b/src/common/types.h @@ -18,10 +18,14 @@ #ifndef __CUDACC__ // NVCC is very unreliable when it comes to CPU intrinsics, we hide them completely from NVCC-compiled code -#ifndef USE_SIMDE +#ifdef __AVX__ #include -#else -#include "x86/avx2.h" +#elif __SSE__ +#include +#endif + +#if defined(__ARM_NEON) || defined(__ARM_NEON__) +#include #endif #endif @@ -167,6 +171,36 @@ struct intgemm8 { #ifndef __CUDACC__ // vectorized types not available from .cu files + +#if defined(__ARM_NEON) || defined(__ARM_NEON__) +struct float32x4 { +private: + using __m128 = float32x4_t; + __m128 f_; + +public: + float32x4() {} + float32x4(const __m128& f) : f_(f) {} + float32x4(const float& f) : f_(vdupq_n_f32(f)) {} // __m128 _mm_set1_ps(float) copies value into all slots + + operator const __m128&() const { return f_; } + operator __m128&() { return f_; } + + float operator[] (size_t i) const { + return *(((float*)&f_) + i); // potentially undefined, but efficient. In practice __m128 is an array of floats + } + + friend std::ostream& operator<<(std::ostream& out, float32x4 f4) { + float* a = (float*)&f4; + out << "[" << a[0]; + for(int i = 1; i < 4; i++) + out << " " << a[i]; + out << "]"; + return out; + } +}; + +#else // @TODO: check what intrinsics are actually available. struct float32x4 { private: @@ -194,6 +228,8 @@ struct float32x4 { } }; +#endif + // @TODO: consider how code can be shared via templating #ifdef __AVX__ struct float32x8 { diff --git a/src/functional/operators.h b/src/functional/operators.h index d79ac3c05..00cb9c819 100755 --- a/src/functional/operators.h +++ b/src/functional/operators.h @@ -213,7 +213,13 @@ struct Ops { // __CUDA_ARCH__ is defined when compiling device (GPU) code #ifndef __CUDACC__ +#if defined(__ARM_NEON) || defined(__ARM_NEON__) +#include "3rd_party/simd_utils/simd_utils.h" +#include "3rd_party/simd_utils/neon_mathfun.h" +#else #include "3rd_party/sse_mathfun.h" +#endif + namespace marian { namespace functional { diff --git a/src/tensors/cpu/integer_common.cpp b/src/tensors/cpu/integer_common.cpp index a941c0d02..edaeb112c 100644 --- a/src/tensors/cpu/integer_common.cpp +++ b/src/tensors/cpu/integer_common.cpp @@ -1,16 +1,12 @@ #include "integer_common.h" -#ifndef USE_SIMDE +#ifdef __SSE__ #include #include #include #include -#else // USE_SIMDE -// https://wiki.debian.org/SIMDEverywhere#Approach -#include "x86/sse2.h" -#include "x86/avx2.h" -#include "x86/ssse3.h" -#include "x86/sse.h" +#elif defined(__ARM_NEON) || defined(__ARM_NEON__) +#include #endif namespace marian { @@ -27,7 +23,9 @@ void AddBias(marian::Tensor C, const marian::Tensor Bias) { for(int j = 0; j < m; ++j) { int i = 0; + #ifdef __AVX512F__ + // Multiples of 16 add together. int n16 = n & ~15; for(; i < n16; i += 16) { __m512 ai = _mm512_loadu_ps(x + j * n + i); @@ -35,7 +33,8 @@ void AddBias(marian::Tensor C, const marian::Tensor Bias) { __m512 yi = _mm512_add_ps(ai, bi); _mm512_storeu_ps(y + j * n + i, yi); } -#else +#elif __SSE__ + // Multiples of 4 add together. int n4 = (n / 4) * 4; for(; i < n4; i += 4) { __m128 ai = _mm_loadu_ps(x + j * n + i); @@ -43,6 +42,21 @@ void AddBias(marian::Tensor C, const marian::Tensor Bias) { __m128 yi = _mm_add_ps(ai, bi); _mm_storeu_ps(y + j * n + i, yi); } +#elif defined(__ARM_NEON) || defined(__ARM_NEON__) + int n4 = (n / 4) * 4; + using __m128 = float32x4_t; + for(; i < n4; i += 4) { + __m128 ai = vld1q_f32(x + j * n + i); + __m128 bi = vld1q_f32(bias + i); + __m128 yi = vaddq_f32(ai, bi); + vst1q_f32(y + j * n + i, yi); + } + +#else + // StandardCPP No SIMD case. + for(i = 0; i < n; i++) { + y[j * n + i] = x[j * n + i] + bias[i]; + } #endif for(; i < n; i++) { y[j * n + i] = x[j * n + i] + bias[i]; From d10009fdb9a2be0449a4c232fadfc16276744bab Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Sat, 28 May 2022 17:14:18 +0000 Subject: [PATCH 21/67] Style fixes: UnquantizeAndWrite, UnquantizeAddBiasAndWrite - Underscore suffix for curried args. - Make args private. --- src/tensors/cpu/ruy_adapter.h | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/src/tensors/cpu/ruy_adapter.h b/src/tensors/cpu/ruy_adapter.h index 89e856d10..f8117ab5d 100644 --- a/src/tensors/cpu/ruy_adapter.h +++ b/src/tensors/cpu/ruy_adapter.h @@ -74,33 +74,36 @@ struct Preprocess { struct UnquantizeAndAddBiasAndWrite { UnquantizeAndAddBiasAndWrite(float unquant_multiplier, const float *input_bias_prepared) - : unquant_multiplier(unquant_multiplier), input_bias_prepared(input_bias_prepared) {} + : unquant_multiplier_(unquant_multiplier), input_bias_prepared_(input_bias_prepared) {} void operator()(const int32_t *input, Index rows_A, Index cols_B, float *output) { for(Index i = 0; i < rows_A; i++) { for(Index j = 0; j < cols_B; j++) { Index idx = i * cols_B + j; - output[idx] = (input[idx] * unquant_multiplier) + input_bias_prepared[j]; + output[idx] = (input[idx] * unquant_multiplier_) + input_bias_prepared_[j]; } } } - float unquant_multiplier; - const float *input_bias_prepared; + private: + float unquant_multiplier_; + const float *input_bias_prepared_; }; struct UnquantizeAndWrite { - UnquantizeAndWrite(float unquant_multiplier) : unquant_multiplier(unquant_multiplier) {} + UnquantizeAndWrite(float unquant_multiplier) : unquant_multiplier_(unquant_multiplier) {} void operator()(const int32_t *input, Index rows_A, Index cols_B, float *output) { for(Index i = 0; i < rows_A; i++) { for(Index j = 0; j < cols_B; j++) { Index idx = i * cols_B + j; - output[idx] = (input[idx] * unquant_multiplier); + output[idx] = (input[idx] * unquant_multiplier_); } } } - float unquant_multiplier; + + private: + float unquant_multiplier_; }; }; @@ -251,18 +254,18 @@ struct Preprocess { struct UnquantizeAndAddBiasAndWrite { UnquantizeAndAddBiasAndWrite(float unquant_multiplier, const float *input_bias_prepared) - : unquant_multiplier(unquant_multiplier), input_bias_prepared(input_bias_prepared) {} + : unquant_multiplier_(unquant_multiplier), input_bias_prepared_(input_bias_prepared) {} void operator()(const int32_t *input, Index rows_A, Index cols_B, float *output) { // Set all registers in lane from same scalar value. - float32x4_t multiplier = vdupq_n_f32(unquant_multiplier); + float32x4_t multiplier = vdupq_n_f32(unquant_multiplier_); const int32x4_t *Input = reinterpret_cast(input); const int32x4_t *InputEnd = reinterpret_cast(input + rows_A * cols_B); float32x4_t *Output = reinterpret_cast(output); while(Input != InputEnd) { // Bias cycles every column for addition. - const float32x4_t *Bias = reinterpret_cast(input_bias_prepared); + const float32x4_t *Bias = reinterpret_cast(input_bias_prepared_); // InputEnd needs to be determined to end the while loop below. const int32x4_t *RowEnd = reinterpret_cast( @@ -278,16 +281,17 @@ struct Preprocess { } } - float unquant_multiplier; - const float *input_bias_prepared; + private: + float unquant_multiplier_; + const float *input_bias_prepared_; }; struct UnquantizeAndWrite { - UnquantizeAndWrite(float unquant_multiplier) : unquant_multiplier(unquant_multiplier) {} + UnquantizeAndWrite(float unquant_multiplier) : unquant_multiplier_(unquant_multiplier) {} void operator()(const int32_t *input, Index rows_A, Index cols_B, float *output) { // Set all registers in lane from same scalar value. - float32x4_t multiplier = vdupq_n_f32(unquant_multiplier); + float32x4_t multiplier = vdupq_n_f32(unquant_multiplier_); const int32x4_t *Input = reinterpret_cast(input); const int32x4_t *InputEnd = reinterpret_cast(input + rows_A * cols_B); float32x4_t *Output = reinterpret_cast(output); @@ -309,7 +313,8 @@ struct Preprocess { } } - float unquant_multiplier; + private: + float unquant_multiplier_; }; }; From 3a37966868263af54a43e2bb3b0cf70d01ae5a66 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Sat, 28 May 2022 17:24:14 +0000 Subject: [PATCH 22/67] const for () operator overrides --- src/tensors/cpu/ruy_adapter.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/tensors/cpu/ruy_adapter.h b/src/tensors/cpu/ruy_adapter.h index f8117ab5d..6015612d7 100644 --- a/src/tensors/cpu/ruy_adapter.h +++ b/src/tensors/cpu/ruy_adapter.h @@ -76,7 +76,7 @@ struct Preprocess { UnquantizeAndAddBiasAndWrite(float unquant_multiplier, const float *input_bias_prepared) : unquant_multiplier_(unquant_multiplier), input_bias_prepared_(input_bias_prepared) {} - void operator()(const int32_t *input, Index rows_A, Index cols_B, float *output) { + void operator()(const int32_t *input, Index rows_A, Index cols_B, float *output) const { for(Index i = 0; i < rows_A; i++) { for(Index j = 0; j < cols_B; j++) { Index idx = i * cols_B + j; @@ -93,7 +93,7 @@ struct Preprocess { struct UnquantizeAndWrite { UnquantizeAndWrite(float unquant_multiplier) : unquant_multiplier_(unquant_multiplier) {} - void operator()(const int32_t *input, Index rows_A, Index cols_B, float *output) { + void operator()(const int32_t *input, Index rows_A, Index cols_B, float *output) const { for(Index i = 0; i < rows_A; i++) { for(Index j = 0; j < cols_B; j++) { Index idx = i * cols_B + j; @@ -256,7 +256,7 @@ struct Preprocess { UnquantizeAndAddBiasAndWrite(float unquant_multiplier, const float *input_bias_prepared) : unquant_multiplier_(unquant_multiplier), input_bias_prepared_(input_bias_prepared) {} - void operator()(const int32_t *input, Index rows_A, Index cols_B, float *output) { + void operator()(const int32_t *input, Index rows_A, Index cols_B, float *output) const { // Set all registers in lane from same scalar value. float32x4_t multiplier = vdupq_n_f32(unquant_multiplier_); const int32x4_t *Input = reinterpret_cast(input); @@ -289,7 +289,7 @@ struct Preprocess { struct UnquantizeAndWrite { UnquantizeAndWrite(float unquant_multiplier) : unquant_multiplier_(unquant_multiplier) {} - void operator()(const int32_t *input, Index rows_A, Index cols_B, float *output) { + void operator()(const int32_t *input, Index rows_A, Index cols_B, float *output) const { // Set all registers in lane from same scalar value. float32x4_t multiplier = vdupq_n_f32(unquant_multiplier_); const int32x4_t *Input = reinterpret_cast(input); From 418a7ce143621b56636cbd6062d13b509feba280 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Sat, 28 May 2022 17:42:51 +0000 Subject: [PATCH 23/67] Explicit for single argument constructor: UnquantizeAndWrite --- src/tensors/cpu/ruy_adapter.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/tensors/cpu/ruy_adapter.h b/src/tensors/cpu/ruy_adapter.h index 6015612d7..a9d51e43a 100644 --- a/src/tensors/cpu/ruy_adapter.h +++ b/src/tensors/cpu/ruy_adapter.h @@ -91,7 +91,8 @@ struct Preprocess { }; struct UnquantizeAndWrite { - UnquantizeAndWrite(float unquant_multiplier) : unquant_multiplier_(unquant_multiplier) {} + explicit UnquantizeAndWrite(float unquant_multiplier) + : unquant_multiplier_(unquant_multiplier) {} void operator()(const int32_t *input, Index rows_A, Index cols_B, float *output) const { for(Index i = 0; i < rows_A; i++) { @@ -287,7 +288,8 @@ struct Preprocess { }; struct UnquantizeAndWrite { - UnquantizeAndWrite(float unquant_multiplier) : unquant_multiplier_(unquant_multiplier) {} + explicit tUnquantizeAndWrite(float unquant_multiplier) + : unquant_multiplier_(unquant_multiplier) {} void operator()(const int32_t *input, Index rows_A, Index cols_B, float *output) const { // Set all registers in lane from same scalar value. From b7412c3b5fe677d31c61ab1718187178635dc71d Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Sat, 28 May 2022 17:49:07 +0000 Subject: [PATCH 24/67] Fix typo --- src/tensors/cpu/ruy_adapter.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tensors/cpu/ruy_adapter.h b/src/tensors/cpu/ruy_adapter.h index a9d51e43a..09fd7b427 100644 --- a/src/tensors/cpu/ruy_adapter.h +++ b/src/tensors/cpu/ruy_adapter.h @@ -288,7 +288,7 @@ struct Preprocess { }; struct UnquantizeAndWrite { - explicit tUnquantizeAndWrite(float unquant_multiplier) + explicit UnquantizeAndWrite(float unquant_multiplier) : unquant_multiplier_(unquant_multiplier) {} void operator()(const int32_t *input, Index rows_A, Index cols_B, float *output) const { From 071e0d4be4edaaad12eb82a6bef75bcd81318f99 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Sat, 28 May 2022 17:50:44 +0000 Subject: [PATCH 25/67] Low compute path for special case alpha = 1.0 --- src/tensors/cpu/prod_blas.h | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/tensors/cpu/prod_blas.h b/src/tensors/cpu/prod_blas.h index a3d11db9c..5415420f9 100644 --- a/src/tensors/cpu/prod_blas.h +++ b/src/tensors/cpu/prod_blas.h @@ -80,15 +80,18 @@ inline void GemmRuy(const bool transA, ruy::MulParams mul_params; ruy::Mul(lhs, rhs, mul_params, &context, &dst); - // Write out C as C = alpha * [op(A) * op(B)] + beta * C - // Can we expect the compiler to autovectorize this? - // TODO: Come back and explicitly use SIMD. - const size_t size = M * N; - const float *opA_opB = C; // Alias. + if(alpha != 1.0) { + // Write out C as C = alpha * [op(A) * op(B)] + beta * C + // Can we expect the compiler to autovectorize this? + // TODO: Come back and explicitly use SIMD. + const size_t size = M * N; + const float *opA_opB = C; // Alias. #pragma clang loop vectorize(enable) interleave(enable) - for(size_t i = 0; i < size; i++) { - C[i] = alpha * opA_opB[i]; + for(size_t i = 0; i < size; i++) { + C[i] = alpha * opA_opB[i]; + } } + } else { // @jerinphilip has not yet been able to find a ruy primitive that does in // place addition to obtain full gemm. From ec886bdead5f224d79be62e8e0051a2bd9be1036 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Mon, 30 May 2022 10:04:47 +0000 Subject: [PATCH 26/67] Remove clang only pragmas --- src/tensors/cpu/prod_blas.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/tensors/cpu/prod_blas.h b/src/tensors/cpu/prod_blas.h index 5415420f9..1a1ae4b35 100644 --- a/src/tensors/cpu/prod_blas.h +++ b/src/tensors/cpu/prod_blas.h @@ -86,7 +86,6 @@ inline void GemmRuy(const bool transA, // TODO: Come back and explicitly use SIMD. const size_t size = M * N; const float *opA_opB = C; // Alias. -#pragma clang loop vectorize(enable) interleave(enable) for(size_t i = 0; i < size; i++) { C[i] = alpha * opA_opB[i]; } @@ -111,7 +110,6 @@ inline void GemmRuy(const bool transA, // TODO: Come back and explicitly use SIMD. const size_t size = M * N; const float *opA_opB = intermediate.data(); -#pragma clang loop vectorize(enable) interleave(enable) for(size_t i = 0; i < size; i++) { C[i] = alpha * opA_opB[i] + beta * C[i]; } From 4df1998ef60d54ce53e38865de4ca20f9c63d51d Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Mon, 30 May 2022 10:24:55 +0000 Subject: [PATCH 27/67] Remove leftover bias cycles comment --- src/tensors/cpu/ruy_adapter.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/tensors/cpu/ruy_adapter.h b/src/tensors/cpu/ruy_adapter.h index 09fd7b427..6be3bbd59 100644 --- a/src/tensors/cpu/ruy_adapter.h +++ b/src/tensors/cpu/ruy_adapter.h @@ -299,8 +299,6 @@ struct Preprocess { float32x4_t *Output = reinterpret_cast(output); while(Input != InputEnd) { - // Bias cycles every column for addition. - // InputEnd needs to be determined to end the while loop below. const int32x4_t *RowEnd = reinterpret_cast( reinterpret_cast(Input) + cols_B); From c4be980fca1f1031198b8c6a1aa241b86bd7bbaf Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Thu, 2 Jun 2022 14:14:58 +0000 Subject: [PATCH 28/67] Defaults: intgemm for x86_64 and ruy and simd_utils for arm --- CMakeLists.txt | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index cb5ec3bdf..f19481700 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -31,7 +31,8 @@ option(USE_CCACHE "Use ccache compiler cache (https://ccache.dev)" OFF) option(USE_CUDNN "Use CUDNN library" OFF) option(USE_DOXYGEN "Build documentation with Doxygen" ON) option(USE_FBGEMM "Use FBGEMM" OFF) -option(USE_INTGEMM "Use INTGEMM" ON) +option(USE_INTGEMM "Use INTGEMM" OFF) +option(USE_RUY "Use Ruy" OFF) option(USE_MKL "Compile with MKL support" ON) option(USE_MPI "Use MPI library" OFF) option(USE_NCCL "Use NCCL library" ON) @@ -50,6 +51,15 @@ option(COMPILE_WITHOUT_EXCEPTIONS "Compile without exceptions" OFF) CMAKE_DEPENDENT_OPTION(USE_THREADS "Compile with multi-threading support" OFF "USE_WASM_COMPATIBLE_SOURCE" ON) + +if(CMAKE_SYSTEM_PROCESSOR STREQUAL x86_64 OR CMAKE_SYSTEM_PROCESSOR STREQUAL amd64) + set(USE_INTGEMM ON) +else() + set(USE_RUY ON) + set(USE_RUY_SGEMM ON) + set(USE_SIMD_UTILS ON) +endif() + if (USE_WASM_COMPATIBLE_SOURCE) set(SPM_BUILD_LIBRARY_ONLY ON CACHE BOOL "Build only sentencepiece library (skip building executables)") add_compile_definitions(WASM_COMPATIBLE_SOURCE) From b181847ae119658921798a3ca3f9cb0a803d63b1 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Mon, 6 Jun 2022 12:55:59 +0000 Subject: [PATCH 29/67] Revert "Defaults: intgemm for x86_64 and ruy and simd_utils for arm" This reverts commit c4be980fca1f1031198b8c6a1aa241b86bd7bbaf. --- CMakeLists.txt | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f19481700..cb5ec3bdf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -31,8 +31,7 @@ option(USE_CCACHE "Use ccache compiler cache (https://ccache.dev)" OFF) option(USE_CUDNN "Use CUDNN library" OFF) option(USE_DOXYGEN "Build documentation with Doxygen" ON) option(USE_FBGEMM "Use FBGEMM" OFF) -option(USE_INTGEMM "Use INTGEMM" OFF) -option(USE_RUY "Use Ruy" OFF) +option(USE_INTGEMM "Use INTGEMM" ON) option(USE_MKL "Compile with MKL support" ON) option(USE_MPI "Use MPI library" OFF) option(USE_NCCL "Use NCCL library" ON) @@ -51,15 +50,6 @@ option(COMPILE_WITHOUT_EXCEPTIONS "Compile without exceptions" OFF) CMAKE_DEPENDENT_OPTION(USE_THREADS "Compile with multi-threading support" OFF "USE_WASM_COMPATIBLE_SOURCE" ON) - -if(CMAKE_SYSTEM_PROCESSOR STREQUAL x86_64 OR CMAKE_SYSTEM_PROCESSOR STREQUAL amd64) - set(USE_INTGEMM ON) -else() - set(USE_RUY ON) - set(USE_RUY_SGEMM ON) - set(USE_SIMD_UTILS ON) -endif() - if (USE_WASM_COMPATIBLE_SOURCE) set(SPM_BUILD_LIBRARY_ONLY ON CACHE BOOL "Build only sentencepiece library (skip building executables)") add_compile_definitions(WASM_COMPATIBLE_SOURCE) From 6e4c5610e8ffe5d5cddf28f8dfd71981d22d2387 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Tue, 7 Jun 2022 13:04:44 +0100 Subject: [PATCH 30/67] Target architecture detection for ARM --- .github/workflows/arm.yml | 3 - CMakeLists.txt | 24 ++++++- cmake/TargetArch.cmake | 142 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 165 insertions(+), 4 deletions(-) create mode 100644 cmake/TargetArch.cmake diff --git a/.github/workflows/arm.yml b/.github/workflows/arm.yml index 1b8ff0839..9667ece05 100644 --- a/.github/workflows/arm.yml +++ b/.github/workflows/arm.yml @@ -87,9 +87,6 @@ jobs: -DCMAKE_USE_PTHREADS_INIT=1 -DTHREADS_PREFER_PTHREAD_FLAG=ON -DBUILD_ARCH=armv8-a - -DUSE_INTGEMM=off - -DUSE_SIMD_UTILS=on - -DUSE_RUY=on # -DCOMPILE_WITHOUT_EXCEPTIONS=on # Apparently this can reduce the binary size, let's see. ) # Additionally list variables finally configured. diff --git a/CMakeLists.txt b/CMakeLists.txt index cb5ec3bdf..d20cc9e39 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,6 +16,19 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON) include(CMakeDependentOption) +# Architecture detection +include(TargetArch) + +target_architecture(CMAKE_TARGET_ARCHITECTURES) +list(LENGTH CMAKE_TARGET_ARCHITECTURES cmake_target_arch_len) +if(NOT "${cmake_target_arch_len}" STREQUAL "1") + set(CMAKE_TARGET_ARCHITECTURE_UNIVERSAL TRUE) + set(CMAKE_TARGET_ARCHITECTURE_CODE "universal") +else() + set(CMAKE_TARGET_ARCHITECTURE_UNIVERSAL FALSE) + set(CMAKE_TARGET_ARCHITECTURE_CODE "${CMAKE_TARGET_ARCHITECTURES}") +endif() + # Custom CMake options option(COMPILE_CPU "Compile CPU version" ON) option(COMPILE_CUDA "Compile GPU version" ON) @@ -31,7 +44,8 @@ option(USE_CCACHE "Use ccache compiler cache (https://ccache.dev)" OFF) option(USE_CUDNN "Use CUDNN library" OFF) option(USE_DOXYGEN "Build documentation with Doxygen" ON) option(USE_FBGEMM "Use FBGEMM" OFF) -option(USE_INTGEMM "Use INTGEMM" ON) +option(USE_INTGEMM "Use INTGEMM" OFF) +option(USE_RUY "Use Ruy" OFF) option(USE_MKL "Compile with MKL support" ON) option(USE_MPI "Use MPI library" OFF) option(USE_NCCL "Use NCCL library" ON) @@ -50,6 +64,14 @@ option(COMPILE_WITHOUT_EXCEPTIONS "Compile without exceptions" OFF) CMAKE_DEPENDENT_OPTION(USE_THREADS "Compile with multi-threading support" OFF "USE_WASM_COMPATIBLE_SOURCE" ON) +if(${CMAKE_TARGET_ARCHITECTURE_CODE} MATCHES "arm") + set(USE_RUY ON) + set(USE_RUY_SGEMM ON) + set(USE_SIMD_UTILS ON) +else() + set(USE_INTGEMM ON) +endif() + if (USE_WASM_COMPATIBLE_SOURCE) set(SPM_BUILD_LIBRARY_ONLY ON CACHE BOOL "Build only sentencepiece library (skip building executables)") add_compile_definitions(WASM_COMPATIBLE_SOURCE) diff --git a/cmake/TargetArch.cmake b/cmake/TargetArch.cmake new file mode 100644 index 000000000..01f4a44d7 --- /dev/null +++ b/cmake/TargetArch.cmake @@ -0,0 +1,142 @@ +# Modified from https://github.com/axr/solar-cmake/blob/73cfea0db0284c5e2010aca23989046e5bda95c9/Solar.cmake +# Based on the Qt 5 processor detection code, so should be very accurate +# https://qt.gitorious.org/qt/qtbase/blobs/master/src/corelib/global/qprocessordetection.h +# Currently handles arm (v5, v6, v7), x86 (32/64), ia64, and ppc (32/64) + +# Regarding POWER/PowerPC, just as is noted in the Qt source, +# "There are many more known variants/revisions that we do not handle/detect." + +set(archdetect_c_code " +#if defined(__arm__) || defined(__TARGET_ARCH_ARM) || defined(_M_ARM) || defined(_M_ARM64) || defined(__aarch64__) || defined(__ARM64__) + #if defined(__ARM_ARCH_8__) || defined(__ARM_ARCH_8) \\ + || defined(__ARM_ARCH_8A__) || defined(__ARM_ARCH_8A) \\ + || defined(__ARM_ARCH_8R__) || defined(__ARM_ARCH_8R) \\ + || defined(__ARM_ARCH_8M__) || defined(__ARM_ARCH_8M) \\ + || (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 8) + #error cmake_ARCH armv8 + #elif defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7) \\ + || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7A__) \\ + || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7R__) \\ + || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7M__) \\ + || (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 7) + #error cmake_ARCH armv7 + #elif defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6) \\ + || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6J__) \\ + || defined(__ARM_ARCH_6T2__) || defined(__ARM_ARCH_6T2__) \\ + || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6Z__) \\ + || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6K__) \\ + || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6ZK__) \\ + || defined(__ARM_ARCH_6M__) || defined(__ARM_ARCH_6M__) \\ + || (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 6) + #error cmake_ARCH armv6 + #elif defined(__ARM_ARCH_5TEJ__) \\ + || (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 5) + #error cmake_ARCH armv5 + #else + #error cmake_ARCH arm + #endif +#elif defined(__i386) || defined(__i386__) || defined(_M_IX86) + #error cmake_ARCH i386 +#elif defined(__x86_64) || defined(__x86_64__) || defined(__amd64) || defined(_M_X64) + #error cmake_ARCH x86_64 +#elif defined(__ia64) || defined(__ia64__) || defined(_M_IA64) + #error cmake_ARCH ia64 +#elif defined(__ppc__) || defined(__ppc) || defined(__powerpc__) \\ + || defined(_ARCH_COM) || defined(_ARCH_PWR) || defined(_ARCH_PPC) \\ + || defined(_M_MPPC) || defined(_M_PPC) + #if defined(__ppc64__) || defined(__powerpc64__) || defined(__64BIT__) + #error cmake_ARCH ppc64 + #else + #error cmake_ARCH ppc + #endif +#endif + +#error cmake_ARCH unknown +") + + +# Set ppc_support to TRUE before including this file or ppc and ppc64 +# will be treated as invalid architectures since they are no longer supported by Apple + +function(target_architecture output_var) + if(APPLE AND CMAKE_OSX_ARCHITECTURES) + # On OS X we use CMAKE_OSX_ARCHITECTURES *if* it was set + # First let's normalize the order of the values + + # Note that it's not possible to compile PowerPC applications if you are using + # the OS X SDK version 10.6 or later - you'll need 10.4/10.5 for that, so we + # disable it by default + # See this page for more information: + # http://stackoverflow.com/questions/5333490/how-can-we-restore-ppc-ppc64-as-well-as-full-10-4-10-5-sdk-support-to-xcode-4 + + # Architecture defaults to i386 or ppc on OS X 10.5 and earlier, depending on the CPU type detected at runtime. + # On OS X 10.6+ the default is x86_64 if the CPU supports it, i386 otherwise. + + foreach(osx_arch ${CMAKE_OSX_ARCHITECTURES}) + if("${osx_arch}" STREQUAL "ppc" AND ppc_support) + set(osx_arch_ppc TRUE) + elseif("${osx_arch}" STREQUAL "i386") + set(osx_arch_i386 TRUE) + elseif("${osx_arch}" STREQUAL "x86_64") + set(osx_arch_x86_64 TRUE) + elseif("${osx_arch}" STREQUAL "ppc64" AND ppc_support) + set(osx_arch_ppc64 TRUE) + else() + message(FATAL_ERROR "Invalid OS X arch name: ${osx_arch}") + endif() + endforeach() + + # Now add all the architectures in our normalized order + if(osx_arch_ppc) + list(APPEND ARCH ppc) + endif() + + if(osx_arch_i386) + list(APPEND ARCH i386) + endif() + + if(osx_arch_x86_64) + list(APPEND ARCH x86_64) + endif() + + if(osx_arch_ppc64) + list(APPEND ARCH ppc64) + endif() + else() + file(WRITE "${CMAKE_BINARY_DIR}/arch.c" "${archdetect_c_code}") + + enable_language(C) + + # Detect the architecture in a rather creative way... + # This compiles a small C program which is a series of ifdefs that selects a + # particular #error preprocessor directive whose message string contains the + # target architecture. The program will always fail to compile (both because + # file is not a valid C program, and obviously because of the presence of the + # #error preprocessor directives... but by exploiting the preprocessor in this + # way, we can detect the correct target architecture even when cross-compiling, + # since the program itself never needs to be run (only the compiler/preprocessor) + try_run( + run_result_unused + compile_result_unused + "${CMAKE_BINARY_DIR}" + "${CMAKE_BINARY_DIR}/arch.c" + COMPILE_OUTPUT_VARIABLE ARCH + CMAKE_FLAGS CMAKE_OSX_ARCHITECTURES=${CMAKE_OSX_ARCHITECTURES} + ) + + # Parse the architecture name from the compiler output + string(REGEX MATCH "cmake_ARCH ([a-zA-Z0-9_]+)" ARCH "${ARCH}") + + # Get rid of the value marker leaving just the architecture name + string(REPLACE "cmake_ARCH " "" ARCH "${ARCH}") + + # If we are compiling with an unknown architecture this variable should + # already be set to "unknown" but in the case that it's empty (i.e. due + # to a typo in the code), then set it to unknown + if (NOT ARCH) + set(ARCH unknown) + endif() + endif() + + set(${output_var} "${ARCH}" PARENT_SCOPE) +endfunction() From 53636cf42a0e80a1d2aa0c206d853e7451d2663d Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Tue, 7 Jun 2022 13:57:22 +0000 Subject: [PATCH 31/67] Remove DEBUG statements --- src/tensors/cpu/ruy_adapter.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/tensors/cpu/ruy_adapter.h b/src/tensors/cpu/ruy_adapter.h index 6be3bbd59..f080d929c 100644 --- a/src/tensors/cpu/ruy_adapter.h +++ b/src/tensors/cpu/ruy_adapter.h @@ -431,14 +431,10 @@ struct IntgemmViaRuy { ruy::MakeSimpleLayout(rows_A, width, ruy::Order::kRowMajor, lhs.mutable_layout()); lhs.set_data(input_A_prepared); - // PRINT_MATRIX_DEBUG(input_A_prepared, rows_A, width, Order::RowMajor); - ruy::Matrix rhs; ruy::MakeSimpleLayout(width, cols_B, ruy::Order::kColMajor, rhs.mutable_layout()); rhs.set_data(input_B_prepared); - // PRINT_MATRIX_DEBUG(input_B_prepared, width, cols_B, Order::ColMajor); - ruy::Matrix dst; ruy::MakeSimpleLayout(rows_A, cols_B, ruy::Order::kRowMajor, dst.mutable_layout()); From be9e153c5664c1e813c0689831975edb607cee54 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Wed, 8 Jun 2022 15:11:46 +0000 Subject: [PATCH 32/67] Remove IntBase inheritance; PrepareB still unimplemented --- src/tensors/cpu/ruy_adapter.h | 78 +++++++++++++++++------------------ 1 file changed, 39 insertions(+), 39 deletions(-) diff --git a/src/tensors/cpu/ruy_adapter.h b/src/tensors/cpu/ruy_adapter.h index f080d929c..3bf51bfff 100644 --- a/src/tensors/cpu/ruy_adapter.h +++ b/src/tensors/cpu/ruy_adapter.h @@ -328,46 +328,9 @@ struct IntgemmViaRuy { // Convert compile time errors into run-time ABORTS. This allows bringing in only int8_t and // select functions that are required to create a path which will run while not achieving // parity with intgemm. - template - struct IntBase { - using Type = T; - static void Quantize(const float *, Type *, float, Index) { ABORT("Quantize unsupported"); } - - static void PrepareA(const float *input, - Type *output, - float quant_mult, - Index rows, - Index cols) { - ABORT("PrepareA Unsupported"); - } - - static void PrepareB(const float *, Type *, float, Index, Index) { - ABORT("PrepareB Unsupported"); - } - static void PrepareBQuantizedTransposed(const Type *, Type *, Index, Index) { - ABORT("PrepareBQuantizedTransposed Unsupported"); - } - static void PrepareBTransposed(const float *, Type *, float, Index, Index) { - ABORT("PrepareBTransposed Unsupported"); - } - static void SelectColumnsB(const Type *, Type *, Index, const Index *, const Index *) { - ABORT("SelectColumnsB Unsupported"); - } - - template - static void Multiply(const Type *A_prepared, - const Type *B_prepared, - float *output, - Index rows_A, - Index width, - Index cols_B, - Callback callback) { - ABORT("Multiply (A*B) Unsupported"); - } - }; // Intgemm nomenclature expects Int8. Missing functions are ABORTs. - struct Int8 : IntBase { + struct Int8 { using Type = int8_t; static void PrepareBQuantizedTransposed(const Type *input, Type *output, @@ -384,6 +347,10 @@ struct IntgemmViaRuy { Preprocess::quantize(input, output, quant_mult, rows, cols); } + static void PrepareB(const float *, Type *, float, Index, Index) { + ABORT("PrepareB Unsupported"); + } + static void PrepareA(const float *input, int8_t *output, float quant_mult, @@ -450,8 +417,41 @@ struct IntgemmViaRuy { }; // Int16 support is currently missing. - struct Int16 : IntBase { + struct Int16 { using Type = int16_t; + static void Quantize(const float *, Type *, float, Index) { ABORT("Quantize unsupported"); } + + static void PrepareA(const float *input, + Type *output, + float quant_mult, + Index rows, + Index cols) { + ABORT("PrepareA Unsupported"); + } + + static void PrepareB(const float *, Type *, float, Index, Index) { + ABORT("PrepareB Unsupported"); + } + static void PrepareBQuantizedTransposed(const Type *, Type *, Index, Index) { + ABORT("PrepareBQuantizedTransposed Unsupported"); + } + static void PrepareBTransposed(const float *, Type *, float, Index, Index) { + ABORT("PrepareBTransposed Unsupported"); + } + static void SelectColumnsB(const Type *, Type *, Index, const Index *, const Index *) { + ABORT("SelectColumnsB Unsupported"); + } + + template + static void Multiply(const Type *A_prepared, + const Type *B_prepared, + float *output, + Index rows_A, + Index width, + Index cols_B, + Callback callback) { + ABORT("Multiply (A*B) Unsupported"); + } }; template From 876a91566093bc1d6795dbbbb90bf59694c73b41 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Wed, 8 Jun 2022 15:23:23 +0000 Subject: [PATCH 33/67] Remove obsolete comment --- src/tensors/cpu/ruy_adapter.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/tensors/cpu/ruy_adapter.h b/src/tensors/cpu/ruy_adapter.h index 3bf51bfff..2d19fc4a3 100644 --- a/src/tensors/cpu/ruy_adapter.h +++ b/src/tensors/cpu/ruy_adapter.h @@ -325,10 +325,6 @@ struct Preprocess { * intgemm_interface.h diff minimal. There are possibly better abstractions. */ struct IntgemmViaRuy { - // Convert compile time errors into run-time ABORTS. This allows bringing in only int8_t and - // select functions that are required to create a path which will run while not achieving - // parity with intgemm. - // Intgemm nomenclature expects Int8. Missing functions are ABORTs. struct Int8 { using Type = int8_t; From d399a35d9e2e82ff7ecf29a8f41c1ae9f2242cfa Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Wed, 8 Jun 2022 15:49:30 +0000 Subject: [PATCH 34/67] Remove logging statement in hotpath --- src/tensors/cpu/prod_blas.h | 1 - 1 file changed, 1 deletion(-) diff --git a/src/tensors/cpu/prod_blas.h b/src/tensors/cpu/prod_blas.h index 1a1ae4b35..bab16cfbd 100644 --- a/src/tensors/cpu/prod_blas.h +++ b/src/tensors/cpu/prod_blas.h @@ -51,7 +51,6 @@ inline void GemmRuy(const bool transA, const float beta, float *C, const int ldc) { - LOG(info, "Ruy multiplication called..."); ruy::Context context; // If we need to transpose, we can swap dimensions in layout claim the matrix From 06b6dd96279c64f27b8650329a96f475c38e3ed8 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Thu, 9 Jun 2022 11:42:58 +0000 Subject: [PATCH 35/67] =?UTF-8?q?Use=20CMAKE=5FCXX=5FFLAGS=20instead=20of?= =?UTF-8?q?=20add=5Fdefinitions=20=F0=9F=A4=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CMakeLists.txt | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d20cc9e39..b167d0586 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -286,18 +286,6 @@ else(MSVC) endif () endif() - if(USE_FBGEMM) - set(EXT_LIBS ${EXT_LIBS} fbgemm dl) - add_definitions(-DUSE_FBGEMM=1) - endif(USE_FBGEMM) - - if(USE_INTGEMM) - add_definitions(-DUSE_INTGEMM=1) - endif(USE_INTGEMM) - - if(USE_SIMD_UTILS) - add_definitions(-DARM -DSSE -flax-vector-conversions) #added for ARM - endif(USE_SIMD_UTILS) if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 9.0) # Clang-10.0.0 complains when CUDA is newer than 10.1 @@ -367,6 +355,20 @@ else(MSVC) endif(COMPILE_WASM) endif(MSVC) +if(USE_FBGEMM) + set(EXT_LIBS ${EXT_LIBS} fbgemm dl) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_FBGEMM=1") +endif(USE_FBGEMM) + +if(USE_INTGEMM) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_INTGEMM=1") +endif(USE_INTGEMM) + +if(USE_SIMD_UTILS) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DARM -DSSE -flax-vector-conversions") #added for ARM + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DARM -DSSE -flax-vector-conversions") #added for ARM +endif(USE_SIMD_UTILS) + # with gcc 7.0 and above we need to mark fallthrough in switch case statements # that can be done in comments for backcompat, but CCACHE removes comments. # -C makes gcc keep comments. From e310f73ddfc1c4114e52cd27dd228c848c9c358f Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Thu, 9 Jun 2022 12:13:53 +0000 Subject: [PATCH 36/67] Check: Does add_compile_{defs,opts} propogate up? --- CMakeLists.txt | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b167d0586..7cc9c3061 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -72,6 +72,24 @@ else() set(USE_INTGEMM ON) endif() +if(USE_FBGEMM) + set(EXT_LIBS ${EXT_LIBS} fbgemm dl) + add_compile_definitions(USE_FBGEMM=1) +endif(USE_FBGEMM) + +if(USE_INTGEMM) + add_compile_defintions(USE_INTGEMM=1) +endif(USE_INTGEMM) + +if(USE_SIMD_UTILS) + add_compile_definitions(ARM SSE) #added for ARM + if(MSVC) + add_compile_options(/flax-vector-conversions) + else(MSVC) + add_compile_options(-flax-vector-conversions) + endif(MSVC) +endif(USE_SIMD_UTILS) + if (USE_WASM_COMPATIBLE_SOURCE) set(SPM_BUILD_LIBRARY_ONLY ON CACHE BOOL "Build only sentencepiece library (skip building executables)") add_compile_definitions(WASM_COMPATIBLE_SOURCE) @@ -355,19 +373,6 @@ else(MSVC) endif(COMPILE_WASM) endif(MSVC) -if(USE_FBGEMM) - set(EXT_LIBS ${EXT_LIBS} fbgemm dl) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_FBGEMM=1") -endif(USE_FBGEMM) - -if(USE_INTGEMM) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_INTGEMM=1") -endif(USE_INTGEMM) - -if(USE_SIMD_UTILS) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DARM -DSSE -flax-vector-conversions") #added for ARM - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DARM -DSSE -flax-vector-conversions") #added for ARM -endif(USE_SIMD_UTILS) # with gcc 7.0 and above we need to mark fallthrough in switch case statements # that can be done in comments for backcompat, but CCACHE removes comments. From 3bf113317e9a92ff86618f941ba5c0dc8b41cb29 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Thu, 9 Jun 2022 12:16:19 +0000 Subject: [PATCH 37/67] Fix typo: definitions --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7cc9c3061..c2ed92cb1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -78,7 +78,7 @@ if(USE_FBGEMM) endif(USE_FBGEMM) if(USE_INTGEMM) - add_compile_defintions(USE_INTGEMM=1) + add_compile_definitions(USE_INTGEMM=1) endif(USE_INTGEMM) if(USE_SIMD_UTILS) From 5c8b1d20b730ef9a9adbb709c850f55f6b5f9940 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Thu, 9 Jun 2022 12:40:06 +0000 Subject: [PATCH 38/67] Undo edit attempts manually for min-diff; Using compile_definitions now --- CMakeLists.txt | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c2ed92cb1..5b5a574ea 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -72,11 +72,6 @@ else() set(USE_INTGEMM ON) endif() -if(USE_FBGEMM) - set(EXT_LIBS ${EXT_LIBS} fbgemm dl) - add_compile_definitions(USE_FBGEMM=1) -endif(USE_FBGEMM) - if(USE_INTGEMM) add_compile_definitions(USE_INTGEMM=1) endif(USE_INTGEMM) @@ -211,7 +206,6 @@ if(MSVC) # set(INTRINSICS "/arch:AVX") add_definitions(-DUSE_SSE2=1) - # Or maybe use these? set(INTRINSICS ${MSVC_BUILD_ARCH}) # set(INTRINSICS "/arch:AVX512") @@ -238,13 +232,7 @@ if(MSVC) set(EXT_LIBS ${EXT_LIBS} fbgemm) add_definitions(-DUSE_FBGEMM=1 -DFBGEMM_STATIC=1) endif(USE_FBGEMM) - - if(USE_INTGEMM) - add_definitions(-DUSE_INTGEMM=1) - endif(USE_INTGEMM) - else(MSVC) - # Check we are using at least g++ 5.0 if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0) message(FATAL_ERROR "FATAL ERROR: Compiling Marian requires at least g++ 5.0, your version is ${CMAKE_CXX_COMPILER_VERSION}") @@ -304,6 +292,10 @@ else(MSVC) endif () endif() + if(USE_FBGEMM) + set(EXT_LIBS ${EXT_LIBS} fbgemm dl) + add_compile_definitions(USE_FBGEMM=1) + endif(USE_FBGEMM) if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 9.0) # Clang-10.0.0 complains when CUDA is newer than 10.1 @@ -600,7 +592,7 @@ if(COMPILE_CPU) if(USE_RUY) set(EXT_LIBS ${EXT_LIBS} ruy) - add_definitions(-DUSE_RUY_SGEMM=1) + add_compile_definitions(USE_RUY_SGEMM=1) endif(USE_RUY) add_definitions(-DCOMPILE_CPU=1) # Move the compile CPU definition on top since we want to compile intgemm when we set compile CPU From 9dd1eff086674952b89cf252107dce29cc282605 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Thu, 9 Jun 2022 14:03:16 +0000 Subject: [PATCH 39/67] Restore CMakeDependentOption; Rename only to ONNX_SGEMM --- CMakeLists.txt | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5b5a574ea..7f35af61e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -63,6 +63,10 @@ option(COMPILE_WITHOUT_EXCEPTIONS "Compile without exceptions" OFF) # cmake options that are dependent on USE_WASM_COMPATIBLE_SOURCE cmake option CMAKE_DEPENDENT_OPTION(USE_THREADS "Compile with multi-threading support" OFF "USE_WASM_COMPATIBLE_SOURCE" ON) +CMAKE_DEPENDENT_OPTION(USE_ONNX_SGEMM "Compile with wasm compatible blas" ON + "USE_WASM_COMPATIBLE_SOURCE" OFF) +CMAKE_DEPENDENT_OPTION(COMPILE_WITHOUT_EXCEPTIONS "Compile without exceptions" ON + "USE_WASM_COMPATIBLE_SOURCE" OFF) if(${CMAKE_TARGET_ARCHITECTURE_CODE} MATCHES "arm") set(USE_RUY ON) @@ -85,13 +89,12 @@ if(USE_SIMD_UTILS) endif(MSVC) endif(USE_SIMD_UTILS) + if (USE_WASM_COMPATIBLE_SOURCE) set(SPM_BUILD_LIBRARY_ONLY ON CACHE BOOL "Build only sentencepiece library (skip building executables)") add_compile_definitions(WASM_COMPATIBLE_SOURCE) # Setting USE_SSE2 definition to enable SSE2 specific code in "3rd_party/sse_mathfun.h" for wasm builds add_compile_definitions(USE_SSE2) - set(USE_ONNX_SGEMM ON CACHE BOOL "Use ONNX SGEMM (for WebAssembly)") - set(COMPILE_WITHOUT_EXCEPTIONS ON CACHE BOOL "Compile without Exceptions") endif() if (COMPILE_WASM) From b055c11bfa71332d0a87ae5ae2b82430af4555c4 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Thu, 9 Jun 2022 14:28:10 +0000 Subject: [PATCH 40/67] Backtrack attempt to flatten ONNX_SGEMM out --- CMakeLists.txt | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7f35af61e..886fb63a6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -606,9 +606,7 @@ if(COMPILE_CPU) ## ^ SGEMM != BLAS set(EXT_LIBS ${EXT_LIBS} onnx-sgemm) add_definitions(-DUSE_ONNX_SGEMM=1) # Might be required in some cmake files further down the line, let's avoid using add_compile_definitions in this codeblock - endif(USE_ONNX_SGEMM) - - if(APPLE AND USE_APPLE_ACCELERATE) + elseif(APPLE AND USE_APPLE_ACCELERATE) set(BLAS_VENDOR "Accelerate") # see https://developer.apple.com/documentation/accelerate for more info # you may need to install Xcode command line tools if you don't have them already (https://developer.apple.com/xcode/features/) From d006196de2b22744c2a01a998b2576db496af28a Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Thu, 9 Jun 2022 15:28:24 +0000 Subject: [PATCH 41/67] USE_ONNX_SGEMM is a CMakeDependentOption --- CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 886fb63a6..433258442 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -56,7 +56,6 @@ option(COMPILE_WASM "Compile (wasm compatible) marian for WASM target" OFF) option(USE_WASM_COMPATIBLE_SOURCE "Enable the minimal marian sources that compile to wasm. Useful for debugging wasm failures by building same sources natively" OFF) option(USE_SIMD_UTILS "Enable simde to target instruction sets" OFF) -option(USE_ONNX_SGEMM "Compile with wasm compatible blas" OFF) option(USE_RUY_SGEMM "Compile with Ruy SGEMM" OFF) option(COMPILE_WITHOUT_EXCEPTIONS "Compile without exceptions" OFF) From 39b72375877c0642994254dcc38fb2f94ba9f367 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Fri, 10 Jun 2022 16:50:56 +0000 Subject: [PATCH 42/67] Keep pre armv8 TargetArch detect unchanged --- cmake/TargetArch.cmake | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/cmake/TargetArch.cmake b/cmake/TargetArch.cmake index 01f4a44d7..f653e3e28 100644 --- a/cmake/TargetArch.cmake +++ b/cmake/TargetArch.cmake @@ -14,19 +14,19 @@ set(archdetect_c_code " || defined(__ARM_ARCH_8M__) || defined(__ARM_ARCH_8M) \\ || (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 8) #error cmake_ARCH armv8 - #elif defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7) \\ - || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7A__) \\ - || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7R__) \\ - || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7M__) \\ + #elif defined(__ARM_ARCH_7__) \\ + || defined(__ARM_ARCH_7A__) \\ + || defined(__ARM_ARCH_7R__) \\ + || defined(__ARM_ARCH_7M__) \\ || (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 7) #error cmake_ARCH armv7 - #elif defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6) \\ - || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6J__) \\ - || defined(__ARM_ARCH_6T2__) || defined(__ARM_ARCH_6T2__) \\ - || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6Z__) \\ - || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6K__) \\ - || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6ZK__) \\ - || defined(__ARM_ARCH_6M__) || defined(__ARM_ARCH_6M__) \\ + #elif defined(__ARM_ARCH_6__) \\ + || defined(__ARM_ARCH_6J__) \\ + || defined(__ARM_ARCH_6T2__) \\ + || defined(__ARM_ARCH_6Z__) \\ + || defined(__ARM_ARCH_6K__) \\ + || defined(__ARM_ARCH_6ZK__) \\ + || defined(__ARM_ARCH_6M__) \\ || (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 6) #error cmake_ARCH armv6 #elif defined(__ARM_ARCH_5TEJ__) \\ From 3a6c51516670eaca3e841ccb76840208091f3278 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Tue, 14 Jun 2022 08:57:47 +0000 Subject: [PATCH 43/67] Simple ARM detection to no-op out shifted/shiftedAll paths --- src/tensors/cpu/backend.h | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/tensors/cpu/backend.h b/src/tensors/cpu/backend.h index 583a2f792..7804c5577 100644 --- a/src/tensors/cpu/backend.h +++ b/src/tensors/cpu/backend.h @@ -78,14 +78,25 @@ class Backend : public marian::Backend { void setInt8(bool optimize) override { int8_ = optimize; } bool isInt8() override { return int8_; } - void setShifted(bool shifted) override { shifted_ = shifted; } + void setShifted(bool shifted) override { +#if (defined(__arm__) || defined(__aarch64__)) + shifted_ = false; +#else + shifted_ = shifted; +#endif + } bool isShifted() override { return shifted_; } void setShiftedAll(bool shiftedAll) override { +#if (defined(__arm__) || defined(__aarch64__)) + shiftedAll_ = false; + shifted_ = false; +#else shiftedAll_ = shiftedAll; if (shiftedAll_) { shifted_ = true; } +#endif } bool isShiftedAll() override { From 46db01bf313a12e64b8cb370f4d4af2ade0269ff Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Tue, 14 Jun 2022 09:20:50 +0000 Subject: [PATCH 44/67] Add logging statements to indicate forced gemm-path change at construction --- src/tensors/cpu/backend.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/tensors/cpu/backend.h b/src/tensors/cpu/backend.h index 7804c5577..a571cbc80 100644 --- a/src/tensors/cpu/backend.h +++ b/src/tensors/cpu/backend.h @@ -80,6 +80,7 @@ class Backend : public marian::Backend { void setShifted(bool shifted) override { #if (defined(__arm__) || defined(__aarch64__)) + LOG(info, "gemm-precision: *shifted* is not available on ARM; Setting to false."); shifted_ = false; #else shifted_ = shifted; @@ -89,6 +90,7 @@ class Backend : public marian::Backend { void setShiftedAll(bool shiftedAll) override { #if (defined(__arm__) || defined(__aarch64__)) + LOG(info, "gemm-precision: *shifted* is not available on ARM; Setting to false."); shiftedAll_ = false; shifted_ = false; #else From 63fea9a7d438917858995b17ddebaea64120d71a Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Tue, 14 Jun 2022 09:20:59 +0000 Subject: [PATCH 45/67] Remove run script --- scripts/run.sh | 40 ---------------------------------------- 1 file changed, 40 deletions(-) delete mode 100644 scripts/run.sh diff --git a/scripts/run.sh b/scripts/run.sh deleted file mode 100644 index f080dee68..000000000 --- a/scripts/run.sh +++ /dev/null @@ -1,40 +0,0 @@ -#!/bin/bash - -NDK=/mnt/Storage/jphilip/android-ndk-r23b -ABI=arm64-v8a -MINSDK_VERSION=28 -CUSTOM_MODULE_PATH=/mnt/Storage/jphilip/marian-android/openblas-install/lib/cmake/openblas -ANDROID_PLATFORM=28 - -OTHER_ANDROID_ARGS=( - -DANDROID_ARM_NEON=TRUE -) - -OTHER_MARIAN_ARGS=( - -DCOMPILE_CUDA=off - -DCOMPILE_CPU=on - -DCMAKE_HAVE_THREADS_LIBRARY=1 - -DCMAKE_USE_WIN32_THREADS_INIT=0 - -DCMAKE_USE_PTHREADS_INIT=1 - -DTHREADS_PREFER_PTHREAD_FLAG=ON - -DBUILD_ARCH=armv8-a - -DUSE_INTGEMM=off - -DUSE_SIMDE=on - -DUSE_RUY=on - -DUSE_ONNX_SGEMM=on # For time being. - -DCOMPILE_WITHOUT_EXCEPTIONS=on # Apparently this can reduce the binary size, let's see. -) -# Additionally list variables finally configured. -cmake -L \ - -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake \ - -DCMAKE_MODULE_PATH=$CUSTOM_MODULE_PATH \ - -DANDROID_TOOLCHAIN=clang \ - -DANDROID_ABI=$ABI \ - -DANDROID_PLATFORM=$ANDROID_PLATFORM \ - -DANDROID_NATIVE_API_LEVEL=$MINSDKVERSION \ - -DANDROID_TOOLCHAIN_NAME=arm-linux-androideabi-4.8 \ - -DANDROID_STL=c++_static \ - -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache \ - "${OTHER_ANDROID_ARGS[@]}" "${OTHER_MARIAN_ARGS[@]}" \ - .. From 82a15e1900e0a5caf08dee37b6d69946ad847218 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Tue, 14 Jun 2022 09:52:51 +0000 Subject: [PATCH 46/67] Removing kStandardCpp - may add later for tests separately --- src/tensors/cpu/intgemm_interface.h | 4 +- src/tensors/cpu/ruy_adapter.h | 342 +++++++++++----------------- 2 files changed, 133 insertions(+), 213 deletions(-) diff --git a/src/tensors/cpu/intgemm_interface.h b/src/tensors/cpu/intgemm_interface.h index 0612559d9..865c97d3f 100644 --- a/src/tensors/cpu/intgemm_interface.h +++ b/src/tensors/cpu/intgemm_interface.h @@ -461,7 +461,7 @@ float scalar_; intgemm::callbacks::UnquantizeAndWrite(unquant_mult, val_->data())); #else typedef typename intgemm_::type Integer; - auto callback = marian::cpu::integer::Preprocess::UnquantizeAndWrite(unquant_mult); + auto callback = marian::cpu::integer::UnquantizeAndWrite(unquant_mult); intgemm_::width::Multiply(reinterpret_cast(child(0)->val()->data()), /*A*/ reinterpret_cast(child(1)->val()->data()), /*B*/ val_->data(), /*output*/ @@ -555,7 +555,7 @@ class AffineNodeOp : public NaryNodeOp { } #else typedef typename intgemm_::type Integer; - auto callback = marian::cpu::integer::Preprocess::UnquantizeAndAddBiasAndWrite(unquant_mult, + auto callback = marian::cpu::integer::UnquantizeAndAddBiasAndWrite(unquant_mult, child(2)->val()->data() /*child(2) is bias*/); intgemm_::width::Multiply(reinterpret_cast(child(0)->val()->data()), /*A*/ reinterpret_cast(child(1)->val()->data()), /*B*/ diff --git a/src/tensors/cpu/ruy_adapter.h b/src/tensors/cpu/ruy_adapter.h index 2d19fc4a3..90383e86a 100644 --- a/src/tensors/cpu/ruy_adapter.h +++ b/src/tensors/cpu/ruy_adapter.h @@ -28,85 +28,9 @@ using Index = unsigned int; // and required in the fast matrix-multiplication workflow for machine-translation, that exists in // marian. -enum class Path { - kStandardCpp = 0, // Pure C++ - kNeon = 1 // NEON Intrinsics (ARM) -}; - -#if RUY_PLATFORM_NEON -constexpr Path kHighestPath = Path::kNeon; -#else -constexpr Path kHighestPath = Path::kStandardCpp; -#endif - -template -struct Preprocess; - /* * Naive implementation using standard C++ functions. Not optimized using SIMD operations. */ -template <> -struct Preprocess { - static void quantize(const float *input, int8_t *output, float scale, Index rows, Index width) { - const Index size = rows * width; - for(Index i = 0; i < size; i++) { - // Round to nearest after multiplying with scale. - float value = roundf(scale * input[i]); - - // Since float can store bigger values, we threshold anything that's gone - // higher and can't fit in int8. - value = std::max(-127.0f, value); - value = std::min(127.0f, value); - - // Finally a static cast. - output[i] = static_cast(value); - }; - } - - template - static void transpose(const Scalar *input, Index rows, Index cols, Scalar *output) { - for(Index i = 0; i < rows; i++) { - for(Index j = 0; j < cols; j++) { - output[j * rows + i] = input[i * cols + j]; - } - } - } - - struct UnquantizeAndAddBiasAndWrite { - UnquantizeAndAddBiasAndWrite(float unquant_multiplier, const float *input_bias_prepared) - : unquant_multiplier_(unquant_multiplier), input_bias_prepared_(input_bias_prepared) {} - - void operator()(const int32_t *input, Index rows_A, Index cols_B, float *output) const { - for(Index i = 0; i < rows_A; i++) { - for(Index j = 0; j < cols_B; j++) { - Index idx = i * cols_B + j; - output[idx] = (input[idx] * unquant_multiplier_) + input_bias_prepared_[j]; - } - } - } - - private: - float unquant_multiplier_; - const float *input_bias_prepared_; - }; - - struct UnquantizeAndWrite { - explicit UnquantizeAndWrite(float unquant_multiplier) - : unquant_multiplier_(unquant_multiplier) {} - - void operator()(const int32_t *input, Index rows_A, Index cols_B, float *output) const { - for(Index i = 0; i < rows_A; i++) { - for(Index j = 0; j < cols_B; j++) { - Index idx = i * cols_B + j; - output[idx] = (input[idx] * unquant_multiplier_); - } - } - } - - private: - float unquant_multiplier_; - }; -}; #if RUY_PLATFORM_NEON @@ -114,82 +38,68 @@ struct Preprocess { * Optimized path using ARM NEON SIMD intrinsics. Currently only supports int8_t. * TODO: Expand support to 16-bit. */ -template <> -struct Preprocess { - static void quantize(const float *input, int8_t *output, float scale, Index rows, Index width) { - const float32x4_t *Input = reinterpret_cast(input); - const float32x4_t *InputEnd = reinterpret_cast(input + rows * width); - - int8x8_t *Output = reinterpret_cast(output); - while(Input != InputEnd) { - // Vector multiply by scalar - // float32x4_t vmulq_n_f32(float32x4_t a, float32_t b); - // VMUL.F32 q0,q0,d0[0] - float32x4_t scaledFloat_lo = vmulq_n_f32(*Input++, scale); - - // Convert from float - // int32x4_t vcvtnq_s32_f32(float32x4_t a); - // VCVT.S32.F32 q0, q0 - int32x4_t scaledInt_lo = vcvtnq_s32_f32(scaledFloat_lo); - - // Vector saturating narrow integer - // int16x4_t vqmovn_s32(int32x4_t a); // VQMOVN.S32 d0,q0 - int16x4_t s16x4_lo = vqmovn_s32(scaledInt_lo); - - // Vector multiply by scalar - // float32x4_t vmulq_n_f32(float32x4_t a, float32_t b); - // VMUL.F32 q0,q0,d0[0] - float32x4_t scaledFloat_hi = vmulq_n_f32(*Input++, scale); - - // Convert from float - // int32x4_t vcvtnq_s32_f32(float32x4_t a); - // VCVT.S32.F32 q0, q0 - int32x4_t scaledInt_hi = vcvtnq_s32_f32(scaledFloat_hi); - - // Vector saturating narrow integer - // int16x4_t vqmovn_s32(int32x4_t a); - // VQMOVN.S32 d0,q0 - int16x4_t s16x4_hi = vqmovn_s32(scaledInt_hi); - - // Combine two ints. - // int16x8_t vcombine_s16(int16x4_t low, int16x4_t high); - int16x8_t s16x8 = vcombine_s16(s16x4_lo, s16x4_hi); - - // Vector saturating narrow integer - int8x8_t s8x8 = vqmovn_s16(s16x8); - - *Output = s8x8; - ++Output; - }; - } - - // Specialization for int8_t - static void transpose(const int8_t *input, Index rows, Index cols, int8_t *output) { - constexpr Index tile_size = 16; - // TODO(jerin): Enable - // assert(rows % tile_size == 0 && cols & tile_size == 0); - for(Index i = 0; i < rows; i += tile_size) { - for(Index j = 0; j < cols; j += tile_size) { - _transpose_16x16(input, i, j, rows, cols, output); - } - } - } - - static void _transpose_16x16(const int8_t *src, - Index i, - Index j, - Index rows, - Index cols, - int8_t *dst) { - // Implemented following the algorithm described in - // https://stackoverflow.com/a/29587984/4565794 - // - // permute n 32-bit rows - // permute n 64-bit rows - // ... - // permute n simd_width/2-bit rows - - // clang-format off +inline void quantize(const float *input, int8_t *output, float scale, Index rows, Index width) { + const float32x4_t *Input = reinterpret_cast(input); + const float32x4_t *InputEnd = reinterpret_cast(input + rows * width); + + int8x8_t *Output = reinterpret_cast(output); + while(Input != InputEnd) { + // Vector multiply by scalar + // float32x4_t vmulq_n_f32(float32x4_t a, float32_t b); + // VMUL.F32 q0,q0,d0[0] + float32x4_t scaledFloat_lo = vmulq_n_f32(*Input++, scale); + + // Convert from float + // int32x4_t vcvtnq_s32_f32(float32x4_t a); + // VCVT.S32.F32 q0, q0 + int32x4_t scaledInt_lo = vcvtnq_s32_f32(scaledFloat_lo); + + // Vector saturating narrow integer + // int16x4_t vqmovn_s32(int32x4_t a); // VQMOVN.S32 d0,q0 + int16x4_t s16x4_lo = vqmovn_s32(scaledInt_lo); + + // Vector multiply by scalar + // float32x4_t vmulq_n_f32(float32x4_t a, float32_t b); + // VMUL.F32 q0,q0,d0[0] + float32x4_t scaledFloat_hi = vmulq_n_f32(*Input++, scale); + + // Convert from float + // int32x4_t vcvtnq_s32_f32(float32x4_t a); + // VCVT.S32.F32 q0, q0 + int32x4_t scaledInt_hi = vcvtnq_s32_f32(scaledFloat_hi); + + // Vector saturating narrow integer + // int16x4_t vqmovn_s32(int32x4_t a); + // VQMOVN.S32 d0,q0 + int16x4_t s16x4_hi = vqmovn_s32(scaledInt_hi); + + // Combine two ints. + // int16x8_t vcombine_s16(int16x4_t low, int16x4_t high); + int16x8_t s16x8 = vcombine_s16(s16x4_lo, s16x4_hi); + + // Vector saturating narrow integer + int8x8_t s8x8 = vqmovn_s16(s16x8); + + *Output = s8x8; + ++Output; + }; +} + +inline void _transpose_16x16(const int8_t *src, + Index i, + Index j, + Index rows, + Index cols, + int8_t *dst) { + // Implemented following the algorithm described in + // https://stackoverflow.com/a/29587984/4565794 + // + // permute n 32-bit rows + // permute n 64-bit rows + // ... + // permute n simd_width/2-bit rows + + // clang-format off // Permute 8 8-bit rows. // Load int8x16x2 from memory into SIMD registers, transpose as 2x2 matrices. @@ -250,72 +160,82 @@ struct Preprocess { vst1q_s8(&dst[14*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(x2.val[1]), vget_high_s32(x6.val[1])))); vst1q_s8(&dst[15*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(x3.val[1]), vget_high_s32(x7.val[1])))); - // clang-format on + // clang-format on +} + +// Specialization for int8_t +inline void transpose(const int8_t *input, Index rows, Index cols, int8_t *output) { + constexpr Index tile_size = 16; + // TODO(jerin): Enable + // assert(rows % tile_size == 0 && cols & tile_size == 0); + for(Index i = 0; i < rows; i += tile_size) { + for(Index j = 0; j < cols; j += tile_size) { + _transpose_16x16(input, i, j, rows, cols, output); + } } +} - struct UnquantizeAndAddBiasAndWrite { - UnquantizeAndAddBiasAndWrite(float unquant_multiplier, const float *input_bias_prepared) - : unquant_multiplier_(unquant_multiplier), input_bias_prepared_(input_bias_prepared) {} - - void operator()(const int32_t *input, Index rows_A, Index cols_B, float *output) const { - // Set all registers in lane from same scalar value. - float32x4_t multiplier = vdupq_n_f32(unquant_multiplier_); - const int32x4_t *Input = reinterpret_cast(input); - const int32x4_t *InputEnd = reinterpret_cast(input + rows_A * cols_B); - float32x4_t *Output = reinterpret_cast(output); - - while(Input != InputEnd) { - // Bias cycles every column for addition. - const float32x4_t *Bias = reinterpret_cast(input_bias_prepared_); - - // InputEnd needs to be determined to end the while loop below. - const int32x4_t *RowEnd = reinterpret_cast( - reinterpret_cast(Input) + cols_B); - - while(Input != RowEnd) { - // Operation happening for 4-elements together: - // output = [int32_t]input * [float]quant_mult + [float]bias; - float32x4_t floatInput = vcvtq_f32_s32(*Input++); - float32x4_t unquantized = vmulq_f32(floatInput, multiplier); - *Output++ = vaddq_f32(unquantized, *Bias++); - } +struct UnquantizeAndAddBiasAndWrite { + UnquantizeAndAddBiasAndWrite(float unquant_multiplier, const float *input_bias_prepared) + : unquant_multiplier_(unquant_multiplier), input_bias_prepared_(input_bias_prepared) {} + + void operator()(const int32_t *input, Index rows_A, Index cols_B, float *output) const { + // Set all registers in lane from same scalar value. + float32x4_t multiplier = vdupq_n_f32(unquant_multiplier_); + const int32x4_t *Input = reinterpret_cast(input); + const int32x4_t *InputEnd = reinterpret_cast(input + rows_A * cols_B); + float32x4_t *Output = reinterpret_cast(output); + + while(Input != InputEnd) { + // Bias cycles every column for addition. + const float32x4_t *Bias = reinterpret_cast(input_bias_prepared_); + + // InputEnd needs to be determined to end the while loop below. + const int32x4_t *RowEnd + = reinterpret_cast(reinterpret_cast(Input) + cols_B); + + while(Input != RowEnd) { + // Operation happening for 4-elements together: + // output = [int32_t]input * [float]quant_mult + [float]bias; + float32x4_t floatInput = vcvtq_f32_s32(*Input++); + float32x4_t unquantized = vmulq_f32(floatInput, multiplier); + *Output++ = vaddq_f32(unquantized, *Bias++); } } + } - private: - float unquant_multiplier_; - const float *input_bias_prepared_; - }; +private: + float unquant_multiplier_; + const float *input_bias_prepared_; +}; - struct UnquantizeAndWrite { - explicit UnquantizeAndWrite(float unquant_multiplier) - : unquant_multiplier_(unquant_multiplier) {} - - void operator()(const int32_t *input, Index rows_A, Index cols_B, float *output) const { - // Set all registers in lane from same scalar value. - float32x4_t multiplier = vdupq_n_f32(unquant_multiplier_); - const int32x4_t *Input = reinterpret_cast(input); - const int32x4_t *InputEnd = reinterpret_cast(input + rows_A * cols_B); - float32x4_t *Output = reinterpret_cast(output); - - while(Input != InputEnd) { - // InputEnd needs to be determined to end the while loop below. - const int32x4_t *RowEnd = reinterpret_cast( - reinterpret_cast(Input) + cols_B); - - while(Input != RowEnd) { - // Operation happening for 4-elements together: - // output = [int32_t]input * [float]quant_mult + [float]bias; - float32x4_t floatInput = vcvtq_f32_s32(*Input++); - float32x4_t unquantized = vmulq_f32(floatInput, multiplier); - *Output++ = unquantized; - } +struct UnquantizeAndWrite { + explicit UnquantizeAndWrite(float unquant_multiplier) : unquant_multiplier_(unquant_multiplier) {} + + void operator()(const int32_t *input, Index rows_A, Index cols_B, float *output) const { + // Set all registers in lane from same scalar value. + float32x4_t multiplier = vdupq_n_f32(unquant_multiplier_); + const int32x4_t *Input = reinterpret_cast(input); + const int32x4_t *InputEnd = reinterpret_cast(input + rows_A * cols_B); + float32x4_t *Output = reinterpret_cast(output); + + while(Input != InputEnd) { + // InputEnd needs to be determined to end the while loop below. + const int32x4_t *RowEnd + = reinterpret_cast(reinterpret_cast(Input) + cols_B); + + while(Input != RowEnd) { + // Operation happening for 4-elements together: + // output = [int32_t]input * [float]quant_mult + [float]bias; + float32x4_t floatInput = vcvtq_f32_s32(*Input++); + float32x4_t unquantized = vmulq_f32(floatInput, multiplier); + *Output++ = unquantized; } } + } - private: - float unquant_multiplier_; - }; +private: + float unquant_multiplier_; }; #endif @@ -340,7 +260,7 @@ struct IntgemmViaRuy { float quant_mult, Index rows, Index cols) { - Preprocess::quantize(input, output, quant_mult, rows, cols); + quantize(input, output, quant_mult, rows, cols); } static void PrepareB(const float *, Type *, float, Index, Index) { @@ -352,7 +272,7 @@ struct IntgemmViaRuy { float quant_mult, Index rows, Index cols) { - Preprocess::quantize(input, output, quant_mult, rows, cols); + quantize(input, output, quant_mult, rows, cols); } static void SelectColumnsB(const Type *input, From 4a8c0da63cca0110478815ccc8f24f890ae34e3a Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Tue, 14 Jun 2022 10:08:32 +0000 Subject: [PATCH 47/67] Remove leftover gcc diagnostic pop for SIMDE --- src/3rd_party/sse_mathfun.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/3rd_party/sse_mathfun.h b/src/3rd_party/sse_mathfun.h index bb0be6c65..91155cac3 100644 --- a/src/3rd_party/sse_mathfun.h +++ b/src/3rd_party/sse_mathfun.h @@ -712,6 +712,3 @@ static inline void sincos_ps(v4sf x, v4sf *s, v4sf *c) { *c = _mm_xor_ps(xmm2, sign_bit_cos); } -#ifdef USE_SIMDE -#pragma GCC diagnostic pop -#endif From e17a5dd891550b7f67304aac2c038bbfacb8fbd5 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Tue, 14 Jun 2022 10:09:49 +0000 Subject: [PATCH 48/67] Remove simde-no-tests reference in CMakeLists.txt file --- src/CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 5845a6710..35bd6866a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -8,7 +8,6 @@ include_directories(3rd_party/sentencepiece/third_party/protobuf-lite) include_directories(3rd_party/fbgemm/include) include_directories(3rd_party/intgemm) include_directories(3rd_party/ruy) -include_directories(3rd_party/simde-no-tests) include_directories(${CMAKE_CURRENT_BINARY_DIR}/3rd_party/intgemm) include_directories(${CMAKE_CURRENT_BINARY_DIR}/3rd_party) include_directories(${CMAKE_BINARY_DIR}/local/include) From 3c8a149c5720a6549c356895e52c1875658b507b Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Tue, 14 Jun 2022 10:21:58 +0000 Subject: [PATCH 49/67] Remove obsolete comments --- src/tensors/cpu/ruy_adapter.h | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/src/tensors/cpu/ruy_adapter.h b/src/tensors/cpu/ruy_adapter.h index 90383e86a..4066ac895 100644 --- a/src/tensors/cpu/ruy_adapter.h +++ b/src/tensors/cpu/ruy_adapter.h @@ -22,21 +22,10 @@ namespace integer { using Index = unsigned int; -// The following partitions a pure C++ slow implementation and a faster SIMD implementation using -// NEON intrinsics on ARM hardware. Ruy already has such a routing, but we add some preprocessing -// and postprocessing functions (quantize, transpose, unquantize) that are outside ruy's offerings -// and required in the fast matrix-multiplication workflow for machine-translation, that exists in -// marian. - -/* - * Naive implementation using standard C++ functions. Not optimized using SIMD operations. - */ - #if RUY_PLATFORM_NEON /* * Optimized path using ARM NEON SIMD intrinsics. Currently only supports int8_t. - * TODO: Expand support to 16-bit. */ inline void quantize(const float *input, int8_t *output, float scale, Index rows, Index width) { const float32x4_t *Input = reinterpret_cast(input); From 800402c85ab3293d311ea28b8987433b12c0333e Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Tue, 14 Jun 2022 13:55:56 +0000 Subject: [PATCH 50/67] Explain copying x86-SSE structure for NEON --- src/common/types.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/common/types.h b/src/common/types.h index 621551daf..6052896fc 100644 --- a/src/common/types.h +++ b/src/common/types.h @@ -173,15 +173,22 @@ struct intgemm8 { #if defined(__ARM_NEON) || defined(__ARM_NEON__) +// The following struct fills this structure for ARM with NEON SIMD, changing +// __m128 and _mm_set1_ps with the equivalents on ARM-NEON. struct float32x4 { private: + // NEON uses 128-bit SIMD registers, same as SSE. We are copying this class + // and locally aliasing __m128 to float32x4_t, which is the NEON + // equivalent. using __m128 = float32x4_t; __m128 f_; public: float32x4() {} float32x4(const __m128& f) : f_(f) {} - float32x4(const float& f) : f_(vdupq_n_f32(f)) {} // __m128 _mm_set1_ps(float) copies value into all slots + // __m128 _mm_set1_ps(float) copies value into all slots, vdupq_n_f32 is it's + // NEON equivalent. + float32x4(const float& f) : f_(vdupq_n_f32(f)) {} operator const __m128&() const { return f_; } operator __m128&() { return f_; } From 9d648d0ebfeee5b10591280f10e0863bbc5e8a37 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Mon, 20 Jun 2022 10:30:55 +0000 Subject: [PATCH 51/67] Remove executable upload for android --- .github/workflows/arm.yml | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/.github/workflows/arm.yml b/.github/workflows/arm.yml index 9667ece05..55e2978cc 100644 --- a/.github/workflows/arm.yml +++ b/.github/workflows/arm.yml @@ -118,22 +118,3 @@ jobs: path: ${{github.workspace}}/build/marian-decoder - # Disable release for now. - # release: - # name: Release Latest Build - # runs-on: ubuntu-latest - # needs: [ubuntu] - # if: github.ref == 'refs/heads/master' - # steps: - # - name: Download artifacts - # uses: actions/download-artifact@v2 - # - # - name: Update GitHub prerelease - # uses: marvinpinto/action-automatic-releases@latest - # with: - # repo_token: ${{ secrets.GITHUB_TOKEN }} - # automatic_release_tag: latest - # prerelease: true - # title: "Latest Build" - # files: | - # artifact/marian-decoder From d19a3123b47d7370841ec3ac53edade132c23b4c Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Mon, 20 Jun 2022 10:31:56 +0000 Subject: [PATCH 52/67] Remove comment --- src/CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 35bd6866a..76aa0e2b3 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -101,7 +101,6 @@ set(MARIAN_SOURCES ) if (NOT USE_WASM_COMPATIBLE_SOURCE AND NOT ANDROID) - # Hi WASM, Android hates this too. list(APPEND MARIAN_SOURCES 3rd_party/ExceptionWithCallStack.cpp ) From 1b38e01bb8c7738dace52ed84d972ca4f9b74897 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Tue, 21 Jun 2022 09:51:32 +0000 Subject: [PATCH 53/67] Restore -Werror --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 433258442..f39e2acc5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -308,7 +308,7 @@ else(MSVC) set(DISABLE_GLOBALLY "-Wno-unused-result ${CLANG_IGNORE_UNKNOWN_CUDA} ${CLANG_IGNORE_UNUSED_VALUES}") # This needs to appear here as well to appease clang11+ on linux # These are used in src/CMakeLists.txt on a per-target basis - list(APPEND ALL_WARNINGS -Wall; # -Werror; + list(APPEND ALL_WARNINGS -Wall; -Werror; -Wextra; -Wno-unused-result; -Wno-deprecated; -Wno-pragmas; -Wno-unused-parameter; -Wno-unused-function; -Wno-unused-value; -Wno-unknown-pragmas; -Wno-sign-compare; From 9027ea4c47a8e0ae031dcd52529c3e9c4e79a557 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Tue, 21 Jun 2022 10:06:59 +0000 Subject: [PATCH 54/67] Switch to a {{0}} sigaction on WASM, {0} for rest --- src/common/logging.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/common/logging.cpp b/src/common/logging.cpp index 51d80510a..3a7afd846 100644 --- a/src/common/logging.cpp +++ b/src/common/logging.cpp @@ -128,7 +128,11 @@ static void setErrorHandlers() { std::set_terminate(unhandledException); #ifdef __unix__ // catch segfaults +#ifdef WASM_COMPATIBLE_SOURCE + struct sigaction sa = {{ 0 }}; +#else // WASM_COMPATIBLE_SOURCE struct sigaction sa = { 0 }; +#endif // WASM_COMPATIBLE_SOURCE sigemptyset(&sa.sa_mask); sa.sa_flags = SA_SIGINFO; sa.sa_sigaction = [](int /*signal*/, siginfo_t*, void*) { ABORT("Segmentation fault"); }; From a0ee5275d5e95a4ba810ff18d3d165d6b93a501d Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Tue, 21 Jun 2022 20:32:06 +0000 Subject: [PATCH 55/67] Revert "Restore -Werror" This reverts commit 1b38e01bb8c7738dace52ed84d972ca4f9b74897. --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f39e2acc5..433258442 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -308,7 +308,7 @@ else(MSVC) set(DISABLE_GLOBALLY "-Wno-unused-result ${CLANG_IGNORE_UNKNOWN_CUDA} ${CLANG_IGNORE_UNUSED_VALUES}") # This needs to appear here as well to appease clang11+ on linux # These are used in src/CMakeLists.txt on a per-target basis - list(APPEND ALL_WARNINGS -Wall; -Werror; + list(APPEND ALL_WARNINGS -Wall; # -Werror; -Wextra; -Wno-unused-result; -Wno-deprecated; -Wno-pragmas; -Wno-unused-parameter; -Wno-unused-function; -Wno-unused-value; -Wno-unknown-pragmas; -Wno-sign-compare; From 6285f28209b95a05a5ba883bc763b7f3d9e350a1 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Tue, 21 Jun 2022 20:41:08 +0000 Subject: [PATCH 56/67] Use -DFMA for NEON from simd_utils example --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 433258442..32309e33e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -80,7 +80,7 @@ if(USE_INTGEMM) endif(USE_INTGEMM) if(USE_SIMD_UTILS) - add_compile_definitions(ARM SSE) #added for ARM + add_compile_definitions(ARM FMA SSE) #added for ARM if(MSVC) add_compile_options(/flax-vector-conversions) else(MSVC) From 8895fda8ed69a7da0ecc2abd83ff5c14c785416c Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Tue, 21 Jun 2022 20:48:15 +0000 Subject: [PATCH 57/67] Remove redundant neon_mathfun include after simd_utils.h --- src/functional/operators.h | 1 - 1 file changed, 1 deletion(-) diff --git a/src/functional/operators.h b/src/functional/operators.h index 00cb9c819..7bffec066 100755 --- a/src/functional/operators.h +++ b/src/functional/operators.h @@ -215,7 +215,6 @@ struct Ops { #if defined(__ARM_NEON) || defined(__ARM_NEON__) #include "3rd_party/simd_utils/simd_utils.h" -#include "3rd_party/simd_utils/neon_mathfun.h" #else #include "3rd_party/sse_mathfun.h" #endif From c6c3ac6fc1e64862f3c808d127e07cc5cb72406c Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Tue, 21 Jun 2022 20:50:32 +0000 Subject: [PATCH 58/67] Wrap CmakeLists.txt ARM definitions with an if --- CMakeLists.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 32309e33e..00c73500c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -80,7 +80,9 @@ if(USE_INTGEMM) endif(USE_INTGEMM) if(USE_SIMD_UTILS) - add_compile_definitions(ARM FMA SSE) #added for ARM + if(${CMAKE_TARGET_ARCHITECTURE_CODE} MATCHES "arm") + add_compile_definitions(ARM FMA SSE) #added for ARM + endif() if(MSVC) add_compile_options(/flax-vector-conversions) else(MSVC) From 3baf620296502bae093f9a3b537ea889efe4c9cb Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Thu, 23 Jun 2022 22:25:46 +0000 Subject: [PATCH 59/67] Use __clang__ instead of WASM_COMPATIBLE_SOURCE; emcc uses LLVM --- src/common/logging.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/common/logging.cpp b/src/common/logging.cpp index 3a7afd846..69c3ccb78 100644 --- a/src/common/logging.cpp +++ b/src/common/logging.cpp @@ -128,11 +128,14 @@ static void setErrorHandlers() { std::set_terminate(unhandledException); #ifdef __unix__ // catch segfaults -#ifdef WASM_COMPATIBLE_SOURCE +// Emscripten uses Clang/LLVM as its underlying codegen compiler, so the +// preprocessor defines __llvm__ and __clang__ are defined. +// Exists to appease -Werror gods. +#ifdef __clang__ struct sigaction sa = {{ 0 }}; -#else // WASM_COMPATIBLE_SOURCE +#else // __clang__ struct sigaction sa = { 0 }; -#endif // WASM_COMPATIBLE_SOURCE +#endif // __clang__ sigemptyset(&sa.sa_mask); sa.sa_flags = SA_SIGINFO; sa.sa_sigaction = [](int /*signal*/, siginfo_t*, void*) { ABORT("Segmentation fault"); }; From aa1842cc527418790d24513b4e605b422406a5c4 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Thu, 23 Jun 2022 22:26:44 +0000 Subject: [PATCH 60/67] Suppress warnings by #pragma GCC diagnostic ... --- src/functional/operators.h | 3 +++ src/tensors/cpu/integer_common.h | 4 ++++ src/tensors/cpu/prod_blas.h | 3 +++ src/tensors/cpu/ruy_adapter.h | 3 +++ 4 files changed, 13 insertions(+) diff --git a/src/functional/operators.h b/src/functional/operators.h index 7bffec066..1a67e22a1 100755 --- a/src/functional/operators.h +++ b/src/functional/operators.h @@ -214,7 +214,10 @@ struct Ops { #ifndef __CUDACC__ #if defined(__ARM_NEON) || defined(__ARM_NEON__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wstrict-aliasing" #include "3rd_party/simd_utils/simd_utils.h" +#pragma GCC diagnostic pop #else #include "3rd_party/sse_mathfun.h" #endif diff --git a/src/tensors/cpu/integer_common.h b/src/tensors/cpu/integer_common.h index 67ee1e94f..389f5c4e8 100644 --- a/src/tensors/cpu/integer_common.h +++ b/src/tensors/cpu/integer_common.h @@ -7,7 +7,11 @@ #ifdef USE_INTGEMM #include "3rd_party/intgemm/intgemm/intgemm.h" #else // USE_INTGEMM +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wall" #include +#pragma GCC diagnostic pop + #include "ruy_adapter.h" #endif // USE_INTGEMM #if defined(WASM) diff --git a/src/tensors/cpu/prod_blas.h b/src/tensors/cpu/prod_blas.h index bab16cfbd..61053b402 100644 --- a/src/tensors/cpu/prod_blas.h +++ b/src/tensors/cpu/prod_blas.h @@ -6,8 +6,11 @@ #elif USE_ONNX_SGEMM #include "3rd_party/onnxjs/src/wasm-ops/gemm.h" #elif USE_RUY_SGEMM +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wcomment" #include "ruy/ruy.h" #include "ruy/system_aligned_alloc.h" +#pragma GCC pop #endif #if USE_RUY_SGEMM diff --git a/src/tensors/cpu/ruy_adapter.h b/src/tensors/cpu/ruy_adapter.h index 4066ac895..ad6bc1ce6 100644 --- a/src/tensors/cpu/ruy_adapter.h +++ b/src/tensors/cpu/ruy_adapter.h @@ -9,8 +9,11 @@ #include #include #include +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wcomment" #include "ruy/platform.h" #include "ruy/system_aligned_alloc.h" +#pragma GCC diagnostic pop #if RUY_PLATFORM_NEON #include From 8eae08bca9ebb42e0c58f3a630a88b189cb86ab5 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Thu, 23 Jun 2022 22:27:05 +0000 Subject: [PATCH 61/67] Re-enable -Werror --- CMakeLists.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 00c73500c..f505188d1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -310,8 +310,7 @@ else(MSVC) set(DISABLE_GLOBALLY "-Wno-unused-result ${CLANG_IGNORE_UNKNOWN_CUDA} ${CLANG_IGNORE_UNUSED_VALUES}") # This needs to appear here as well to appease clang11+ on linux # These are used in src/CMakeLists.txt on a per-target basis - list(APPEND ALL_WARNINGS -Wall; # -Werror; - -Wextra; -Wno-unused-result; -Wno-deprecated; + list(APPEND ALL_WARNINGS -Wall; -Werror; -Wextra; -Wno-unused-result; -Wno-deprecated; -Wno-pragmas; -Wno-unused-parameter; -Wno-unused-function; -Wno-unused-value; -Wno-unknown-pragmas; -Wno-sign-compare; -Wno-missing-field-initializers; ${CLANG_IGNORE_UNUSED_PRIVATE_FIELD}) From 9a541c456a864a59502fdc5fee494cf2965dd671 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Thu, 23 Jun 2022 22:50:48 +0000 Subject: [PATCH 62/67] {0} -> {} to work around empty-braces Werror --- src/common/logging.cpp | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/src/common/logging.cpp b/src/common/logging.cpp index 69c3ccb78..c21cf4207 100644 --- a/src/common/logging.cpp +++ b/src/common/logging.cpp @@ -128,14 +128,7 @@ static void setErrorHandlers() { std::set_terminate(unhandledException); #ifdef __unix__ // catch segfaults -// Emscripten uses Clang/LLVM as its underlying codegen compiler, so the -// preprocessor defines __llvm__ and __clang__ are defined. -// Exists to appease -Werror gods. -#ifdef __clang__ - struct sigaction sa = {{ 0 }}; -#else // __clang__ - struct sigaction sa = { 0 }; -#endif // __clang__ + struct sigaction sa = {}; sigemptyset(&sa.sa_mask); sa.sa_flags = SA_SIGINFO; sa.sa_sigaction = [](int /*signal*/, siginfo_t*, void*) { ABORT("Segmentation fault"); }; From 4b8039901ba29a427f33b42d4d25c24aced47018 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Fri, 24 Jun 2022 07:46:52 +0000 Subject: [PATCH 63/67] Replace -Wall with -Wcomment --- src/tensors/cpu/integer_common.h | 226 ++++++++++++++++++------------- 1 file changed, 132 insertions(+), 94 deletions(-) diff --git a/src/tensors/cpu/integer_common.h b/src/tensors/cpu/integer_common.h index 389f5c4e8..ad46735a0 100644 --- a/src/tensors/cpu/integer_common.h +++ b/src/tensors/cpu/integer_common.h @@ -1,19 +1,19 @@ #pragma once +#include "common/io_item.h" +#include "tensors/cpu/aligned.h" #include "tensors/tensor_allocator.h" #include "tensors/tensor_operators.h" -#include "tensors/cpu/aligned.h" -#include "common/io_item.h" #ifdef USE_INTGEMM #include "3rd_party/intgemm/intgemm/intgemm.h" -#else // USE_INTGEMM +#else // USE_INTGEMM #pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wall" +#pragma GCC diagnostic ignored "-Wcomment" #include #pragma GCC diagnostic pop #include "ruy_adapter.h" -#endif // USE_INTGEMM +#endif // USE_INTGEMM #if defined(WASM) #include "wasm_intgemm_interface.h" #endif @@ -25,106 +25,144 @@ namespace marian { namespace cpu { namespace integer { -//Convenient function to get rows and columns of a tensor, shadowed by namespace. -inline int cols(Tensor& tensor) { return tensor->shape()[-1]; } -inline int rows(Tensor& tensor) { return tensor->shape().elements() / cols(tensor); } +// Convenient function to get rows and columns of a tensor, shadowed by namespace. +inline int cols(Tensor &tensor) { + return tensor->shape()[-1]; +} +inline int rows(Tensor &tensor) { + return tensor->shape().elements() / cols(tensor); +} -inline int cols(Shape& shape) { return shape[-1]; } -inline int rows(Shape& shape) { return shape.elements() / cols(shape); } +inline int cols(Shape &shape) { + return shape[-1]; +} +inline int rows(Shape &shape) { + return shape.elements() / cols(shape); +} // This operates on floats after processing so doesn't care about int8_t vs int16_t. void AddBias(marian::Tensor C, const marian::Tensor Bias); #ifdef USE_INTGEMM -template struct intgemm_; -template <> struct intgemm_ {using width = intgemm::Int8; - using type = int8_t; - constexpr static const Type intgemmType = Type::intgemm8;}; -template <> struct intgemm_ {using width = intgemm::Int16; - using type = int16_t; - constexpr static const Type intgemmType = Type::intgemm16;}; - - - -#else // USE_INTGEMM - -template struct intgemm_; -template <> struct intgemm_ {using width = IntgemmViaRuy::Int8; - using type = IntgemmViaRuy::Int8::Type; - constexpr static const Type intgemmType = Type::intgemm8;}; -template <> struct intgemm_ {using width = IntgemmViaRuy::Int16; - using type = IntgemmViaRuy::Int16::Type; - constexpr static const Type intgemmType = Type::intgemm16;}; - -#endif // USE_INTGEMM +template +struct intgemm_; +template <> +struct intgemm_ { + using width = intgemm::Int8; + using type = int8_t; + constexpr static const Type intgemmType = Type::intgemm8; +}; +template <> +struct intgemm_ { + using width = intgemm::Int16; + using type = int16_t; + constexpr static const Type intgemmType = Type::intgemm16; +}; + +#else // USE_INTGEMM + +template +struct intgemm_; +template <> +struct intgemm_ { + using width = IntgemmViaRuy::Int8; + using type = IntgemmViaRuy::Int8::Type; + constexpr static const Type intgemmType = Type::intgemm8; +}; +template <> +struct intgemm_ { + using width = IntgemmViaRuy::Int16; + using type = IntgemmViaRuy::Int16::Type; + constexpr static const Type intgemmType = Type::intgemm16; +}; + +#endif // USE_INTGEMM // For loading architecture agnostic models. We do PrepareAndTranpose, because we already transposed // in our binary format. Then we copy the quantizationMultiplier information at the end -template -void prepareAndTransposeB(io::Item& item, const char * input) { - #ifdef COMPILE_CPU - typedef typename intgemm_::type Integer; - Integer * output_tensor = reinterpret_cast(&(*item.bytes.begin())); - // Sometimes we will end up with misaligned intput (and output) so we can't use them directly. - // If this is the case, we will need to temporary allocate aligned memory, copy the results, and then free it - if (reinterpret_cast(input) % 64 == 0 && reinterpret_cast(output_tensor) % 64 == 0) { - #if defined(WASM) - ABORT_IF(intgemm_::intgemmType == Type::intgemm16, - "Int16::PrepareBQuantizedTransposed is not implemented for wasm."); - int8PrepareBFromQuantizedTransposed(reinterpret_cast(input), - (Index)rows(item.shape), //Since we only transposed, but didn't update the shape when constructing the binary - (Index)cols(item.shape), //rows here returns the columns of the transposed input matrix, and cols -> the rows - (int8_t *)output_tensor); - #else - intgemm_::width::PrepareBQuantizedTransposed(reinterpret_cast(input), - output_tensor, - rows(item.shape), //Since we only transposed, but didn't update the shape when constructing the binary, - cols(item.shape)); //rows here returns the columns of the transposed input matrix, and cols -> the rows - #endif - } else { - Integer * aligned_input = reinterpret_cast(genericMalloc(512, rows(item.shape)*cols(item.shape)*sizeof(Integer))); - std::copy(input, input + rows(item.shape)*cols(item.shape), aligned_input); - Integer * aligned_output = reinterpret_cast(genericMalloc(512, rows(item.shape)*cols(item.shape)*sizeof(Integer))); - #if defined(WASM) - ABORT_IF(intgemm_::intgemmType == Type::intgemm16, - "Int16::PrepareBQuantizedTransposed is not implemented for wasm."); - int8PrepareBFromQuantizedTransposed(reinterpret_cast(aligned_input), - (Index)rows(item.shape), //Since we only transposed, but didn't update the shape when constructing the binary, - (Index)cols(item.shape), //rows here returns the columns of the transposed input matrix, and cols -> the rows - reinterpret_cast(aligned_output)); - #else - intgemm_::width::PrepareBQuantizedTransposed(reinterpret_cast(aligned_input), - reinterpret_cast(aligned_output), - rows(item.shape), //Since we only transposed, but didn't update the shape when constructing the binary, - cols(item.shape)); //rows here returns the columns of the transposed input matrix, and cols -> the rows - #endif - // Copy to output tensor - std::copy(aligned_output, aligned_output + rows(item.shape)*cols(item.shape), output_tensor); - genericFree(aligned_input); - genericFree(aligned_output); - } - //Copy the quantMult - float quantMult = *(reinterpret_cast(reinterpret_cast(input) + item.shape.elements())); - *(reinterpret_cast(&(*(output_tensor + item.shape.elements())))) = quantMult; - #else // COMPILE_CPU - ABORT("Using intgemm models is supported only with -DCOMPILE_CPU=on"); - #endif +template +void prepareAndTransposeB(io::Item &item, const char *input) { +#ifdef COMPILE_CPU + typedef typename intgemm_::type Integer; + Integer *output_tensor = reinterpret_cast(&(*item.bytes.begin())); + // Sometimes we will end up with misaligned intput (and output) so we can't use them directly. + // If this is the case, we will need to temporary allocate aligned memory, copy the results, and + // then free it + if(reinterpret_cast(input) % 64 == 0 + && reinterpret_cast(output_tensor) % 64 == 0) { +#if defined(WASM) + ABORT_IF(intgemm_::intgemmType == Type::intgemm16, + "Int16::PrepareBQuantizedTransposed is not implemented for wasm."); + int8PrepareBFromQuantizedTransposed( + reinterpret_cast(input), + (Index)rows(item.shape), // Since we only transposed, but didn't update the shape when + // constructing the binary + (Index)cols(item.shape), // rows here returns the columns of the transposed input matrix, + // and cols -> the rows + (int8_t *)output_tensor); +#else + intgemm_::width::PrepareBQuantizedTransposed( + reinterpret_cast(input), + output_tensor, + rows(item.shape), // Since we only transposed, but didn't update the shape when + // constructing the binary, + cols(item.shape)); // rows here returns the columns of the transposed input matrix, and + // cols -> the rows +#endif + } else { + Integer *aligned_input = reinterpret_cast( + genericMalloc(512, rows(item.shape) * cols(item.shape) * sizeof(Integer))); + std::copy(input, input + rows(item.shape) * cols(item.shape), aligned_input); + Integer *aligned_output = reinterpret_cast( + genericMalloc(512, rows(item.shape) * cols(item.shape) * sizeof(Integer))); +#if defined(WASM) + ABORT_IF(intgemm_::intgemmType == Type::intgemm16, + "Int16::PrepareBQuantizedTransposed is not implemented for wasm."); + int8PrepareBFromQuantizedTransposed( + reinterpret_cast(aligned_input), + (Index)rows(item.shape), // Since we only transposed, but didn't update the shape when + // constructing the binary, + (Index)cols(item.shape), // rows here returns the columns of the transposed input matrix, + // and cols -> the rows + reinterpret_cast(aligned_output)); +#else + intgemm_::width::PrepareBQuantizedTransposed( + reinterpret_cast(aligned_input), + reinterpret_cast(aligned_output), + rows(item.shape), // Since we only transposed, but didn't update the shape when + // constructing the binary, + cols(item.shape)); // rows here returns the columns of the transposed input matrix, and + // cols -> the rows +#endif + // Copy to output tensor + std::copy(aligned_output, aligned_output + rows(item.shape) * cols(item.shape), output_tensor); + genericFree(aligned_input); + genericFree(aligned_output); + } + // Copy the quantMult + float quantMult = *(reinterpret_cast(reinterpret_cast(input) + + item.shape.elements())); + *(reinterpret_cast(&(*(output_tensor + item.shape.elements())))) = quantMult; +#else // COMPILE_CPU + ABORT("Using intgemm models is supported only with -DCOMPILE_CPU=on"); +#endif } -template -void unquantizeWemb(io::Item& item, const char * input) { - typedef typename intgemm_::type Integer; - float quantMult = *(reinterpret_cast(reinterpret_cast(input) + item.shape.elements())); - float * output_tensor = reinterpret_cast(&(*item.bytes.begin())); - // Explicitly calculate n once beforehand because the compiler does not pick up on its - // static nature, and will end up calling marian::Shape::dim() a lot. - const size_t n = rows(item.shape) * cols(item.shape); - for (size_t i = 0; i < n; i++) { - output_tensor[i] = reinterpret_cast(input)[i]*(1/quantMult); - } +template +void unquantizeWemb(io::Item &item, const char *input) { + typedef typename intgemm_::type Integer; + float quantMult = *(reinterpret_cast(reinterpret_cast(input) + + item.shape.elements())); + float *output_tensor = reinterpret_cast(&(*item.bytes.begin())); + // Explicitly calculate n once beforehand because the compiler does not pick up on its + // static nature, and will end up calling marian::Shape::dim() a lot. + const size_t n = rows(item.shape) * cols(item.shape); + for(size_t i = 0; i < n; i++) { + output_tensor[i] = reinterpret_cast(input)[i] * (1 / quantMult); + } } -} //integer -} //cpu -} //marian +} // namespace integer +} // namespace cpu +} // namespace marian From ac8de9196d38da03251f2799c92ee8b8ca240489 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Fri, 24 Jun 2022 07:47:40 +0000 Subject: [PATCH 64/67] Revert "Replace -Wall with -Wcomment" This reverts commit 4b8039901ba29a427f33b42d4d25c24aced47018. --- src/tensors/cpu/integer_common.h | 226 +++++++++++++------------------ 1 file changed, 94 insertions(+), 132 deletions(-) diff --git a/src/tensors/cpu/integer_common.h b/src/tensors/cpu/integer_common.h index ad46735a0..389f5c4e8 100644 --- a/src/tensors/cpu/integer_common.h +++ b/src/tensors/cpu/integer_common.h @@ -1,19 +1,19 @@ #pragma once -#include "common/io_item.h" -#include "tensors/cpu/aligned.h" #include "tensors/tensor_allocator.h" #include "tensors/tensor_operators.h" +#include "tensors/cpu/aligned.h" +#include "common/io_item.h" #ifdef USE_INTGEMM #include "3rd_party/intgemm/intgemm/intgemm.h" -#else // USE_INTGEMM +#else // USE_INTGEMM #pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wcomment" +#pragma GCC diagnostic ignored "-Wall" #include #pragma GCC diagnostic pop #include "ruy_adapter.h" -#endif // USE_INTGEMM +#endif // USE_INTGEMM #if defined(WASM) #include "wasm_intgemm_interface.h" #endif @@ -25,144 +25,106 @@ namespace marian { namespace cpu { namespace integer { -// Convenient function to get rows and columns of a tensor, shadowed by namespace. -inline int cols(Tensor &tensor) { - return tensor->shape()[-1]; -} -inline int rows(Tensor &tensor) { - return tensor->shape().elements() / cols(tensor); -} +//Convenient function to get rows and columns of a tensor, shadowed by namespace. +inline int cols(Tensor& tensor) { return tensor->shape()[-1]; } +inline int rows(Tensor& tensor) { return tensor->shape().elements() / cols(tensor); } -inline int cols(Shape &shape) { - return shape[-1]; -} -inline int rows(Shape &shape) { - return shape.elements() / cols(shape); -} +inline int cols(Shape& shape) { return shape[-1]; } +inline int rows(Shape& shape) { return shape.elements() / cols(shape); } // This operates on floats after processing so doesn't care about int8_t vs int16_t. void AddBias(marian::Tensor C, const marian::Tensor Bias); #ifdef USE_INTGEMM -template -struct intgemm_; -template <> -struct intgemm_ { - using width = intgemm::Int8; - using type = int8_t; - constexpr static const Type intgemmType = Type::intgemm8; -}; -template <> -struct intgemm_ { - using width = intgemm::Int16; - using type = int16_t; - constexpr static const Type intgemmType = Type::intgemm16; -}; - -#else // USE_INTGEMM - -template -struct intgemm_; -template <> -struct intgemm_ { - using width = IntgemmViaRuy::Int8; - using type = IntgemmViaRuy::Int8::Type; - constexpr static const Type intgemmType = Type::intgemm8; -}; -template <> -struct intgemm_ { - using width = IntgemmViaRuy::Int16; - using type = IntgemmViaRuy::Int16::Type; - constexpr static const Type intgemmType = Type::intgemm16; -}; - -#endif // USE_INTGEMM +template struct intgemm_; +template <> struct intgemm_ {using width = intgemm::Int8; + using type = int8_t; + constexpr static const Type intgemmType = Type::intgemm8;}; +template <> struct intgemm_ {using width = intgemm::Int16; + using type = int16_t; + constexpr static const Type intgemmType = Type::intgemm16;}; + + + +#else // USE_INTGEMM + +template struct intgemm_; +template <> struct intgemm_ {using width = IntgemmViaRuy::Int8; + using type = IntgemmViaRuy::Int8::Type; + constexpr static const Type intgemmType = Type::intgemm8;}; +template <> struct intgemm_ {using width = IntgemmViaRuy::Int16; + using type = IntgemmViaRuy::Int16::Type; + constexpr static const Type intgemmType = Type::intgemm16;}; + +#endif // USE_INTGEMM // For loading architecture agnostic models. We do PrepareAndTranpose, because we already transposed // in our binary format. Then we copy the quantizationMultiplier information at the end -template -void prepareAndTransposeB(io::Item &item, const char *input) { -#ifdef COMPILE_CPU - typedef typename intgemm_::type Integer; - Integer *output_tensor = reinterpret_cast(&(*item.bytes.begin())); - // Sometimes we will end up with misaligned intput (and output) so we can't use them directly. - // If this is the case, we will need to temporary allocate aligned memory, copy the results, and - // then free it - if(reinterpret_cast(input) % 64 == 0 - && reinterpret_cast(output_tensor) % 64 == 0) { -#if defined(WASM) - ABORT_IF(intgemm_::intgemmType == Type::intgemm16, - "Int16::PrepareBQuantizedTransposed is not implemented for wasm."); - int8PrepareBFromQuantizedTransposed( - reinterpret_cast(input), - (Index)rows(item.shape), // Since we only transposed, but didn't update the shape when - // constructing the binary - (Index)cols(item.shape), // rows here returns the columns of the transposed input matrix, - // and cols -> the rows - (int8_t *)output_tensor); -#else - intgemm_::width::PrepareBQuantizedTransposed( - reinterpret_cast(input), - output_tensor, - rows(item.shape), // Since we only transposed, but didn't update the shape when - // constructing the binary, - cols(item.shape)); // rows here returns the columns of the transposed input matrix, and - // cols -> the rows -#endif - } else { - Integer *aligned_input = reinterpret_cast( - genericMalloc(512, rows(item.shape) * cols(item.shape) * sizeof(Integer))); - std::copy(input, input + rows(item.shape) * cols(item.shape), aligned_input); - Integer *aligned_output = reinterpret_cast( - genericMalloc(512, rows(item.shape) * cols(item.shape) * sizeof(Integer))); -#if defined(WASM) - ABORT_IF(intgemm_::intgemmType == Type::intgemm16, - "Int16::PrepareBQuantizedTransposed is not implemented for wasm."); - int8PrepareBFromQuantizedTransposed( - reinterpret_cast(aligned_input), - (Index)rows(item.shape), // Since we only transposed, but didn't update the shape when - // constructing the binary, - (Index)cols(item.shape), // rows here returns the columns of the transposed input matrix, - // and cols -> the rows - reinterpret_cast(aligned_output)); -#else - intgemm_::width::PrepareBQuantizedTransposed( - reinterpret_cast(aligned_input), - reinterpret_cast(aligned_output), - rows(item.shape), // Since we only transposed, but didn't update the shape when - // constructing the binary, - cols(item.shape)); // rows here returns the columns of the transposed input matrix, and - // cols -> the rows -#endif - // Copy to output tensor - std::copy(aligned_output, aligned_output + rows(item.shape) * cols(item.shape), output_tensor); - genericFree(aligned_input); - genericFree(aligned_output); - } - // Copy the quantMult - float quantMult = *(reinterpret_cast(reinterpret_cast(input) - + item.shape.elements())); - *(reinterpret_cast(&(*(output_tensor + item.shape.elements())))) = quantMult; -#else // COMPILE_CPU - ABORT("Using intgemm models is supported only with -DCOMPILE_CPU=on"); -#endif +template +void prepareAndTransposeB(io::Item& item, const char * input) { + #ifdef COMPILE_CPU + typedef typename intgemm_::type Integer; + Integer * output_tensor = reinterpret_cast(&(*item.bytes.begin())); + // Sometimes we will end up with misaligned intput (and output) so we can't use them directly. + // If this is the case, we will need to temporary allocate aligned memory, copy the results, and then free it + if (reinterpret_cast(input) % 64 == 0 && reinterpret_cast(output_tensor) % 64 == 0) { + #if defined(WASM) + ABORT_IF(intgemm_::intgemmType == Type::intgemm16, + "Int16::PrepareBQuantizedTransposed is not implemented for wasm."); + int8PrepareBFromQuantizedTransposed(reinterpret_cast(input), + (Index)rows(item.shape), //Since we only transposed, but didn't update the shape when constructing the binary + (Index)cols(item.shape), //rows here returns the columns of the transposed input matrix, and cols -> the rows + (int8_t *)output_tensor); + #else + intgemm_::width::PrepareBQuantizedTransposed(reinterpret_cast(input), + output_tensor, + rows(item.shape), //Since we only transposed, but didn't update the shape when constructing the binary, + cols(item.shape)); //rows here returns the columns of the transposed input matrix, and cols -> the rows + #endif + } else { + Integer * aligned_input = reinterpret_cast(genericMalloc(512, rows(item.shape)*cols(item.shape)*sizeof(Integer))); + std::copy(input, input + rows(item.shape)*cols(item.shape), aligned_input); + Integer * aligned_output = reinterpret_cast(genericMalloc(512, rows(item.shape)*cols(item.shape)*sizeof(Integer))); + #if defined(WASM) + ABORT_IF(intgemm_::intgemmType == Type::intgemm16, + "Int16::PrepareBQuantizedTransposed is not implemented for wasm."); + int8PrepareBFromQuantizedTransposed(reinterpret_cast(aligned_input), + (Index)rows(item.shape), //Since we only transposed, but didn't update the shape when constructing the binary, + (Index)cols(item.shape), //rows here returns the columns of the transposed input matrix, and cols -> the rows + reinterpret_cast(aligned_output)); + #else + intgemm_::width::PrepareBQuantizedTransposed(reinterpret_cast(aligned_input), + reinterpret_cast(aligned_output), + rows(item.shape), //Since we only transposed, but didn't update the shape when constructing the binary, + cols(item.shape)); //rows here returns the columns of the transposed input matrix, and cols -> the rows + #endif + // Copy to output tensor + std::copy(aligned_output, aligned_output + rows(item.shape)*cols(item.shape), output_tensor); + genericFree(aligned_input); + genericFree(aligned_output); + } + //Copy the quantMult + float quantMult = *(reinterpret_cast(reinterpret_cast(input) + item.shape.elements())); + *(reinterpret_cast(&(*(output_tensor + item.shape.elements())))) = quantMult; + #else // COMPILE_CPU + ABORT("Using intgemm models is supported only with -DCOMPILE_CPU=on"); + #endif } -template -void unquantizeWemb(io::Item &item, const char *input) { - typedef typename intgemm_::type Integer; - float quantMult = *(reinterpret_cast(reinterpret_cast(input) - + item.shape.elements())); - float *output_tensor = reinterpret_cast(&(*item.bytes.begin())); - // Explicitly calculate n once beforehand because the compiler does not pick up on its - // static nature, and will end up calling marian::Shape::dim() a lot. - const size_t n = rows(item.shape) * cols(item.shape); - for(size_t i = 0; i < n; i++) { - output_tensor[i] = reinterpret_cast(input)[i] * (1 / quantMult); - } +template +void unquantizeWemb(io::Item& item, const char * input) { + typedef typename intgemm_::type Integer; + float quantMult = *(reinterpret_cast(reinterpret_cast(input) + item.shape.elements())); + float * output_tensor = reinterpret_cast(&(*item.bytes.begin())); + // Explicitly calculate n once beforehand because the compiler does not pick up on its + // static nature, and will end up calling marian::Shape::dim() a lot. + const size_t n = rows(item.shape) * cols(item.shape); + for (size_t i = 0; i < n; i++) { + output_tensor[i] = reinterpret_cast(input)[i]*(1/quantMult); + } } -} // namespace integer -} // namespace cpu -} // namespace marian +} //integer +} //cpu +} //marian From 38b608af5ae1af852e68fe9c2e835a22ef11be59 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Fri, 24 Jun 2022 07:48:38 +0000 Subject: [PATCH 65/67] Disable formatting then local edit -Wall -> -Wcomment --- src/tensors/cpu/integer_common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tensors/cpu/integer_common.h b/src/tensors/cpu/integer_common.h index 389f5c4e8..d05440ef1 100644 --- a/src/tensors/cpu/integer_common.h +++ b/src/tensors/cpu/integer_common.h @@ -8,7 +8,7 @@ #include "3rd_party/intgemm/intgemm/intgemm.h" #else // USE_INTGEMM #pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wall" +#pragma GCC diagnostic ignored "-Wcomment" #include #pragma GCC diagnostic pop From 86c8d44af6edf8d501c6845dc31819e483122237 Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Mon, 27 Jun 2022 11:30:53 +0000 Subject: [PATCH 66/67] Do not check for BLAS on usual ARM, except Mac: Apple Accelerate --- CMakeLists.txt | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f505188d1..b0e707012 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -69,7 +69,12 @@ CMAKE_DEPENDENT_OPTION(COMPILE_WITHOUT_EXCEPTIONS "Compile without exceptions" O if(${CMAKE_TARGET_ARCHITECTURE_CODE} MATCHES "arm") set(USE_RUY ON) - set(USE_RUY_SGEMM ON) + + # Apple M1 has Apple Accelerate(?). + if(NOT APPLE) + set(USE_RUY_SGEMM ON) + endif(NOT APPLE) + set(USE_SIMD_UTILS ON) else() set(USE_INTGEMM ON) @@ -593,9 +598,8 @@ if(COMPILE_CPU) set(EXT_LIBS ${EXT_LIBS} intgemm) # Move the intgemm bits on top since they compile with every single variant endif(USE_INTGEMM) - if(USE_RUY) + if(USE_RUY OR USE_RUY_SGEMM) set(EXT_LIBS ${EXT_LIBS} ruy) - add_compile_definitions(USE_RUY_SGEMM=1) endif(USE_RUY) add_definitions(-DCOMPILE_CPU=1) # Move the compile CPU definition on top since we want to compile intgemm when we set compile CPU @@ -613,6 +617,8 @@ if(COMPILE_CPU) include_directories("/Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/System/Library/Frameworks/Accelerate.framework/Frameworks/vecLib.framework/Headers") set(EXT_LIBS ${EXT_LIBS} "-framework Accelerate") add_definitions(-DBLAS_FOUND=1) + elseif(USE_RUY_SGEMM) + add_compile_definitions(USE_RUY_SGEMM=1) else(USE_ONNX_SGEMM) if(USE_MKL) find_package(MKL) From 861e31d55931bf2578ae2765dd0859e33ee2700c Mon Sep 17 00:00:00 2001 From: Jerin Philip Date: Fri, 1 Jul 2022 10:58:47 +0000 Subject: [PATCH 67/67] Fix endif: CMakeScript quirks --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b0e707012..efa096f55 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -600,7 +600,7 @@ if(COMPILE_CPU) if(USE_RUY OR USE_RUY_SGEMM) set(EXT_LIBS ${EXT_LIBS} ruy) - endif(USE_RUY) + endif(USE_RUY OR USE_RUY_SGEMM) add_definitions(-DCOMPILE_CPU=1) # Move the compile CPU definition on top since we want to compile intgemm when we set compile CPU # in case a BLAS vendor is not found, we have a runtime error, although we should probably not allow the compilation to go on