From 79d7b332e56427964ded0126fd97e7619348c6a6 Mon Sep 17 00:00:00 2001
From: Jerin Philip <jerinphilip@live.in>
Date: Tue, 8 Feb 2022 23:08:58 +0000
Subject: [PATCH 01/67] ARM Backend for marian

Provides an arm backend for matrix multiplies using google/ruy and math
functions through simde (https://simd-everywhere.github.io/blog/about/)
effectively getting marian-decoder to run on ARM.

The following cmake flags are added:

- USE_INTGEMM (to switch intgemm on/off)
- USE_RUY (to switch ruy on/off)
- USE_ONNX_SGEMM (use onnx sgemm added by wasm to provide attention
  matrix multiply which is currently reliant on a BLAS library).
- USE_SIMDE (swaps out existing intel based functions by using SIMDE
  instead).

The built marian-decoder is tested on an Oracle Cloud ARM Machine with
the following specs:

  Architecture   : aarch64
  CPU op-mode(s) : 32-bit, 64-bit
  Byte Order     : Little Endian
  Vendor ID      : ARM
  Model name     : Neoverse-N1
  Flags          : fp asimd evtstrm aes pmull sha1 sha2 crc32 atomics fphp
                   asimdhp cpuid asimdrdm lrcpc dcpop asimddp ssbs

A CI check on GitHub actions is added to use android-ndk cross-compile
targetting arm64-v8a. The built binary is tested to work on an Android
Phone using termux (Samsung M30s).

Successful android build additionally requires a patch (sentencepiece ->
protobuf). See https://github.com/opencv/opencv/issues/17282 and
https://github.com/opencv/opencv/pull/19049.

-Werror etc causes issues with ruy (-Wmulti-line-comment) and are
disabled.

The following minor changes are also applied:

 - Remove M32_BINARIES use COMPILE_WASM for -m32
 - Hide msse4.1 if unknown platform
 - faiss was previously hardcoded for platforms with SSE available. This
   has been mitigated by adding a refernce standard cpp implementation of
   the missing function.
 - Exclude packed_gemm_....cpp from sources if USE_FBGEMM=off
 - MSVC workaround following
   https://github.com/browsermt/marian-dev/pull/56#issuecomment-945821693
---
 .github/workflows/arm.yml                     | 147 +++++
 .github/workflows/macos.yml                   |   2 +-
 .../native-customized_marian-macos.yml        |   2 +-
 .github/workflows/ubuntu.yml                  |   2 +-
 .../wasm-customized_marian-macos.yml          |   2 +-
 .../wasm-customized_marian-ubuntu.yml         |   2 +-
 .github/workflows/windows.yml                 |   2 +-
 .gitmodules                                   |   6 +
 CMakeLists.txt                                |  68 ++-
 patches/01-spm-protobuf-android.patch         |  14 +
 scripts/run.sh                                |  40 ++
 src/3rd_party/CMakeLists.txt                  |  28 +-
 src/3rd_party/faiss/VectorTransform.cpp       |  13 +-
 src/3rd_party/ruy                             |   1 +
 src/3rd_party/sentencepiece                   |   2 +-
 src/3rd_party/simde-no-tests                  |   1 +
 src/3rd_party/sse_mathfun.h                   |   9 +
 src/CMakeLists.txt                            |  15 +-
 src/common/config_parser.cpp                  |  10 -
 src/common/logging.cpp                        |   6 +-
 src/common/types.h                            |   6 +
 src/layers/lsh.cpp                            |   4 +-
 src/tensors/cpu/expression_graph_packable.h   |   8 +-
 src/tensors/cpu/integer_common.cpp            |  15 +-
 src/tensors/cpu/integer_common.h              |  34 +-
 src/tensors/cpu/intgemm_interface.h           |  62 +-
 src/tensors/cpu/prod.cpp                      |  13 +-
 src/tensors/cpu/prod_blas.h                   |  15 +-
 src/tensors/cpu/ruy_adapter.h                 | 534 ++++++++++++++++++
 29 files changed, 974 insertions(+), 89 deletions(-)
 create mode 100644 .github/workflows/arm.yml
 create mode 100644 patches/01-spm-protobuf-android.patch
 create mode 100644 scripts/run.sh
 create mode 160000 src/3rd_party/ruy
 create mode 160000 src/3rd_party/simde-no-tests
 create mode 100644 src/tensors/cpu/ruy_adapter.h

diff --git a/.github/workflows/arm.yml b/.github/workflows/arm.yml
new file mode 100644
index 000000000..4cab55c9e
--- /dev/null
+++ b/.github/workflows/arm.yml
@@ -0,0 +1,147 @@
+name: ARM
+'on':
+  push:
+    branches:
+    - main
+    - ci-sandbox
+  pull_request:
+    branches:
+    - '**'
+env:
+  ccache_basedir: ${{ github.workspace }}
+  ccache_dir: "${{ github.workspace }}/.ccache"
+  ccache_compilercheck: content
+  ccache_compress: 'true'
+  ccache_compresslevel: 9
+  ccache_maxsize: 200M
+  ccache_cmake: -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache
+  ndk: "${{ github.workspace }}/android-ndk-r23b"
+  abi: "arm64-v8a"
+  minsdk_version : 28
+  android_platform: 28
+
+jobs:
+  ubuntu:
+    name: "arm-v8a cross-compile via Android NDK"
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v2
+      with:
+        submodules: recursive
+
+    - name: Install prerequisites
+      run: |
+          wget -c --quiet https://dl.google.com/android/repository/android-ndk-r23b-linux.zip
+          unzip -qq android-ndk-r23b-linux.zip
+          sudo apt-get -y install ccache cmake
+
+    - name: Generate ccache_vars for ccache based on machine
+      shell: bash
+      id: ccache_vars
+      run: |-
+        echo "::set-output name=hash::$(echo ${{ env.ccache_compilercheck }})"
+        echo "::set-output name=timestamp::$(date '+%Y-%m-%dT%H.%M.%S')"
+
+    - name: Cache-op for build-cache through ccache
+      uses: actions/cache@v2
+      with:
+        path: ${{ env.ccache_dir }}
+        key: ccache-${{ matrix.identifier }}-${{ steps.ccache_vars.outputs.hash }}-${{ github.ref }}-${{ steps.ccache_vars.outputs.timestamp }}
+        restore-keys: |-
+          ccache-${{ matrix.identifier }}-${{ steps.ccache_vars.outputs.hash }}-${{ github.ref }}
+          ccache-${{ matrix.identifier }}-${{ steps.ccache_vars.outputs.hash }}
+          ccache-${{ matrix.identifier }}
+
+    - name: ccache environment setup
+      run: |-
+        echo "CCACHE_COMPILER_CHECK=${{ env.ccache_compilercheck }}" >> $GITHUB_ENV
+        echo "CCACHE_BASEDIR=${{ env.ccache_basedir }}" >> $GITHUB_ENV
+        echo "CCACHE_COMPRESS=${{ env.ccache_compress }}" >> $GITHUB_ENV
+        echo "CCACHE_COMPRESSLEVEL=${{ env.ccache_compresslevel }}" >> $GITHUB_ENV
+        echo "CCACHE_DIR=${{ env.ccache_dir }}" >> $GITHUB_ENV
+        echo "CCACHE_MAXSIZE=${{ env.ccache_maxsize }}" >> $GITHUB_ENV
+
+    - name: ccache prolog
+      run: |-
+        ccache -s # Print current cache stats
+        ccache -z # Zero cache entry
+
+    - name: Apply patch sentencepiece for android
+      run: |-
+          patch -p1 < patches/01-spm-protobuf-android.patch
+
+    - name: Generate buildfiles for marian on android via cmake
+      run: |-
+        mkdir -p build 
+        cd build
+        NDK=${{ env.ndk }}
+        ABI=${{ env.abi }}
+        MINSDK_VERSION=${{ env.minsdk_version }}
+        ANDROID_PLATFORM=${{ env.android_platform }}
+        OTHER_ANDROID_ARGS=(
+            -DANDROID_ARM_NEON=TRUE
+        )
+        OTHER_MARIAN_ARGS=(
+            -DCOMPILE_CUDA=off
+            -DCOMPILE_CPU=on
+            -DCMAKE_HAVE_THREADS_LIBRARY=1
+            -DCMAKE_USE_WIN32_THREADS_INIT=0
+            -DCMAKE_USE_PTHREADS_INIT=1
+            -DTHREADS_PREFER_PTHREAD_FLAG=ON
+            -DBUILD_ARCH=armv8-a
+            -DUSE_INTGEMM=off
+            -DUSE_SIMDE=on
+            -DUSE_RUY=on
+            -DUSE_ONNX_SGEMM=on # For time being.
+            # -DCOMPILE_WITHOUT_EXCEPTIONS=on # Apparently this can reduce the binary size, let's see.
+        )
+        # Additionally list variables finally configured.
+        cmake -L \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake \
+            -DANDROID_TOOLCHAIN=clang \
+            -DANDROID_ABI=$ABI \
+            -DANDROID_PLATFORM=$ANDROID_PLATFORM \
+            -DANDROID_NATIVE_API_LEVEL=$MINSDKVERSION \
+            -DANDROID_TOOLCHAIN_NAME=arm-linux-androideabi-4.8 \
+            -DANDROID_STL=c++_static \
+            -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+            "${OTHER_ANDROID_ARGS[@]}" "${OTHER_MARIAN_ARGS[@]}" \
+            ..
+
+
+    - name : Build marian for android
+      working-directory: build
+      run: |-
+          # Only build marian (lib) for now.
+          make -j2 
+
+    - name: ccache epilog
+      run: 'ccache -s # Print current cache stats'
+
+    - uses: actions/upload-artifact@v2
+      with:
+        path: ${{github.workspace}}/build/marian-decoder
+
+
+  # Disable release for now.
+  # release:
+  #   name: Release Latest Build
+  #   runs-on: ubuntu-latest
+  #   needs: [ubuntu]
+  #   if: github.ref == 'refs/heads/master'
+  #   steps:
+  #    - name: Download artifacts
+  #      uses: actions/download-artifact@v2
+  #     
+  #    - name: Update GitHub prerelease
+  #      uses: marvinpinto/action-automatic-releases@latest
+  #      with:
+  #        repo_token: ${{ secrets.GITHUB_TOKEN }}
+  #        automatic_release_tag: latest
+  #        prerelease: true
+  #        title: "Latest Build"
+  #        files: |
+  #          artifact/marian-decoder
diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml
index 6d18dea84..b71ddad54 100644
--- a/.github/workflows/macos.yml
+++ b/.github/workflows/macos.yml
@@ -4,7 +4,7 @@ on:
   push:
     branches: [ master ]
   pull_request:
-    branches: [ master ]
+    branches: [ "**" ]
 
 jobs:
   build-macos:
diff --git a/.github/workflows/native-customized_marian-macos.yml b/.github/workflows/native-customized_marian-macos.yml
index 41f102a0d..6c3405a01 100644
--- a/.github/workflows/native-customized_marian-macos.yml
+++ b/.github/workflows/native-customized_marian-macos.yml
@@ -4,7 +4,7 @@ on:
   push:
     branches: [ master ]
   pull_request:
-    branches: [ master ]
+    branches: [ "**" ]
 
 jobs:
   build-macos:
diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml
index a01770e17..f4a1c32c8 100644
--- a/.github/workflows/ubuntu.yml
+++ b/.github/workflows/ubuntu.yml
@@ -4,7 +4,7 @@ on:
   push:
     branches: [ master ]
   pull_request:
-    branches: [ master ]
+    branches: [ "**" ]
 
 jobs:
   build-ubuntu:
diff --git a/.github/workflows/wasm-customized_marian-macos.yml b/.github/workflows/wasm-customized_marian-macos.yml
index cfafe0182..5b797499b 100644
--- a/.github/workflows/wasm-customized_marian-macos.yml
+++ b/.github/workflows/wasm-customized_marian-macos.yml
@@ -4,7 +4,7 @@ on:
   push:
     branches: [ master ]
   pull_request:
-    branches: [ master ]
+    branches: [ "**" ]
 
 jobs:
   build-wasm:
diff --git a/.github/workflows/wasm-customized_marian-ubuntu.yml b/.github/workflows/wasm-customized_marian-ubuntu.yml
index 8294665f6..4777d04de 100644
--- a/.github/workflows/wasm-customized_marian-ubuntu.yml
+++ b/.github/workflows/wasm-customized_marian-ubuntu.yml
@@ -4,7 +4,7 @@ on:
   push:
     branches: [ master ]
   pull_request:
-    branches: [ master ]
+    branches: [ "**" ]
 
 jobs:
   build-wasm:
diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
index 648887aec..06aa22623 100644
--- a/.github/workflows/windows.yml
+++ b/.github/workflows/windows.yml
@@ -4,7 +4,7 @@ on:
   push:
     branches: [ master ]
   pull_request:
-    branches: [ master ]
+    branches: [ "**" ]
 
 env:
   MKL_URL: "https://romang.blob.core.windows.net/mariandev/ci/mkl-2020.1-windows-static.zip"
diff --git a/.gitmodules b/.gitmodules
index a8facd1fd..ae76037f1 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -23,3 +23,9 @@
 [submodule "src/3rd_party/onnxjs"]
 	path = src/3rd_party/onnxjs
 	url = https://github.com/abhi-agg/onnxjs.git
+[submodule "src/3rd_party/simde-no-tests"]
+	path = src/3rd_party/simde-no-tests
+	url = https://github.com/simd-everywhere/simde-no-tests/
+[submodule "src/3rd_party/ruy"]
+	path = src/3rd_party/ruy
+	url = https://github.com/google/ruy
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 12b38f0f5..97ef84269 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -31,29 +31,31 @@ option(USE_CCACHE "Use ccache compiler cache (https://ccache.dev)" OFF)
 option(USE_CUDNN "Use CUDNN library" OFF)
 option(USE_DOXYGEN "Build documentation with Doxygen" ON)
 option(USE_FBGEMM "Use FBGEMM" OFF)
+option(USE_INTGEMM "Use INTGEMM" ON)
 option(USE_MKL "Compile with MKL support" ON)
 option(USE_MPI "Use MPI library" OFF)
 option(USE_NCCL "Use NCCL library" ON)
 option(USE_SENTENCEPIECE "Download and compile SentencePiece" ON)
 option(USE_STATIC_LIBS "Link statically against non-system libs" OFF)
 option(GENERATE_MARIAN_INSTALL_TARGETS "Generate Marian install targets (requires CMake 3.12+)" OFF)
-option(M32_BINARIES "Generate 32bit binaries even when building outside of WASM. Useful for testing some WASM specific functionality without the need for the compiling to WASM." OFF)
 option(COMPILE_WASM "Compile (wasm compatible) marian for WASM target" OFF)
 option(USE_WASM_COMPATIBLE_SOURCE "Enable the minimal marian sources that compile to wasm. Useful for debugging wasm failures by building same sources natively" OFF)
 
+option(USE_SIMDE "Enable simde to target instruction sets" OFF)
+option(USE_ONNX_SGEMM "Compile with wasm compatible blas" OFF)
+option(COMPILE_WITHOUT_EXCEPTIONS "Compile without exceptions" OFF)
+
 # cmake options that are dependent on USE_WASM_COMPATIBLE_SOURCE cmake option
 CMAKE_DEPENDENT_OPTION(USE_THREADS "Compile with multi-threading support" OFF
                        "USE_WASM_COMPATIBLE_SOURCE" ON)
-CMAKE_DEPENDENT_OPTION(USE_WASM_COMPATIBLE_BLAS "Compile with wasm compatible blas" ON
-                       "USE_WASM_COMPATIBLE_SOURCE" OFF)
-CMAKE_DEPENDENT_OPTION(COMPILE_WITHOUT_EXCEPTIONS "Compile without exceptions" ON
-                       "USE_WASM_COMPATIBLE_SOURCE" OFF)
 
 if (USE_WASM_COMPATIBLE_SOURCE)
   set(SPM_BUILD_LIBRARY_ONLY ON CACHE BOOL "Build only sentencepiece library (skip building executables)")
   add_compile_definitions(WASM_COMPATIBLE_SOURCE)
   # Setting USE_SSE2 definition to enable SSE2 specific code in "3rd_party/sse_mathfun.h" for wasm builds
   add_compile_definitions(USE_SSE2)
+  set(USE_ONNX_SGEMM ON CACHE BOOL "")
+  set(COMPILE_WITHOUT_EXCEPTIONS ON CACHE BOOL "")
 endif()
 
 if (COMPILE_WASM)
@@ -61,10 +63,11 @@ if (COMPILE_WASM)
   set(WORMHOLE ON CACHE BOOL "Use WASM wormhole in intgemm https://bugzilla.mozilla.org/show_bug.cgi?id=1672160")
 endif()
 
-if(M32_BINARIES OR COMPILE_WASM)
+
+if(COMPILE_WASM)
     set("BUILD_WIDTH" "-m32")
-else(M32_BINARIES OR COMPILE_WASM)
-    set("BUILD_WIDTH" "-m64")
+else(COMPILE_WASM)
+    set("BUILD_WIDTH" "")
 endif()
 
 if(NOT COMPILE_WASM)
@@ -167,6 +170,7 @@ if(MSVC)
   # set(INTRINSICS "/arch:AVX")
   add_definitions(-DUSE_SSE2=1)
 
+
   # Or maybe use these?
   set(INTRINSICS ${MSVC_BUILD_ARCH})
   # set(INTRINSICS "/arch:AVX512")
@@ -193,6 +197,16 @@ if(MSVC)
     set(EXT_LIBS ${EXT_LIBS} fbgemm)
     add_definitions(-DUSE_FBGEMM=1 -DFBGEMM_STATIC=1)
   endif(USE_FBGEMM)
+
+  if(USE_INTGEMM)
+    add_definitions(-DUSE_INTGEMM=1)
+  endif(USE_INTGEMM)
+
+  if(USE_SIMDE)
+    add_definitions(-DUSE_SIMDE=1)
+    add_definitions(-DSIMDE_ENABLE_NATIVE_ALIASES=1)
+  endif(USE_SIMDE)
+
 else(MSVC)
 
   # Check we are using at least g++ 5.0
@@ -249,7 +263,7 @@ else(MSVC)
     # -msse4.1 once marian can solely be compiled with intgemm ("onnxjs" will be removed in that case)
     set(INTRINSICS "-mssse3 -msimd128")
   else()
-    set(INTRINSICS "-msse4.1")
+    # Not assuming we have "-msse4.1 here"
   endif()
 
   if(USE_FBGEMM)
@@ -257,6 +271,15 @@ else(MSVC)
     add_definitions(-DUSE_FBGEMM=1)
   endif(USE_FBGEMM)
 
+  if(USE_INTGEMM)
+    add_definitions(-DUSE_INTGEMM=1)
+  endif(USE_INTGEMM)
+
+  if(USE_SIMDE)
+    add_definitions(-DUSE_SIMDE=1)
+    add_definitions(-DSIMDE_ENABLE_NATIVE_ALIASES=1)
+  endif(USE_SIMDE)
+
   if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 9.0)
     # Clang-10.0.0 complains when CUDA is newer than 10.1
     set(CLANG_IGNORE_UNKNOWN_CUDA "-Wno-unknown-warning-option -Wno-unknown-cuda-version")
@@ -266,7 +289,8 @@ else(MSVC)
   set(DISABLE_GLOBALLY "-Wno-unused-result ${CLANG_IGNORE_UNKNOWN_CUDA} ${CLANG_IGNORE_UNUSED_VALUES}") # This needs to appear here as well to appease clang11+ on linux
 
   # These are used in src/CMakeLists.txt on a per-target basis
-  list(APPEND ALL_WARNINGS -Wall; -Werror; -Wextra; -Wno-unused-result; -Wno-deprecated;
+  list(APPEND ALL_WARNINGS -Wall; # -Werror; 
+    -Wextra; -Wno-unused-result; -Wno-deprecated;
     -Wno-pragmas; -Wno-unused-parameter; -Wno-unused-function;
     -Wno-unused-value; -Wno-unknown-pragmas; -Wno-sign-compare;
     -Wno-missing-field-initializers; ${CLANG_IGNORE_UNUSED_PRIVATE_FIELD})
@@ -542,24 +566,32 @@ endif(USE_MPI)
 ###############################################################################
 # Find BLAS library for CPU compilation
 if(COMPILE_CPU)
-  set(EXT_LIBS ${EXT_LIBS} intgemm)       # Move the intgemm bits on top since they compile with every single variant
+  if(USE_INTGEMM)
+      set(EXT_LIBS ${EXT_LIBS} intgemm)       # Move the intgemm bits on top since they compile with every single variant
+  endif(USE_INTGEMM)
+
+  if(USE_RUY)
+    set(EXT_LIBS ${EXT_LIBS} ruy)
+  endif(USE_RUY)
+
   add_definitions(-DCOMPILE_CPU=1)        # Move the compile CPU definition on top since we want to compile intgemm when we set compile CPU
                                           # in case a BLAS vendor is not found, we have a runtime error, although we should probably not allow the compilation to go on
                                           # if there are BLAS vendors, we have other runtime checks with sane error messages.
-  if(USE_WASM_COMPATIBLE_BLAS)
+  if(USE_ONNX_SGEMM)
     ## Use a wasm compatible BLAS
+    ## ^ SGEMM != BLAS
     set(EXT_LIBS ${EXT_LIBS} onnx-sgemm)
-    set(BLAS_FOUND TRUE)
-    set(BLAS_VENDOR "ONNX-SGEMM")
-    add_definitions(-DBLAS_FOUND=1  -DWASM_COMPATIBLE_BLAS=1) # Might be required in some cmake files further down the line, let's avoid using add_compile_definitions in this codeblock
-  elseif(APPLE AND USE_APPLE_ACCELERATE)
+    add_definitions(-DUSE_ONNX_SGEMM=1) # Might be required in some cmake files further down the line, let's avoid using add_compile_definitions in this codeblock
+  endif(USE_ONNX_SGEMM)
+
+  if(APPLE AND USE_APPLE_ACCELERATE)
     set(BLAS_VENDOR "Accelerate")
     # see https://developer.apple.com/documentation/accelerate for more info
     # you may need to install Xcode command line tools if you don't have them already (https://developer.apple.com/xcode/features/)
     include_directories("/Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/System/Library/Frameworks/Accelerate.framework/Frameworks/vecLib.framework/Headers")
     set(EXT_LIBS ${EXT_LIBS} "-framework Accelerate")
     add_definitions(-DBLAS_FOUND=1)
-  else(USE_WASM_COMPATIBLE_BLAS)
+  else(USE_ONNX_SGEMM)
     if(USE_MKL)
       find_package(MKL)
     endif(USE_MKL)
@@ -580,7 +612,7 @@ if(COMPILE_CPU)
         endif(CBLAS_FOUND)
       endif(BLAS_FOUND)
     endif(MKL_FOUND)
-  endif(USE_WASM_COMPATIBLE_BLAS)
+  endif(USE_ONNX_SGEMM)
 endif(COMPILE_CPU)
 
 ###############################################################################
diff --git a/patches/01-spm-protobuf-android.patch b/patches/01-spm-protobuf-android.patch
new file mode 100644
index 000000000..6eadb5454
--- /dev/null
+++ b/patches/01-spm-protobuf-android.patch
@@ -0,0 +1,14 @@
+diff --git a/src/3rd_party/sentencepiece/src/CMakeLists.txt b/src/3rd_party/sentencepiece/src/CMakeLists.txt
+index 0ba6407..276caec 100644
+--- a/ src/3rd_party/sentencepiece/src/CMakeLists.txt
++++ b/src/3rd_party/sentencepiece/src/CMakeLists.txt
+@@ -159,6 +159,9 @@ set(SPM_TEST_SRCS
+ find_package(Threads REQUIRED)
+ 
+ set(SPM_LIBS ${PROTOBUF_LITE_LIBRARY} Threads::Threads)
++if(ANDROID)
++    set(SPM_LIBS ${SPM_LIBS} android log)
++endif(ANDROID)
+ 
+ if (SPM_ENABLE_NFKC_COMPILE)
+   find_package(ICU 4.4 COMPONENTS i18n data uc REQUIRED)
diff --git a/scripts/run.sh b/scripts/run.sh
new file mode 100644
index 000000000..f080dee68
--- /dev/null
+++ b/scripts/run.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+NDK=/mnt/Storage/jphilip/android-ndk-r23b
+ABI=arm64-v8a
+MINSDK_VERSION=28
+CUSTOM_MODULE_PATH=/mnt/Storage/jphilip/marian-android/openblas-install/lib/cmake/openblas
+ANDROID_PLATFORM=28
+
+OTHER_ANDROID_ARGS=(
+    -DANDROID_ARM_NEON=TRUE
+)
+
+OTHER_MARIAN_ARGS=(
+    -DCOMPILE_CUDA=off
+    -DCOMPILE_CPU=on
+    -DCMAKE_HAVE_THREADS_LIBRARY=1
+    -DCMAKE_USE_WIN32_THREADS_INIT=0
+    -DCMAKE_USE_PTHREADS_INIT=1
+    -DTHREADS_PREFER_PTHREAD_FLAG=ON
+    -DBUILD_ARCH=armv8-a
+    -DUSE_INTGEMM=off
+    -DUSE_SIMDE=on
+    -DUSE_RUY=on
+    -DUSE_ONNX_SGEMM=on # For time being.
+    -DCOMPILE_WITHOUT_EXCEPTIONS=on # Apparently this can reduce the binary size, let's see.
+)
+# Additionally list variables finally configured.
+cmake -L \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake \
+    -DCMAKE_MODULE_PATH=$CUSTOM_MODULE_PATH \
+    -DANDROID_TOOLCHAIN=clang \
+    -DANDROID_ABI=$ABI \
+    -DANDROID_PLATFORM=$ANDROID_PLATFORM \
+    -DANDROID_NATIVE_API_LEVEL=$MINSDKVERSION \
+    -DANDROID_TOOLCHAIN_NAME=arm-linux-androideabi-4.8 \
+    -DANDROID_STL=c++_static \
+    -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+    "${OTHER_ANDROID_ARGS[@]}" "${OTHER_MARIAN_ARGS[@]}" \
+    ..
diff --git a/src/3rd_party/CMakeLists.txt b/src/3rd_party/CMakeLists.txt
index f335c218a..2fbe69b14 100644
--- a/src/3rd_party/CMakeLists.txt
+++ b/src/3rd_party/CMakeLists.txt
@@ -5,17 +5,35 @@ add_subdirectory(./yaml-cpp)
 if(NOT USE_WASM_COMPATIBLE_SOURCE)
   add_subdirectory(./SQLiteCpp)
   add_subdirectory(./zlib)
+
   add_subdirectory(./faiss)
   include_directories(./faiss)
 endif()
-add_subdirectory(./pathie-cpp)
 
-set(INTGEMM_DONT_BUILD_TESTS ON CACHE BOOL "Disable intgemm tests")
-add_subdirectory(./intgemm)
+add_subdirectory(./pathie-cpp)
 
-if(USE_WASM_COMPATIBLE_BLAS)
+if(USE_INTGEMM)
+  set(INTGEMM_DONT_BUILD_TESTS ON CACHE BOOL "Disable intgemm tests")
+  add_subdirectory(./intgemm)
+endif(USE_INTGEMM)
+
+if(USE_RUY)
+  set(CPUINFO_BUILD_MOCK_TESTS OFF CACHE BOOL " " FORCE)
+  set(CPUINFO_BUILD_BENCHMARKS OFF CACHE BOOL " " FORCE)
+  set(CPUINFO_BUILD_PKG_CONFIG OFF CACHE BOOL " " FORCE)
+  set(CPUINFO_BUILD_UNIT_TESTS OFF CACHE BOOL " " FORCE)
+  set(CPUINFO_BUILD_TOOLS      OFF CACHE BOOL " " FORCE)
+  add_subdirectory(ruy/third_party/cpuinfo EXCLUDE_FROM_ALL)
+  add_subdirectory(ruy EXCLUDE_FROM_ALL)
+endif(USE_RUY)
+
+if(USE_SIMDE)
+  include_directories(./simde-no-tests)
+endif(USE_SIMDE)
+
+if(USE_ONNX_SGEMM)
   add_subdirectory(./onnxjs)
-endif(USE_WASM_COMPATIBLE_BLAS)
+endif(USE_ONNX_SGEMM)
 
 if(USE_FBGEMM)
   # @TODO: find out if this is somehow harmful. This is supppressing CMake warnings for CMAKE_SUPPRESS_DEVELOPER_WARNINGS
diff --git a/src/3rd_party/faiss/VectorTransform.cpp b/src/3rd_party/faiss/VectorTransform.cpp
index 103b0910e..a26c2b4d3 100644
--- a/src/3rd_party/faiss/VectorTransform.cpp
+++ b/src/3rd_party/faiss/VectorTransform.cpp
@@ -132,7 +132,18 @@ const float *fvecs_maybe_subsample(
   return x_subset;
 }
 
-#if 1 // def __SSE__
+float fvec_norm_L2sqr_ref(const float *x, size_t d)
+{
+    size_t i;
+    double res = 0;
+    for (i = 0; i < d; i++)
+       res += x[i] * x[i];
+    return res;
+}
+
+
+
+#ifdef __SSE__
 // reads 0 <= d < 4 floats as __m128
 static inline __m128 masked_read(int d, const float *x)
 {
diff --git a/src/3rd_party/ruy b/src/3rd_party/ruy
new file mode 160000
index 000000000..2d950b3bf
--- /dev/null
+++ b/src/3rd_party/ruy
@@ -0,0 +1 @@
+Subproject commit 2d950b3bfa7ebfbe7a97ecb44b1cc4da5ac1d6f0
diff --git a/src/3rd_party/sentencepiece b/src/3rd_party/sentencepiece
index 3ffdc0065..c307b874d 160000
--- a/src/3rd_party/sentencepiece
+++ b/src/3rd_party/sentencepiece
@@ -1 +1 @@
-Subproject commit 3ffdc0065a03cadd9d0e5e123aaf9b6ea7ffb05d
+Subproject commit c307b874deb5ea896db8f93506e173353e66d4d3
diff --git a/src/3rd_party/simde-no-tests b/src/3rd_party/simde-no-tests
new file mode 160000
index 000000000..9af03cd0f
--- /dev/null
+++ b/src/3rd_party/simde-no-tests
@@ -0,0 +1 @@
+Subproject commit 9af03cd0f30efae1beb94ef31430dc0370b98b0c
diff --git a/src/3rd_party/sse_mathfun.h b/src/3rd_party/sse_mathfun.h
index 91155cac3..89ca1d3ed 100644
--- a/src/3rd_party/sse_mathfun.h
+++ b/src/3rd_party/sse_mathfun.h
@@ -29,7 +29,13 @@
   (this is the zlib license)
 */
 
+#ifndef USE_SIMDE
 #include <xmmintrin.h>
+#else
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#include "x86/sse.h"
+#endif
 
 /* yes I know, the top of this file is quite ugly */
 
@@ -712,3 +718,6 @@ static inline void sincos_ps(v4sf x, v4sf *s, v4sf *c) {
   *c = _mm_xor_ps(xmm2, sign_bit_cos);
 }
 
+#ifdef USE_SIMDE
+#pragma GCC diagnostic pop
+#endif
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 9123e2324..5845a6710 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -7,6 +7,8 @@ include_directories(3rd_party/sentencepiece)
 include_directories(3rd_party/sentencepiece/third_party/protobuf-lite)
 include_directories(3rd_party/fbgemm/include)
 include_directories(3rd_party/intgemm)
+include_directories(3rd_party/ruy)
+include_directories(3rd_party/simde-no-tests)
 include_directories(${CMAKE_CURRENT_BINARY_DIR}/3rd_party/intgemm)
 include_directories(${CMAKE_CURRENT_BINARY_DIR}/3rd_party)
 include_directories(${CMAKE_BINARY_DIR}/local/include)
@@ -99,12 +101,16 @@ set(MARIAN_SOURCES
   $<TARGET_OBJECTS:pathie-cpp>
 )
 
-if (NOT USE_WASM_COMPATIBLE_SOURCE)
+if (NOT USE_WASM_COMPATIBLE_SOURCE AND NOT ANDROID)
+  # Hi WASM, Android hates this too.
   list(APPEND MARIAN_SOURCES
     3rd_party/ExceptionWithCallStack.cpp
+  )
+endif()
 
+if (NOT USE_WASM_COMPATIBLE_SOURCE)
+  list(APPEND MARIAN_SOURCES
     data/corpus_sqlite.cpp
-    tensors/cpu/fbgemm/packed_gemm.cpp
     layers/lsh.cpp
     optimizers/quantizer.cpp
 
@@ -122,6 +128,11 @@ if (NOT USE_WASM_COMPATIBLE_SOURCE)
     $<TARGET_OBJECTS:zlib>
     $<TARGET_OBJECTS:faiss>
   )
+  if(USE_FBGEMM)
+    list(APPEND MARIAN_SOURCES
+      tensors/cpu/fbgemm/packed_gemm.cpp
+    )
+  endif(USE_FBGEMM)
 endif()
 
 add_library(marian STATIC ${MARIAN_SOURCES})
diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp
index 9e90bbb59..3bcc6518e 100755
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@@ -15,16 +15,6 @@
 #include <stdexcept>
 #include <string>
 
-#if MKL_FOUND
-#include <mkl.h>
-#elif BLAS_FOUND
-  #if WASM_COMPATIBLE_BLAS
-    #include "3rd_party/onnxjs/src/wasm-ops/gemm.h"
-  #else
-    #include <cblas.h>
-  #endif // WASM_COMPATIBLE_BLAS
-#endif
-
 namespace marian {
 
 // TODO: Move this to CLIWrapper and allow to mark options as paths in the same place they are
diff --git a/src/common/logging.cpp b/src/common/logging.cpp
index 999b97b42..51d80510a 100644
--- a/src/common/logging.cpp
+++ b/src/common/logging.cpp
@@ -128,7 +128,7 @@ static void setErrorHandlers() {
   std::set_terminate(unhandledException);
 #ifdef __unix__
   // catch segfaults
-  struct sigaction sa = { {0} };
+  struct sigaction sa = { 0 };
   sigemptyset(&sa.sa_mask);
   sa.sa_flags = SA_SIGINFO;
   sa.sa_sigaction = [](int /*signal*/, siginfo_t*, void*) { ABORT("Segmentation fault"); };
@@ -149,8 +149,8 @@ void switchtoMultinodeLogging(std::string nodeIdStr) {
 
 namespace marian {
   std::string noinline getCallStack(size_t skipLevels) {
-  #ifdef WASM_COMPATIBLE_SOURCE
-    return "Callstacks not supported in WASM builds currently";
+  #if defined(WASM_COMPATIBLE_SOURCE) || defined(__ANDROID__)
+    return "Callstacks not supported in WASM or Android builds currently";
   #else
     return ::Microsoft::MSR::CNTK::DebugUtil::GetCallStack(skipLevels + 2, /*makeFunctionNamesStandOut=*/true);
   #endif
diff --git a/src/common/types.h b/src/common/types.h
index 575e77120..fb6aab27a 100644
--- a/src/common/types.h
+++ b/src/common/types.h
@@ -17,7 +17,13 @@
 #include <type_traits>
 
 #ifndef __CUDACC__ // NVCC is very unreliable when it comes to CPU intrinsics, we hide them completely from NVCC-compiled code
+
+#ifndef USE_SIMDE
 #include <immintrin.h>
+#else
+#include "x86/avx2.h"
+#endif
+
 #endif
 
 #ifdef __CUDACC__ // nvcc is compiling this code
diff --git a/src/layers/lsh.cpp b/src/layers/lsh.cpp
index a91778ed5..19b040ab9 100644
--- a/src/layers/lsh.cpp
+++ b/src/layers/lsh.cpp
@@ -3,7 +3,7 @@
 #include "tensors/cpu/prod_blas.h"
 
 #if BLAS_FOUND
-#include "3rd_party/faiss/IndexLSH.h"
+#include "faiss/IndexLSH.h"
 #endif
 
 namespace marian {
@@ -127,4 +127,4 @@ Expr LSH::affine(Expr idx, Expr input, Expr W, Expr b) {
 }
 #endif
 
-}  // namespace marian
\ No newline at end of file
+}  // namespace marian
diff --git a/src/tensors/cpu/expression_graph_packable.h b/src/tensors/cpu/expression_graph_packable.h
index d93719d8e..4af120e4d 100644
--- a/src/tensors/cpu/expression_graph_packable.h
+++ b/src/tensors/cpu/expression_graph_packable.h
@@ -209,7 +209,7 @@ class ExpressionGraphPackable : public ExpressionGraph {
         if (gemmElementType == Type::intgemm8) {
 #if defined(WASM)
           ABORT("Int8::PrepareA is not implemented for wasm.");
-#else
+#elif defined(USE_INTGEMM)
           float quantMult = 127.0f / intgemm::MaxAbsolute(val->data(), val->data() + val->shape().elements());
           intgemm::Int8::PrepareA(tmp->data(), /*input*/
                                 paramMat->data<int8_t>(), /*output*/
@@ -218,11 +218,13 @@ class ExpressionGraphPackable : public ExpressionGraph {
                                 cols(val));
           //Put the quantMult at the back of the tensor
           *(reinterpret_cast<float *>(paramMat->data<int8_t>() + val->shape().elements())) = quantMult;
+#else
+    ABORT("Int8::PrepareA not implemented yet for ruy");
 #endif
         } else {
 #if defined(WASM)
           ABORT("Int16::PrepareA is not implemented for wasm.");
-#else
+#elif defined(USE_INTGEMM)
           float quantMult = 1024.0f;
           intgemm::Int16::PrepareA(tmp->data(), /*input*/
                                 paramMat->data<int16_t>(), /*output*/
@@ -231,6 +233,8 @@ class ExpressionGraphPackable : public ExpressionGraph {
                                 cols(val));
           //Put the quantMult at the back of the tensor
           *(reinterpret_cast<float *>(paramMat->data<int16_t>() + val->shape().elements())) = quantMult;
+#else
+      ABORT("Int16::PrepareA is not implemented for wasm.");
 #endif
         }
 
diff --git a/src/tensors/cpu/integer_common.cpp b/src/tensors/cpu/integer_common.cpp
index 21e7254fa..a941c0d02 100644
--- a/src/tensors/cpu/integer_common.cpp
+++ b/src/tensors/cpu/integer_common.cpp
@@ -1,5 +1,18 @@
 #include "integer_common.h"
 
+#ifndef USE_SIMDE
+#include <emmintrin.h>
+#include <immintrin.h>
+#include <tmmintrin.h>
+#include <xmmintrin.h>
+#else // USE_SIMDE
+// https://wiki.debian.org/SIMDEverywhere#Approach
+#include "x86/sse2.h"
+#include "x86/avx2.h"
+#include "x86/ssse3.h"
+#include "x86/sse.h"
+#endif
+
 namespace marian {
 namespace cpu {
 namespace integer {
@@ -39,4 +52,4 @@ void AddBias(marian::Tensor C, const marian::Tensor Bias) {
 
 } //integer
 } //cpu
-} //marian
\ No newline at end of file
+} //marian
diff --git a/src/tensors/cpu/integer_common.h b/src/tensors/cpu/integer_common.h
index 97ca79c12..80aa31d24 100644
--- a/src/tensors/cpu/integer_common.h
+++ b/src/tensors/cpu/integer_common.h
@@ -4,15 +4,16 @@
 #include "tensors/tensor_operators.h"
 #include "tensors/cpu/aligned.h"
 #include "common/io_item.h"
+#ifdef USE_INTGEMM
 #include "3rd_party/intgemm/intgemm/intgemm.h"
+#else // USE_INTGEMM
+#include <ruy/ruy.h>
+#include "ruy_adapter.h"
+#endif // USE_INTGEMM
 #if defined(WASM)
 #include "wasm_intgemm_interface.h"
 #endif
 
-#include <emmintrin.h>
-#include <immintrin.h>
-#include <tmmintrin.h>
-#include <xmmintrin.h>
 #include <cassert>
 #include <cstddef>
 
@@ -27,6 +28,11 @@ inline int rows(Tensor& tensor) { return tensor->shape().elements() / cols(tenso
 inline int cols(Shape& shape) { return shape[-1]; }
 inline int rows(Shape& shape) { return shape.elements() / cols(shape); }
 
+// This operates on floats after processing so doesn't care about int8_t vs int16_t.
+void AddBias(marian::Tensor C, const marian::Tensor Bias);
+
+#ifdef USE_INTGEMM
+
 template<Type type> struct intgemm_;
 template <> struct intgemm_<Type::int8> {using width = intgemm::Int8;
                                          using type = int8_t;
@@ -35,8 +41,19 @@ template <> struct intgemm_<Type::int16> {using width = intgemm::Int16;
                                           using type = int16_t;
                                           constexpr static const Type intgemmType = Type::intgemm16;};
 
-// This operates on floats after processing so doesn't care about int8_t vs int16_t.
-void AddBias(marian::Tensor C, const marian::Tensor Bias);
+
+
+#else // USE_INTGEMM
+
+template<Type type> struct intgemm_;
+template <> struct intgemm_<Type::int8> {using width = IntgemmViaRuy::Int8;
+                                         using type = IntgemmViaRuy::Int8::Type;
+                                         constexpr static const Type intgemmType = Type::intgemm8;};
+template <> struct intgemm_<Type::int16> {using width = IntgemmViaRuy::Int16;
+                                          using type = IntgemmViaRuy::Int16::Type;
+                                          constexpr static const Type intgemmType = Type::intgemm16;};
+
+#endif // USE_INTGEMM
 
 // For loading architecture agnostic models. We do PrepareAndTranpose, because we already transposed
 // in our binary format. Then we copy the quantizationMultiplier information at the end
@@ -86,7 +103,7 @@ void prepareAndTransposeB(io::Item& item, const char * input) {
     //Copy the quantMult
     float quantMult = *(reinterpret_cast<const float *>(reinterpret_cast<const Integer *>(input) + item.shape.elements()));
     *(reinterpret_cast<float *>(&(*(output_tensor + item.shape.elements())))) = quantMult;
-    #else
+    #else // COMPILE_CPU
     ABORT("Using intgemm models is supported only with -DCOMPILE_CPU=on");
     #endif
 }
@@ -106,4 +123,5 @@ void unquantizeWemb(io::Item& item, const char * input) {
 
 } //integer
 } //cpu
-} //marian
\ No newline at end of file
+} //marian
+
diff --git a/src/tensors/cpu/intgemm_interface.h b/src/tensors/cpu/intgemm_interface.h
index 7b965d296..4dcfc83f3 100644
--- a/src/tensors/cpu/intgemm_interface.h
+++ b/src/tensors/cpu/intgemm_interface.h
@@ -42,7 +42,7 @@ bool shifted_;
                   rows(child(0)->val()),
                   cols(child(0)->val()),
                   val_->data<int8_t>() /*output*/);
-  #else
+  #elif defined(USE_INTGEMM)
     typedef typename intgemm_<vtype>::type Integer;
       if (!shifted_) {
         intgemm_<vtype>::width::PrepareA(child(0)->val()->data(), /*input*/
@@ -57,6 +57,14 @@ bool shifted_;
                                       rows(child(0)->val()),
                                       cols(child(0)->val()));
       }
+  #else 
+  // Copied from above. No shifted in ARM.
+    typedef typename intgemm_<vtype>::type Integer;
+    intgemm_<vtype>::width::PrepareA(child(0)->val()->data(), /*input*/
+                                      val_->data<Integer>(), /*output*/
+                                      *child(1)->val()->data(), /*Quant Mult*/
+                                      rows(child(0)->val()),
+                                      cols(child(0)->val()));
   #endif
   }};
 #else
@@ -258,8 +266,8 @@ struct QuantMultNodeOp : public UnaryNodeOp {
 #pragma warning(push)
 #pragma warning(disable: 4127) //VSCODE thinks line 222 is constant conditional expression, which it is only after the template resolution, not before.
   NodeOps forwardOps() override {
-#ifdef COMPILE_CPU
-    return {NodeOp(
+    return  {[=](){
+    #ifdef COMPILE_CPU
       if (vtype == Type::int16) {
         *val_->data() = 1024.0f;
       } else if (child(0)->type() == "intgemmSelectColumnsB") {
@@ -269,17 +277,21 @@ struct QuantMultNodeOp : public UnaryNodeOp {
         *val_->data() = *(reinterpret_cast<float *>(reinterpret_cast<Integer *>(child(0)->val()->data()) + child(0)->val()->shape().elements()));
       } else {
         if (child(0)->graph()->getBackend()->DumpQuantMult()) {
+          #if defined(USE_INTGEMM)
           intgemm::MeanStd meanstd = intgemm::GetVectorMeanStd(child(0)->val()->data(), child(0)->val()->data() + child(0)->val()->shape().elements(), true);
           intgemm::MeanStd meanstd2 = intgemm::GetVectorMeanStd(child(0)->val()->data(), child(0)->val()->data() + child(0)->val()->shape().elements());
           std::cerr << "Name: " << name() << " MeanAbs: " << meanstd.mean << " stddevAbs: " << meanstd.stddev << " Mean: " << meanstd2.mean << " stddev: "
           << meanstd2.stddev << " MaxAbs: " << intgemm::MaxAbsolute(child(0)->val()->data(), child(0)->val()->data() + child(0)->val()->shape().elements()) << std::endl;
+          #endif
         }
+        #if defined(USE_INTGEMM)
         *val_->data() = 127.0f / intgemm::MaxAbsolute(child(0)->val()->data(), child(0)->val()->data() + child(0)->val()->shape().elements());
+        #else
+        *val_->data() = 127.0f / IntgemmViaRuy::MaxAbsolute(child(0)->val()->data(), child(0)->val()->data() + child(0)->val()->shape().elements());
+        #endif
       }
-    )};
-#else
-    return {NodeOp()};
-#endif
+    #endif // COMPILE_CPU
+    }};
   }
 #pragma warning(pop)
   NodeOps backwardOps() override {
@@ -345,9 +357,11 @@ class PrepareBiasForBNodeOp : public NaryNodeOp {
         float scale_a = *quant_mult_a->data();
         float scale_b = *quant_mult_b->data();
         int8PrepareBias((const int8_t *)b->data(), scale_a, 0.0 /*zero_point_a*/, scale_b, 0.0 /*zero_point_b*/, rows(b), cols(b), bias->data(), val_->data());
-    #else
+    #elif defined(USE_INTGEMM)
         float unquant_mult = (-1)*((127.0f / *quant_mult_a->data())*(127.0f / *quant_mult_b->data()))/(127.0f); //Minus one to invert add_ps later on
         intgemm::Int8Shift::PrepareBias((const int8_t *)b->data(), rows(b), cols(b), intgemm::callbacks::UnquantizeAndAddBiasAndWrite(unquant_mult, bias->data(), val_->data()));
+    #else
+        IntgemmViaRuy::PrepareBias(nullptr, val_->data(), rows(b), cols(b));
     #endif
       }
     }};
@@ -382,9 +396,13 @@ class PrepareFakeBiasForBNodeOp : public NaryNodeOp {
     float scale_a = *quant_mult_a->data();
     float scale_b = *quant_mult_b->data();
     int8PrepareBias((const int8_t *)b->data(), scale_a, 0.0 /*zero_point_a*/, scale_b, 0.0 /*zero_point_b*/, rows(b), cols(b), nullptr/*input_bias*/, val_->data());
-  #else
+  #elif defined(USE_INTGEMM)
     float unquant_mult = (-1)*((127.0f / *quant_mult_a->data())*(127.0f / *quant_mult_b->data()))/(127.0f); //Minus one to invert add_ps later on
     intgemm::Int8Shift::PrepareBias((const int8_t *)b->data(), rows(b), cols(b), intgemm::callbacks::UnquantizeAndWrite(unquant_mult, val_->data()));
+  #else
+    const float *bias = nullptr;
+    float *bias_prepared = val_->data();
+    IntgemmViaRuy::PrepareBias(bias, bias_prepared, rows(b), cols(b));
   #endif
     }};
 #else
@@ -433,7 +451,7 @@ float scalar_;
               "Int16::Multiply is not implemented for wasm.");
           ABORT_IF(intgemm_<vtype>::intgemmType == Type::intgemm8,
               "Int8::Multiply is not implemented for wasm.");
-      #else
+      #elif defined(USE_INTGEMM)
           typedef typename intgemm_<vtype>::type Integer;
           intgemm_<vtype>::width::Multiply(reinterpret_cast<Integer *>(child(0)->val()->data()), /*A*/
                                            reinterpret_cast<Integer *>(child(1)->val()->data()), /*B*/
@@ -441,6 +459,16 @@ float scalar_;
                                            cols(child(0)->val()),
                                            cols(child(1)->val()),
                                            intgemm::callbacks::UnquantizeAndWrite(unquant_mult, val_->data()));
+      #else
+        typedef typename intgemm_<vtype>::type Integer;
+        intgemm_<vtype>::width::Multiply(reinterpret_cast<Integer *>(child(0)->val()->data()), /*A*/
+                                   reinterpret_cast<Integer *>(child(1)->val()->data()), /*B*/
+                                   val_->data(), /*output*/
+                                   rows(child(0)->val()),
+                                   cols(child(0)->val()),
+                                   cols(child(1)->val()),                                          
+                                   unquant_mult);
+
       #endif
     }};
 #else
@@ -507,7 +535,7 @@ class AffineNodeOp : public NaryNodeOp {
                                 cols(child(0)->val()),
                                 cols(child(1)->val()),
                                 val_->data());
-      #else
+      #elif defined(USE_INTGEMM)
           typedef typename intgemm_<vtype>::type Integer;
           if (!shifted_) {
             intgemm_<vtype>::width::Multiply(reinterpret_cast<Integer *>(child(0)->val()->data()), /*A*/
@@ -524,6 +552,18 @@ class AffineNodeOp : public NaryNodeOp {
                                   cols(child(1)->val()),                                          /*child(2) is bias*/
                                   intgemm::callbacks::UnquantizeAndAddBiasAndWrite(unquant_mult, child(2)->val()->data(), val_->data()));
           }
+      #else
+        typedef typename intgemm_<vtype>::type Integer;
+        intgemm_<vtype>::width::Multiply(reinterpret_cast<Integer *>(child(0)->val()->data()), /*A*/
+                                   reinterpret_cast<Integer *>(child(1)->val()->data()), /*B*/
+                                   child(2)->val()->data(),  /*child(2) is bias*/
+                                   val_->data(), /*output*/
+                                   rows(child(0)->val()),
+                                   cols(child(0)->val()),
+                                   cols(child(1)->val()),                                          
+                                   unquant_mult);
+
+
       #endif
     }};
 #else
diff --git a/src/tensors/cpu/prod.cpp b/src/tensors/cpu/prod.cpp
index 4d761cf4b..8cc030539 100755
--- a/src/tensors/cpu/prod.cpp
+++ b/src/tensors/cpu/prod.cpp
@@ -10,11 +10,9 @@
 #if MKL_FOUND
 #include <mkl.h>
 #elif BLAS_FOUND
-  #if WASM_COMPATIBLE_BLAS
-    #include "3rd_party/onnxjs/src/wasm-ops/gemm.h"
-  #else
-    #include <cblas.h>
-  #endif // WASM_COMPATIBLE_BLAS
+   #include <cblas.h>
+#elif USE_ONNX_SGEMM
+  #include "3rd_party/onnxjs/src/wasm-ops/gemm.h"
 #endif
 
 #include "integer_common.h"
@@ -79,7 +77,6 @@ void ProdBatchedOld(marian::Tensor C,
                  bool transB,
                  float beta,
                  float scalar) {
-#if BLAS_FOUND
   float alpha = scalar;
 
   size_t batchA = A->shape().elements() / (A->shape()[-1] * A->shape()[-2]);
@@ -183,10 +180,6 @@ void ProdBatchedOld(marian::Tensor C,
           (int)ldc);
   }
 #endif
-#else
-  C; A; B; transA; transB; beta; scalar;
-  ABORT("You need to compile with MKL in order to use the CPU version");
-#endif
 }
 
 void ProdBatched(marian::Tensor C,
diff --git a/src/tensors/cpu/prod_blas.h b/src/tensors/cpu/prod_blas.h
index 1d6757927..c9dd6d7bc 100644
--- a/src/tensors/cpu/prod_blas.h
+++ b/src/tensors/cpu/prod_blas.h
@@ -1,11 +1,10 @@
+#pragma once
 #if MKL_FOUND
-#include <mkl.h>
+    #include <mkl.h>
 #elif BLAS_FOUND
-  #if WASM_COMPATIBLE_BLAS
-    #include "3rd_party/onnxjs/src/wasm-ops/gemm.h"
-  #else
     #include <cblas.h>
-  #endif // WASM_COMPATIBLE_BLAS
+#elif USE_ONNX_SGEMM
+    #include "3rd_party/onnxjs/src/wasm-ops/gemm.h"
 #endif
 
 inline void sgemm(bool transA,
@@ -22,9 +21,6 @@ inline void sgemm(bool transA,
                   float* c,
                   int ldc) {
 #if BLAS_FOUND
-    #if WASM_COMPATIBLE_BLAS
-        gemm_f32_imp(transA, transB, rows_a, rows_b, width, alpha, a, b, beta, c);
-    #else
         cblas_sgemm(CblasRowMajor,
                     transA ? CblasTrans : CblasNoTrans,
                     transB ? CblasTrans : CblasNoTrans,
@@ -39,7 +35,8 @@ inline void sgemm(bool transA,
                     beta,
                     c,
                     ldc);
-    #endif
+#elif USE_ONNX_SGEMM
+        gemm_f32_imp(transA, transB, rows_a, rows_b, width, alpha, a, b, beta, c);
 #else
     transA; transB; rows_a; rows_b; width; alpha; a; lda; b; ldb; beta; c; ldc; // make compiler happy
     ABORT("Marian must be compiled with a BLAS library");
diff --git a/src/tensors/cpu/ruy_adapter.h b/src/tensors/cpu/ruy_adapter.h
new file mode 100644
index 000000000..d4bbfc002
--- /dev/null
+++ b/src/tensors/cpu/ruy_adapter.h
@@ -0,0 +1,534 @@
+/*
+ * This file follows intgemm and is a means of retrofitting ruy into the intgemm based wiring in
+ * `intgemm_interface.h`. ruy is an inference backend used in tensorflow and android deployment and
+ * has an optimized ARM backend for the multiply operations required. Optimized code for quantize,
+ * unquantize, transpose are added separately to connect the multiply library to marian.
+ */
+
+#pragma once
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include "ruy/platform.h"
+#include "ruy/system_aligned_alloc.h"
+
+#if RUY_PLATFORM_NEON
+#include <arm_neon.h>
+#endif
+
+namespace marian {
+namespace cpu {
+namespace integer {
+
+using Index = unsigned int;
+
+/*
+ * An AlignedVector is similar to intgemm's aligned allocations. Defined here
+ * independently because we are ignoring intgemm path entirely on ARM.
+ */
+template <class T>
+class AlignedVector {
+public:
+  AlignedVector(size_t num_elem)
+      : size_(num_elem),
+        storage_(reinterpret_cast<T *>(ruy::detail::SystemAlignedAlloc(sizeof(T) * num_elem))) {}
+
+  T *begin() { return storage_; }
+  T *data() { return storage_; }
+  size_t size() const { return size_; }
+  size_t memSize() const { return sizeof(T) * size_; }
+
+  // Forbid copy
+  AlignedVector(const AlignedVector &) = delete;
+  AlignedVector &operator=(const AlignedVector &) = delete;
+
+  ~AlignedVector() { ruy::detail::SystemAlignedFree(reinterpret_cast<void *>(storage_)); }
+
+private:
+  size_t size_;
+  T *storage_;
+};
+
+enum class Path { kStandardCpp = 0, kNeon = 1 };
+
+#if RUY_PLATFORM_NEON
+constexpr Path kHighestPath = Path::kNeon;
+#else
+constexpr Path kHighestPath = Path::kStandardCpp;
+#endif
+
+template <enum Path>
+struct Preprocess;
+
+/*
+ * Naive implementation using standard C++ functions. Not optimized using SIMD operations.
+ */
+template <>
+struct Preprocess<Path::kStandardCpp> {
+  static void quantize(const float *input, int8_t *output, float scale, Index rows, Index width) {
+    const Index size = rows * width;
+    for(Index i = 0; i < size; i++) {
+      // Round to nearest after multiplying with scale.
+      float value = roundf(scale * input[i]);
+
+      // Since float can store bigger values, we threshold anything that's gone
+      // higher and can't fit in int8.
+      value = std::max<float>(-127.0f, value);
+      value = std::min<float>(127.0f, value);
+
+      // Finally a static cast.
+      output[i] = static_cast<int8_t>(value);
+    };
+  }
+
+  template <class Scalar>
+  static void transpose(const Scalar *input, Index rows, Index cols, Scalar *output) {
+    for(Index i = 0; i < rows; i++) {
+      for(Index j = 0; j < cols; j++) {
+        output[j * rows + i] = input[i * cols + j];
+      }
+    }
+  }
+
+  static void unquantizeAddBias(const int32_t *input,
+                                const float *input_bias_prepared,
+                                float unquant_multiplier,
+                                Index rows_A,
+                                Index cols_B,
+                                float *output) {
+    for(Index i = 0; i < rows_A; i++) {
+      for(Index j = 0; j < cols_B; j++) {
+        Index idx = i * cols_B + j;
+        output[idx] = (input[idx] * unquant_multiplier) + input_bias_prepared[j];
+      }
+    }
+  }
+
+  static void unquantize(const int32_t *input,
+                         float unquant_multiplier,
+                         Index rows_A,
+                         Index cols_B,
+                         float *output) {
+    for(Index i = 0; i < rows_A; i++) {
+      for(Index j = 0; j < cols_B; j++) {
+        Index idx = i * cols_B + j;
+        output[idx] = (input[idx] * unquant_multiplier);
+      }
+    }
+  }
+};
+
+#if RUY_PLATFORM_NEON
+
+/*
+ * Optimized path using ARM NEON SIMD intrinsics. Currently only supports int8_t.
+ * TODO: Expand support to 16-bit.
+ */
+template <>
+struct Preprocess<Path::kNeon> {
+  static void quantize(const float *input, int8_t *output, float scale, Index rows, Index width) {
+    const float32x4_t *Input = reinterpret_cast<const float32x4_t *>(input);
+    const float32x4_t *InputEnd = reinterpret_cast<const float32x4_t *>(input + rows * width);
+
+    int8x8_t *Output = reinterpret_cast<int8x8_t *>(output);
+    while(Input != InputEnd) {
+      // Vector multiply by scalar
+      // float32x4_t vmulq_n_f32(float32x4_t a, float32_t b);
+      // VMUL.F32 q0,q0,d0[0]
+      float32x4_t scaledFloat_lo = vmulq_n_f32(*Input++, scale);
+
+      // Convert from float
+      // int32x4_t  vcvtnq_s32_f32(float32x4_t a);
+      // VCVT.S32.F32 q0, q0
+      int32x4_t scaledInt_lo = vcvtnq_s32_f32(scaledFloat_lo);
+
+      // Vector saturating narrow integer
+      // int16x4_t  vqmovn_s32(int32x4_t a);   // VQMOVN.S32 d0,q0
+      int16x4_t s16x4_lo = vqmovn_s32(scaledInt_lo);
+
+      // Vector multiply by scalar
+      // float32x4_t vmulq_n_f32(float32x4_t a, float32_t b);
+      // VMUL.F32 q0,q0,d0[0]
+      float32x4_t scaledFloat_hi = vmulq_n_f32(*Input++, scale);
+
+      // Convert from float
+      // int32x4_t  vcvtnq_s32_f32(float32x4_t a);
+      // VCVT.S32.F32 q0, q0
+      int32x4_t scaledInt_hi = vcvtnq_s32_f32(scaledFloat_hi);
+
+      // Vector saturating narrow integer
+      // int16x4_t  vqmovn_s32(int32x4_t a);
+      // VQMOVN.S32 d0,q0
+      int16x4_t s16x4_hi = vqmovn_s32(scaledInt_hi);
+
+      // Combine two ints.
+      // int16x8_t   vcombine_s16(int16x4_t low, int16x4_t high);
+      int16x8_t s16x8 = vcombine_s16(s16x4_lo, s16x4_hi);
+
+      // Vector saturating narrow integer
+      int8x8_t s8x8 = vqmovn_s16(s16x8);
+
+      *Output = s8x8;
+      ++Output;
+    };
+  }
+
+  template <class Scalar>
+  static void transpose(const Scalar *input, Index rows, Index cols, Scalar *output) {
+    // This is a template with an abort. The specialized implementation is done
+    // below for int8_t.
+    std::abort();
+  }
+
+  // Specialization for int8_t
+  static void transpose(const int8_t *input, Index rows, Index cols, int8_t *output) {
+    constexpr Index tile_size = 16;
+    // TODO(jerin): Enable
+    // assert(rows % tile_size == 0 && cols & tile_size == 0);
+    for(Index i = 0; i < rows; i += tile_size) {
+      for(Index j = 0; j < cols; j += tile_size) {
+        _transpose_16x16(input, i, j, rows, cols, output);
+      }
+    }
+  }
+
+  static void _transpose_16x16(const int8_t *src,
+                               Index i,
+                               Index j,
+                               Index rows,
+                               Index cols,
+                               int8_t *dst) {
+    // Implemented following the algorithm described in
+    // https://stackoverflow.com/a/29587984/4565794
+    //
+    // permute n 32-bit rows
+    // permute n 64-bit rows
+    // ...
+    // permute n simd_width/2-bit rows
+
+    // clang-format off
+    
+    // Permute 8 8-bit rows.
+    // Load int8x16x2 from memory into SIMD registers, transpose as 2x2 matrices.
+
+    Index srcRowBegin = i*cols + j;
+    int8x16x2_t r0 = vtrnq_s8(vld1q_s8(&src[ 0*cols + srcRowBegin]), vld1q_s8(&src[ 1*cols + srcRowBegin]));
+    int8x16x2_t r1 = vtrnq_s8(vld1q_s8(&src[ 2*cols + srcRowBegin]), vld1q_s8(&src[ 3*cols + srcRowBegin]));
+    int8x16x2_t r2 = vtrnq_s8(vld1q_s8(&src[ 4*cols + srcRowBegin]), vld1q_s8(&src[ 5*cols + srcRowBegin]));
+    int8x16x2_t r3 = vtrnq_s8(vld1q_s8(&src[ 6*cols + srcRowBegin]), vld1q_s8(&src[ 7*cols + srcRowBegin]));
+    int8x16x2_t r4 = vtrnq_s8(vld1q_s8(&src[ 8*cols + srcRowBegin]), vld1q_s8(&src[ 9*cols + srcRowBegin]));
+    int8x16x2_t r5 = vtrnq_s8(vld1q_s8(&src[10*cols + srcRowBegin]), vld1q_s8(&src[11*cols + srcRowBegin]));
+    int8x16x2_t r6 = vtrnq_s8(vld1q_s8(&src[12*cols + srcRowBegin]), vld1q_s8(&src[13*cols + srcRowBegin]));
+    int8x16x2_t r7 = vtrnq_s8(vld1q_s8(&src[14*cols + srcRowBegin]), vld1q_s8(&src[15*cols + srcRowBegin]));
+
+
+    // Permute 8 16-bit rows.
+    // Next step is to treat the entries as int16x8x2 (via cast) and do
+    // transpose for int16, which will now leave intra-2 pairs intact while
+    // transposing inter 2-pairs into the right places.
+    int16x8x2_t t0 = vtrnq_s16(vreinterpretq_s16_s8(r0.val[0]), vreinterpretq_s16_s8(r1.val[0]));
+    int16x8x2_t t1 = vtrnq_s16(vreinterpretq_s16_s8(r2.val[0]), vreinterpretq_s16_s8(r3.val[0]));
+    int16x8x2_t t2 = vtrnq_s16(vreinterpretq_s16_s8(r4.val[0]), vreinterpretq_s16_s8(r5.val[0]));
+    int16x8x2_t t3 = vtrnq_s16(vreinterpretq_s16_s8(r6.val[0]), vreinterpretq_s16_s8(r7.val[0]));
+    int16x8x2_t t4 = vtrnq_s16(vreinterpretq_s16_s8(r0.val[1]), vreinterpretq_s16_s8(r1.val[1]));
+    int16x8x2_t t5 = vtrnq_s16(vreinterpretq_s16_s8(r2.val[1]), vreinterpretq_s16_s8(r3.val[1]));
+    int16x8x2_t t6 = vtrnq_s16(vreinterpretq_s16_s8(r4.val[1]), vreinterpretq_s16_s8(r5.val[1]));
+    int16x8x2_t t7 = vtrnq_s16(vreinterpretq_s16_s8(r6.val[1]), vreinterpretq_s16_s8(r7.val[1]));
+
+    // Permute 8 32-bit rows.
+    int32x4x2_t x0 = vtrnq_s32(vreinterpretq_s32_s16(t0.val[0]), vreinterpretq_s32_s16(t1.val[0]));
+    int32x4x2_t x1 = vtrnq_s32(vreinterpretq_s32_s16(t4.val[0]), vreinterpretq_s32_s16(t5.val[0]));
+    int32x4x2_t x2 = vtrnq_s32(vreinterpretq_s32_s16(t0.val[1]), vreinterpretq_s32_s16(t1.val[1]));
+    int32x4x2_t x3 = vtrnq_s32(vreinterpretq_s32_s16(t4.val[1]), vreinterpretq_s32_s16(t5.val[1]));
+
+    int32x4x2_t x4 = vtrnq_s32(vreinterpretq_s32_s16(t2.val[0]), vreinterpretq_s32_s16(t3.val[0]));
+    int32x4x2_t x5 = vtrnq_s32(vreinterpretq_s32_s16(t6.val[0]), vreinterpretq_s32_s16(t7.val[0]));
+    int32x4x2_t x6 = vtrnq_s32(vreinterpretq_s32_s16(t2.val[1]), vreinterpretq_s32_s16(t3.val[1]));
+    int32x4x2_t x7 = vtrnq_s32(vreinterpretq_s32_s16(t6.val[1]), vreinterpretq_s32_s16(t7.val[1]));
+
+    // There is no permute 8 64-bit rows available. 
+    // Instead we follow extracting low and high and placing them into the right places.
+    Index dstRowBegin = j*rows + i;
+    vst1q_s8(&dst[ 0*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32( vget_low_s32(x0.val[0]),  vget_low_s32(x4.val[0])))); 
+    vst1q_s8(&dst[ 1*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32( vget_low_s32(x1.val[0]),  vget_low_s32(x5.val[0]))));
+    vst1q_s8(&dst[ 2*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32( vget_low_s32(x2.val[0]),  vget_low_s32(x6.val[0]))));
+    vst1q_s8(&dst[ 3*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32( vget_low_s32(x3.val[0]),  vget_low_s32(x7.val[0]))));
+    vst1q_s8(&dst[ 4*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32( vget_low_s32(x0.val[1]),  vget_low_s32(x4.val[1]))));
+    vst1q_s8(&dst[ 5*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32( vget_low_s32(x1.val[1]),  vget_low_s32(x5.val[1]))));
+    vst1q_s8(&dst[ 6*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32( vget_low_s32(x2.val[1]),  vget_low_s32(x6.val[1]))));
+    vst1q_s8(&dst[ 7*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32( vget_low_s32(x3.val[1]),  vget_low_s32(x7.val[1]))));
+
+    vst1q_s8(&dst[ 8*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(x0.val[0]), vget_high_s32(x4.val[0]))));
+    vst1q_s8(&dst[ 9*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(x1.val[0]), vget_high_s32(x5.val[0]))));
+    vst1q_s8(&dst[10*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(x2.val[0]), vget_high_s32(x6.val[0]))));
+    vst1q_s8(&dst[11*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(x3.val[0]), vget_high_s32(x7.val[0]))));
+    vst1q_s8(&dst[12*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(x0.val[1]), vget_high_s32(x4.val[1]))));
+    vst1q_s8(&dst[13*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(x1.val[1]), vget_high_s32(x5.val[1]))));
+    vst1q_s8(&dst[14*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(x2.val[1]), vget_high_s32(x6.val[1]))));
+    vst1q_s8(&dst[15*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(x3.val[1]), vget_high_s32(x7.val[1]))));
+
+    // clang-format on
+  }
+
+  static void unquantizeAddBias(const int32_t *input,
+                                const float *input_bias_prepared,
+                                float unquant_multiplier,
+                                Index rows_A,
+                                Index cols_B,
+                                float *output) {
+    // Set all registers in lane from same scalar value.
+    float32x4_t multiplier = vdupq_n_f32(unquant_multiplier);
+    const int32x4_t *Input = reinterpret_cast<const int32x4_t *>(input);
+    const int32x4_t *InputEnd = reinterpret_cast<const int32x4_t *>(input + rows_A * cols_B);
+    float32x4_t *Output = reinterpret_cast<float32x4_t *>(output);
+
+    while(Input != InputEnd) {
+      // Bias cycles every column for addition.
+      const float32x4_t *Bias = reinterpret_cast<const float32x4_t *>(input_bias_prepared);
+
+      // InputEnd needs to be determined to end the while loop below.
+      const int32x4_t *RowEnd
+          = reinterpret_cast<const int32x4_t *>(reinterpret_cast<const int32_t *>(Input) + cols_B);
+
+      while(Input != RowEnd) {
+        // Operation happening for 4-elements together:
+        // output = [int32_t]input * [float]quant_mult + [float]bias;
+        float32x4_t floatInput = vcvtq_f32_s32(*Input++);
+        float32x4_t unquantized = vmulq_f32(floatInput, multiplier);
+        *Output++ = vaddq_f32(unquantized, *Bias++);
+      }
+    }
+  }
+
+  static void unquantize(const int32_t *input,
+                         float unquant_multiplier,
+                         Index rows_A,
+                         Index cols_B,
+                         float *output) {
+    // Set all registers in lane from same scalar value.
+    float32x4_t multiplier = vdupq_n_f32(unquant_multiplier);
+    const int32x4_t *Input = reinterpret_cast<const int32x4_t *>(input);
+    const int32x4_t *InputEnd = reinterpret_cast<const int32x4_t *>(input + rows_A * cols_B);
+    float32x4_t *Output = reinterpret_cast<float32x4_t *>(output);
+
+    while(Input != InputEnd) {
+      // Bias cycles every column for addition.
+
+      // InputEnd needs to be determined to end the while loop below.
+      const int32x4_t *RowEnd
+          = reinterpret_cast<const int32x4_t *>(reinterpret_cast<const int32_t *>(Input) + cols_B);
+
+      while(Input != RowEnd) {
+        // Operation happening for 4-elements together:
+        // output = [int32_t]input * [float]quant_mult + [float]bias;
+        float32x4_t floatInput = vcvtq_f32_s32(*Input++);
+        float32x4_t unquantized = vmulq_f32(floatInput, multiplier);
+        *Output++ = unquantized;
+      }
+    }
+  }
+};
+
+#endif
+
+/*
+ * The following nomenclature comes from intgemm. The current state of code is to keep the
+ * intgemm_interface.h diff minimal. There are possibly better abstractions.
+ */
+struct IntgemmViaRuy {
+  // Convert compile time errors into run-time ABORTS. This allows bringing in only int8_t and
+  // select functions that are required to create a path which will run while not achieving parity
+  // with intgemm.
+  template <class T>
+  struct IntBase {
+    using Type = T;
+    static void Quantize(const float *, Type *, float, Index) { ABORT("Quantize unsupported"); }
+
+    static void PrepareA(const float *input,
+                         Type *output,
+                         float quant_mult,
+                         Index rows,
+                         Index cols) {
+      ABORT("PrepareA Unsupported");
+    }
+
+    static void PrepareB(const float *, Type *, float, Index, Index) {
+      ABORT("PrepareB Unsupported");
+    }
+    static void PrepareBQuantizedTransposed(const Type *, Type *, Index, Index) {
+      ABORT("PrepareBQuantizedTransposed Unsupported");
+    }
+    static void PrepareBTransposed(const float *, Type *, float, Index, Index) {
+      ABORT("PrepareBTransposed Unsupported");
+    }
+    static void SelectColumnsB(const Type *, Type *, Index, const Index *, const Index *) {
+      ABORT("SelectColumnsB Unsupported");
+    }
+
+    static void
+    Multiply(const Type *, const Type *, const float *, const float *, Index, Index, Index, float) {
+      ABORT("Multiply (A*B + bias) Unsupported");
+    }
+
+    static void Multiply(const Type *, const Type *, const float *, Index, Index, Index, float) {
+      ABORT("Multiply (A*B) Unsupported");
+    }
+  };
+
+  // Intgemm nomenclature expects Int8. Missing functions are ABORTs.
+  struct Int8 : IntBase<int8_t> {
+    using Type = int8_t;
+    static void PrepareBQuantizedTransposed(const Type *input,
+                                            Type *output,
+                                            Index rows,
+                                            Index cols) {
+      std::memcpy(output, input, /*count=*/sizeof(Type) * (rows * cols));
+    }
+
+    static void PrepareBTransposed(const float *input,
+                                   Type *output,
+                                   float quant_mult,
+                                   Index rows,
+                                   Index cols) {
+      Preprocess<kHighestPath>::quantize(input, output, quant_mult, rows, cols);
+    }
+
+    static void PrepareA(const float *input,
+                         int8_t *output,
+                         float quant_mult,
+                         Index rows,
+                         Index cols) {
+      Preprocess<kHighestPath>::quantize(input, output, quant_mult, rows, cols);
+    }
+
+    static void SelectColumnsB(const Type *input,
+                               Type *output,
+                               Index width,
+                               const Index *cols,
+                               const Index *cols_end) {
+      // B_prepared is expected to be col-major, for our implementation via ruy. If
+      // col-major we can memcpy the respective column entries as they're
+      // sequential. There are width=rows entries.
+      Index num_cols = static_cast<Index>(std::distance(cols, cols_end));
+      for(Index c = 0; c < num_cols; ++c) {
+        std::memcpy(&(output[c * width]), &(input[cols[c] * width]), width);
+      }
+    }
+
+    // We don't have callback an no-op capability here yet. Multiply is kept similar to Mozilla
+    // specification and there are overloads with and without bias to avoid an if inside. This
+    // method corresponds to the one with bias.
+    // output = A*B + bias
+    static void Multiply(const Type *input_A_prepared,
+                         const Type *input_B_prepared,
+                         const float *bias_prepared,
+                         float *output,
+                         Index rows_A,
+                         Index width,
+                         Index cols_B,
+                         float unquant_multiplier) {
+      // It is expected that somehow we have managed to call all prepare by the time
+      // we are here, with inputs (prepared) in int8_t. All that's left to do is use
+      // ruy for multiply and then start with the reverse ops to get to fp32.
+
+      // Use ruy to multiply.
+      // The following is adapted from
+      // https://github.com/google/ruy/blob/878283640de7946a43053e8ebf4f15114fbc9156/example/example.cc#L129-L152
+
+      ruy::Context context;
+      ruy::Matrix<std::int8_t> lhs;
+      ruy::MakeSimpleLayout(rows_A, width, ruy::Order::kRowMajor, lhs.mutable_layout());
+      lhs.set_data(input_A_prepared);
+
+      // PRINT_MATRIX_DEBUG(input_A_prepared, rows_A, width, Order::RowMajor);
+
+      ruy::Matrix<std::int8_t> rhs;
+      ruy::MakeSimpleLayout(width, cols_B, ruy::Order::kColMajor, rhs.mutable_layout());
+      rhs.set_data(input_B_prepared);
+
+      // PRINT_MATRIX_DEBUG(input_B_prepared, width, cols_B, Order::ColMajor);
+
+      ruy::Matrix<std::int32_t> dst;
+      ruy::MakeSimpleLayout(rows_A, cols_B, ruy::Order::kRowMajor, dst.mutable_layout());
+
+      AlignedVector<std::int32_t> dst_data(rows_A * cols_B);
+      std::int32_t *dest_ptr = dst_data.data();
+
+      dst.set_data(dest_ptr);
+
+      // When Dst is int32, mul_params is unused.
+      ruy::MulParams<std::int32_t, std::int32_t> mul_params;
+      ruy::Mul(lhs, rhs, mul_params, &context, &dst);
+
+      // Unquantizes, then adds bias in a single statement on the output.
+      // float unquant_multiplier = (1.0f * scale_output) / (scale_A * scale_B);
+      Preprocess<kHighestPath>::unquantizeAddBias(
+          dest_ptr, bias_prepared, unquant_multiplier, rows_A, cols_B, output);
+    }
+
+    // output = A*B (notice no bias).
+    static void Multiply(const Type *input_A_prepared,
+                         const Type *input_B_prepared,
+                         float *output,
+                         Index rows_A,
+                         Index width,
+                         Index cols_B,
+                         float unquant_multiplier) {
+      // It is expected that somehow we have managed to call all prepare by the time
+      // we are here, with inputs (prepared) in int8_t. All that's left to do is use
+      // ruy for multiply and then start with the reverse ops to get to fp32.
+
+      // Use ruy to multiply.
+      // The following is adapted from
+      // https://github.com/google/ruy/blob/878283640de7946a43053e8ebf4f15114fbc9156/example/example.cc#L129-L152
+
+      ruy::Context context;
+      ruy::Matrix<std::int8_t> lhs;
+      ruy::MakeSimpleLayout(rows_A, width, ruy::Order::kRowMajor, lhs.mutable_layout());
+      lhs.set_data(input_A_prepared);
+
+      ruy::Matrix<std::int8_t> rhs;
+      ruy::MakeSimpleLayout(width, cols_B, ruy::Order::kColMajor, rhs.mutable_layout());
+      rhs.set_data(input_B_prepared);
+
+      ruy::Matrix<std::int32_t> dst;
+      ruy::MakeSimpleLayout(rows_A, cols_B, ruy::Order::kRowMajor, dst.mutable_layout());
+
+      AlignedVector<std::int32_t> dst_data(rows_A * cols_B);
+      std::int32_t *dest_ptr = dst_data.data();
+
+      dst.set_data(dest_ptr);
+
+      // When Dst is int32, mul_params is unused.
+      ruy::MulParams<std::int32_t, std::int32_t> mul_params;
+      ruy::Mul(lhs, rhs, mul_params, &context, &dst);
+
+      // Unquantizes, then adds bias in a single statement on the output.
+      // float unquant_multiplier = (1.0f * scale_output) / (scale_A * scale_B);
+      Preprocess<kHighestPath>::unquantize(dest_ptr, unquant_multiplier, rows_A, cols_B, output);
+    }
+  };
+
+  // Int16 support is currently missing.
+  struct Int16 : IntBase<int16_t> {
+    using Type = int16_t;
+  };
+
+  template <class T>
+  static T MaxAbsolute(const T *begin, const T *end) {
+    T result = 0;
+    for(auto p = begin; p < end; ++p) {
+      result = std::max(result, std::abs(*p));
+    }
+    return result;
+  }
+
+  static void PrepareBias(const float *input, float *output, Index rows, Index cols) {
+    assert(input != nullptr && output != nullptr);
+    std::memcpy(output, input, /*count=*/sizeof(float) * (1 * cols));
+  }
+};
+
+}  // namespace integer
+}  // namespace cpu
+}  // namespace marian
\ No newline at end of file

From 2ac7cbc95cd3213797490b6d712385417fa66c32 Mon Sep 17 00:00:00 2001
From: Jerin Philip <jphilip@ed.ac.uk>
Date: Wed, 9 Mar 2022 20:32:18 +0000
Subject: [PATCH 02/67] Fix sentencepiece submodule mixup

---
 src/3rd_party/sentencepiece | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/3rd_party/sentencepiece b/src/3rd_party/sentencepiece
index c307b874d..3ffdc0065 160000
--- a/src/3rd_party/sentencepiece
+++ b/src/3rd_party/sentencepiece
@@ -1 +1 @@
-Subproject commit c307b874deb5ea896db8f93506e173353e66d4d3
+Subproject commit 3ffdc0065a03cadd9d0e5e123aaf9b6ea7ffb05d

From 96749735709d9837c9cf49065010fd0f95f1da2c Mon Sep 17 00:00:00 2001
From: Jerin Philip <jphilip@ed.ac.uk>
Date: Sun, 10 Apr 2022 21:47:34 +0000
Subject: [PATCH 03/67] [sentencepiece] android cmake additional libs

---
 src/3rd_party/sentencepiece | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/3rd_party/sentencepiece b/src/3rd_party/sentencepiece
index 3ffdc0065..7f669c3f8 160000
--- a/src/3rd_party/sentencepiece
+++ b/src/3rd_party/sentencepiece
@@ -1 +1 @@
-Subproject commit 3ffdc0065a03cadd9d0e5e123aaf9b6ea7ffb05d
+Subproject commit 7f669c3f8f5fcc288838f3beba88a04533824f73

From f3e78185f7c8073cc5db7ea14f555c46ad63e344 Mon Sep 17 00:00:00 2001
From: Jerin Philip <jphilip@ed.ac.uk>
Date: Sun, 10 Apr 2022 21:48:02 +0000
Subject: [PATCH 04/67] Remove separately added patch in favour of submodule
 update

---
 .github/workflows/arm.yml             |  4 ----
 patches/01-spm-protobuf-android.patch | 14 --------------
 2 files changed, 18 deletions(-)
 delete mode 100644 patches/01-spm-protobuf-android.patch

diff --git a/.github/workflows/arm.yml b/.github/workflows/arm.yml
index 4cab55c9e..e10d29d6f 100644
--- a/.github/workflows/arm.yml
+++ b/.github/workflows/arm.yml
@@ -68,10 +68,6 @@ jobs:
         ccache -s # Print current cache stats
         ccache -z # Zero cache entry
 
-    - name: Apply patch sentencepiece for android
-      run: |-
-          patch -p1 < patches/01-spm-protobuf-android.patch
-
     - name: Generate buildfiles for marian on android via cmake
       run: |-
         mkdir -p build 
diff --git a/patches/01-spm-protobuf-android.patch b/patches/01-spm-protobuf-android.patch
deleted file mode 100644
index 6eadb5454..000000000
--- a/patches/01-spm-protobuf-android.patch
+++ /dev/null
@@ -1,14 +0,0 @@
-diff --git a/src/3rd_party/sentencepiece/src/CMakeLists.txt b/src/3rd_party/sentencepiece/src/CMakeLists.txt
-index 0ba6407..276caec 100644
---- a/ src/3rd_party/sentencepiece/src/CMakeLists.txt
-+++ b/src/3rd_party/sentencepiece/src/CMakeLists.txt
-@@ -159,6 +159,9 @@ set(SPM_TEST_SRCS
- find_package(Threads REQUIRED)
- 
- set(SPM_LIBS ${PROTOBUF_LITE_LIBRARY} Threads::Threads)
-+if(ANDROID)
-+    set(SPM_LIBS ${SPM_LIBS} android log)
-+endif(ANDROID)
- 
- if (SPM_ENABLE_NFKC_COMPILE)
-   find_package(ICU 4.4 COMPONENTS i18n data uc REQUIRED)

From 5250b9e81ebbab8974e7c9834c1e0457bf92116c Mon Sep 17 00:00:00 2001
From: Jerin Philip <jphilip@ed.ac.uk>
Date: Sun, 10 Apr 2022 21:53:43 +0000
Subject: [PATCH 05/67] Remove trailing newline in integer_common.h to prettify
 diff

---
 src/tensors/cpu/integer_common.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/tensors/cpu/integer_common.h b/src/tensors/cpu/integer_common.h
index 80aa31d24..67ee1e94f 100644
--- a/src/tensors/cpu/integer_common.h
+++ b/src/tensors/cpu/integer_common.h
@@ -124,4 +124,3 @@ void unquantizeWemb(io::Item& item, const char * input) {
 } //integer
 } //cpu
 } //marian
-

From 26d3ba25ab7aa52364918ec94d748229fdb1d3d4 Mon Sep 17 00:00:00 2001
From: Jerin Philip <jphilip@ed.ac.uk>
Date: Sun, 10 Apr 2022 21:58:16 +0000
Subject: [PATCH 06/67] Remove trailing newline in ruy_adapter.h

---
 src/tensors/cpu/ruy_adapter.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tensors/cpu/ruy_adapter.h b/src/tensors/cpu/ruy_adapter.h
index d4bbfc002..ec03903dd 100644
--- a/src/tensors/cpu/ruy_adapter.h
+++ b/src/tensors/cpu/ruy_adapter.h
@@ -531,4 +531,4 @@ struct IntgemmViaRuy {
 
 }  // namespace integer
 }  // namespace cpu
-}  // namespace marian
\ No newline at end of file
+}  // namespace marian

From b271b70bfe7337a4cacca789cc657dc020968cc2 Mon Sep 17 00:00:00 2001
From: Jerin Philip <jerinphilip@live.in>
Date: Thu, 14 Apr 2022 13:35:31 +0000
Subject: [PATCH 07/67] In-place multiply without malloc by reinterpret_cast

---
 src/tensors/cpu/ruy_adapter.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/tensors/cpu/ruy_adapter.h b/src/tensors/cpu/ruy_adapter.h
index ec03903dd..725d1aeff 100644
--- a/src/tensors/cpu/ruy_adapter.h
+++ b/src/tensors/cpu/ruy_adapter.h
@@ -494,9 +494,7 @@ struct IntgemmViaRuy {
       ruy::Matrix<std::int32_t> dst;
       ruy::MakeSimpleLayout(rows_A, cols_B, ruy::Order::kRowMajor, dst.mutable_layout());
 
-      AlignedVector<std::int32_t> dst_data(rows_A * cols_B);
-      std::int32_t *dest_ptr = dst_data.data();
-
+      std::int32_t *dest_ptr = reinterpret_cast<std::int32_t *>(output);
       dst.set_data(dest_ptr);
 
       // When Dst is int32, mul_params is unused.

From efa5a854df77f4da3a85284cc979ad1dc87c65fb Mon Sep 17 00:00:00 2001
From: Jerin Philip <jerinphilip@live.in>
Date: Thu, 14 Apr 2022 14:44:17 +0000
Subject: [PATCH 08/67] Documentation for the stdcpp/NEON paths created

---
 src/tensors/cpu/ruy_adapter.h | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/tensors/cpu/ruy_adapter.h b/src/tensors/cpu/ruy_adapter.h
index 725d1aeff..490c19b4d 100644
--- a/src/tensors/cpu/ruy_adapter.h
+++ b/src/tensors/cpu/ruy_adapter.h
@@ -49,7 +49,16 @@ class AlignedVector {
   T *storage_;
 };
 
-enum class Path { kStandardCpp = 0, kNeon = 1 };
+// The following partitions a pure C++ slow implementation and a faster SIMD implementation using
+// NEON intrinsics on ARM hardware. Ruy already has such a routing, but we add some preprocessing
+// and postprocessing functions (quantize, transpose, unquantize) that are outside ruy's offerings
+// and required in the fast matrix-multiplication workflow for machine-translation, that exists in
+// marian.
+
+enum class Path {
+  kStandardCpp = 0,  // Pure C++
+  kNeon = 1          // NEON Intrinsics (ARM)
+};
 
 #if RUY_PLATFORM_NEON
 constexpr Path kHighestPath = Path::kNeon;

From 179f239a7715291be824eb7fcbd546415a98858d Mon Sep 17 00:00:00 2001
From: Jerin Philip <jerinphilip@live.in>
Date: Mon, 18 Apr 2022 08:07:29 +0000
Subject: [PATCH 09/67] Remove templated abort transpose()

---
 src/tensors/cpu/ruy_adapter.h | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/src/tensors/cpu/ruy_adapter.h b/src/tensors/cpu/ruy_adapter.h
index 490c19b4d..aaa0ac18c 100644
--- a/src/tensors/cpu/ruy_adapter.h
+++ b/src/tensors/cpu/ruy_adapter.h
@@ -182,13 +182,6 @@ struct Preprocess<Path::kNeon> {
     };
   }
 
-  template <class Scalar>
-  static void transpose(const Scalar *input, Index rows, Index cols, Scalar *output) {
-    // This is a template with an abort. The specialized implementation is done
-    // below for int8_t.
-    std::abort();
-  }
-
   // Specialization for int8_t
   static void transpose(const int8_t *input, Index rows, Index cols, int8_t *output) {
     constexpr Index tile_size = 16;

From 0d189c81a526ab87608b26bb6fcbb7a0d5828ba0 Mon Sep 17 00:00:00 2001
From: Jerin Philip <jerinphilip@live.in>
Date: Mon, 18 Apr 2022 08:14:21 +0000
Subject: [PATCH 10/67] Reinterpret at unquantize add bias as well as int32_t
 from float32_t

---
 src/tensors/cpu/ruy_adapter.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/tensors/cpu/ruy_adapter.h b/src/tensors/cpu/ruy_adapter.h
index aaa0ac18c..ee3cf9822 100644
--- a/src/tensors/cpu/ruy_adapter.h
+++ b/src/tensors/cpu/ruy_adapter.h
@@ -453,9 +453,7 @@ struct IntgemmViaRuy {
       ruy::Matrix<std::int32_t> dst;
       ruy::MakeSimpleLayout(rows_A, cols_B, ruy::Order::kRowMajor, dst.mutable_layout());
 
-      AlignedVector<std::int32_t> dst_data(rows_A * cols_B);
-      std::int32_t *dest_ptr = dst_data.data();
-
+      std::int32_t *dest_ptr = reinterpret_cast<std::int32_t *>(output);
       dst.set_data(dest_ptr);
 
       // When Dst is int32, mul_params is unused.

From 8951261047b4ce60d6e2ec4c400ed589273b27fc Mon Sep 17 00:00:00 2001
From: Jerin Philip <jerinphilip@live.in>
Date: Mon, 18 Apr 2022 18:17:03 +0000
Subject: [PATCH 11/67] Remove AlignedVector from ruy_adapter - not required
 here.

---
 src/tensors/cpu/ruy_adapter.h | 27 ---------------------------
 1 file changed, 27 deletions(-)

diff --git a/src/tensors/cpu/ruy_adapter.h b/src/tensors/cpu/ruy_adapter.h
index ee3cf9822..3a04e475d 100644
--- a/src/tensors/cpu/ruy_adapter.h
+++ b/src/tensors/cpu/ruy_adapter.h
@@ -22,33 +22,6 @@ namespace integer {
 
 using Index = unsigned int;
 
-/*
- * An AlignedVector is similar to intgemm's aligned allocations. Defined here
- * independently because we are ignoring intgemm path entirely on ARM.
- */
-template <class T>
-class AlignedVector {
-public:
-  AlignedVector(size_t num_elem)
-      : size_(num_elem),
-        storage_(reinterpret_cast<T *>(ruy::detail::SystemAlignedAlloc(sizeof(T) * num_elem))) {}
-
-  T *begin() { return storage_; }
-  T *data() { return storage_; }
-  size_t size() const { return size_; }
-  size_t memSize() const { return sizeof(T) * size_; }
-
-  // Forbid copy
-  AlignedVector(const AlignedVector &) = delete;
-  AlignedVector &operator=(const AlignedVector &) = delete;
-
-  ~AlignedVector() { ruy::detail::SystemAlignedFree(reinterpret_cast<void *>(storage_)); }
-
-private:
-  size_t size_;
-  T *storage_;
-};
-
 // The following partitions a pure C++ slow implementation and a faster SIMD implementation using
 // NEON intrinsics on ARM hardware. Ruy already has such a routing, but we add some preprocessing
 // and postprocessing functions (quantize, transpose, unquantize) that are outside ruy's offerings

From 49beb50ba1935bc2f27b20ec07804b2ba7cb09d8 Mon Sep 17 00:00:00 2001
From: Jerin Philip <jerinphilip@live.in>
Date: Mon, 18 Apr 2022 18:26:15 +0000
Subject: [PATCH 12/67] Remove ViaRuy::PrepareBias without effect to output

---
 src/tensors/cpu/intgemm_interface.h | 6 ++----
 src/tensors/cpu/ruy_adapter.h       | 5 -----
 2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/src/tensors/cpu/intgemm_interface.h b/src/tensors/cpu/intgemm_interface.h
index 194169c60..57d01081b 100644
--- a/src/tensors/cpu/intgemm_interface.h
+++ b/src/tensors/cpu/intgemm_interface.h
@@ -363,7 +363,7 @@ class PrepareBiasForBNodeOp : public NaryNodeOp {
         float unquant_mult = (-1)*((127.0f / *quant_mult_a->data())*(127.0f / *quant_mult_b->data()))/(127.0f); //Minus one to invert add_ps later on
         intgemm::Int8Shift::PrepareBias((const int8_t *)b->data(), rows(b), cols(b), intgemm::callbacks::UnquantizeAndAddBiasAndWrite(unquant_mult, bias->data(), val_->data()));
     #else
-        IntgemmViaRuy::PrepareBias(nullptr, val_->data(), rows(b), cols(b));
+        ABORT("PrepareBias should not be called on ARM");
     #endif
       }
     }};
@@ -402,9 +402,7 @@ class PrepareFakeBiasForBNodeOp : public NaryNodeOp {
     float unquant_mult = (-1)*((127.0f / *quant_mult_a->data())*(127.0f / *quant_mult_b->data()))/(127.0f); //Minus one to invert add_ps later on
     intgemm::Int8Shift::PrepareBias((const int8_t *)b->data(), rows(b), cols(b), intgemm::callbacks::UnquantizeAndWrite(unquant_mult, val_->data()));
   #else
-    const float *bias = nullptr;
-    float *bias_prepared = val_->data();
-    IntgemmViaRuy::PrepareBias(bias, bias_prepared, rows(b), cols(b));
+    // Not sure what's going on here. 
   #endif
     }};
 #else
diff --git a/src/tensors/cpu/ruy_adapter.h b/src/tensors/cpu/ruy_adapter.h
index 3a04e475d..a200c331b 100644
--- a/src/tensors/cpu/ruy_adapter.h
+++ b/src/tensors/cpu/ruy_adapter.h
@@ -493,11 +493,6 @@ struct IntgemmViaRuy {
     }
     return result;
   }
-
-  static void PrepareBias(const float *input, float *output, Index rows, Index cols) {
-    assert(input != nullptr && output != nullptr);
-    std::memcpy(output, input, /*count=*/sizeof(float) * (1 * cols));
-  }
 };
 
 }  // namespace integer

From 3cf85f703fa3bb54787631c13c8ff99aeb4e6dc0 Mon Sep 17 00:00:00 2001
From: Jerin Philip <jerinphilip@live.in>
Date: Mon, 18 Apr 2022 18:58:00 +0000
Subject: [PATCH 13/67] If SSE4.1 found use it to avoid perf regressions even
 if not -march=native

---
 CMakeLists.txt | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 97ef84269..5fc7a6dee 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -215,14 +215,13 @@ else(MSVC)
   endif()
 
   # Detect support CPU instrinsics for the current platform. This will
-  # only by used with BUILD_ARCH=native. For overridden BUILD_ARCH we
-  # minimally use -msse4.1. This seems to work with MKL.
+  # only by used with BUILD_ARCH=native. 
+  include(FindSSE)
   set(INTRINSICS "")
   list(APPEND INTRINSICS_NVCC)
 
   if(BUILD_ARCH STREQUAL "native")
     message(STATUS "Checking support for CPU intrinsics")
-    include(FindSSE)
     if(SSE2_FOUND)
       message(STATUS "SSE2 support found")
       set(INTRINSICS "${INTRINSICS} -msse2")
@@ -263,7 +262,12 @@ else(MSVC)
     # -msse4.1 once marian can solely be compiled with intgemm ("onnxjs" will be removed in that case)
     set(INTRINSICS "-mssse3 -msimd128")
   else()
-    # Not assuming we have "-msse4.1 here"
+     # For overridden BUILD_ARCH we minimally use -msse4.1 (if available)
+    if(SSE4_1_FOUND)
+      message(STATUS "SSE4.1 support found")
+      set(INTRINSICS "${INTRINSICS} -msse4.1")
+      list(APPEND INTRINSICS_NVCC -Xcompiler\ -msse4.1)
+    endif(SSE4_1_FOUND)
   endif()
 
   if(USE_FBGEMM)

From 4edc8ef62dc4d580e8553af1fe1d110854d474d1 Mon Sep 17 00:00:00 2001
From: Jerin Philip <jerinphilip@live.in>
Date: Mon, 18 Apr 2022 19:31:37 +0000
Subject: [PATCH 14/67] Deduplicate multiply by capturing variability through
 callbacks

Bit ugly for now, but we are headed towards better.
---
 src/tensors/cpu/intgemm_interface.h |   8 +-
 src/tensors/cpu/ruy_adapter.h       | 220 +++++++++++++---------------
 2 files changed, 104 insertions(+), 124 deletions(-)

diff --git a/src/tensors/cpu/intgemm_interface.h b/src/tensors/cpu/intgemm_interface.h
index 57d01081b..0612559d9 100644
--- a/src/tensors/cpu/intgemm_interface.h
+++ b/src/tensors/cpu/intgemm_interface.h
@@ -461,13 +461,14 @@ float scalar_;
                                            intgemm::callbacks::UnquantizeAndWrite(unquant_mult, val_->data()));
       #else
         typedef typename intgemm_<vtype>::type Integer;
+        auto callback = marian::cpu::integer::Preprocess<marian::cpu::integer::kHighestPath>::UnquantizeAndWrite(unquant_mult);
         intgemm_<vtype>::width::Multiply(reinterpret_cast<Integer *>(child(0)->val()->data()), /*A*/
                                    reinterpret_cast<Integer *>(child(1)->val()->data()), /*B*/
                                    val_->data(), /*output*/
                                    rows(child(0)->val()),
                                    cols(child(0)->val()),
                                    cols(child(1)->val()),                                          
-                                   unquant_mult);
+                                   callback);
 
       #endif
     }};
@@ -554,14 +555,15 @@ class AffineNodeOp : public NaryNodeOp {
           }
       #else
         typedef typename intgemm_<vtype>::type Integer;
+        auto callback = marian::cpu::integer::Preprocess<marian::cpu::integer::kHighestPath>::UnquantizeAndAddBiasAndWrite(unquant_mult, 
+                                   child(2)->val()->data()  /*child(2) is bias*/);
         intgemm_<vtype>::width::Multiply(reinterpret_cast<Integer *>(child(0)->val()->data()), /*A*/
                                    reinterpret_cast<Integer *>(child(1)->val()->data()), /*B*/
-                                   child(2)->val()->data(),  /*child(2) is bias*/
                                    val_->data(), /*output*/
                                    rows(child(0)->val()),
                                    cols(child(0)->val()),
                                    cols(child(1)->val()),                                          
-                                   unquant_mult);
+                                   callback);
 
 
       #endif
diff --git a/src/tensors/cpu/ruy_adapter.h b/src/tensors/cpu/ruy_adapter.h
index a200c331b..e539218cd 100644
--- a/src/tensors/cpu/ruy_adapter.h
+++ b/src/tensors/cpu/ruy_adapter.h
@@ -72,32 +72,36 @@ struct Preprocess<Path::kStandardCpp> {
     }
   }
 
-  static void unquantizeAddBias(const int32_t *input,
-                                const float *input_bias_prepared,
-                                float unquant_multiplier,
-                                Index rows_A,
-                                Index cols_B,
-                                float *output) {
-    for(Index i = 0; i < rows_A; i++) {
-      for(Index j = 0; j < cols_B; j++) {
-        Index idx = i * cols_B + j;
-        output[idx] = (input[idx] * unquant_multiplier) + input_bias_prepared[j];
+  struct UnquantizeAndAddBiasAndWrite {
+    UnquantizeAndAddBiasAndWrite(float unquant_multiplier, const float *input_bias_prepared)
+        : unquant_multiplier(unquant_multiplier), input_bias_prepared(input_bias_prepared) {}
+
+    void operator()(const int32_t *input, Index rows_A, Index cols_B, float *output) {
+      for(Index i = 0; i < rows_A; i++) {
+        for(Index j = 0; j < cols_B; j++) {
+          Index idx = i * cols_B + j;
+          output[idx] = (input[idx] * unquant_multiplier) + input_bias_prepared[j];
+        }
       }
     }
-  }
 
-  static void unquantize(const int32_t *input,
-                         float unquant_multiplier,
-                         Index rows_A,
-                         Index cols_B,
-                         float *output) {
-    for(Index i = 0; i < rows_A; i++) {
-      for(Index j = 0; j < cols_B; j++) {
-        Index idx = i * cols_B + j;
-        output[idx] = (input[idx] * unquant_multiplier);
+    float unquant_multiplier;
+    const float *input_bias_prepared;
+  };
+
+  struct UnquantizeAndWrite {
+    UnquantizeAndWrite(float unquant_multiplier) : unquant_multiplier(unquant_multiplier) {}
+
+    void operator()(const int32_t *input, Index rows_A, Index cols_B, float *output) {
+      for(Index i = 0; i < rows_A; i++) {
+        for(Index j = 0; j < cols_B; j++) {
+          Index idx = i * cols_B + j;
+          output[idx] = (input[idx] * unquant_multiplier);
+        }
       }
     }
-  }
+    float unquant_multiplier;
+  };
 };
 
 #if RUY_PLATFORM_NEON
@@ -245,63 +249,68 @@ struct Preprocess<Path::kNeon> {
     // clang-format on
   }
 
-  static void unquantizeAddBias(const int32_t *input,
-                                const float *input_bias_prepared,
-                                float unquant_multiplier,
-                                Index rows_A,
-                                Index cols_B,
-                                float *output) {
-    // Set all registers in lane from same scalar value.
-    float32x4_t multiplier = vdupq_n_f32(unquant_multiplier);
-    const int32x4_t *Input = reinterpret_cast<const int32x4_t *>(input);
-    const int32x4_t *InputEnd = reinterpret_cast<const int32x4_t *>(input + rows_A * cols_B);
-    float32x4_t *Output = reinterpret_cast<float32x4_t *>(output);
-
-    while(Input != InputEnd) {
-      // Bias cycles every column for addition.
-      const float32x4_t *Bias = reinterpret_cast<const float32x4_t *>(input_bias_prepared);
-
-      // InputEnd needs to be determined to end the while loop below.
-      const int32x4_t *RowEnd
-          = reinterpret_cast<const int32x4_t *>(reinterpret_cast<const int32_t *>(Input) + cols_B);
-
-      while(Input != RowEnd) {
-        // Operation happening for 4-elements together:
-        // output = [int32_t]input * [float]quant_mult + [float]bias;
-        float32x4_t floatInput = vcvtq_f32_s32(*Input++);
-        float32x4_t unquantized = vmulq_f32(floatInput, multiplier);
-        *Output++ = vaddq_f32(unquantized, *Bias++);
+  struct UnquantizeAndAddBiasAndWrite {
+    UnquantizeAndAddBiasAndWrite(float unquant_multiplier, const float *input_bias_prepared)
+        : unquant_multiplier(unquant_multiplier), input_bias_prepared(input_bias_prepared) {}
+
+    void operator()(const int32_t *input, Index rows_A, Index cols_B, float *output) {
+      // Set all registers in lane from same scalar value.
+      float32x4_t multiplier = vdupq_n_f32(unquant_multiplier);
+      const int32x4_t *Input = reinterpret_cast<const int32x4_t *>(input);
+      const int32x4_t *InputEnd = reinterpret_cast<const int32x4_t *>(input + rows_A * cols_B);
+      float32x4_t *Output = reinterpret_cast<float32x4_t *>(output);
+
+      while(Input != InputEnd) {
+        // Bias cycles every column for addition.
+        const float32x4_t *Bias = reinterpret_cast<const float32x4_t *>(input_bias_prepared);
+
+        // InputEnd needs to be determined to end the while loop below.
+        const int32x4_t *RowEnd = reinterpret_cast<const int32x4_t *>(
+            reinterpret_cast<const int32_t *>(Input) + cols_B);
+
+        while(Input != RowEnd) {
+          // Operation happening for 4-elements together:
+          // output = [int32_t]input * [float]quant_mult + [float]bias;
+          float32x4_t floatInput = vcvtq_f32_s32(*Input++);
+          float32x4_t unquantized = vmulq_f32(floatInput, multiplier);
+          *Output++ = vaddq_f32(unquantized, *Bias++);
+        }
       }
     }
-  }
 
-  static void unquantize(const int32_t *input,
-                         float unquant_multiplier,
-                         Index rows_A,
-                         Index cols_B,
-                         float *output) {
-    // Set all registers in lane from same scalar value.
-    float32x4_t multiplier = vdupq_n_f32(unquant_multiplier);
-    const int32x4_t *Input = reinterpret_cast<const int32x4_t *>(input);
-    const int32x4_t *InputEnd = reinterpret_cast<const int32x4_t *>(input + rows_A * cols_B);
-    float32x4_t *Output = reinterpret_cast<float32x4_t *>(output);
+    float unquant_multiplier;
+    const float *input_bias_prepared;
+  };
 
-    while(Input != InputEnd) {
-      // Bias cycles every column for addition.
-
-      // InputEnd needs to be determined to end the while loop below.
-      const int32x4_t *RowEnd
-          = reinterpret_cast<const int32x4_t *>(reinterpret_cast<const int32_t *>(Input) + cols_B);
-
-      while(Input != RowEnd) {
-        // Operation happening for 4-elements together:
-        // output = [int32_t]input * [float]quant_mult + [float]bias;
-        float32x4_t floatInput = vcvtq_f32_s32(*Input++);
-        float32x4_t unquantized = vmulq_f32(floatInput, multiplier);
-        *Output++ = unquantized;
+  struct UnquantizeAndWrite {
+    UnquantizeAndWrite(float unquant_multiplier) : unquant_multiplier(unquant_multiplier) {}
+
+    void operator()(const int32_t *input, Index rows_A, Index cols_B, float *output) {
+      // Set all registers in lane from same scalar value.
+      float32x4_t multiplier = vdupq_n_f32(unquant_multiplier);
+      const int32x4_t *Input = reinterpret_cast<const int32x4_t *>(input);
+      const int32x4_t *InputEnd = reinterpret_cast<const int32x4_t *>(input + rows_A * cols_B);
+      float32x4_t *Output = reinterpret_cast<float32x4_t *>(output);
+
+      while(Input != InputEnd) {
+        // Bias cycles every column for addition.
+
+        // InputEnd needs to be determined to end the while loop below.
+        const int32x4_t *RowEnd = reinterpret_cast<const int32x4_t *>(
+            reinterpret_cast<const int32_t *>(Input) + cols_B);
+
+        while(Input != RowEnd) {
+          // Operation happening for 4-elements together:
+          // output = [int32_t]input * [float]quant_mult + [float]bias;
+          float32x4_t floatInput = vcvtq_f32_s32(*Input++);
+          float32x4_t unquantized = vmulq_f32(floatInput, multiplier);
+          *Output++ = unquantized;
+        }
       }
     }
-  }
+
+    float unquant_multiplier;
+  };
 };
 
 #endif
@@ -312,8 +321,8 @@ struct Preprocess<Path::kNeon> {
  */
 struct IntgemmViaRuy {
   // Convert compile time errors into run-time ABORTS. This allows bringing in only int8_t and
-  // select functions that are required to create a path which will run while not achieving parity
-  // with intgemm.
+  // select functions that are required to create a path which will run while not achieving
+  // parity with intgemm.
   template <class T>
   struct IntBase {
     using Type = T;
@@ -340,12 +349,14 @@ struct IntgemmViaRuy {
       ABORT("SelectColumnsB Unsupported");
     }
 
-    static void
-    Multiply(const Type *, const Type *, const float *, const float *, Index, Index, Index, float) {
-      ABORT("Multiply (A*B + bias) Unsupported");
-    }
-
-    static void Multiply(const Type *, const Type *, const float *, Index, Index, Index, float) {
+    template <class Callback>
+    static void Multiply(const Type *A_prepared,
+                         const Type *B_prepared,
+                         float *output,
+                         Index rows_A,
+                         Index width,
+                         Index cols_B,
+                         Callback callback) {
       ABORT("Multiply (A*B) Unsupported");
     }
   };
@@ -394,14 +405,14 @@ struct IntgemmViaRuy {
     // specification and there are overloads with and without bias to avoid an if inside. This
     // method corresponds to the one with bias.
     // output = A*B + bias
+    template <class Callback>
     static void Multiply(const Type *input_A_prepared,
                          const Type *input_B_prepared,
-                         const float *bias_prepared,
                          float *output,
                          Index rows_A,
                          Index width,
                          Index cols_B,
-                         float unquant_multiplier) {
+                         Callback callback) {
       // It is expected that somehow we have managed to call all prepare by the time
       // we are here, with inputs (prepared) in int8_t. All that's left to do is use
       // ruy for multiply and then start with the reverse ops to get to fp32.
@@ -435,48 +446,15 @@ struct IntgemmViaRuy {
 
       // Unquantizes, then adds bias in a single statement on the output.
       // float unquant_multiplier = (1.0f * scale_output) / (scale_A * scale_B);
-      Preprocess<kHighestPath>::unquantizeAddBias(
-          dest_ptr, bias_prepared, unquant_multiplier, rows_A, cols_B, output);
-    }
-
-    // output = A*B (notice no bias).
-    static void Multiply(const Type *input_A_prepared,
-                         const Type *input_B_prepared,
-                         float *output,
-                         Index rows_A,
-                         Index width,
-                         Index cols_B,
-                         float unquant_multiplier) {
-      // It is expected that somehow we have managed to call all prepare by the time
-      // we are here, with inputs (prepared) in int8_t. All that's left to do is use
-      // ruy for multiply and then start with the reverse ops to get to fp32.
-
-      // Use ruy to multiply.
-      // The following is adapted from
-      // https://github.com/google/ruy/blob/878283640de7946a43053e8ebf4f15114fbc9156/example/example.cc#L129-L152
-
-      ruy::Context context;
-      ruy::Matrix<std::int8_t> lhs;
-      ruy::MakeSimpleLayout(rows_A, width, ruy::Order::kRowMajor, lhs.mutable_layout());
-      lhs.set_data(input_A_prepared);
-
-      ruy::Matrix<std::int8_t> rhs;
-      ruy::MakeSimpleLayout(width, cols_B, ruy::Order::kColMajor, rhs.mutable_layout());
-      rhs.set_data(input_B_prepared);
-
-      ruy::Matrix<std::int32_t> dst;
-      ruy::MakeSimpleLayout(rows_A, cols_B, ruy::Order::kRowMajor, dst.mutable_layout());
-
-      std::int32_t *dest_ptr = reinterpret_cast<std::int32_t *>(output);
-      dst.set_data(dest_ptr);
-
-      // When Dst is int32, mul_params is unused.
-      ruy::MulParams<std::int32_t, std::int32_t> mul_params;
-      ruy::Mul(lhs, rhs, mul_params, &context, &dst);
+      // Preprocess<kHighestPath>::unquantizeAddBias(
+      //     dest_ptr, bias_prepared, unquant_multiplier, rows_A, cols_B, output);
 
       // Unquantizes, then adds bias in a single statement on the output.
       // float unquant_multiplier = (1.0f * scale_output) / (scale_A * scale_B);
-      Preprocess<kHighestPath>::unquantize(dest_ptr, unquant_multiplier, rows_A, cols_B, output);
+      // Preprocess<kHighestPath>::unquantize(dest_ptr, unquant_multiplier, rows_A, cols_B,
+      // output);
+
+      callback(dest_ptr, rows_A, cols_B, output);
     }
   };
 

From a414b60dd3c8637e5d5dd28050978aa3016052db Mon Sep 17 00:00:00 2001
From: Jerin Philip <jerinphilip@live.in>
Date: Mon, 18 Apr 2022 19:35:45 +0000
Subject: [PATCH 15/67] Revert "If SSE4.1 found use it to avoid perf
 regressions even if not -march=native"

This reverts commit 3cf85f703fa3bb54787631c13c8ff99aeb4e6dc0.
---
 CMakeLists.txt | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5fc7a6dee..97ef84269 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -215,13 +215,14 @@ else(MSVC)
   endif()
 
   # Detect support CPU instrinsics for the current platform. This will
-  # only by used with BUILD_ARCH=native. 
-  include(FindSSE)
+  # only by used with BUILD_ARCH=native. For overridden BUILD_ARCH we
+  # minimally use -msse4.1. This seems to work with MKL.
   set(INTRINSICS "")
   list(APPEND INTRINSICS_NVCC)
 
   if(BUILD_ARCH STREQUAL "native")
     message(STATUS "Checking support for CPU intrinsics")
+    include(FindSSE)
     if(SSE2_FOUND)
       message(STATUS "SSE2 support found")
       set(INTRINSICS "${INTRINSICS} -msse2")
@@ -262,12 +263,7 @@ else(MSVC)
     # -msse4.1 once marian can solely be compiled with intgemm ("onnxjs" will be removed in that case)
     set(INTRINSICS "-mssse3 -msimd128")
   else()
-     # For overridden BUILD_ARCH we minimally use -msse4.1 (if available)
-    if(SSE4_1_FOUND)
-      message(STATUS "SSE4.1 support found")
-      set(INTRINSICS "${INTRINSICS} -msse4.1")
-      list(APPEND INTRINSICS_NVCC -Xcompiler\ -msse4.1)
-    endif(SSE4_1_FOUND)
+    # Not assuming we have "-msse4.1 here"
   endif()
 
   if(USE_FBGEMM)

From e2069bfee29149ce38b06ba7b062daff454ac077 Mon Sep 17 00:00:00 2001
From: Jerin Philip <jerinphilip@live.in>
Date: Mon, 18 Apr 2022 22:01:44 +0000
Subject: [PATCH 16/67] CMAKE_SYSTEM_PROCESSOR indicates x86 and native mode is
 not enabled, apply -msse4.1.

---
 CMakeLists.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 97ef84269..5ff0b5da8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -263,7 +263,9 @@ else(MSVC)
     # -msse4.1 once marian can solely be compiled with intgemm ("onnxjs" will be removed in that case)
     set(INTRINSICS "-mssse3 -msimd128")
   else()
-    # Not assuming we have "-msse4.1 here"
+    if(CMAKE_SYSTEM_PROCESSOR STREQUAL x86_64 OR CMAKE_SYSTEM_PROCESSOR STREQUAL amd64)
+      set(INTRINSICS "-msse4.1")
+    endif ()
   endif()
 
   if(USE_FBGEMM)

From 1b4049a4551746142d599c3e575a87e58bf24124 Mon Sep 17 00:00:00 2001
From: Jerin Philip <jerinphilip@live.in>
Date: Mon, 18 Apr 2022 22:20:19 +0000
Subject: [PATCH 17/67] Remove comments, now that callback is working

---
 src/tensors/cpu/ruy_adapter.h | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/src/tensors/cpu/ruy_adapter.h b/src/tensors/cpu/ruy_adapter.h
index e539218cd..89e856d10 100644
--- a/src/tensors/cpu/ruy_adapter.h
+++ b/src/tensors/cpu/ruy_adapter.h
@@ -444,16 +444,6 @@ struct IntgemmViaRuy {
       ruy::MulParams<std::int32_t, std::int32_t> mul_params;
       ruy::Mul(lhs, rhs, mul_params, &context, &dst);
 
-      // Unquantizes, then adds bias in a single statement on the output.
-      // float unquant_multiplier = (1.0f * scale_output) / (scale_A * scale_B);
-      // Preprocess<kHighestPath>::unquantizeAddBias(
-      //     dest_ptr, bias_prepared, unquant_multiplier, rows_A, cols_B, output);
-
-      // Unquantizes, then adds bias in a single statement on the output.
-      // float unquant_multiplier = (1.0f * scale_output) / (scale_A * scale_B);
-      // Preprocess<kHighestPath>::unquantize(dest_ptr, unquant_multiplier, rows_A, cols_B,
-      // output);
-
       callback(dest_ptr, rows_A, cols_B, output);
     }
   };

From e522e6cdcbcef11f48f92aef906400cc94c930e2 Mon Sep 17 00:00:00 2001
From: Jerin Philip <jerinphilip@live.in>
Date: Tue, 19 Apr 2022 16:21:12 +0000
Subject: [PATCH 18/67] Minimal gemmRuy

There are no accompanying tests.
---
 CMakeLists.txt              |   5 +-
 src/tensors/cpu/prod_blas.h | 127 +++++++++++++++++++++++++++++++++++-
 2 files changed, 130 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5ff0b5da8..d5b61c9a8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -43,6 +43,7 @@ option(USE_WASM_COMPATIBLE_SOURCE "Enable the minimal marian sources that compil
 
 option(USE_SIMDE "Enable simde to target instruction sets" OFF)
 option(USE_ONNX_SGEMM "Compile with wasm compatible blas" OFF)
+option(USE_RUY_SGEMM "Compile with Ruy SGEMM" OFF)
 option(COMPILE_WITHOUT_EXCEPTIONS "Compile without exceptions" OFF)
 
 # cmake options that are dependent on USE_WASM_COMPATIBLE_SOURCE cmake option
@@ -573,7 +574,8 @@ if(COMPILE_CPU)
   endif(USE_INTGEMM)
 
   if(USE_RUY)
-    set(EXT_LIBS ${EXT_LIBS} ruy)
+    set(EXT_LIBS ${EXT_LIBS} ruy) 
+    add_definitions(-DUSE_RUY_SGEMM=1)
   endif(USE_RUY)
 
   add_definitions(-DCOMPILE_CPU=1)        # Move the compile CPU definition on top since we want to compile intgemm when we set compile CPU
@@ -615,6 +617,7 @@ if(COMPILE_CPU)
       endif(BLAS_FOUND)
     endif(MKL_FOUND)
   endif(USE_ONNX_SGEMM)
+
 endif(COMPILE_CPU)
 
 ###############################################################################
diff --git a/src/tensors/cpu/prod_blas.h b/src/tensors/cpu/prod_blas.h
index c9dd6d7bc..a3d11db9c 100644
--- a/src/tensors/cpu/prod_blas.h
+++ b/src/tensors/cpu/prod_blas.h
@@ -5,8 +5,119 @@
     #include <cblas.h>
 #elif USE_ONNX_SGEMM
     #include "3rd_party/onnxjs/src/wasm-ops/gemm.h"
+#elif USE_RUY_SGEMM
+    #include "ruy/ruy.h"
+    #include "ruy/system_aligned_alloc.h"
 #endif
 
+#if USE_RUY_SGEMM
+
+// AlignedVector allocates aligned memory and cleans up after itself. RAII
+// wrapper similar to intgemm's AlignedVector.
+template <class T>
+class AlignedVector {
+public:
+  AlignedVector(size_t num_elem)
+      : size_(num_elem),
+        storage_(reinterpret_cast<T *>(ruy::detail::SystemAlignedAlloc(sizeof(T) * num_elem))) {}
+
+  T *begin() { return storage_; }
+  T *data() { return storage_; }
+  size_t size() const { return size_; }
+  size_t memSize() const { return sizeof(T) * size_; }
+
+  // Forbid copy
+  AlignedVector(const AlignedVector &) = delete;
+  AlignedVector &operator=(const AlignedVector &) = delete;
+
+  ~AlignedVector() { ruy::detail::SystemAlignedFree(reinterpret_cast<void *>(storage_)); }
+
+private:
+  size_t size_;
+  T *storage_;
+};
+
+
+inline void GemmRuy(const bool transA,
+                    const bool transB,
+                    const int M,
+                    const int N,
+                    const int K,
+                    const float alpha,
+                    const float *A,
+                    const int lda,
+                    const float *B,
+                    const int ldb,
+                    const float beta,
+                    float *C,
+                    const int ldc) {
+  LOG(info, "Ruy multiplication called...");
+  ruy::Context context;
+
+  // If we need to transpose, we can swap dimensions in layout claim the matrix
+  // is just column-major. Set ordering so transpose.
+  const auto orderA = (transA ? ruy::Order::kColMajor : ruy::Order::kRowMajor);
+  const auto orderB = (transB ? ruy::Order::kColMajor : ruy::Order::kRowMajor);
+
+  ruy::Matrix<float> lhs;
+  ruy::MakeSimpleLayout(M, K, orderA, lhs.mutable_layout());
+  lhs.set_data(A);
+
+  ruy::Matrix<float> rhs;
+  ruy::MakeSimpleLayout(K, N, orderB, rhs.mutable_layout());
+  rhs.set_data(B);
+
+  ruy::Matrix<float> dst;
+  ruy::MakeSimpleLayout(M, N, ruy::Order::kRowMajor, dst.mutable_layout());
+
+  if(beta == 0) {
+    // For beta = 0, we want to avoid the additional allocation. This is a
+    // large amount of our inference use-cases. sgemm is called with `beta` for
+    // accumulating gradients in backpropogation, which is 0.0 during
+    // inference.
+
+    dst.set_data(C);
+    ruy::MulParams<float, float> mul_params;
+    ruy::Mul(lhs, rhs, mul_params, &context, &dst);
+
+    // Write out C as C = alpha * [op(A) * op(B)] + beta * C
+    // Can we expect the compiler to autovectorize this?
+    // TODO: Come back and explicitly use SIMD.
+    const size_t size    = M * N;
+    const float *opA_opB = C;  // Alias.
+#pragma clang loop vectorize(enable) interleave(enable)
+    for(size_t i = 0; i < size; i++) {
+      C[i] = alpha * opA_opB[i];
+    }
+  } else {
+    // @jerinphilip has not yet been able to find a ruy primitive that does in
+    // place addition to obtain full gemm.
+    //
+    // Safe bet is to make an additional allocation to store the result of
+    // multiply  and use the existing values in C.
+    //
+    // See also: https://github.com/google/ruy/issues/307
+
+    AlignedVector<float> intermediate(M * N);
+    dst.set_data(intermediate.data());
+    ruy::MulParams<float, float> mul_params;
+    ruy::Mul(lhs, rhs, mul_params, &context, &dst);
+
+    // Write out C as C = alpha * [op(A) * op(B)] + beta * C
+    // Can we expect the compiler to autovectorize this?
+    // TODO: Come back and explicitly use SIMD.
+    const size_t size    = M * N;
+    const float *opA_opB = intermediate.data();
+#pragma clang loop vectorize(enable) interleave(enable)
+    for(size_t i = 0; i < size; i++) {
+      C[i] = alpha * opA_opB[i] + beta * C[i];
+    }
+  }
+}
+
+#endif // RUY_SGEMM
+
+
 inline void sgemm(bool transA,
                   bool transB,
                   int rows_a,
@@ -37,7 +148,21 @@ inline void sgemm(bool transA,
                     ldc);
 #elif USE_ONNX_SGEMM
         gemm_f32_imp(transA, transB, rows_a, rows_b, width, alpha, a, b, beta, c);
-#else
+#elif USE_RUY_SGEMM
+        GemmRuy(transA, 
+                transB,
+                rows_a,
+                rows_b,
+                width,
+                alpha,
+                a,
+                lda,
+                b,
+                ldb,
+                beta,
+                c,
+                ldc);
+#else 
     transA; transB; rows_a; rows_b; width; alpha; a; lda; b; ldb; beta; c; ldc; // make compiler happy
     ABORT("Marian must be compiled with a BLAS library");
 #endif

From 90858a5ddb0dc689da920d19cccda214e049f38c Mon Sep 17 00:00:00 2001
From: Jerin Philip <jerinphilip@live.in>
Date: Tue, 19 Apr 2022 16:22:47 +0000
Subject: [PATCH 19/67] Update CI

---
 .github/workflows/arm.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/arm.yml b/.github/workflows/arm.yml
index e10d29d6f..9cb865895 100644
--- a/.github/workflows/arm.yml
+++ b/.github/workflows/arm.yml
@@ -90,7 +90,6 @@ jobs:
             -DUSE_INTGEMM=off
             -DUSE_SIMDE=on
             -DUSE_RUY=on
-            -DUSE_ONNX_SGEMM=on # For time being.
             # -DCOMPILE_WITHOUT_EXCEPTIONS=on # Apparently this can reduce the binary size, let's see.
         )
         # Additionally list variables finally configured.

From 557de0ceeae1125441437a1f71ccf66e089fc5aa Mon Sep 17 00:00:00 2001
From: Jerin Philip <jerinphilip@live.in>
Date: Mon, 25 Apr 2022 14:57:27 +0100
Subject: [PATCH 20/67] Using simd_utils instead of SIMDE

* Remove SIMDE dependency from integer_common.cpp:AddBias
* Typo: __ARM_NEON__
* Revert "Typo: __ARM_NEON__"
This reverts commit f29a0bb24e5a0b21d7bfd2f694bc24e37e87dd72.
* Typo __ARM_NEON__: NoAutoFormatBuffer
* AVX and optional expansion, include guarded by __AVX__ instead of SIMDE
* Create an ARM_NEON structure to advance compilation without SIMDE
* Import neon header files
* No SIMDE for sse_mathfun
* Import submodule as a whole, can't do only neon
* Removing old files
* Update simdutils submodule usage
* Remove simde submodule
* More SIMDE removal
* Remove the neon_mathfun.h file
* USE_SIMD_UTILS instead of USE_SIMDE in CI
* xmmintrin.h include if SSE for WebAssembly
* Include simd_utils only if flag set
* Create a dummy float32x4 because Windows
* #else block is SSE __m128d now, old behaviour
* Windows does not give us flags, expects sse_mathfun
* Point simd_utils to a fork with an experimental patch
* Restore TODO, it's valid after the reset
---
 .github/workflows/arm.yml          |  2 +-
 .gitmodules                        |  6 ++---
 CMakeLists.txt                     | 18 +++++--------
 src/3rd_party/CMakeLists.txt       |  4 ---
 src/3rd_party/simd_utils           |  1 +
 src/3rd_party/simde-no-tests       |  1 -
 src/3rd_party/sse_mathfun.h        |  6 -----
 src/common/types.h                 | 42 +++++++++++++++++++++++++++---
 src/functional/operators.h         |  6 +++++
 src/tensors/cpu/integer_common.cpp | 30 +++++++++++++++------
 10 files changed, 78 insertions(+), 38 deletions(-)
 create mode 160000 src/3rd_party/simd_utils
 delete mode 160000 src/3rd_party/simde-no-tests

diff --git a/.github/workflows/arm.yml b/.github/workflows/arm.yml
index 9cb865895..1b8ff0839 100644
--- a/.github/workflows/arm.yml
+++ b/.github/workflows/arm.yml
@@ -88,7 +88,7 @@ jobs:
             -DTHREADS_PREFER_PTHREAD_FLAG=ON
             -DBUILD_ARCH=armv8-a
             -DUSE_INTGEMM=off
-            -DUSE_SIMDE=on
+            -DUSE_SIMD_UTILS=on
             -DUSE_RUY=on
             # -DCOMPILE_WITHOUT_EXCEPTIONS=on # Apparently this can reduce the binary size, let's see.
         )
diff --git a/.gitmodules b/.gitmodules
index ae76037f1..536fa3829 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -23,9 +23,9 @@
 [submodule "src/3rd_party/onnxjs"]
 	path = src/3rd_party/onnxjs
 	url = https://github.com/abhi-agg/onnxjs.git
-[submodule "src/3rd_party/simde-no-tests"]
-	path = src/3rd_party/simde-no-tests
-	url = https://github.com/simd-everywhere/simde-no-tests/
 [submodule "src/3rd_party/ruy"]
 	path = src/3rd_party/ruy
 	url = https://github.com/google/ruy
+[submodule "src/3rd_party/simd_utils"]
+	path = src/3rd_party/simd_utils
+	url = https://github.com/JishinMaster/simd_utils/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d5b61c9a8..19d74d770 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -41,7 +41,7 @@ option(GENERATE_MARIAN_INSTALL_TARGETS "Generate Marian install targets (require
 option(COMPILE_WASM "Compile (wasm compatible) marian for WASM target" OFF)
 option(USE_WASM_COMPATIBLE_SOURCE "Enable the minimal marian sources that compile to wasm. Useful for debugging wasm failures by building same sources natively" OFF)
 
-option(USE_SIMDE "Enable simde to target instruction sets" OFF)
+option(USE_SIMD_UTILS "Enable simde to target instruction sets" OFF)
 option(USE_ONNX_SGEMM "Compile with wasm compatible blas" OFF)
 option(USE_RUY_SGEMM "Compile with Ruy SGEMM" OFF)
 option(COMPILE_WITHOUT_EXCEPTIONS "Compile without exceptions" OFF)
@@ -55,8 +55,8 @@ if (USE_WASM_COMPATIBLE_SOURCE)
   add_compile_definitions(WASM_COMPATIBLE_SOURCE)
   # Setting USE_SSE2 definition to enable SSE2 specific code in "3rd_party/sse_mathfun.h" for wasm builds
   add_compile_definitions(USE_SSE2)
-  set(USE_ONNX_SGEMM ON CACHE BOOL "")
-  set(COMPILE_WITHOUT_EXCEPTIONS ON CACHE BOOL "")
+  set(USE_ONNX_SGEMM ON CACHE BOOL "Use ONNX SGEMM (for WebAssembly)")
+  set(COMPILE_WITHOUT_EXCEPTIONS ON CACHE BOOL "Compile without Exceptions")
 endif()
 
 if (COMPILE_WASM)
@@ -203,11 +203,6 @@ if(MSVC)
     add_definitions(-DUSE_INTGEMM=1)
   endif(USE_INTGEMM)
 
-  if(USE_SIMDE)
-    add_definitions(-DUSE_SIMDE=1)
-    add_definitions(-DSIMDE_ENABLE_NATIVE_ALIASES=1)
-  endif(USE_SIMDE)
-
 else(MSVC)
 
   # Check we are using at least g++ 5.0
@@ -278,10 +273,9 @@ else(MSVC)
     add_definitions(-DUSE_INTGEMM=1)
   endif(USE_INTGEMM)
 
-  if(USE_SIMDE)
-    add_definitions(-DUSE_SIMDE=1)
-    add_definitions(-DSIMDE_ENABLE_NATIVE_ALIASES=1)
-  endif(USE_SIMDE)
+  if(USE_SIMD_UTILS)
+    add_definitions(-DARM -DSSE -flax-vector-conversions) #added for ARM
+  endif(USE_SIMD_UTILS)
 
   if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 9.0)
     # Clang-10.0.0 complains when CUDA is newer than 10.1
diff --git a/src/3rd_party/CMakeLists.txt b/src/3rd_party/CMakeLists.txt
index 2fbe69b14..f2062c381 100644
--- a/src/3rd_party/CMakeLists.txt
+++ b/src/3rd_party/CMakeLists.txt
@@ -27,10 +27,6 @@ if(USE_RUY)
   add_subdirectory(ruy EXCLUDE_FROM_ALL)
 endif(USE_RUY)
 
-if(USE_SIMDE)
-  include_directories(./simde-no-tests)
-endif(USE_SIMDE)
-
 if(USE_ONNX_SGEMM)
   add_subdirectory(./onnxjs)
 endif(USE_ONNX_SGEMM)
diff --git a/src/3rd_party/simd_utils b/src/3rd_party/simd_utils
new file mode 160000
index 000000000..696036258
--- /dev/null
+++ b/src/3rd_party/simd_utils
@@ -0,0 +1 @@
+Subproject commit 6960362584481c977cdae9f6a8f7061a37c766cb
diff --git a/src/3rd_party/simde-no-tests b/src/3rd_party/simde-no-tests
deleted file mode 160000
index 9af03cd0f..000000000
--- a/src/3rd_party/simde-no-tests
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 9af03cd0f30efae1beb94ef31430dc0370b98b0c
diff --git a/src/3rd_party/sse_mathfun.h b/src/3rd_party/sse_mathfun.h
index 89ca1d3ed..bb0be6c65 100644
--- a/src/3rd_party/sse_mathfun.h
+++ b/src/3rd_party/sse_mathfun.h
@@ -29,13 +29,7 @@
   (this is the zlib license)
 */
 
-#ifndef USE_SIMDE
 #include <xmmintrin.h>
-#else
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wstrict-aliasing"
-#include "x86/sse.h"
-#endif
 
 /* yes I know, the top of this file is quite ugly */
 
diff --git a/src/common/types.h b/src/common/types.h
index fb6aab27a..621551daf 100644
--- a/src/common/types.h
+++ b/src/common/types.h
@@ -18,10 +18,14 @@
 
 #ifndef __CUDACC__ // NVCC is very unreliable when it comes to CPU intrinsics, we hide them completely from NVCC-compiled code
 
-#ifndef USE_SIMDE
+#ifdef __AVX__
 #include <immintrin.h>
-#else
-#include "x86/avx2.h"
+#elif __SSE__
+#include <xmmintrin.h>
+#endif
+
+#if defined(__ARM_NEON) || defined(__ARM_NEON__)
+#include <arm_neon.h>
 #endif
 
 #endif
@@ -167,6 +171,36 @@ struct intgemm8 {
 
 #ifndef __CUDACC__ // vectorized types not available from .cu files
 
+
+#if defined(__ARM_NEON) || defined(__ARM_NEON__)
+struct float32x4 {
+private:
+   using __m128 = float32x4_t;
+  __m128 f_;
+
+public:
+  float32x4() {}
+  float32x4(const __m128& f) : f_(f) {}
+  float32x4(const float& f) : f_(vdupq_n_f32(f)) {} // __m128 _mm_set1_ps(float) copies value into all slots
+
+  operator const __m128&() const { return f_; }
+  operator __m128&() { return f_; }
+
+  float operator[] (size_t i) const {
+    return *(((float*)&f_) + i); // potentially undefined, but efficient. In practice __m128 is an array of floats
+  }
+
+  friend std::ostream& operator<<(std::ostream& out, float32x4 f4) {
+    float* a = (float*)&f4;
+    out << "[" << a[0];
+    for(int i = 1; i < 4; i++)
+      out << " " << a[i];
+    out << "]";
+    return out;
+  }
+};
+
+#else 
 // @TODO: check what intrinsics are actually available.
 struct float32x4 {
 private:
@@ -194,6 +228,8 @@ struct float32x4 {
   }
 };
 
+#endif
+
 // @TODO: consider how code can be shared via templating
 #ifdef __AVX__
 struct float32x8 {
diff --git a/src/functional/operators.h b/src/functional/operators.h
index d79ac3c05..00cb9c819 100755
--- a/src/functional/operators.h
+++ b/src/functional/operators.h
@@ -213,7 +213,13 @@ struct Ops<double> {
 // __CUDA_ARCH__ is defined when compiling device (GPU) code
 #ifndef __CUDACC__
 
+#if defined(__ARM_NEON) || defined(__ARM_NEON__)
+#include "3rd_party/simd_utils/simd_utils.h"
+#include "3rd_party/simd_utils/neon_mathfun.h"
+#else
 #include "3rd_party/sse_mathfun.h"
+#endif
+
 
 namespace marian {
 namespace functional {
diff --git a/src/tensors/cpu/integer_common.cpp b/src/tensors/cpu/integer_common.cpp
index a941c0d02..edaeb112c 100644
--- a/src/tensors/cpu/integer_common.cpp
+++ b/src/tensors/cpu/integer_common.cpp
@@ -1,16 +1,12 @@
 #include "integer_common.h"
 
-#ifndef USE_SIMDE
+#ifdef __SSE__
 #include <emmintrin.h>
 #include <immintrin.h>
 #include <tmmintrin.h>
 #include <xmmintrin.h>
-#else // USE_SIMDE
-// https://wiki.debian.org/SIMDEverywhere#Approach
-#include "x86/sse2.h"
-#include "x86/avx2.h"
-#include "x86/ssse3.h"
-#include "x86/sse.h"
+#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
+#include <arm_neon.h>
 #endif
 
 namespace marian {
@@ -27,7 +23,9 @@ void AddBias(marian::Tensor C, const marian::Tensor Bias) {
 
   for(int j = 0; j < m; ++j) {
     int i = 0;
+
 #ifdef __AVX512F__
+    // Multiples of 16 add together.
     int n16 = n & ~15;
     for(; i < n16; i += 16) {
       __m512 ai = _mm512_loadu_ps(x + j * n + i);
@@ -35,7 +33,8 @@ void AddBias(marian::Tensor C, const marian::Tensor Bias) {
       __m512 yi = _mm512_add_ps(ai, bi);
       _mm512_storeu_ps(y + j * n + i, yi);
     }
-#else
+#elif __SSE__
+    // Multiples of 4 add together.
     int n4 = (n / 4) * 4;
     for(; i < n4; i += 4) {
       __m128 ai = _mm_loadu_ps(x + j * n + i);
@@ -43,6 +42,21 @@ void AddBias(marian::Tensor C, const marian::Tensor Bias) {
       __m128 yi = _mm_add_ps(ai, bi);
       _mm_storeu_ps(y + j * n + i, yi);
     }
+#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
+    int n4 = (n / 4) * 4;
+    using __m128 = float32x4_t;
+    for(; i < n4; i += 4) {
+      __m128 ai = vld1q_f32(x + j * n + i);
+      __m128 bi = vld1q_f32(bias + i);
+      __m128 yi = vaddq_f32(ai, bi);
+      vst1q_f32(y + j * n + i, yi);
+    }
+
+#else
+    // StandardCPP No SIMD case.
+    for(i = 0;  i < n; i++) {
+        y[j * n + i] = x[j * n + i] + bias[i];
+    }
 #endif
     for(; i < n; i++) {
       y[j * n + i] = x[j * n + i] + bias[i];

From d10009fdb9a2be0449a4c232fadfc16276744bab Mon Sep 17 00:00:00 2001
From: Jerin Philip <jerinphilip@live.in>
Date: Sat, 28 May 2022 17:14:18 +0000
Subject: [PATCH 21/67] Style fixes: UnquantizeAndWrite,
 UnquantizeAddBiasAndWrite

- Underscore suffix for curried args.
- Make args private.
---
 src/tensors/cpu/ruy_adapter.h | 35 ++++++++++++++++++++---------------
 1 file changed, 20 insertions(+), 15 deletions(-)

diff --git a/src/tensors/cpu/ruy_adapter.h b/src/tensors/cpu/ruy_adapter.h
index 89e856d10..f8117ab5d 100644
--- a/src/tensors/cpu/ruy_adapter.h
+++ b/src/tensors/cpu/ruy_adapter.h
@@ -74,33 +74,36 @@ struct Preprocess<Path::kStandardCpp> {
 
   struct UnquantizeAndAddBiasAndWrite {
     UnquantizeAndAddBiasAndWrite(float unquant_multiplier, const float *input_bias_prepared)
-        : unquant_multiplier(unquant_multiplier), input_bias_prepared(input_bias_prepared) {}
+        : unquant_multiplier_(unquant_multiplier), input_bias_prepared_(input_bias_prepared) {}
 
     void operator()(const int32_t *input, Index rows_A, Index cols_B, float *output) {
       for(Index i = 0; i < rows_A; i++) {
         for(Index j = 0; j < cols_B; j++) {
           Index idx = i * cols_B + j;
-          output[idx] = (input[idx] * unquant_multiplier) + input_bias_prepared[j];
+          output[idx] = (input[idx] * unquant_multiplier_) + input_bias_prepared_[j];
         }
       }
     }
 
-    float unquant_multiplier;
-    const float *input_bias_prepared;
+  private:
+    float unquant_multiplier_;
+    const float *input_bias_prepared_;
   };
 
   struct UnquantizeAndWrite {
-    UnquantizeAndWrite(float unquant_multiplier) : unquant_multiplier(unquant_multiplier) {}
+    UnquantizeAndWrite(float unquant_multiplier) : unquant_multiplier_(unquant_multiplier) {}
 
     void operator()(const int32_t *input, Index rows_A, Index cols_B, float *output) {
       for(Index i = 0; i < rows_A; i++) {
         for(Index j = 0; j < cols_B; j++) {
           Index idx = i * cols_B + j;
-          output[idx] = (input[idx] * unquant_multiplier);
+          output[idx] = (input[idx] * unquant_multiplier_);
         }
       }
     }
-    float unquant_multiplier;
+
+  private:
+    float unquant_multiplier_;
   };
 };
 
@@ -251,18 +254,18 @@ struct Preprocess<Path::kNeon> {
 
   struct UnquantizeAndAddBiasAndWrite {
     UnquantizeAndAddBiasAndWrite(float unquant_multiplier, const float *input_bias_prepared)
-        : unquant_multiplier(unquant_multiplier), input_bias_prepared(input_bias_prepared) {}
+        : unquant_multiplier_(unquant_multiplier), input_bias_prepared_(input_bias_prepared) {}
 
     void operator()(const int32_t *input, Index rows_A, Index cols_B, float *output) {
       // Set all registers in lane from same scalar value.
-      float32x4_t multiplier = vdupq_n_f32(unquant_multiplier);
+      float32x4_t multiplier = vdupq_n_f32(unquant_multiplier_);
       const int32x4_t *Input = reinterpret_cast<const int32x4_t *>(input);
       const int32x4_t *InputEnd = reinterpret_cast<const int32x4_t *>(input + rows_A * cols_B);
       float32x4_t *Output = reinterpret_cast<float32x4_t *>(output);
 
       while(Input != InputEnd) {
         // Bias cycles every column for addition.
-        const float32x4_t *Bias = reinterpret_cast<const float32x4_t *>(input_bias_prepared);
+        const float32x4_t *Bias = reinterpret_cast<const float32x4_t *>(input_bias_prepared_);
 
         // InputEnd needs to be determined to end the while loop below.
         const int32x4_t *RowEnd = reinterpret_cast<const int32x4_t *>(
@@ -278,16 +281,17 @@ struct Preprocess<Path::kNeon> {
       }
     }
 
-    float unquant_multiplier;
-    const float *input_bias_prepared;
+  private:
+    float unquant_multiplier_;
+    const float *input_bias_prepared_;
   };
 
   struct UnquantizeAndWrite {
-    UnquantizeAndWrite(float unquant_multiplier) : unquant_multiplier(unquant_multiplier) {}
+    UnquantizeAndWrite(float unquant_multiplier) : unquant_multiplier_(unquant_multiplier) {}
 
     void operator()(const int32_t *input, Index rows_A, Index cols_B, float *output) {
       // Set all registers in lane from same scalar value.
-      float32x4_t multiplier = vdupq_n_f32(unquant_multiplier);
+      float32x4_t multiplier = vdupq_n_f32(unquant_multiplier_);
       const int32x4_t *Input = reinterpret_cast<const int32x4_t *>(input);
       const int32x4_t *InputEnd = reinterpret_cast<const int32x4_t *>(input + rows_A * cols_B);
       float32x4_t *Output = reinterpret_cast<float32x4_t *>(output);
@@ -309,7 +313,8 @@ struct Preprocess<Path::kNeon> {
       }
     }
 
-    float unquant_multiplier;
+  private:
+    float unquant_multiplier_;
   };
 };
 

From 3a37966868263af54a43e2bb3b0cf70d01ae5a66 Mon Sep 17 00:00:00 2001
From: Jerin Philip <jerinphilip@live.in>
Date: Sat, 28 May 2022 17:24:14 +0000
Subject: [PATCH 22/67] const for () operator overrides

---
 src/tensors/cpu/ruy_adapter.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/tensors/cpu/ruy_adapter.h b/src/tensors/cpu/ruy_adapter.h
index f8117ab5d..6015612d7 100644
--- a/src/tensors/cpu/ruy_adapter.h
+++ b/src/tensors/cpu/ruy_adapter.h
@@ -76,7 +76,7 @@ struct Preprocess<Path::kStandardCpp> {
     UnquantizeAndAddBiasAndWrite(float unquant_multiplier, const float *input_bias_prepared)
         : unquant_multiplier_(unquant_multiplier), input_bias_prepared_(input_bias_prepared) {}
 
-    void operator()(const int32_t *input, Index rows_A, Index cols_B, float *output) {
+    void operator()(const int32_t *input, Index rows_A, Index cols_B, float *output) const {
       for(Index i = 0; i < rows_A; i++) {
         for(Index j = 0; j < cols_B; j++) {
           Index idx = i * cols_B + j;
@@ -93,7 +93,7 @@ struct Preprocess<Path::kStandardCpp> {
   struct UnquantizeAndWrite {
     UnquantizeAndWrite(float unquant_multiplier) : unquant_multiplier_(unquant_multiplier) {}
 
-    void operator()(const int32_t *input, Index rows_A, Index cols_B, float *output) {
+    void operator()(const int32_t *input, Index rows_A, Index cols_B, float *output) const {
       for(Index i = 0; i < rows_A; i++) {
         for(Index j = 0; j < cols_B; j++) {
           Index idx = i * cols_B + j;
@@ -256,7 +256,7 @@ struct Preprocess<Path::kNeon> {
     UnquantizeAndAddBiasAndWrite(float unquant_multiplier, const float *input_bias_prepared)
         : unquant_multiplier_(unquant_multiplier), input_bias_prepared_(input_bias_prepared) {}
 
-    void operator()(const int32_t *input, Index rows_A, Index cols_B, float *output) {
+    void operator()(const int32_t *input, Index rows_A, Index cols_B, float *output) const {
       // Set all registers in lane from same scalar value.
       float32x4_t multiplier = vdupq_n_f32(unquant_multiplier_);
       const int32x4_t *Input = reinterpret_cast<const int32x4_t *>(input);
@@ -289,7 +289,7 @@ struct Preprocess<Path::kNeon> {
   struct UnquantizeAndWrite {
     UnquantizeAndWrite(float unquant_multiplier) : unquant_multiplier_(unquant_multiplier) {}
 
-    void operator()(const int32_t *input, Index rows_A, Index cols_B, float *output) {
+    void operator()(const int32_t *input, Index rows_A, Index cols_B, float *output) const {
       // Set all registers in lane from same scalar value.
       float32x4_t multiplier = vdupq_n_f32(unquant_multiplier_);
       const int32x4_t *Input = reinterpret_cast<const int32x4_t *>(input);

From 418a7ce143621b56636cbd6062d13b509feba280 Mon Sep 17 00:00:00 2001
From: Jerin Philip <jerinphilip@live.in>
Date: Sat, 28 May 2022 17:42:51 +0000
Subject: [PATCH 23/67] Explicit for single argument constructor:
 UnquantizeAndWrite

---
 src/tensors/cpu/ruy_adapter.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/tensors/cpu/ruy_adapter.h b/src/tensors/cpu/ruy_adapter.h
index 6015612d7..a9d51e43a 100644
--- a/src/tensors/cpu/ruy_adapter.h
+++ b/src/tensors/cpu/ruy_adapter.h
@@ -91,7 +91,8 @@ struct Preprocess<Path::kStandardCpp> {
   };
 
   struct UnquantizeAndWrite {
-    UnquantizeAndWrite(float unquant_multiplier) : unquant_multiplier_(unquant_multiplier) {}
+    explicit UnquantizeAndWrite(float unquant_multiplier)
+        : unquant_multiplier_(unquant_multiplier) {}
 
     void operator()(const int32_t *input, Index rows_A, Index cols_B, float *output) const {
       for(Index i = 0; i < rows_A; i++) {
@@ -287,7 +288,8 @@ struct Preprocess<Path::kNeon> {
   };
 
   struct UnquantizeAndWrite {
-    UnquantizeAndWrite(float unquant_multiplier) : unquant_multiplier_(unquant_multiplier) {}
+    explicit tUnquantizeAndWrite(float unquant_multiplier)
+        : unquant_multiplier_(unquant_multiplier) {}
 
     void operator()(const int32_t *input, Index rows_A, Index cols_B, float *output) const {
       // Set all registers in lane from same scalar value.

From b7412c3b5fe677d31c61ab1718187178635dc71d Mon Sep 17 00:00:00 2001
From: Jerin Philip <jerinphilip@live.in>
Date: Sat, 28 May 2022 17:49:07 +0000
Subject: [PATCH 24/67] Fix typo

---
 src/tensors/cpu/ruy_adapter.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tensors/cpu/ruy_adapter.h b/src/tensors/cpu/ruy_adapter.h
index a9d51e43a..09fd7b427 100644
--- a/src/tensors/cpu/ruy_adapter.h
+++ b/src/tensors/cpu/ruy_adapter.h
@@ -288,7 +288,7 @@ struct Preprocess<Path::kNeon> {
   };
 
   struct UnquantizeAndWrite {
-    explicit tUnquantizeAndWrite(float unquant_multiplier)
+    explicit UnquantizeAndWrite(float unquant_multiplier)
         : unquant_multiplier_(unquant_multiplier) {}
 
     void operator()(const int32_t *input, Index rows_A, Index cols_B, float *output) const {

From 071e0d4be4edaaad12eb82a6bef75bcd81318f99 Mon Sep 17 00:00:00 2001
From: Jerin Philip <jerinphilip@live.in>
Date: Sat, 28 May 2022 17:50:44 +0000
Subject: [PATCH 25/67] Low compute path for special case alpha = 1.0

---
 src/tensors/cpu/prod_blas.h | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/src/tensors/cpu/prod_blas.h b/src/tensors/cpu/prod_blas.h
index a3d11db9c..5415420f9 100644
--- a/src/tensors/cpu/prod_blas.h
+++ b/src/tensors/cpu/prod_blas.h
@@ -80,15 +80,18 @@ inline void GemmRuy(const bool transA,
     ruy::MulParams<float, float> mul_params;
     ruy::Mul(lhs, rhs, mul_params, &context, &dst);
 
-    // Write out C as C = alpha * [op(A) * op(B)] + beta * C
-    // Can we expect the compiler to autovectorize this?
-    // TODO: Come back and explicitly use SIMD.
-    const size_t size    = M * N;
-    const float *opA_opB = C;  // Alias.
+    if(alpha != 1.0) {
+        // Write out C as C = alpha * [op(A) * op(B)] + beta * C
+        // Can we expect the compiler to autovectorize this?
+        // TODO: Come back and explicitly use SIMD.
+        const size_t size    = M * N;
+        const float *opA_opB = C;  // Alias.
 #pragma clang loop vectorize(enable) interleave(enable)
-    for(size_t i = 0; i < size; i++) {
-      C[i] = alpha * opA_opB[i];
+        for(size_t i = 0; i < size; i++) {
+          C[i] = alpha * opA_opB[i];
+        }
     }
+
   } else {
     // @jerinphilip has not yet been able to find a ruy primitive that does in
     // place addition to obtain full gemm.

From ec886bdead5f224d79be62e8e0051a2bd9be1036 Mon Sep 17 00:00:00 2001
From: Jerin Philip <jerinphilip@live.in>
Date: Mon, 30 May 2022 10:04:47 +0000
Subject: [PATCH 26/67] Remove clang only pragmas

---
 src/tensors/cpu/prod_blas.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/tensors/cpu/prod_blas.h b/src/tensors/cpu/prod_blas.h
index 5415420f9..1a1ae4b35 100644
--- a/src/tensors/cpu/prod_blas.h
+++ b/src/tensors/cpu/prod_blas.h
@@ -86,7 +86,6 @@ inline void GemmRuy(const bool transA,
         // TODO: Come back and explicitly use SIMD.
         const size_t size    = M * N;
         const float *opA_opB = C;  // Alias.
-#pragma clang loop vectorize(enable) interleave(enable)
         for(size_t i = 0; i < size; i++) {
           C[i] = alpha * opA_opB[i];
         }
@@ -111,7 +110,6 @@ inline void GemmRuy(const bool transA,
     // TODO: Come back and explicitly use SIMD.
     const size_t size    = M * N;
     const float *opA_opB = intermediate.data();
-#pragma clang loop vectorize(enable) interleave(enable)
     for(size_t i = 0; i < size; i++) {
       C[i] = alpha * opA_opB[i] + beta * C[i];
     }

From 4df1998ef60d54ce53e38865de4ca20f9c63d51d Mon Sep 17 00:00:00 2001
From: Jerin Philip <jerinphilip@live.in>
Date: Mon, 30 May 2022 10:24:55 +0000
Subject: [PATCH 27/67] Remove leftover bias cycles comment

---
 src/tensors/cpu/ruy_adapter.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/tensors/cpu/ruy_adapter.h b/src/tensors/cpu/ruy_adapter.h
index 09fd7b427..6be3bbd59 100644
--- a/src/tensors/cpu/ruy_adapter.h
+++ b/src/tensors/cpu/ruy_adapter.h
@@ -299,8 +299,6 @@ struct Preprocess<Path::kNeon> {
       float32x4_t *Output = reinterpret_cast<float32x4_t *>(output);
 
       while(Input != InputEnd) {
-        // Bias cycles every column for addition.
-
         // InputEnd needs to be determined to end the while loop below.
         const int32x4_t *RowEnd = reinterpret_cast<const int32x4_t *>(
             reinterpret_cast<const int32_t *>(Input) + cols_B);

From c4be980fca1f1031198b8c6a1aa241b86bd7bbaf Mon Sep 17 00:00:00 2001
From: Jerin Philip <jerinphilip@live.in>
Date: Thu, 2 Jun 2022 14:14:58 +0000
Subject: [PATCH 28/67] Defaults: intgemm for x86_64 and ruy and simd_utils for
 arm

---
 CMakeLists.txt | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index cb5ec3bdf..f19481700 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -31,7 +31,8 @@ option(USE_CCACHE "Use ccache compiler cache (https://ccache.dev)" OFF)
 option(USE_CUDNN "Use CUDNN library" OFF)
 option(USE_DOXYGEN "Build documentation with Doxygen" ON)
 option(USE_FBGEMM "Use FBGEMM" OFF)
-option(USE_INTGEMM "Use INTGEMM" ON)
+option(USE_INTGEMM "Use INTGEMM" OFF)
+option(USE_RUY "Use Ruy" OFF)
 option(USE_MKL "Compile with MKL support" ON)
 option(USE_MPI "Use MPI library" OFF)
 option(USE_NCCL "Use NCCL library" ON)
@@ -50,6 +51,15 @@ option(COMPILE_WITHOUT_EXCEPTIONS "Compile without exceptions" OFF)
 CMAKE_DEPENDENT_OPTION(USE_THREADS "Compile with multi-threading support" OFF
                        "USE_WASM_COMPATIBLE_SOURCE" ON)
 
+
+if(CMAKE_SYSTEM_PROCESSOR STREQUAL x86_64 OR CMAKE_SYSTEM_PROCESSOR STREQUAL amd64)
+    set(USE_INTGEMM ON)
+else() 
+    set(USE_RUY ON)
+    set(USE_RUY_SGEMM ON)
+    set(USE_SIMD_UTILS ON)
+endif()
+
 if (USE_WASM_COMPATIBLE_SOURCE)
   set(SPM_BUILD_LIBRARY_ONLY ON CACHE BOOL "Build only sentencepiece library (skip building executables)")
   add_compile_definitions(WASM_COMPATIBLE_SOURCE)

From b181847ae119658921798a3ca3f9cb0a803d63b1 Mon Sep 17 00:00:00 2001
From: Jerin Philip <jerinphilip@live.in>
Date: Mon, 6 Jun 2022 12:55:59 +0000
Subject: [PATCH 29/67] Revert "Defaults: intgemm for x86_64 and ruy and
 simd_utils for arm"

This reverts commit c4be980fca1f1031198b8c6a1aa241b86bd7bbaf.
---
 CMakeLists.txt | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f19481700..cb5ec3bdf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -31,8 +31,7 @@ option(USE_CCACHE "Use ccache compiler cache (https://ccache.dev)" OFF)
 option(USE_CUDNN "Use CUDNN library" OFF)
 option(USE_DOXYGEN "Build documentation with Doxygen" ON)
 option(USE_FBGEMM "Use FBGEMM" OFF)
-option(USE_INTGEMM "Use INTGEMM" OFF)
-option(USE_RUY "Use Ruy" OFF)
+option(USE_INTGEMM "Use INTGEMM" ON)
 option(USE_MKL "Compile with MKL support" ON)
 option(USE_MPI "Use MPI library" OFF)
 option(USE_NCCL "Use NCCL library" ON)
@@ -51,15 +50,6 @@ option(COMPILE_WITHOUT_EXCEPTIONS "Compile without exceptions" OFF)
 CMAKE_DEPENDENT_OPTION(USE_THREADS "Compile with multi-threading support" OFF
                        "USE_WASM_COMPATIBLE_SOURCE" ON)
 
-
-if(CMAKE_SYSTEM_PROCESSOR STREQUAL x86_64 OR CMAKE_SYSTEM_PROCESSOR STREQUAL amd64)
-    set(USE_INTGEMM ON)
-else() 
-    set(USE_RUY ON)
-    set(USE_RUY_SGEMM ON)
-    set(USE_SIMD_UTILS ON)
-endif()
-
 if (USE_WASM_COMPATIBLE_SOURCE)
   set(SPM_BUILD_LIBRARY_ONLY ON CACHE BOOL "Build only sentencepiece library (skip building executables)")
   add_compile_definitions(WASM_COMPATIBLE_SOURCE)

From 6e4c5610e8ffe5d5cddf28f8dfd71981d22d2387 Mon Sep 17 00:00:00 2001
From: Jerin Philip <jerinphilip@live.in>
Date: Tue, 7 Jun 2022 13:04:44 +0100
Subject: [PATCH 30/67] Target architecture detection for ARM

---
 .github/workflows/arm.yml |   3 -
 CMakeLists.txt            |  24 ++++++-
 cmake/TargetArch.cmake    | 142 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 165 insertions(+), 4 deletions(-)
 create mode 100644 cmake/TargetArch.cmake

diff --git a/.github/workflows/arm.yml b/.github/workflows/arm.yml
index 1b8ff0839..9667ece05 100644
--- a/.github/workflows/arm.yml
+++ b/.github/workflows/arm.yml
@@ -87,9 +87,6 @@ jobs:
             -DCMAKE_USE_PTHREADS_INIT=1
             -DTHREADS_PREFER_PTHREAD_FLAG=ON
             -DBUILD_ARCH=armv8-a
-            -DUSE_INTGEMM=off
-            -DUSE_SIMD_UTILS=on
-            -DUSE_RUY=on
             # -DCOMPILE_WITHOUT_EXCEPTIONS=on # Apparently this can reduce the binary size, let's see.
         )
         # Additionally list variables finally configured.
diff --git a/CMakeLists.txt b/CMakeLists.txt
index cb5ec3bdf..d20cc9e39 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -16,6 +16,19 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
 include(CMakeDependentOption)
 
+# Architecture detection
+include(TargetArch)
+
+target_architecture(CMAKE_TARGET_ARCHITECTURES)
+list(LENGTH CMAKE_TARGET_ARCHITECTURES cmake_target_arch_len)
+if(NOT "${cmake_target_arch_len}" STREQUAL "1")
+    set(CMAKE_TARGET_ARCHITECTURE_UNIVERSAL TRUE)
+    set(CMAKE_TARGET_ARCHITECTURE_CODE "universal")
+else()
+    set(CMAKE_TARGET_ARCHITECTURE_UNIVERSAL FALSE)
+    set(CMAKE_TARGET_ARCHITECTURE_CODE "${CMAKE_TARGET_ARCHITECTURES}")
+endif()
+
 # Custom CMake options
 option(COMPILE_CPU "Compile CPU version" ON)
 option(COMPILE_CUDA "Compile GPU version" ON)
@@ -31,7 +44,8 @@ option(USE_CCACHE "Use ccache compiler cache (https://ccache.dev)" OFF)
 option(USE_CUDNN "Use CUDNN library" OFF)
 option(USE_DOXYGEN "Build documentation with Doxygen" ON)
 option(USE_FBGEMM "Use FBGEMM" OFF)
-option(USE_INTGEMM "Use INTGEMM" ON)
+option(USE_INTGEMM "Use INTGEMM" OFF)
+option(USE_RUY "Use Ruy" OFF)
 option(USE_MKL "Compile with MKL support" ON)
 option(USE_MPI "Use MPI library" OFF)
 option(USE_NCCL "Use NCCL library" ON)
@@ -50,6 +64,14 @@ option(COMPILE_WITHOUT_EXCEPTIONS "Compile without exceptions" OFF)
 CMAKE_DEPENDENT_OPTION(USE_THREADS "Compile with multi-threading support" OFF
                        "USE_WASM_COMPATIBLE_SOURCE" ON)
 
+if(${CMAKE_TARGET_ARCHITECTURE_CODE} MATCHES "arm")
+  set(USE_RUY ON)
+  set(USE_RUY_SGEMM ON)
+  set(USE_SIMD_UTILS ON)
+else()
+  set(USE_INTGEMM ON)
+endif()
+
 if (USE_WASM_COMPATIBLE_SOURCE)
   set(SPM_BUILD_LIBRARY_ONLY ON CACHE BOOL "Build only sentencepiece library (skip building executables)")
   add_compile_definitions(WASM_COMPATIBLE_SOURCE)
diff --git a/cmake/TargetArch.cmake b/cmake/TargetArch.cmake
new file mode 100644
index 000000000..01f4a44d7
--- /dev/null
+++ b/cmake/TargetArch.cmake
@@ -0,0 +1,142 @@
+# Modified from https://github.com/axr/solar-cmake/blob/73cfea0db0284c5e2010aca23989046e5bda95c9/Solar.cmake
+# Based on the Qt 5 processor detection code, so should be very accurate
+# https://qt.gitorious.org/qt/qtbase/blobs/master/src/corelib/global/qprocessordetection.h
+# Currently handles arm (v5, v6, v7), x86 (32/64), ia64, and ppc (32/64)
+
+# Regarding POWER/PowerPC, just as is noted in the Qt source,
+# "There are many more known variants/revisions that we do not handle/detect."
+
+set(archdetect_c_code "
+#if defined(__arm__) || defined(__TARGET_ARCH_ARM)  || defined(_M_ARM) || defined(_M_ARM64) || defined(__aarch64__) || defined(__ARM64__)
+    #if defined(__ARM_ARCH_8__)     || defined(__ARM_ARCH_8)          \\
+        || defined(__ARM_ARCH_8A__) || defined(__ARM_ARCH_8A)         \\
+        || defined(__ARM_ARCH_8R__) || defined(__ARM_ARCH_8R)         \\
+        || defined(__ARM_ARCH_8M__) || defined(__ARM_ARCH_8M)         \\
+        || (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 8)
+        #error cmake_ARCH armv8
+    #elif defined(__ARM_ARCH_7__)   || defined(__ARM_ARCH_7)          \\
+        || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7A__)       \\
+        || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7R__)       \\
+        || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7M__)       \\
+        || (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 7)
+        #error cmake_ARCH armv7
+    #elif defined(__ARM_ARCH_6__)    || defined(__ARM_ARCH_6)         \\
+        || defined(__ARM_ARCH_6J__)  || defined(__ARM_ARCH_6J__)      \\
+        || defined(__ARM_ARCH_6T2__) || defined(__ARM_ARCH_6T2__)     \\
+        || defined(__ARM_ARCH_6Z__)  || defined(__ARM_ARCH_6Z__)      \\
+        || defined(__ARM_ARCH_6K__)  || defined(__ARM_ARCH_6K__)      \\
+        || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6ZK__)     \\
+        || defined(__ARM_ARCH_6M__)  || defined(__ARM_ARCH_6M__)      \\
+        || (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 6)
+        #error cmake_ARCH armv6
+    #elif defined(__ARM_ARCH_5TEJ__) \\
+        || (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 5)
+        #error cmake_ARCH armv5
+    #else
+        #error cmake_ARCH arm
+    #endif
+#elif defined(__i386) || defined(__i386__) || defined(_M_IX86)
+    #error cmake_ARCH i386
+#elif defined(__x86_64) || defined(__x86_64__) || defined(__amd64) || defined(_M_X64)
+    #error cmake_ARCH x86_64
+#elif defined(__ia64) || defined(__ia64__) || defined(_M_IA64)
+    #error cmake_ARCH ia64
+#elif defined(__ppc__) || defined(__ppc) || defined(__powerpc__) \\
+      || defined(_ARCH_COM) || defined(_ARCH_PWR) || defined(_ARCH_PPC)  \\
+      || defined(_M_MPPC) || defined(_M_PPC)
+    #if defined(__ppc64__) || defined(__powerpc64__) || defined(__64BIT__)
+        #error cmake_ARCH ppc64
+    #else
+        #error cmake_ARCH ppc
+    #endif
+#endif
+
+#error cmake_ARCH unknown
+")
+
+
+# Set ppc_support to TRUE before including this file or ppc and ppc64
+# will be treated as invalid architectures since they are no longer supported by Apple
+
+function(target_architecture output_var)
+    if(APPLE AND CMAKE_OSX_ARCHITECTURES)
+        # On OS X we use CMAKE_OSX_ARCHITECTURES *if* it was set
+        # First let's normalize the order of the values
+
+        # Note that it's not possible to compile PowerPC applications if you are using
+        # the OS X SDK version 10.6 or later - you'll need 10.4/10.5 for that, so we
+        # disable it by default
+        # See this page for more information:
+        # http://stackoverflow.com/questions/5333490/how-can-we-restore-ppc-ppc64-as-well-as-full-10-4-10-5-sdk-support-to-xcode-4
+
+        # Architecture defaults to i386 or ppc on OS X 10.5 and earlier, depending on the CPU type detected at runtime.
+        # On OS X 10.6+ the default is x86_64 if the CPU supports it, i386 otherwise.
+
+        foreach(osx_arch ${CMAKE_OSX_ARCHITECTURES})
+            if("${osx_arch}" STREQUAL "ppc" AND ppc_support)
+                set(osx_arch_ppc TRUE)
+            elseif("${osx_arch}" STREQUAL "i386")
+                set(osx_arch_i386 TRUE)
+            elseif("${osx_arch}" STREQUAL "x86_64")
+                set(osx_arch_x86_64 TRUE)
+            elseif("${osx_arch}" STREQUAL "ppc64" AND ppc_support)
+                set(osx_arch_ppc64 TRUE)
+            else()
+                message(FATAL_ERROR "Invalid OS X arch name: ${osx_arch}")
+            endif()
+        endforeach()
+
+        # Now add all the architectures in our normalized order
+        if(osx_arch_ppc)
+            list(APPEND ARCH ppc)
+        endif()
+
+        if(osx_arch_i386)
+            list(APPEND ARCH i386)
+        endif()
+
+        if(osx_arch_x86_64)
+            list(APPEND ARCH x86_64)
+        endif()
+
+        if(osx_arch_ppc64)
+            list(APPEND ARCH ppc64)
+        endif()
+    else()
+        file(WRITE "${CMAKE_BINARY_DIR}/arch.c" "${archdetect_c_code}")
+
+        enable_language(C)
+
+        # Detect the architecture in a rather creative way...
+        # This compiles a small C program which is a series of ifdefs that selects a
+        # particular #error preprocessor directive whose message string contains the
+        # target architecture. The program will always fail to compile (both because
+        # file is not a valid C program, and obviously because of the presence of the
+        # #error preprocessor directives... but by exploiting the preprocessor in this
+        # way, we can detect the correct target architecture even when cross-compiling,
+        # since the program itself never needs to be run (only the compiler/preprocessor)
+        try_run(
+            run_result_unused
+            compile_result_unused
+            "${CMAKE_BINARY_DIR}"
+            "${CMAKE_BINARY_DIR}/arch.c"
+            COMPILE_OUTPUT_VARIABLE ARCH
+            CMAKE_FLAGS CMAKE_OSX_ARCHITECTURES=${CMAKE_OSX_ARCHITECTURES}
+        )
+
+        # Parse the architecture name from the compiler output
+        string(REGEX MATCH "cmake_ARCH ([a-zA-Z0-9_]+)" ARCH "${ARCH}")
+
+        # Get rid of the value marker leaving just the architecture name
+        string(REPLACE "cmake_ARCH " "" ARCH "${ARCH}")
+
+        # If we are compiling with an unknown architecture this variable should
+        # already be set to "unknown" but in the case that it's empty (i.e. due
+        # to a typo in the code), then set it to unknown
+        if (NOT ARCH)
+            set(ARCH unknown)
+        endif()
+    endif()
+
+    set(${output_var} "${ARCH}" PARENT_SCOPE)
+endfunction()

From 53636cf42a0e80a1d2aa0c206d853e7451d2663d Mon Sep 17 00:00:00 2001
From: Jerin Philip <jerinphilip@live.in>
Date: Tue, 7 Jun 2022 13:57:22 +0000
Subject: [PATCH 31/67] Remove DEBUG statements

---
 src/tensors/cpu/ruy_adapter.h | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/tensors/cpu/ruy_adapter.h b/src/tensors/cpu/ruy_adapter.h
index 6be3bbd59..f080d929c 100644
--- a/src/tensors/cpu/ruy_adapter.h
+++ b/src/tensors/cpu/ruy_adapter.h
@@ -431,14 +431,10 @@ struct IntgemmViaRuy {
       ruy::MakeSimpleLayout(rows_A, width, ruy::Order::kRowMajor, lhs.mutable_layout());
       lhs.set_data(input_A_prepared);
 
-      // PRINT_MATRIX_DEBUG(input_A_prepared, rows_A, width, Order::RowMajor);
-
       ruy::Matrix<std::int8_t> rhs;
       ruy::MakeSimpleLayout(width, cols_B, ruy::Order::kColMajor, rhs.mutable_layout());
       rhs.set_data(input_B_prepared);
 
-      // PRINT_MATRIX_DEBUG(input_B_prepared, width, cols_B, Order::ColMajor);
-
       ruy::Matrix<std::int32_t> dst;
       ruy::MakeSimpleLayout(rows_A, cols_B, ruy::Order::kRowMajor, dst.mutable_layout());
 

From be9e153c5664c1e813c0689831975edb607cee54 Mon Sep 17 00:00:00 2001
From: Jerin Philip <jerinphilip@live.in>
Date: Wed, 8 Jun 2022 15:11:46 +0000
Subject: [PATCH 32/67] Remove IntBase inheritance; PrepareB still
 unimplemented

---
 src/tensors/cpu/ruy_adapter.h | 78 +++++++++++++++++------------------
 1 file changed, 39 insertions(+), 39 deletions(-)

diff --git a/src/tensors/cpu/ruy_adapter.h b/src/tensors/cpu/ruy_adapter.h
index f080d929c..3bf51bfff 100644
--- a/src/tensors/cpu/ruy_adapter.h
+++ b/src/tensors/cpu/ruy_adapter.h
@@ -328,46 +328,9 @@ struct IntgemmViaRuy {
   // Convert compile time errors into run-time ABORTS. This allows bringing in only int8_t and
   // select functions that are required to create a path which will run while not achieving
   // parity with intgemm.
-  template <class T>
-  struct IntBase {
-    using Type = T;
-    static void Quantize(const float *, Type *, float, Index) { ABORT("Quantize unsupported"); }
-
-    static void PrepareA(const float *input,
-                         Type *output,
-                         float quant_mult,
-                         Index rows,
-                         Index cols) {
-      ABORT("PrepareA Unsupported");
-    }
-
-    static void PrepareB(const float *, Type *, float, Index, Index) {
-      ABORT("PrepareB Unsupported");
-    }
-    static void PrepareBQuantizedTransposed(const Type *, Type *, Index, Index) {
-      ABORT("PrepareBQuantizedTransposed Unsupported");
-    }
-    static void PrepareBTransposed(const float *, Type *, float, Index, Index) {
-      ABORT("PrepareBTransposed Unsupported");
-    }
-    static void SelectColumnsB(const Type *, Type *, Index, const Index *, const Index *) {
-      ABORT("SelectColumnsB Unsupported");
-    }
-
-    template <class Callback>
-    static void Multiply(const Type *A_prepared,
-                         const Type *B_prepared,
-                         float *output,
-                         Index rows_A,
-                         Index width,
-                         Index cols_B,
-                         Callback callback) {
-      ABORT("Multiply (A*B) Unsupported");
-    }
-  };
 
   // Intgemm nomenclature expects Int8. Missing functions are ABORTs.
-  struct Int8 : IntBase<int8_t> {
+  struct Int8 {
     using Type = int8_t;
     static void PrepareBQuantizedTransposed(const Type *input,
                                             Type *output,
@@ -384,6 +347,10 @@ struct IntgemmViaRuy {
       Preprocess<kHighestPath>::quantize(input, output, quant_mult, rows, cols);
     }
 
+    static void PrepareB(const float *, Type *, float, Index, Index) {
+      ABORT("PrepareB Unsupported");
+    }
+
     static void PrepareA(const float *input,
                          int8_t *output,
                          float quant_mult,
@@ -450,8 +417,41 @@ struct IntgemmViaRuy {
   };
 
   // Int16 support is currently missing.
-  struct Int16 : IntBase<int16_t> {
+  struct Int16 {
     using Type = int16_t;
+    static void Quantize(const float *, Type *, float, Index) { ABORT("Quantize unsupported"); }
+
+    static void PrepareA(const float *input,
+                         Type *output,
+                         float quant_mult,
+                         Index rows,
+                         Index cols) {
+      ABORT("PrepareA Unsupported");
+    }
+
+    static void PrepareB(const float *, Type *, float, Index, Index) {
+      ABORT("PrepareB Unsupported");
+    }
+    static void PrepareBQuantizedTransposed(const Type *, Type *, Index, Index) {
+      ABORT("PrepareBQuantizedTransposed Unsupported");
+    }
+    static void PrepareBTransposed(const float *, Type *, float, Index, Index) {
+      ABORT("PrepareBTransposed Unsupported");
+    }
+    static void SelectColumnsB(const Type *, Type *, Index, const Index *, const Index *) {
+      ABORT("SelectColumnsB Unsupported");
+    }
+
+    template <class Callback>
+    static void Multiply(const Type *A_prepared,
+                         const Type *B_prepared,
+                         float *output,
+                         Index rows_A,
+                         Index width,
+                         Index cols_B,
+                         Callback callback) {
+      ABORT("Multiply (A*B) Unsupported");
+    }
   };
 
   template <class T>

From 876a91566093bc1d6795dbbbb90bf59694c73b41 Mon Sep 17 00:00:00 2001
From: Jerin Philip <jerinphilip@live.in>
Date: Wed, 8 Jun 2022 15:23:23 +0000
Subject: [PATCH 33/67] Remove obsolete comment

---
 src/tensors/cpu/ruy_adapter.h | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/tensors/cpu/ruy_adapter.h b/src/tensors/cpu/ruy_adapter.h
index 3bf51bfff..2d19fc4a3 100644
--- a/src/tensors/cpu/ruy_adapter.h
+++ b/src/tensors/cpu/ruy_adapter.h
@@ -325,10 +325,6 @@ struct Preprocess<Path::kNeon> {
  * intgemm_interface.h diff minimal. There are possibly better abstractions.
  */
 struct IntgemmViaRuy {
-  // Convert compile time errors into run-time ABORTS. This allows bringing in only int8_t and
-  // select functions that are required to create a path which will run while not achieving
-  // parity with intgemm.
-
   // Intgemm nomenclature expects Int8. Missing functions are ABORTs.
   struct Int8 {
     using Type = int8_t;

From d399a35d9e2e82ff7ecf29a8f41c1ae9f2242cfa Mon Sep 17 00:00:00 2001
From: Jerin Philip <jerinphilip@live.in>
Date: Wed, 8 Jun 2022 15:49:30 +0000
Subject: [PATCH 34/67] Remove logging statement in hotpath

---
 src/tensors/cpu/prod_blas.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/tensors/cpu/prod_blas.h b/src/tensors/cpu/prod_blas.h
index 1a1ae4b35..bab16cfbd 100644
--- a/src/tensors/cpu/prod_blas.h
+++ b/src/tensors/cpu/prod_blas.h
@@ -51,7 +51,6 @@ inline void GemmRuy(const bool transA,
                     const float beta,
                     float *C,
                     const int ldc) {
-  LOG(info, "Ruy multiplication called...");
   ruy::Context context;
 
   // If we need to transpose, we can swap dimensions in layout claim the matrix

From 06b6dd96279c64f27b8650329a96f475c38e3ed8 Mon Sep 17 00:00:00 2001
From: Jerin Philip <jerinphilip@live.in>
Date: Thu, 9 Jun 2022 11:42:58 +0000
Subject: [PATCH 35/67] =?UTF-8?q?Use=20CMAKE=5FCXX=5FFLAGS=20instead=20of?=
 =?UTF-8?q?=20add=5Fdefinitions=20=F0=9F=A4=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 CMakeLists.txt | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d20cc9e39..b167d0586 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -286,18 +286,6 @@ else(MSVC)
     endif ()
   endif()
 
-  if(USE_FBGEMM)
-    set(EXT_LIBS ${EXT_LIBS} fbgemm dl)
-    add_definitions(-DUSE_FBGEMM=1)
-  endif(USE_FBGEMM)
-
-  if(USE_INTGEMM)
-    add_definitions(-DUSE_INTGEMM=1)
-  endif(USE_INTGEMM)
-
-  if(USE_SIMD_UTILS)
-    add_definitions(-DARM -DSSE -flax-vector-conversions) #added for ARM
-  endif(USE_SIMD_UTILS)
 
   if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 9.0)
     # Clang-10.0.0 complains when CUDA is newer than 10.1
@@ -367,6 +355,20 @@ else(MSVC)
   endif(COMPILE_WASM)
 endif(MSVC)
 
+if(USE_FBGEMM)
+  set(EXT_LIBS ${EXT_LIBS} fbgemm dl)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_FBGEMM=1")
+endif(USE_FBGEMM)
+
+if(USE_INTGEMM)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_INTGEMM=1")
+endif(USE_INTGEMM)
+
+if(USE_SIMD_UTILS)
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DARM -DSSE -flax-vector-conversions") #added for ARM
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DARM -DSSE -flax-vector-conversions") #added for ARM
+endif(USE_SIMD_UTILS)
+
 # with gcc 7.0 and above we need to mark fallthrough in switch case statements
 # that can be done in comments for backcompat, but CCACHE removes comments.
 # -C makes gcc keep comments.

From e310f73ddfc1c4114e52cd27dd228c848c9c358f Mon Sep 17 00:00:00 2001
From: Jerin Philip <jerinphilip@live.in>
Date: Thu, 9 Jun 2022 12:13:53 +0000
Subject: [PATCH 36/67] Check: Does add_compile_{defs,opts} propogate up?

---
 CMakeLists.txt | 31 ++++++++++++++++++-------------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b167d0586..7cc9c3061 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -72,6 +72,24 @@ else()
   set(USE_INTGEMM ON)
 endif()
 
+if(USE_FBGEMM)
+  set(EXT_LIBS ${EXT_LIBS} fbgemm dl)
+  add_compile_definitions(USE_FBGEMM=1)
+endif(USE_FBGEMM)
+
+if(USE_INTGEMM)
+  add_compile_defintions(USE_INTGEMM=1)
+endif(USE_INTGEMM)
+
+if(USE_SIMD_UTILS)
+  add_compile_definitions(ARM SSE) #added for ARM
+  if(MSVC)
+    add_compile_options(/flax-vector-conversions)
+  else(MSVC)
+    add_compile_options(-flax-vector-conversions)
+  endif(MSVC)
+endif(USE_SIMD_UTILS)
+
 if (USE_WASM_COMPATIBLE_SOURCE)
   set(SPM_BUILD_LIBRARY_ONLY ON CACHE BOOL "Build only sentencepiece library (skip building executables)")
   add_compile_definitions(WASM_COMPATIBLE_SOURCE)
@@ -355,19 +373,6 @@ else(MSVC)
   endif(COMPILE_WASM)
 endif(MSVC)
 
-if(USE_FBGEMM)
-  set(EXT_LIBS ${EXT_LIBS} fbgemm dl)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_FBGEMM=1")
-endif(USE_FBGEMM)
-
-if(USE_INTGEMM)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_INTGEMM=1")
-endif(USE_INTGEMM)
-
-if(USE_SIMD_UTILS)
-  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DARM -DSSE -flax-vector-conversions") #added for ARM
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DARM -DSSE -flax-vector-conversions") #added for ARM
-endif(USE_SIMD_UTILS)
 
 # with gcc 7.0 and above we need to mark fallthrough in switch case statements
 # that can be done in comments for backcompat, but CCACHE removes comments.

From 3bf113317e9a92ff86618f941ba5c0dc8b41cb29 Mon Sep 17 00:00:00 2001
From: Jerin Philip <jerinphilip@live.in>
Date: Thu, 9 Jun 2022 12:16:19 +0000
Subject: [PATCH 37/67] Fix typo: definitions

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7cc9c3061..c2ed92cb1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -78,7 +78,7 @@ if(USE_FBGEMM)
 endif(USE_FBGEMM)
 
 if(USE_INTGEMM)
-  add_compile_defintions(USE_INTGEMM=1)
+  add_compile_definitions(USE_INTGEMM=1)
 endif(USE_INTGEMM)
 
 if(USE_SIMD_UTILS)

From 5c8b1d20b730ef9a9adbb709c850f55f6b5f9940 Mon Sep 17 00:00:00 2001
From: Jerin Philip <jerinphilip@live.in>
Date: Thu, 9 Jun 2022 12:40:06 +0000
Subject: [PATCH 38/67] Undo edit attempts manually for min-diff; Using
 compile_definitions now

---
 CMakeLists.txt | 18 +++++-------------
 1 file changed, 5 insertions(+), 13 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c2ed92cb1..5b5a574ea 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -72,11 +72,6 @@ else()
   set(USE_INTGEMM ON)
 endif()
 
-if(USE_FBGEMM)
-  set(EXT_LIBS ${EXT_LIBS} fbgemm dl)
-  add_compile_definitions(USE_FBGEMM=1)
-endif(USE_FBGEMM)
-
 if(USE_INTGEMM)
   add_compile_definitions(USE_INTGEMM=1)
 endif(USE_INTGEMM)
@@ -211,7 +206,6 @@ if(MSVC)
   # set(INTRINSICS "/arch:AVX")
   add_definitions(-DUSE_SSE2=1)
 
-
   # Or maybe use these?
   set(INTRINSICS ${MSVC_BUILD_ARCH})
   # set(INTRINSICS "/arch:AVX512")
@@ -238,13 +232,7 @@ if(MSVC)
     set(EXT_LIBS ${EXT_LIBS} fbgemm)
     add_definitions(-DUSE_FBGEMM=1 -DFBGEMM_STATIC=1)
   endif(USE_FBGEMM)
-
-  if(USE_INTGEMM)
-    add_definitions(-DUSE_INTGEMM=1)
-  endif(USE_INTGEMM)
-
 else(MSVC)
-
   # Check we are using at least g++ 5.0
   if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0)
     message(FATAL_ERROR "FATAL ERROR: Compiling Marian requires at least g++ 5.0, your version is ${CMAKE_CXX_COMPILER_VERSION}")
@@ -304,6 +292,10 @@ else(MSVC)
     endif ()
   endif()
 
+  if(USE_FBGEMM)
+    set(EXT_LIBS ${EXT_LIBS} fbgemm dl)
+    add_compile_definitions(USE_FBGEMM=1)
+  endif(USE_FBGEMM)
 
   if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 9.0)
     # Clang-10.0.0 complains when CUDA is newer than 10.1
@@ -600,7 +592,7 @@ if(COMPILE_CPU)
 
   if(USE_RUY)
     set(EXT_LIBS ${EXT_LIBS} ruy) 
-    add_definitions(-DUSE_RUY_SGEMM=1)
+    add_compile_definitions(USE_RUY_SGEMM=1)
   endif(USE_RUY)
 
   add_definitions(-DCOMPILE_CPU=1)        # Move the compile CPU definition on top since we want to compile intgemm when we set compile CPU

From 9dd1eff086674952b89cf252107dce29cc282605 Mon Sep 17 00:00:00 2001
From: Jerin Philip <jerinphilip@live.in>
Date: Thu, 9 Jun 2022 14:03:16 +0000
Subject: [PATCH 39/67] Restore CMakeDependentOption; Rename only to ONNX_SGEMM

---
 CMakeLists.txt | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5b5a574ea..7f35af61e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -63,6 +63,10 @@ option(COMPILE_WITHOUT_EXCEPTIONS "Compile without exceptions" OFF)
 # cmake options that are dependent on USE_WASM_COMPATIBLE_SOURCE cmake option
 CMAKE_DEPENDENT_OPTION(USE_THREADS "Compile with multi-threading support" OFF
                        "USE_WASM_COMPATIBLE_SOURCE" ON)
+CMAKE_DEPENDENT_OPTION(USE_ONNX_SGEMM "Compile with wasm compatible blas" ON
+                       "USE_WASM_COMPATIBLE_SOURCE" OFF)
+CMAKE_DEPENDENT_OPTION(COMPILE_WITHOUT_EXCEPTIONS "Compile without exceptions" ON
+                       "USE_WASM_COMPATIBLE_SOURCE" OFF)
 
 if(${CMAKE_TARGET_ARCHITECTURE_CODE} MATCHES "arm")
   set(USE_RUY ON)
@@ -85,13 +89,12 @@ if(USE_SIMD_UTILS)
   endif(MSVC)
 endif(USE_SIMD_UTILS)
 
+
 if (USE_WASM_COMPATIBLE_SOURCE)
   set(SPM_BUILD_LIBRARY_ONLY ON CACHE BOOL "Build only sentencepiece library (skip building executables)")
   add_compile_definitions(WASM_COMPATIBLE_SOURCE)
   # Setting USE_SSE2 definition to enable SSE2 specific code in "3rd_party/sse_mathfun.h" for wasm builds
   add_compile_definitions(USE_SSE2)
-  set(USE_ONNX_SGEMM ON CACHE BOOL "Use ONNX SGEMM (for WebAssembly)")
-  set(COMPILE_WITHOUT_EXCEPTIONS ON CACHE BOOL "Compile without Exceptions")
 endif()
 
 if (COMPILE_WASM)

From b055c11bfa71332d0a87ae5ae2b82430af4555c4 Mon Sep 17 00:00:00 2001
From: Jerin Philip <jerinphilip@live.in>
Date: Thu, 9 Jun 2022 14:28:10 +0000
Subject: [PATCH 40/67] Backtrack attempt to flatten ONNX_SGEMM out

---
 CMakeLists.txt | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7f35af61e..886fb63a6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -606,9 +606,7 @@ if(COMPILE_CPU)
     ## ^ SGEMM != BLAS
     set(EXT_LIBS ${EXT_LIBS} onnx-sgemm)
     add_definitions(-DUSE_ONNX_SGEMM=1) # Might be required in some cmake files further down the line, let's avoid using add_compile_definitions in this codeblock
-  endif(USE_ONNX_SGEMM)
-
-  if(APPLE AND USE_APPLE_ACCELERATE)
+  elseif(APPLE AND USE_APPLE_ACCELERATE)
     set(BLAS_VENDOR "Accelerate")
     # see https://developer.apple.com/documentation/accelerate for more info
     # you may need to install Xcode command line tools if you don't have them already (https://developer.apple.com/xcode/features/)

From d006196de2b22744c2a01a998b2576db496af28a Mon Sep 17 00:00:00 2001
From: Jerin Philip <jerinphilip@live.in>
Date: Thu, 9 Jun 2022 15:28:24 +0000
Subject: [PATCH 41/67] USE_ONNX_SGEMM is a CMakeDependentOption

---
 CMakeLists.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 886fb63a6..433258442 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -56,7 +56,6 @@ option(COMPILE_WASM "Compile (wasm compatible) marian for WASM target" OFF)
 option(USE_WASM_COMPATIBLE_SOURCE "Enable the minimal marian sources that compile to wasm. Useful for debugging wasm failures by building same sources natively" OFF)
 
 option(USE_SIMD_UTILS "Enable simde to target instruction sets" OFF)
-option(USE_ONNX_SGEMM "Compile with wasm compatible blas" OFF)
 option(USE_RUY_SGEMM "Compile with Ruy SGEMM" OFF)
 option(COMPILE_WITHOUT_EXCEPTIONS "Compile without exceptions" OFF)
 

From 39b72375877c0642994254dcc38fb2f94ba9f367 Mon Sep 17 00:00:00 2001
From: Jerin Philip <jerinphilip@live.in>
Date: Fri, 10 Jun 2022 16:50:56 +0000
Subject: [PATCH 42/67] Keep pre armv8 TargetArch detect unchanged

---
 cmake/TargetArch.cmake | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/cmake/TargetArch.cmake b/cmake/TargetArch.cmake
index 01f4a44d7..f653e3e28 100644
--- a/cmake/TargetArch.cmake
+++ b/cmake/TargetArch.cmake
@@ -14,19 +14,19 @@ set(archdetect_c_code "
         || defined(__ARM_ARCH_8M__) || defined(__ARM_ARCH_8M)         \\
         || (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 8)
         #error cmake_ARCH armv8
-    #elif defined(__ARM_ARCH_7__)   || defined(__ARM_ARCH_7)          \\
-        || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7A__)       \\
-        || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7R__)       \\
-        || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7M__)       \\
+    #elif defined(__ARM_ARCH_7__)                                     \\
+        || defined(__ARM_ARCH_7A__)                                   \\
+        || defined(__ARM_ARCH_7R__)                                   \\
+        || defined(__ARM_ARCH_7M__)                                   \\
         || (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 7)
         #error cmake_ARCH armv7
-    #elif defined(__ARM_ARCH_6__)    || defined(__ARM_ARCH_6)         \\
-        || defined(__ARM_ARCH_6J__)  || defined(__ARM_ARCH_6J__)      \\
-        || defined(__ARM_ARCH_6T2__) || defined(__ARM_ARCH_6T2__)     \\
-        || defined(__ARM_ARCH_6Z__)  || defined(__ARM_ARCH_6Z__)      \\
-        || defined(__ARM_ARCH_6K__)  || defined(__ARM_ARCH_6K__)      \\
-        || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6ZK__)     \\
-        || defined(__ARM_ARCH_6M__)  || defined(__ARM_ARCH_6M__)      \\
+    #elif defined(__ARM_ARCH_6__)                                      \\
+        || defined(__ARM_ARCH_6J__)                                    \\
+        || defined(__ARM_ARCH_6T2__)                                   \\
+        || defined(__ARM_ARCH_6Z__)                                    \\
+        || defined(__ARM_ARCH_6K__)                                    \\
+        || defined(__ARM_ARCH_6ZK__)                                   \\
+        || defined(__ARM_ARCH_6M__)                                    \\
         || (defined(__TARGET_ARCH_ARM) && __TARGET_ARCH_ARM-0 >= 6)
         #error cmake_ARCH armv6
     #elif defined(__ARM_ARCH_5TEJ__) \\

From 3a6c51516670eaca3e841ccb76840208091f3278 Mon Sep 17 00:00:00 2001
From: Jerin Philip <jerinphilip@live.in>
Date: Tue, 14 Jun 2022 08:57:47 +0000
Subject: [PATCH 43/67] Simple ARM detection to no-op out shifted/shiftedAll
 paths

---
 src/tensors/cpu/backend.h | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/src/tensors/cpu/backend.h b/src/tensors/cpu/backend.h
index 583a2f792..7804c5577 100644
--- a/src/tensors/cpu/backend.h
+++ b/src/tensors/cpu/backend.h
@@ -78,14 +78,25 @@ class Backend : public marian::Backend {
   void setInt8(bool optimize) override { int8_ = optimize; }
   bool isInt8() override { return int8_; }
 
-  void setShifted(bool shifted) override { shifted_ = shifted; }
+  void setShifted(bool shifted) override { 
+#if (defined(__arm__) || defined(__aarch64__))
+      shifted_ = false;
+#else
+      shifted_ = shifted; 
+#endif
+  }
   bool isShifted() override { return shifted_; }
 
   void setShiftedAll(bool shiftedAll) override {
+#if (defined(__arm__) || defined(__aarch64__))
+      shiftedAll_ = false;
+      shifted_ = false;
+#else
     shiftedAll_ = shiftedAll;
     if (shiftedAll_) {
       shifted_ = true;
     }
+#endif
   }
 
   bool isShiftedAll() override {

From 46db01bf313a12e64b8cb370f4d4af2ade0269ff Mon Sep 17 00:00:00 2001
From: Jerin Philip <jerinphilip@live.in>
Date: Tue, 14 Jun 2022 09:20:50 +0000
Subject: [PATCH 44/67] Add logging statements to indicate forced gemm-path
 change at construction

---
 src/tensors/cpu/backend.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/tensors/cpu/backend.h b/src/tensors/cpu/backend.h
index 7804c5577..a571cbc80 100644
--- a/src/tensors/cpu/backend.h
+++ b/src/tensors/cpu/backend.h
@@ -80,6 +80,7 @@ class Backend : public marian::Backend {
 
   void setShifted(bool shifted) override { 
 #if (defined(__arm__) || defined(__aarch64__))
+      LOG(info, "gemm-precision: *shifted* is not available on ARM; Setting to false.");
       shifted_ = false;
 #else
       shifted_ = shifted; 
@@ -89,6 +90,7 @@ class Backend : public marian::Backend {
 
   void setShiftedAll(bool shiftedAll) override {
 #if (defined(__arm__) || defined(__aarch64__))
+      LOG(info, "gemm-precision: *shifted* is not available on ARM; Setting to false.");
       shiftedAll_ = false;
       shifted_ = false;
 #else

From 63fea9a7d438917858995b17ddebaea64120d71a Mon Sep 17 00:00:00 2001
From: Jerin Philip <jerinphilip@live.in>
Date: Tue, 14 Jun 2022 09:20:59 +0000
Subject: [PATCH 45/67] Remove run script

---
 scripts/run.sh | 40 ----------------------------------------
 1 file changed, 40 deletions(-)
 delete mode 100644 scripts/run.sh

diff --git a/scripts/run.sh b/scripts/run.sh
deleted file mode 100644
index f080dee68..000000000
--- a/scripts/run.sh
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/bin/bash
-
-NDK=/mnt/Storage/jphilip/android-ndk-r23b
-ABI=arm64-v8a
-MINSDK_VERSION=28
-CUSTOM_MODULE_PATH=/mnt/Storage/jphilip/marian-android/openblas-install/lib/cmake/openblas
-ANDROID_PLATFORM=28
-
-OTHER_ANDROID_ARGS=(
-    -DANDROID_ARM_NEON=TRUE
-)
-
-OTHER_MARIAN_ARGS=(
-    -DCOMPILE_CUDA=off
-    -DCOMPILE_CPU=on
-    -DCMAKE_HAVE_THREADS_LIBRARY=1
-    -DCMAKE_USE_WIN32_THREADS_INIT=0
-    -DCMAKE_USE_PTHREADS_INIT=1
-    -DTHREADS_PREFER_PTHREAD_FLAG=ON
-    -DBUILD_ARCH=armv8-a
-    -DUSE_INTGEMM=off
-    -DUSE_SIMDE=on
-    -DUSE_RUY=on
-    -DUSE_ONNX_SGEMM=on # For time being.
-    -DCOMPILE_WITHOUT_EXCEPTIONS=on # Apparently this can reduce the binary size, let's see.
-)
-# Additionally list variables finally configured.
-cmake -L \
-    -DCMAKE_BUILD_TYPE=Release \
-    -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake \
-    -DCMAKE_MODULE_PATH=$CUSTOM_MODULE_PATH \
-    -DANDROID_TOOLCHAIN=clang \
-    -DANDROID_ABI=$ABI \
-    -DANDROID_PLATFORM=$ANDROID_PLATFORM \
-    -DANDROID_NATIVE_API_LEVEL=$MINSDKVERSION \
-    -DANDROID_TOOLCHAIN_NAME=arm-linux-androideabi-4.8 \
-    -DANDROID_STL=c++_static \
-    -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DCMAKE_C_COMPILER_LAUNCHER=ccache \
-    "${OTHER_ANDROID_ARGS[@]}" "${OTHER_MARIAN_ARGS[@]}" \
-    ..

From 82a15e1900e0a5caf08dee37b6d69946ad847218 Mon Sep 17 00:00:00 2001
From: Jerin Philip <jerinphilip@live.in>
Date: Tue, 14 Jun 2022 09:52:51 +0000
Subject: [PATCH 46/67] Removing kStandardCpp - may add later for tests
 separately

---
 src/tensors/cpu/intgemm_interface.h |   4 +-
 src/tensors/cpu/ruy_adapter.h       | 342 +++++++++++-----------------
 2 files changed, 133 insertions(+), 213 deletions(-)

diff --git a/src/tensors/cpu/intgemm_interface.h b/src/tensors/cpu/intgemm_interface.h
index 0612559d9..865c97d3f 100644
--- a/src/tensors/cpu/intgemm_interface.h
+++ b/src/tensors/cpu/intgemm_interface.h
@@ -461,7 +461,7 @@ float scalar_;
                                            intgemm::callbacks::UnquantizeAndWrite(unquant_mult, val_->data()));
       #else
         typedef typename intgemm_<vtype>::type Integer;
-        auto callback = marian::cpu::integer::Preprocess<marian::cpu::integer::kHighestPath>::UnquantizeAndWrite(unquant_mult);
+        auto callback = marian::cpu::integer::UnquantizeAndWrite(unquant_mult);
         intgemm_<vtype>::width::Multiply(reinterpret_cast<Integer *>(child(0)->val()->data()), /*A*/
                                    reinterpret_cast<Integer *>(child(1)->val()->data()), /*B*/
                                    val_->data(), /*output*/
@@ -555,7 +555,7 @@ class AffineNodeOp : public NaryNodeOp {
           }
       #else
         typedef typename intgemm_<vtype>::type Integer;
-        auto callback = marian::cpu::integer::Preprocess<marian::cpu::integer::kHighestPath>::UnquantizeAndAddBiasAndWrite(unquant_mult, 
+        auto callback = marian::cpu::integer::UnquantizeAndAddBiasAndWrite(unquant_mult, 
                                    child(2)->val()->data()  /*child(2) is bias*/);
         intgemm_<vtype>::width::Multiply(reinterpret_cast<Integer *>(child(0)->val()->data()), /*A*/
                                    reinterpret_cast<Integer *>(child(1)->val()->data()), /*B*/
diff --git a/src/tensors/cpu/ruy_adapter.h b/src/tensors/cpu/ruy_adapter.h
index 2d19fc4a3..90383e86a 100644
--- a/src/tensors/cpu/ruy_adapter.h
+++ b/src/tensors/cpu/ruy_adapter.h
@@ -28,85 +28,9 @@ using Index = unsigned int;
 // and required in the fast matrix-multiplication workflow for machine-translation, that exists in
 // marian.
 
-enum class Path {
-  kStandardCpp = 0,  // Pure C++
-  kNeon = 1          // NEON Intrinsics (ARM)
-};
-
-#if RUY_PLATFORM_NEON
-constexpr Path kHighestPath = Path::kNeon;
-#else
-constexpr Path kHighestPath = Path::kStandardCpp;
-#endif
-
-template <enum Path>
-struct Preprocess;
-
 /*
  * Naive implementation using standard C++ functions. Not optimized using SIMD operations.
  */
-template <>
-struct Preprocess<Path::kStandardCpp> {
-  static void quantize(const float *input, int8_t *output, float scale, Index rows, Index width) {
-    const Index size = rows * width;
-    for(Index i = 0; i < size; i++) {
-      // Round to nearest after multiplying with scale.
-      float value = roundf(scale * input[i]);
-
-      // Since float can store bigger values, we threshold anything that's gone
-      // higher and can't fit in int8.
-      value = std::max<float>(-127.0f, value);
-      value = std::min<float>(127.0f, value);
-
-      // Finally a static cast.
-      output[i] = static_cast<int8_t>(value);
-    };
-  }
-
-  template <class Scalar>
-  static void transpose(const Scalar *input, Index rows, Index cols, Scalar *output) {
-    for(Index i = 0; i < rows; i++) {
-      for(Index j = 0; j < cols; j++) {
-        output[j * rows + i] = input[i * cols + j];
-      }
-    }
-  }
-
-  struct UnquantizeAndAddBiasAndWrite {
-    UnquantizeAndAddBiasAndWrite(float unquant_multiplier, const float *input_bias_prepared)
-        : unquant_multiplier_(unquant_multiplier), input_bias_prepared_(input_bias_prepared) {}
-
-    void operator()(const int32_t *input, Index rows_A, Index cols_B, float *output) const {
-      for(Index i = 0; i < rows_A; i++) {
-        for(Index j = 0; j < cols_B; j++) {
-          Index idx = i * cols_B + j;
-          output[idx] = (input[idx] * unquant_multiplier_) + input_bias_prepared_[j];
-        }
-      }
-    }
-
-  private:
-    float unquant_multiplier_;
-    const float *input_bias_prepared_;
-  };
-
-  struct UnquantizeAndWrite {
-    explicit UnquantizeAndWrite(float unquant_multiplier)
-        : unquant_multiplier_(unquant_multiplier) {}
-
-    void operator()(const int32_t *input, Index rows_A, Index cols_B, float *output) const {
-      for(Index i = 0; i < rows_A; i++) {
-        for(Index j = 0; j < cols_B; j++) {
-          Index idx = i * cols_B + j;
-          output[idx] = (input[idx] * unquant_multiplier_);
-        }
-      }
-    }
-
-  private:
-    float unquant_multiplier_;
-  };
-};
 
 #if RUY_PLATFORM_NEON
 
@@ -114,82 +38,68 @@ struct Preprocess<Path::kStandardCpp> {
  * Optimized path using ARM NEON SIMD intrinsics. Currently only supports int8_t.
  * TODO: Expand support to 16-bit.
  */
-template <>
-struct Preprocess<Path::kNeon> {
-  static void quantize(const float *input, int8_t *output, float scale, Index rows, Index width) {
-    const float32x4_t *Input = reinterpret_cast<const float32x4_t *>(input);
-    const float32x4_t *InputEnd = reinterpret_cast<const float32x4_t *>(input + rows * width);
-
-    int8x8_t *Output = reinterpret_cast<int8x8_t *>(output);
-    while(Input != InputEnd) {
-      // Vector multiply by scalar
-      // float32x4_t vmulq_n_f32(float32x4_t a, float32_t b);
-      // VMUL.F32 q0,q0,d0[0]
-      float32x4_t scaledFloat_lo = vmulq_n_f32(*Input++, scale);
-
-      // Convert from float
-      // int32x4_t  vcvtnq_s32_f32(float32x4_t a);
-      // VCVT.S32.F32 q0, q0
-      int32x4_t scaledInt_lo = vcvtnq_s32_f32(scaledFloat_lo);
-
-      // Vector saturating narrow integer
-      // int16x4_t  vqmovn_s32(int32x4_t a);   // VQMOVN.S32 d0,q0
-      int16x4_t s16x4_lo = vqmovn_s32(scaledInt_lo);
-
-      // Vector multiply by scalar
-      // float32x4_t vmulq_n_f32(float32x4_t a, float32_t b);
-      // VMUL.F32 q0,q0,d0[0]
-      float32x4_t scaledFloat_hi = vmulq_n_f32(*Input++, scale);
-
-      // Convert from float
-      // int32x4_t  vcvtnq_s32_f32(float32x4_t a);
-      // VCVT.S32.F32 q0, q0
-      int32x4_t scaledInt_hi = vcvtnq_s32_f32(scaledFloat_hi);
-
-      // Vector saturating narrow integer
-      // int16x4_t  vqmovn_s32(int32x4_t a);
-      // VQMOVN.S32 d0,q0
-      int16x4_t s16x4_hi = vqmovn_s32(scaledInt_hi);
-
-      // Combine two ints.
-      // int16x8_t   vcombine_s16(int16x4_t low, int16x4_t high);
-      int16x8_t s16x8 = vcombine_s16(s16x4_lo, s16x4_hi);
-
-      // Vector saturating narrow integer
-      int8x8_t s8x8 = vqmovn_s16(s16x8);
-
-      *Output = s8x8;
-      ++Output;
-    };
-  }
-
-  // Specialization for int8_t
-  static void transpose(const int8_t *input, Index rows, Index cols, int8_t *output) {
-    constexpr Index tile_size = 16;
-    // TODO(jerin): Enable
-    // assert(rows % tile_size == 0 && cols & tile_size == 0);
-    for(Index i = 0; i < rows; i += tile_size) {
-      for(Index j = 0; j < cols; j += tile_size) {
-        _transpose_16x16(input, i, j, rows, cols, output);
-      }
-    }
-  }
-
-  static void _transpose_16x16(const int8_t *src,
-                               Index i,
-                               Index j,
-                               Index rows,
-                               Index cols,
-                               int8_t *dst) {
-    // Implemented following the algorithm described in
-    // https://stackoverflow.com/a/29587984/4565794
-    //
-    // permute n 32-bit rows
-    // permute n 64-bit rows
-    // ...
-    // permute n simd_width/2-bit rows
-
-    // clang-format off
+inline void quantize(const float *input, int8_t *output, float scale, Index rows, Index width) {
+  const float32x4_t *Input = reinterpret_cast<const float32x4_t *>(input);
+  const float32x4_t *InputEnd = reinterpret_cast<const float32x4_t *>(input + rows * width);
+
+  int8x8_t *Output = reinterpret_cast<int8x8_t *>(output);
+  while(Input != InputEnd) {
+    // Vector multiply by scalar
+    // float32x4_t vmulq_n_f32(float32x4_t a, float32_t b);
+    // VMUL.F32 q0,q0,d0[0]
+    float32x4_t scaledFloat_lo = vmulq_n_f32(*Input++, scale);
+
+    // Convert from float
+    // int32x4_t  vcvtnq_s32_f32(float32x4_t a);
+    // VCVT.S32.F32 q0, q0
+    int32x4_t scaledInt_lo = vcvtnq_s32_f32(scaledFloat_lo);
+
+    // Vector saturating narrow integer
+    // int16x4_t  vqmovn_s32(int32x4_t a);   // VQMOVN.S32 d0,q0
+    int16x4_t s16x4_lo = vqmovn_s32(scaledInt_lo);
+
+    // Vector multiply by scalar
+    // float32x4_t vmulq_n_f32(float32x4_t a, float32_t b);
+    // VMUL.F32 q0,q0,d0[0]
+    float32x4_t scaledFloat_hi = vmulq_n_f32(*Input++, scale);
+
+    // Convert from float
+    // int32x4_t  vcvtnq_s32_f32(float32x4_t a);
+    // VCVT.S32.F32 q0, q0
+    int32x4_t scaledInt_hi = vcvtnq_s32_f32(scaledFloat_hi);
+
+    // Vector saturating narrow integer
+    // int16x4_t  vqmovn_s32(int32x4_t a);
+    // VQMOVN.S32 d0,q0
+    int16x4_t s16x4_hi = vqmovn_s32(scaledInt_hi);
+
+    // Combine two ints.
+    // int16x8_t   vcombine_s16(int16x4_t low, int16x4_t high);
+    int16x8_t s16x8 = vcombine_s16(s16x4_lo, s16x4_hi);
+
+    // Vector saturating narrow integer
+    int8x8_t s8x8 = vqmovn_s16(s16x8);
+
+    *Output = s8x8;
+    ++Output;
+  };
+}
+
+inline void _transpose_16x16(const int8_t *src,
+                             Index i,
+                             Index j,
+                             Index rows,
+                             Index cols,
+                             int8_t *dst) {
+  // Implemented following the algorithm described in
+  // https://stackoverflow.com/a/29587984/4565794
+  //
+  // permute n 32-bit rows
+  // permute n 64-bit rows
+  // ...
+  // permute n simd_width/2-bit rows
+
+  // clang-format off
     
     // Permute 8 8-bit rows.
     // Load int8x16x2 from memory into SIMD registers, transpose as 2x2 matrices.
@@ -250,72 +160,82 @@ struct Preprocess<Path::kNeon> {
     vst1q_s8(&dst[14*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(x2.val[1]), vget_high_s32(x6.val[1]))));
     vst1q_s8(&dst[15*rows + dstRowBegin], vreinterpretq_s8_s32(vcombine_s32(vget_high_s32(x3.val[1]), vget_high_s32(x7.val[1]))));
 
-    // clang-format on
+  // clang-format on
+}
+
+// Specialization for int8_t
+inline void transpose(const int8_t *input, Index rows, Index cols, int8_t *output) {
+  constexpr Index tile_size = 16;
+  // TODO(jerin): Enable
+  // assert(rows % tile_size == 0 && cols & tile_size == 0);
+  for(Index i = 0; i < rows; i += tile_size) {
+    for(Index j = 0; j < cols; j += tile_size) {
+      _transpose_16x16(input, i, j, rows, cols, output);
+    }
   }
+}
 
-  struct UnquantizeAndAddBiasAndWrite {
-    UnquantizeAndAddBiasAndWrite(float unquant_multiplier, const float *input_bias_prepared)
-        : unquant_multiplier_(unquant_multiplier), input_bias_prepared_(input_bias_prepared) {}
-
-    void operator()(const int32_t *input, Index rows_A, Index cols_B, float *output) const {
-      // Set all registers in lane from same scalar value.
-      float32x4_t multiplier = vdupq_n_f32(unquant_multiplier_);
-      const int32x4_t *Input = reinterpret_cast<const int32x4_t *>(input);
-      const int32x4_t *InputEnd = reinterpret_cast<const int32x4_t *>(input + rows_A * cols_B);
-      float32x4_t *Output = reinterpret_cast<float32x4_t *>(output);
-
-      while(Input != InputEnd) {
-        // Bias cycles every column for addition.
-        const float32x4_t *Bias = reinterpret_cast<const float32x4_t *>(input_bias_prepared_);
-
-        // InputEnd needs to be determined to end the while loop below.
-        const int32x4_t *RowEnd = reinterpret_cast<const int32x4_t *>(
-            reinterpret_cast<const int32_t *>(Input) + cols_B);
-
-        while(Input != RowEnd) {
-          // Operation happening for 4-elements together:
-          // output = [int32_t]input * [float]quant_mult + [float]bias;
-          float32x4_t floatInput = vcvtq_f32_s32(*Input++);
-          float32x4_t unquantized = vmulq_f32(floatInput, multiplier);
-          *Output++ = vaddq_f32(unquantized, *Bias++);
-        }
+struct UnquantizeAndAddBiasAndWrite {
+  UnquantizeAndAddBiasAndWrite(float unquant_multiplier, const float *input_bias_prepared)
+      : unquant_multiplier_(unquant_multiplier), input_bias_prepared_(input_bias_prepared) {}
+
+  void operator()(const int32_t *input, Index rows_A, Index cols_B, float *output) const {
+    // Set all registers in lane from same scalar value.
+    float32x4_t multiplier = vdupq_n_f32(unquant_multiplier_);
+    const int32x4_t *Input = reinterpret_cast<const int32x4_t *>(input);
+    const int32x4_t *InputEnd = reinterpret_cast<const int32x4_t *>(input + rows_A * cols_B);
+    float32x4_t *Output = reinterpret_cast<float32x4_t *>(output);
+
+    while(Input != InputEnd) {
+      // Bias cycles every column for addition.
+      const float32x4_t *Bias = reinterpret_cast<const float32x4_t *>(input_bias_prepared_);
+
+      // InputEnd needs to be determined to end the while loop below.
+      const int32x4_t *RowEnd
+          = reinterpret_cast<const int32x4_t *>(reinterpret_cast<const int32_t *>(Input) + cols_B);
+
+      while(Input != RowEnd) {
+        // Operation happening for 4-elements together:
+        // output = [int32_t]input * [float]quant_mult + [float]bias;
+        float32x4_t floatInput = vcvtq_f32_s32(*Input++);
+        float32x4_t unquantized = vmulq_f32(floatInput, multiplier);
+        *Output++ = vaddq_f32(unquantized, *Bias++);
       }
     }
+  }
 
-  private:
-    float unquant_multiplier_;
-    const float *input_bias_prepared_;
-  };
+private:
+  float unquant_multiplier_;
+  const float *input_bias_prepared_;
+};
 
-  struct UnquantizeAndWrite {
-    explicit UnquantizeAndWrite(float unquant_multiplier)
-        : unquant_multiplier_(unquant_multiplier) {}
-
-    void operator()(const int32_t *input, Index rows_A, Index cols_B, float *output) const {
-      // Set all registers in lane from same scalar value.
-      float32x4_t multiplier = vdupq_n_f32(unquant_multiplier_);
-      const int32x4_t *Input = reinterpret_cast<const int32x4_t *>(input);
-      const int32x4_t *InputEnd = reinterpret_cast<const int32x4_t *>(input + rows_A * cols_B);
-      float32x4_t *Output = reinterpret_cast<float32x4_t *>(output);
-
-      while(Input != InputEnd) {
-        // InputEnd needs to be determined to end the while loop below.
-        const int32x4_t *RowEnd = reinterpret_cast<const int32x4_t *>(
-            reinterpret_cast<const int32_t *>(Input) + cols_B);
-
-        while(Input != RowEnd) {
-          // Operation happening for 4-elements together:
-          // output = [int32_t]input * [float]quant_mult + [float]bias;
-          float32x4_t floatInput = vcvtq_f32_s32(*Input++);
-          float32x4_t unquantized = vmulq_f32(floatInput, multiplier);
-          *Output++ = unquantized;
-        }
+struct UnquantizeAndWrite {
+  explicit UnquantizeAndWrite(float unquant_multiplier) : unquant_multiplier_(unquant_multiplier) {}
+
+  void operator()(const int32_t *input, Index rows_A, Index cols_B, float *output) const {
+    // Set all registers in lane from same scalar value.
+    float32x4_t multiplier = vdupq_n_f32(unquant_multiplier_);
+    const int32x4_t *Input = reinterpret_cast<const int32x4_t *>(input);
+    const int32x4_t *InputEnd = reinterpret_cast<const int32x4_t *>(input + rows_A * cols_B);
+    float32x4_t *Output = reinterpret_cast<float32x4_t *>(output);
+
+    while(Input != InputEnd) {
+      // InputEnd needs to be determined to end the while loop below.
+      const int32x4_t *RowEnd
+          = reinterpret_cast<const int32x4_t *>(reinterpret_cast<const int32_t *>(Input) + cols_B);
+
+      while(Input != RowEnd) {
+        // Operation happening for 4-elements together:
+        // output = [int32_t]input * [float]quant_mult + [float]bias;
+        float32x4_t floatInput = vcvtq_f32_s32(*Input++);
+        float32x4_t unquantized = vmulq_f32(floatInput, multiplier);
+        *Output++ = unquantized;
       }
     }
+  }
 
-  private:
-    float unquant_multiplier_;
-  };
+private:
+  float unquant_multiplier_;
 };
 
 #endif
@@ -340,7 +260,7 @@ struct IntgemmViaRuy {
                                    float quant_mult,
                                    Index rows,
                                    Index cols) {
-      Preprocess<kHighestPath>::quantize(input, output, quant_mult, rows, cols);
+      quantize(input, output, quant_mult, rows, cols);
     }
 
     static void PrepareB(const float *, Type *, float, Index, Index) {
@@ -352,7 +272,7 @@ struct IntgemmViaRuy {
                          float quant_mult,
                          Index rows,
                          Index cols) {
-      Preprocess<kHighestPath>::quantize(input, output, quant_mult, rows, cols);
+      quantize(input, output, quant_mult, rows, cols);
     }
 
     static void SelectColumnsB(const Type *input,

From 4a8c0da63cca0110478815ccc8f24f890ae34e3a Mon Sep 17 00:00:00 2001
From: Jerin Philip <jerinphilip@live.in>
Date: Tue, 14 Jun 2022 10:08:32 +0000
Subject: [PATCH 47/67] Remove leftover gcc diagnostic pop for SIMDE

---
 src/3rd_party/sse_mathfun.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/3rd_party/sse_mathfun.h b/src/3rd_party/sse_mathfun.h
index bb0be6c65..91155cac3 100644
--- a/src/3rd_party/sse_mathfun.h
+++ b/src/3rd_party/sse_mathfun.h
@@ -712,6 +712,3 @@ static inline void sincos_ps(v4sf x, v4sf *s, v4sf *c) {
   *c = _mm_xor_ps(xmm2, sign_bit_cos);
 }
 
-#ifdef USE_SIMDE
-#pragma GCC diagnostic pop
-#endif

From e17a5dd891550b7f67304aac2c038bbfacb8fbd5 Mon Sep 17 00:00:00 2001
From: Jerin Philip <jerinphilip@live.in>
Date: Tue, 14 Jun 2022 10:09:49 +0000
Subject: [PATCH 48/67] Remove simde-no-tests reference in CMakeLists.txt file

---
 src/CMakeLists.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 5845a6710..35bd6866a 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -8,7 +8,6 @@ include_directories(3rd_party/sentencepiece/third_party/protobuf-lite)
 include_directories(3rd_party/fbgemm/include)
 include_directories(3rd_party/intgemm)
 include_directories(3rd_party/ruy)
-include_directories(3rd_party/simde-no-tests)
 include_directories(${CMAKE_CURRENT_BINARY_DIR}/3rd_party/intgemm)
 include_directories(${CMAKE_CURRENT_BINARY_DIR}/3rd_party)
 include_directories(${CMAKE_BINARY_DIR}/local/include)

From 3c8a149c5720a6549c356895e52c1875658b507b Mon Sep 17 00:00:00 2001
From: Jerin Philip <jerinphilip@live.in>
Date: Tue, 14 Jun 2022 10:21:58 +0000
Subject: [PATCH 49/67] Remove obsolete comments

---
 src/tensors/cpu/ruy_adapter.h | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/src/tensors/cpu/ruy_adapter.h b/src/tensors/cpu/ruy_adapter.h
index 90383e86a..4066ac895 100644
--- a/src/tensors/cpu/ruy_adapter.h
+++ b/src/tensors/cpu/ruy_adapter.h
@@ -22,21 +22,10 @@ namespace integer {
 
 using Index = unsigned int;
 
-// The following partitions a pure C++ slow implementation and a faster SIMD implementation using
-// NEON intrinsics on ARM hardware. Ruy already has such a routing, but we add some preprocessing
-// and postprocessing functions (quantize, transpose, unquantize) that are outside ruy's offerings
-// and required in the fast matrix-multiplication workflow for machine-translation, that exists in
-// marian.
-
-/*
- * Naive implementation using standard C++ functions. Not optimized using SIMD operations.
- */
-
 #if RUY_PLATFORM_NEON
 
 /*
  * Optimized path using ARM NEON SIMD intrinsics. Currently only supports int8_t.
- * TODO: Expand support to 16-bit.
  */
 inline void quantize(const float *input, int8_t *output, float scale, Index rows, Index width) {
   const float32x4_t *Input = reinterpret_cast<const float32x4_t *>(input);

From 800402c85ab3293d311ea28b8987433b12c0333e Mon Sep 17 00:00:00 2001
From: Jerin Philip <jerinphilip@live.in>
Date: Tue, 14 Jun 2022 13:55:56 +0000
Subject: [PATCH 50/67] Explain copying x86-SSE structure for NEON

---
 src/common/types.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/common/types.h b/src/common/types.h
index 621551daf..6052896fc 100644
--- a/src/common/types.h
+++ b/src/common/types.h
@@ -173,15 +173,22 @@ struct intgemm8 {
 
 
 #if defined(__ARM_NEON) || defined(__ARM_NEON__)
+// The following struct fills this structure for ARM with NEON SIMD, changing
+// __m128 and _mm_set1_ps with the equivalents on ARM-NEON.
 struct float32x4 {
 private:
+    // NEON uses 128-bit SIMD registers, same as SSE. We are copying this class
+    // and locally aliasing __m128 to float32x4_t, which is the NEON
+    // equivalent.
    using __m128 = float32x4_t;
   __m128 f_;
 
 public:
   float32x4() {}
   float32x4(const __m128& f) : f_(f) {}
-  float32x4(const float& f) : f_(vdupq_n_f32(f)) {} // __m128 _mm_set1_ps(float) copies value into all slots
+  // __m128 _mm_set1_ps(float) copies value into all slots, vdupq_n_f32 is it's
+  // NEON equivalent.
+  float32x4(const float& f) : f_(vdupq_n_f32(f)) {} 
 
   operator const __m128&() const { return f_; }
   operator __m128&() { return f_; }

From 9d648d0ebfeee5b10591280f10e0863bbc5e8a37 Mon Sep 17 00:00:00 2001
From: Jerin Philip <jerinphilip@live.in>
Date: Mon, 20 Jun 2022 10:30:55 +0000
Subject: [PATCH 51/67] Remove executable upload for android

---
 .github/workflows/arm.yml | 19 -------------------
 1 file changed, 19 deletions(-)

diff --git a/.github/workflows/arm.yml b/.github/workflows/arm.yml
index 9667ece05..55e2978cc 100644
--- a/.github/workflows/arm.yml
+++ b/.github/workflows/arm.yml
@@ -118,22 +118,3 @@ jobs:
         path: ${{github.workspace}}/build/marian-decoder
 
 
-  # Disable release for now.
-  # release:
-  #   name: Release Latest Build
-  #   runs-on: ubuntu-latest
-  #   needs: [ubuntu]
-  #   if: github.ref == 'refs/heads/master'
-  #   steps:
-  #    - name: Download artifacts
-  #      uses: actions/download-artifact@v2
-  #     
-  #    - name: Update GitHub prerelease
-  #      uses: marvinpinto/action-automatic-releases@latest
-  #      with:
-  #        repo_token: ${{ secrets.GITHUB_TOKEN }}
-  #        automatic_release_tag: latest
-  #        prerelease: true
-  #        title: "Latest Build"
-  #        files: |
-  #          artifact/marian-decoder

From d19a3123b47d7370841ec3ac53edade132c23b4c Mon Sep 17 00:00:00 2001
From: Jerin Philip <jerinphilip@live.in>
Date: Mon, 20 Jun 2022 10:31:56 +0000
Subject: [PATCH 52/67] Remove comment

---
 src/CMakeLists.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 35bd6866a..76aa0e2b3 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -101,7 +101,6 @@ set(MARIAN_SOURCES
 )
 
 if (NOT USE_WASM_COMPATIBLE_SOURCE AND NOT ANDROID)
-  # Hi WASM, Android hates this too.
   list(APPEND MARIAN_SOURCES
     3rd_party/ExceptionWithCallStack.cpp
   )

From 1b38e01bb8c7738dace52ed84d972ca4f9b74897 Mon Sep 17 00:00:00 2001
From: Jerin Philip <jerinphilip@live.in>
Date: Tue, 21 Jun 2022 09:51:32 +0000
Subject: [PATCH 53/67] Restore -Werror

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 433258442..f39e2acc5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -308,7 +308,7 @@ else(MSVC)
   set(DISABLE_GLOBALLY "-Wno-unused-result ${CLANG_IGNORE_UNKNOWN_CUDA} ${CLANG_IGNORE_UNUSED_VALUES}") # This needs to appear here as well to appease clang11+ on linux
 
   # These are used in src/CMakeLists.txt on a per-target basis
-  list(APPEND ALL_WARNINGS -Wall; # -Werror; 
+  list(APPEND ALL_WARNINGS -Wall; -Werror; 
     -Wextra; -Wno-unused-result; -Wno-deprecated;
     -Wno-pragmas; -Wno-unused-parameter; -Wno-unused-function;
     -Wno-unused-value; -Wno-unknown-pragmas; -Wno-sign-compare;

From 9027ea4c47a8e0ae031dcd52529c3e9c4e79a557 Mon Sep 17 00:00:00 2001
From: Jerin Philip <jerinphilip@live.in>
Date: Tue, 21 Jun 2022 10:06:59 +0000
Subject: [PATCH 54/67] Switch to a {{0}} sigaction on WASM, {0} for rest

---
 src/common/logging.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/common/logging.cpp b/src/common/logging.cpp
index 51d80510a..3a7afd846 100644
--- a/src/common/logging.cpp
+++ b/src/common/logging.cpp
@@ -128,7 +128,11 @@ static void setErrorHandlers() {
   std::set_terminate(unhandledException);
 #ifdef __unix__
   // catch segfaults
+#ifdef WASM_COMPATIBLE_SOURCE
+  struct sigaction sa = {{ 0 }};
+#else // WASM_COMPATIBLE_SOURCE
   struct sigaction sa = { 0 };
+#endif // WASM_COMPATIBLE_SOURCE
   sigemptyset(&sa.sa_mask);
   sa.sa_flags = SA_SIGINFO;
   sa.sa_sigaction = [](int /*signal*/, siginfo_t*, void*) { ABORT("Segmentation fault"); };

From a0ee5275d5e95a4ba810ff18d3d165d6b93a501d Mon Sep 17 00:00:00 2001
From: Jerin Philip <jerinphilip@live.in>
Date: Tue, 21 Jun 2022 20:32:06 +0000
Subject: [PATCH 55/67] Revert "Restore -Werror"

This reverts commit 1b38e01bb8c7738dace52ed84d972ca4f9b74897.
---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f39e2acc5..433258442 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -308,7 +308,7 @@ else(MSVC)
   set(DISABLE_GLOBALLY "-Wno-unused-result ${CLANG_IGNORE_UNKNOWN_CUDA} ${CLANG_IGNORE_UNUSED_VALUES}") # This needs to appear here as well to appease clang11+ on linux
 
   # These are used in src/CMakeLists.txt on a per-target basis
-  list(APPEND ALL_WARNINGS -Wall; -Werror; 
+  list(APPEND ALL_WARNINGS -Wall; # -Werror; 
     -Wextra; -Wno-unused-result; -Wno-deprecated;
     -Wno-pragmas; -Wno-unused-parameter; -Wno-unused-function;
     -Wno-unused-value; -Wno-unknown-pragmas; -Wno-sign-compare;

From 6285f28209b95a05a5ba883bc763b7f3d9e350a1 Mon Sep 17 00:00:00 2001
From: Jerin Philip <jerinphilip@live.in>
Date: Tue, 21 Jun 2022 20:41:08 +0000
Subject: [PATCH 56/67] Use -DFMA for NEON from simd_utils example

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 433258442..32309e33e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -80,7 +80,7 @@ if(USE_INTGEMM)
 endif(USE_INTGEMM)
 
 if(USE_SIMD_UTILS)
-  add_compile_definitions(ARM SSE) #added for ARM
+  add_compile_definitions(ARM FMA SSE) #added for ARM
   if(MSVC)
     add_compile_options(/flax-vector-conversions)
   else(MSVC)

From 8895fda8ed69a7da0ecc2abd83ff5c14c785416c Mon Sep 17 00:00:00 2001
From: Jerin Philip <jerinphilip@live.in>
Date: Tue, 21 Jun 2022 20:48:15 +0000
Subject: [PATCH 57/67] Remove redundant neon_mathfun include after
 simd_utils.h

---
 src/functional/operators.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/functional/operators.h b/src/functional/operators.h
index 00cb9c819..7bffec066 100755
--- a/src/functional/operators.h
+++ b/src/functional/operators.h
@@ -215,7 +215,6 @@ struct Ops<double> {
 
 #if defined(__ARM_NEON) || defined(__ARM_NEON__)
 #include "3rd_party/simd_utils/simd_utils.h"
-#include "3rd_party/simd_utils/neon_mathfun.h"
 #else
 #include "3rd_party/sse_mathfun.h"
 #endif

From c6c3ac6fc1e64862f3c808d127e07cc5cb72406c Mon Sep 17 00:00:00 2001
From: Jerin Philip <jerinphilip@live.in>
Date: Tue, 21 Jun 2022 20:50:32 +0000
Subject: [PATCH 58/67] Wrap CmakeLists.txt ARM definitions with an if

---
 CMakeLists.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 32309e33e..00c73500c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -80,7 +80,9 @@ if(USE_INTGEMM)
 endif(USE_INTGEMM)
 
 if(USE_SIMD_UTILS)
-  add_compile_definitions(ARM FMA SSE) #added for ARM
+  if(${CMAKE_TARGET_ARCHITECTURE_CODE} MATCHES "arm")
+    add_compile_definitions(ARM FMA SSE) #added for ARM
+  endif()
   if(MSVC)
     add_compile_options(/flax-vector-conversions)
   else(MSVC)

From 3baf620296502bae093f9a3b537ea889efe4c9cb Mon Sep 17 00:00:00 2001
From: Jerin Philip <jerinphilip@live.in>
Date: Thu, 23 Jun 2022 22:25:46 +0000
Subject: [PATCH 59/67] Use __clang__ instead of WASM_COMPATIBLE_SOURCE; emcc
 uses LLVM

---
 src/common/logging.cpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/common/logging.cpp b/src/common/logging.cpp
index 3a7afd846..69c3ccb78 100644
--- a/src/common/logging.cpp
+++ b/src/common/logging.cpp
@@ -128,11 +128,14 @@ static void setErrorHandlers() {
   std::set_terminate(unhandledException);
 #ifdef __unix__
   // catch segfaults
-#ifdef WASM_COMPATIBLE_SOURCE
+// Emscripten uses Clang/LLVM as its underlying codegen compiler, so the
+// preprocessor defines __llvm__ and __clang__ are defined.
+// Exists to appease -Werror gods.
+#ifdef __clang__
   struct sigaction sa = {{ 0 }};
-#else // WASM_COMPATIBLE_SOURCE
+#else // __clang__
   struct sigaction sa = { 0 };
-#endif // WASM_COMPATIBLE_SOURCE
+#endif // __clang__
   sigemptyset(&sa.sa_mask);
   sa.sa_flags = SA_SIGINFO;
   sa.sa_sigaction = [](int /*signal*/, siginfo_t*, void*) { ABORT("Segmentation fault"); };

From aa1842cc527418790d24513b4e605b422406a5c4 Mon Sep 17 00:00:00 2001
From: Jerin Philip <jerinphilip@live.in>
Date: Thu, 23 Jun 2022 22:26:44 +0000
Subject: [PATCH 60/67] Suppress warnings by #pragma GCC diagnostic ...

---
 src/functional/operators.h       | 3 +++
 src/tensors/cpu/integer_common.h | 4 ++++
 src/tensors/cpu/prod_blas.h      | 3 +++
 src/tensors/cpu/ruy_adapter.h    | 3 +++
 4 files changed, 13 insertions(+)

diff --git a/src/functional/operators.h b/src/functional/operators.h
index 7bffec066..1a67e22a1 100755
--- a/src/functional/operators.h
+++ b/src/functional/operators.h
@@ -214,7 +214,10 @@ struct Ops<double> {
 #ifndef __CUDACC__
 
 #if defined(__ARM_NEON) || defined(__ARM_NEON__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #include "3rd_party/simd_utils/simd_utils.h"
+#pragma GCC diagnostic pop
 #else
 #include "3rd_party/sse_mathfun.h"
 #endif
diff --git a/src/tensors/cpu/integer_common.h b/src/tensors/cpu/integer_common.h
index 67ee1e94f..389f5c4e8 100644
--- a/src/tensors/cpu/integer_common.h
+++ b/src/tensors/cpu/integer_common.h
@@ -7,7 +7,11 @@
 #ifdef USE_INTGEMM
 #include "3rd_party/intgemm/intgemm/intgemm.h"
 #else // USE_INTGEMM
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wall"
 #include <ruy/ruy.h>
+#pragma GCC diagnostic pop
+
 #include "ruy_adapter.h"
 #endif // USE_INTGEMM
 #if defined(WASM)
diff --git a/src/tensors/cpu/prod_blas.h b/src/tensors/cpu/prod_blas.h
index bab16cfbd..61053b402 100644
--- a/src/tensors/cpu/prod_blas.h
+++ b/src/tensors/cpu/prod_blas.h
@@ -6,8 +6,11 @@
 #elif USE_ONNX_SGEMM
     #include "3rd_party/onnxjs/src/wasm-ops/gemm.h"
 #elif USE_RUY_SGEMM
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wcomment"
     #include "ruy/ruy.h"
     #include "ruy/system_aligned_alloc.h"
+#pragma GCC pop
 #endif
 
 #if USE_RUY_SGEMM
diff --git a/src/tensors/cpu/ruy_adapter.h b/src/tensors/cpu/ruy_adapter.h
index 4066ac895..ad6bc1ce6 100644
--- a/src/tensors/cpu/ruy_adapter.h
+++ b/src/tensors/cpu/ruy_adapter.h
@@ -9,8 +9,11 @@
 #include <algorithm>
 #include <cmath>
 #include <cstdint>
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wcomment"
 #include "ruy/platform.h"
 #include "ruy/system_aligned_alloc.h"
+#pragma GCC diagnostic pop
 
 #if RUY_PLATFORM_NEON
 #include <arm_neon.h>

From 8eae08bca9ebb42e0c58f3a630a88b189cb86ab5 Mon Sep 17 00:00:00 2001
From: Jerin Philip <jerinphilip@live.in>
Date: Thu, 23 Jun 2022 22:27:05 +0000
Subject: [PATCH 61/67] Re-enable -Werror

---
 CMakeLists.txt | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 00c73500c..f505188d1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -310,8 +310,7 @@ else(MSVC)
   set(DISABLE_GLOBALLY "-Wno-unused-result ${CLANG_IGNORE_UNKNOWN_CUDA} ${CLANG_IGNORE_UNUSED_VALUES}") # This needs to appear here as well to appease clang11+ on linux
 
   # These are used in src/CMakeLists.txt on a per-target basis
-  list(APPEND ALL_WARNINGS -Wall; # -Werror; 
-    -Wextra; -Wno-unused-result; -Wno-deprecated;
+  list(APPEND ALL_WARNINGS -Wall; -Werror; -Wextra; -Wno-unused-result; -Wno-deprecated;
     -Wno-pragmas; -Wno-unused-parameter; -Wno-unused-function;
     -Wno-unused-value; -Wno-unknown-pragmas; -Wno-sign-compare;
     -Wno-missing-field-initializers; ${CLANG_IGNORE_UNUSED_PRIVATE_FIELD})

From 9a541c456a864a59502fdc5fee494cf2965dd671 Mon Sep 17 00:00:00 2001
From: Jerin Philip <jerinphilip@live.in>
Date: Thu, 23 Jun 2022 22:50:48 +0000
Subject: [PATCH 62/67] {0} -> {} to work around empty-braces Werror

---
 src/common/logging.cpp | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/src/common/logging.cpp b/src/common/logging.cpp
index 69c3ccb78..c21cf4207 100644
--- a/src/common/logging.cpp
+++ b/src/common/logging.cpp
@@ -128,14 +128,7 @@ static void setErrorHandlers() {
   std::set_terminate(unhandledException);
 #ifdef __unix__
   // catch segfaults
-// Emscripten uses Clang/LLVM as its underlying codegen compiler, so the
-// preprocessor defines __llvm__ and __clang__ are defined.
-// Exists to appease -Werror gods.
-#ifdef __clang__
-  struct sigaction sa = {{ 0 }};
-#else // __clang__
-  struct sigaction sa = { 0 };
-#endif // __clang__
+  struct sigaction sa = {};
   sigemptyset(&sa.sa_mask);
   sa.sa_flags = SA_SIGINFO;
   sa.sa_sigaction = [](int /*signal*/, siginfo_t*, void*) { ABORT("Segmentation fault"); };

From 4b8039901ba29a427f33b42d4d25c24aced47018 Mon Sep 17 00:00:00 2001
From: Jerin Philip <jerinphilip@live.in>
Date: Fri, 24 Jun 2022 07:46:52 +0000
Subject: [PATCH 63/67] Replace -Wall with -Wcomment

---
 src/tensors/cpu/integer_common.h | 226 ++++++++++++++++++-------------
 1 file changed, 132 insertions(+), 94 deletions(-)

diff --git a/src/tensors/cpu/integer_common.h b/src/tensors/cpu/integer_common.h
index 389f5c4e8..ad46735a0 100644
--- a/src/tensors/cpu/integer_common.h
+++ b/src/tensors/cpu/integer_common.h
@@ -1,19 +1,19 @@
 #pragma once
 
+#include "common/io_item.h"
+#include "tensors/cpu/aligned.h"
 #include "tensors/tensor_allocator.h"
 #include "tensors/tensor_operators.h"
-#include "tensors/cpu/aligned.h"
-#include "common/io_item.h"
 #ifdef USE_INTGEMM
 #include "3rd_party/intgemm/intgemm/intgemm.h"
-#else // USE_INTGEMM
+#else  // USE_INTGEMM
 #pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wall"
+#pragma GCC diagnostic ignored "-Wcomment"
 #include <ruy/ruy.h>
 #pragma GCC diagnostic pop
 
 #include "ruy_adapter.h"
-#endif // USE_INTGEMM
+#endif  // USE_INTGEMM
 #if defined(WASM)
 #include "wasm_intgemm_interface.h"
 #endif
@@ -25,106 +25,144 @@ namespace marian {
 namespace cpu {
 namespace integer {
 
-//Convenient function to get rows and columns of a tensor, shadowed by namespace.
-inline int cols(Tensor& tensor) { return tensor->shape()[-1]; }
-inline int rows(Tensor& tensor) { return tensor->shape().elements() / cols(tensor); }
+// Convenient function to get rows and columns of a tensor, shadowed by namespace.
+inline int cols(Tensor &tensor) {
+  return tensor->shape()[-1];
+}
+inline int rows(Tensor &tensor) {
+  return tensor->shape().elements() / cols(tensor);
+}
 
-inline int cols(Shape& shape) { return shape[-1]; }
-inline int rows(Shape& shape) { return shape.elements() / cols(shape); }
+inline int cols(Shape &shape) {
+  return shape[-1];
+}
+inline int rows(Shape &shape) {
+  return shape.elements() / cols(shape);
+}
 
 // This operates on floats after processing so doesn't care about int8_t vs int16_t.
 void AddBias(marian::Tensor C, const marian::Tensor Bias);
 
 #ifdef USE_INTGEMM
 
-template<Type type> struct intgemm_;
-template <> struct intgemm_<Type::int8> {using width = intgemm::Int8;
-                                         using type = int8_t;
-                                         constexpr static const Type intgemmType = Type::intgemm8;};
-template <> struct intgemm_<Type::int16> {using width = intgemm::Int16;
-                                          using type = int16_t;
-                                          constexpr static const Type intgemmType = Type::intgemm16;};
-
-
-
-#else // USE_INTGEMM
-
-template<Type type> struct intgemm_;
-template <> struct intgemm_<Type::int8> {using width = IntgemmViaRuy::Int8;
-                                         using type = IntgemmViaRuy::Int8::Type;
-                                         constexpr static const Type intgemmType = Type::intgemm8;};
-template <> struct intgemm_<Type::int16> {using width = IntgemmViaRuy::Int16;
-                                          using type = IntgemmViaRuy::Int16::Type;
-                                          constexpr static const Type intgemmType = Type::intgemm16;};
-
-#endif // USE_INTGEMM
+template <Type type>
+struct intgemm_;
+template <>
+struct intgemm_<Type::int8> {
+  using width = intgemm::Int8;
+  using type = int8_t;
+  constexpr static const Type intgemmType = Type::intgemm8;
+};
+template <>
+struct intgemm_<Type::int16> {
+  using width = intgemm::Int16;
+  using type = int16_t;
+  constexpr static const Type intgemmType = Type::intgemm16;
+};
+
+#else  // USE_INTGEMM
+
+template <Type type>
+struct intgemm_;
+template <>
+struct intgemm_<Type::int8> {
+  using width = IntgemmViaRuy::Int8;
+  using type = IntgemmViaRuy::Int8::Type;
+  constexpr static const Type intgemmType = Type::intgemm8;
+};
+template <>
+struct intgemm_<Type::int16> {
+  using width = IntgemmViaRuy::Int16;
+  using type = IntgemmViaRuy::Int16::Type;
+  constexpr static const Type intgemmType = Type::intgemm16;
+};
+
+#endif  // USE_INTGEMM
 
 // For loading architecture agnostic models. We do PrepareAndTranpose, because we already transposed
 // in our binary format. Then we copy the quantizationMultiplier information at the end
-template<Type vtype>
-void prepareAndTransposeB(io::Item& item, const char * input) {
-    #ifdef COMPILE_CPU
-    typedef typename intgemm_<vtype>::type Integer;
-    Integer * output_tensor = reinterpret_cast<Integer *>(&(*item.bytes.begin()));
-    // Sometimes we will end up with misaligned intput (and output) so we can't use them directly.
-    // If this is the case, we will need to temporary allocate aligned memory, copy the results, and then free it
-    if (reinterpret_cast<uintptr_t>(input) % 64 == 0 && reinterpret_cast<uintptr_t>(output_tensor) % 64 == 0) {
-    #if defined(WASM)
-        ABORT_IF(intgemm_<vtype>::intgemmType == Type::intgemm16,
-                "Int16::PrepareBQuantizedTransposed is not implemented for wasm.");
-        int8PrepareBFromQuantizedTransposed(reinterpret_cast<const int8_t *>(input),
-                                        (Index)rows(item.shape),  //Since we only transposed, but didn't update the shape when constructing the binary 
-                                        (Index)cols(item.shape), //rows here returns the columns of the transposed input matrix, and cols -> the rows
-                                        (int8_t *)output_tensor);
-    #else
-        intgemm_<vtype>::width::PrepareBQuantizedTransposed(reinterpret_cast<const Integer *>(input),
-                                                   output_tensor,
-                                                   rows(item.shape),  //Since we only transposed, but didn't update the shape when constructing the binary, 
-                                                   cols(item.shape)); //rows here returns the columns of the transposed input matrix, and cols -> the rows
-    #endif
-    } else {
-        Integer * aligned_input = reinterpret_cast<Integer *>(genericMalloc(512, rows(item.shape)*cols(item.shape)*sizeof(Integer)));
-        std::copy(input, input + rows(item.shape)*cols(item.shape), aligned_input);
-        Integer * aligned_output = reinterpret_cast<Integer *>(genericMalloc(512, rows(item.shape)*cols(item.shape)*sizeof(Integer)));
-    #if defined(WASM)
-        ABORT_IF(intgemm_<vtype>::intgemmType == Type::intgemm16,
-                "Int16::PrepareBQuantizedTransposed is not implemented for wasm.");
-        int8PrepareBFromQuantizedTransposed(reinterpret_cast<const int8_t *>(aligned_input),
-                                        (Index)rows(item.shape),  //Since we only transposed, but didn't update the shape when constructing the binary, 
-                                        (Index)cols(item.shape), //rows here returns the columns of the transposed input matrix, and cols -> the rows
-                                        reinterpret_cast<int8_t *>(aligned_output));
-    #else
-        intgemm_<vtype>::width::PrepareBQuantizedTransposed(reinterpret_cast<const Integer *>(aligned_input),
-                                                   reinterpret_cast<Integer *>(aligned_output),
-                                                   rows(item.shape),  //Since we only transposed, but didn't update the shape when constructing the binary, 
-                                                   cols(item.shape)); //rows here returns the columns of the transposed input matrix, and cols -> the rows
-    #endif
-        // Copy to output tensor
-        std::copy(aligned_output, aligned_output + rows(item.shape)*cols(item.shape), output_tensor);
-        genericFree(aligned_input);
-        genericFree(aligned_output);
-    }
-    //Copy the quantMult
-    float quantMult = *(reinterpret_cast<const float *>(reinterpret_cast<const Integer *>(input) + item.shape.elements()));
-    *(reinterpret_cast<float *>(&(*(output_tensor + item.shape.elements())))) = quantMult;
-    #else // COMPILE_CPU
-    ABORT("Using intgemm models is supported only with -DCOMPILE_CPU=on");
-    #endif
+template <Type vtype>
+void prepareAndTransposeB(io::Item &item, const char *input) {
+#ifdef COMPILE_CPU
+  typedef typename intgemm_<vtype>::type Integer;
+  Integer *output_tensor = reinterpret_cast<Integer *>(&(*item.bytes.begin()));
+  // Sometimes we will end up with misaligned intput (and output) so we can't use them directly.
+  // If this is the case, we will need to temporary allocate aligned memory, copy the results, and
+  // then free it
+  if(reinterpret_cast<uintptr_t>(input) % 64 == 0
+     && reinterpret_cast<uintptr_t>(output_tensor) % 64 == 0) {
+#if defined(WASM)
+    ABORT_IF(intgemm_<vtype>::intgemmType == Type::intgemm16,
+             "Int16::PrepareBQuantizedTransposed is not implemented for wasm.");
+    int8PrepareBFromQuantizedTransposed(
+        reinterpret_cast<const int8_t *>(input),
+        (Index)rows(item.shape),  // Since we only transposed, but didn't update the shape when
+                                  // constructing the binary
+        (Index)cols(item.shape),  // rows here returns the columns of the transposed input matrix,
+                                  // and cols -> the rows
+        (int8_t *)output_tensor);
+#else
+    intgemm_<vtype>::width::PrepareBQuantizedTransposed(
+        reinterpret_cast<const Integer *>(input),
+        output_tensor,
+        rows(item.shape),  // Since we only transposed, but didn't update the shape when
+                           // constructing the binary,
+        cols(item.shape));  // rows here returns the columns of the transposed input matrix, and
+                            // cols -> the rows
+#endif
+  } else {
+    Integer *aligned_input = reinterpret_cast<Integer *>(
+        genericMalloc(512, rows(item.shape) * cols(item.shape) * sizeof(Integer)));
+    std::copy(input, input + rows(item.shape) * cols(item.shape), aligned_input);
+    Integer *aligned_output = reinterpret_cast<Integer *>(
+        genericMalloc(512, rows(item.shape) * cols(item.shape) * sizeof(Integer)));
+#if defined(WASM)
+    ABORT_IF(intgemm_<vtype>::intgemmType == Type::intgemm16,
+             "Int16::PrepareBQuantizedTransposed is not implemented for wasm.");
+    int8PrepareBFromQuantizedTransposed(
+        reinterpret_cast<const int8_t *>(aligned_input),
+        (Index)rows(item.shape),  // Since we only transposed, but didn't update the shape when
+                                  // constructing the binary,
+        (Index)cols(item.shape),  // rows here returns the columns of the transposed input matrix,
+                                  // and cols -> the rows
+        reinterpret_cast<int8_t *>(aligned_output));
+#else
+    intgemm_<vtype>::width::PrepareBQuantizedTransposed(
+        reinterpret_cast<const Integer *>(aligned_input),
+        reinterpret_cast<Integer *>(aligned_output),
+        rows(item.shape),  // Since we only transposed, but didn't update the shape when
+                           // constructing the binary,
+        cols(item.shape));  // rows here returns the columns of the transposed input matrix, and
+                            // cols -> the rows
+#endif
+    // Copy to output tensor
+    std::copy(aligned_output, aligned_output + rows(item.shape) * cols(item.shape), output_tensor);
+    genericFree(aligned_input);
+    genericFree(aligned_output);
+  }
+  // Copy the quantMult
+  float quantMult = *(reinterpret_cast<const float *>(reinterpret_cast<const Integer *>(input)
+                                                      + item.shape.elements()));
+  *(reinterpret_cast<float *>(&(*(output_tensor + item.shape.elements())))) = quantMult;
+#else  // COMPILE_CPU
+  ABORT("Using intgemm models is supported only with -DCOMPILE_CPU=on");
+#endif
 }
 
-template<Type vtype>
-void unquantizeWemb(io::Item& item, const char * input) {
-    typedef typename intgemm_<vtype>::type Integer;
-    float quantMult = *(reinterpret_cast<const float *>(reinterpret_cast<const Integer *>(input) + item.shape.elements()));
-    float * output_tensor = reinterpret_cast<float *>(&(*item.bytes.begin()));
-    // Explicitly calculate n once beforehand because the compiler does not pick up on its
-    // static nature, and will end up calling marian::Shape::dim() a lot.
-    const size_t n = rows(item.shape) * cols(item.shape);
-    for (size_t i = 0; i < n; i++) {
-        output_tensor[i] = reinterpret_cast<const Integer *>(input)[i]*(1/quantMult);
-    }
+template <Type vtype>
+void unquantizeWemb(io::Item &item, const char *input) {
+  typedef typename intgemm_<vtype>::type Integer;
+  float quantMult = *(reinterpret_cast<const float *>(reinterpret_cast<const Integer *>(input)
+                                                      + item.shape.elements()));
+  float *output_tensor = reinterpret_cast<float *>(&(*item.bytes.begin()));
+  // Explicitly calculate n once beforehand because the compiler does not pick up on its
+  // static nature, and will end up calling marian::Shape::dim() a lot.
+  const size_t n = rows(item.shape) * cols(item.shape);
+  for(size_t i = 0; i < n; i++) {
+    output_tensor[i] = reinterpret_cast<const Integer *>(input)[i] * (1 / quantMult);
+  }
 }
 
-} //integer
-} //cpu
-} //marian
+}  // namespace integer
+}  // namespace cpu
+}  // namespace marian

From ac8de9196d38da03251f2799c92ee8b8ca240489 Mon Sep 17 00:00:00 2001
From: Jerin Philip <jerinphilip@live.in>
Date: Fri, 24 Jun 2022 07:47:40 +0000
Subject: [PATCH 64/67] Revert "Replace -Wall with -Wcomment"

This reverts commit 4b8039901ba29a427f33b42d4d25c24aced47018.
---
 src/tensors/cpu/integer_common.h | 226 +++++++++++++------------------
 1 file changed, 94 insertions(+), 132 deletions(-)

diff --git a/src/tensors/cpu/integer_common.h b/src/tensors/cpu/integer_common.h
index ad46735a0..389f5c4e8 100644
--- a/src/tensors/cpu/integer_common.h
+++ b/src/tensors/cpu/integer_common.h
@@ -1,19 +1,19 @@
 #pragma once
 
-#include "common/io_item.h"
-#include "tensors/cpu/aligned.h"
 #include "tensors/tensor_allocator.h"
 #include "tensors/tensor_operators.h"
+#include "tensors/cpu/aligned.h"
+#include "common/io_item.h"
 #ifdef USE_INTGEMM
 #include "3rd_party/intgemm/intgemm/intgemm.h"
-#else  // USE_INTGEMM
+#else // USE_INTGEMM
 #pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wcomment"
+#pragma GCC diagnostic ignored "-Wall"
 #include <ruy/ruy.h>
 #pragma GCC diagnostic pop
 
 #include "ruy_adapter.h"
-#endif  // USE_INTGEMM
+#endif // USE_INTGEMM
 #if defined(WASM)
 #include "wasm_intgemm_interface.h"
 #endif
@@ -25,144 +25,106 @@ namespace marian {
 namespace cpu {
 namespace integer {
 
-// Convenient function to get rows and columns of a tensor, shadowed by namespace.
-inline int cols(Tensor &tensor) {
-  return tensor->shape()[-1];
-}
-inline int rows(Tensor &tensor) {
-  return tensor->shape().elements() / cols(tensor);
-}
+//Convenient function to get rows and columns of a tensor, shadowed by namespace.
+inline int cols(Tensor& tensor) { return tensor->shape()[-1]; }
+inline int rows(Tensor& tensor) { return tensor->shape().elements() / cols(tensor); }
 
-inline int cols(Shape &shape) {
-  return shape[-1];
-}
-inline int rows(Shape &shape) {
-  return shape.elements() / cols(shape);
-}
+inline int cols(Shape& shape) { return shape[-1]; }
+inline int rows(Shape& shape) { return shape.elements() / cols(shape); }
 
 // This operates on floats after processing so doesn't care about int8_t vs int16_t.
 void AddBias(marian::Tensor C, const marian::Tensor Bias);
 
 #ifdef USE_INTGEMM
 
-template <Type type>
-struct intgemm_;
-template <>
-struct intgemm_<Type::int8> {
-  using width = intgemm::Int8;
-  using type = int8_t;
-  constexpr static const Type intgemmType = Type::intgemm8;
-};
-template <>
-struct intgemm_<Type::int16> {
-  using width = intgemm::Int16;
-  using type = int16_t;
-  constexpr static const Type intgemmType = Type::intgemm16;
-};
-
-#else  // USE_INTGEMM
-
-template <Type type>
-struct intgemm_;
-template <>
-struct intgemm_<Type::int8> {
-  using width = IntgemmViaRuy::Int8;
-  using type = IntgemmViaRuy::Int8::Type;
-  constexpr static const Type intgemmType = Type::intgemm8;
-};
-template <>
-struct intgemm_<Type::int16> {
-  using width = IntgemmViaRuy::Int16;
-  using type = IntgemmViaRuy::Int16::Type;
-  constexpr static const Type intgemmType = Type::intgemm16;
-};
-
-#endif  // USE_INTGEMM
+template<Type type> struct intgemm_;
+template <> struct intgemm_<Type::int8> {using width = intgemm::Int8;
+                                         using type = int8_t;
+                                         constexpr static const Type intgemmType = Type::intgemm8;};
+template <> struct intgemm_<Type::int16> {using width = intgemm::Int16;
+                                          using type = int16_t;
+                                          constexpr static const Type intgemmType = Type::intgemm16;};
+
+
+
+#else // USE_INTGEMM
+
+template<Type type> struct intgemm_;
+template <> struct intgemm_<Type::int8> {using width = IntgemmViaRuy::Int8;
+                                         using type = IntgemmViaRuy::Int8::Type;
+                                         constexpr static const Type intgemmType = Type::intgemm8;};
+template <> struct intgemm_<Type::int16> {using width = IntgemmViaRuy::Int16;
+                                          using type = IntgemmViaRuy::Int16::Type;
+                                          constexpr static const Type intgemmType = Type::intgemm16;};
+
+#endif // USE_INTGEMM
 
 // For loading architecture agnostic models. We do PrepareAndTranpose, because we already transposed
 // in our binary format. Then we copy the quantizationMultiplier information at the end
-template <Type vtype>
-void prepareAndTransposeB(io::Item &item, const char *input) {
-#ifdef COMPILE_CPU
-  typedef typename intgemm_<vtype>::type Integer;
-  Integer *output_tensor = reinterpret_cast<Integer *>(&(*item.bytes.begin()));
-  // Sometimes we will end up with misaligned intput (and output) so we can't use them directly.
-  // If this is the case, we will need to temporary allocate aligned memory, copy the results, and
-  // then free it
-  if(reinterpret_cast<uintptr_t>(input) % 64 == 0
-     && reinterpret_cast<uintptr_t>(output_tensor) % 64 == 0) {
-#if defined(WASM)
-    ABORT_IF(intgemm_<vtype>::intgemmType == Type::intgemm16,
-             "Int16::PrepareBQuantizedTransposed is not implemented for wasm.");
-    int8PrepareBFromQuantizedTransposed(
-        reinterpret_cast<const int8_t *>(input),
-        (Index)rows(item.shape),  // Since we only transposed, but didn't update the shape when
-                                  // constructing the binary
-        (Index)cols(item.shape),  // rows here returns the columns of the transposed input matrix,
-                                  // and cols -> the rows
-        (int8_t *)output_tensor);
-#else
-    intgemm_<vtype>::width::PrepareBQuantizedTransposed(
-        reinterpret_cast<const Integer *>(input),
-        output_tensor,
-        rows(item.shape),  // Since we only transposed, but didn't update the shape when
-                           // constructing the binary,
-        cols(item.shape));  // rows here returns the columns of the transposed input matrix, and
-                            // cols -> the rows
-#endif
-  } else {
-    Integer *aligned_input = reinterpret_cast<Integer *>(
-        genericMalloc(512, rows(item.shape) * cols(item.shape) * sizeof(Integer)));
-    std::copy(input, input + rows(item.shape) * cols(item.shape), aligned_input);
-    Integer *aligned_output = reinterpret_cast<Integer *>(
-        genericMalloc(512, rows(item.shape) * cols(item.shape) * sizeof(Integer)));
-#if defined(WASM)
-    ABORT_IF(intgemm_<vtype>::intgemmType == Type::intgemm16,
-             "Int16::PrepareBQuantizedTransposed is not implemented for wasm.");
-    int8PrepareBFromQuantizedTransposed(
-        reinterpret_cast<const int8_t *>(aligned_input),
-        (Index)rows(item.shape),  // Since we only transposed, but didn't update the shape when
-                                  // constructing the binary,
-        (Index)cols(item.shape),  // rows here returns the columns of the transposed input matrix,
-                                  // and cols -> the rows
-        reinterpret_cast<int8_t *>(aligned_output));
-#else
-    intgemm_<vtype>::width::PrepareBQuantizedTransposed(
-        reinterpret_cast<const Integer *>(aligned_input),
-        reinterpret_cast<Integer *>(aligned_output),
-        rows(item.shape),  // Since we only transposed, but didn't update the shape when
-                           // constructing the binary,
-        cols(item.shape));  // rows here returns the columns of the transposed input matrix, and
-                            // cols -> the rows
-#endif
-    // Copy to output tensor
-    std::copy(aligned_output, aligned_output + rows(item.shape) * cols(item.shape), output_tensor);
-    genericFree(aligned_input);
-    genericFree(aligned_output);
-  }
-  // Copy the quantMult
-  float quantMult = *(reinterpret_cast<const float *>(reinterpret_cast<const Integer *>(input)
-                                                      + item.shape.elements()));
-  *(reinterpret_cast<float *>(&(*(output_tensor + item.shape.elements())))) = quantMult;
-#else  // COMPILE_CPU
-  ABORT("Using intgemm models is supported only with -DCOMPILE_CPU=on");
-#endif
+template<Type vtype>
+void prepareAndTransposeB(io::Item& item, const char * input) {
+    #ifdef COMPILE_CPU
+    typedef typename intgemm_<vtype>::type Integer;
+    Integer * output_tensor = reinterpret_cast<Integer *>(&(*item.bytes.begin()));
+    // Sometimes we will end up with misaligned intput (and output) so we can't use them directly.
+    // If this is the case, we will need to temporary allocate aligned memory, copy the results, and then free it
+    if (reinterpret_cast<uintptr_t>(input) % 64 == 0 && reinterpret_cast<uintptr_t>(output_tensor) % 64 == 0) {
+    #if defined(WASM)
+        ABORT_IF(intgemm_<vtype>::intgemmType == Type::intgemm16,
+                "Int16::PrepareBQuantizedTransposed is not implemented for wasm.");
+        int8PrepareBFromQuantizedTransposed(reinterpret_cast<const int8_t *>(input),
+                                        (Index)rows(item.shape),  //Since we only transposed, but didn't update the shape when constructing the binary 
+                                        (Index)cols(item.shape), //rows here returns the columns of the transposed input matrix, and cols -> the rows
+                                        (int8_t *)output_tensor);
+    #else
+        intgemm_<vtype>::width::PrepareBQuantizedTransposed(reinterpret_cast<const Integer *>(input),
+                                                   output_tensor,
+                                                   rows(item.shape),  //Since we only transposed, but didn't update the shape when constructing the binary, 
+                                                   cols(item.shape)); //rows here returns the columns of the transposed input matrix, and cols -> the rows
+    #endif
+    } else {
+        Integer * aligned_input = reinterpret_cast<Integer *>(genericMalloc(512, rows(item.shape)*cols(item.shape)*sizeof(Integer)));
+        std::copy(input, input + rows(item.shape)*cols(item.shape), aligned_input);
+        Integer * aligned_output = reinterpret_cast<Integer *>(genericMalloc(512, rows(item.shape)*cols(item.shape)*sizeof(Integer)));
+    #if defined(WASM)
+        ABORT_IF(intgemm_<vtype>::intgemmType == Type::intgemm16,
+                "Int16::PrepareBQuantizedTransposed is not implemented for wasm.");
+        int8PrepareBFromQuantizedTransposed(reinterpret_cast<const int8_t *>(aligned_input),
+                                        (Index)rows(item.shape),  //Since we only transposed, but didn't update the shape when constructing the binary, 
+                                        (Index)cols(item.shape), //rows here returns the columns of the transposed input matrix, and cols -> the rows
+                                        reinterpret_cast<int8_t *>(aligned_output));
+    #else
+        intgemm_<vtype>::width::PrepareBQuantizedTransposed(reinterpret_cast<const Integer *>(aligned_input),
+                                                   reinterpret_cast<Integer *>(aligned_output),
+                                                   rows(item.shape),  //Since we only transposed, but didn't update the shape when constructing the binary, 
+                                                   cols(item.shape)); //rows here returns the columns of the transposed input matrix, and cols -> the rows
+    #endif
+        // Copy to output tensor
+        std::copy(aligned_output, aligned_output + rows(item.shape)*cols(item.shape), output_tensor);
+        genericFree(aligned_input);
+        genericFree(aligned_output);
+    }
+    //Copy the quantMult
+    float quantMult = *(reinterpret_cast<const float *>(reinterpret_cast<const Integer *>(input) + item.shape.elements()));
+    *(reinterpret_cast<float *>(&(*(output_tensor + item.shape.elements())))) = quantMult;
+    #else // COMPILE_CPU
+    ABORT("Using intgemm models is supported only with -DCOMPILE_CPU=on");
+    #endif
 }
 
-template <Type vtype>
-void unquantizeWemb(io::Item &item, const char *input) {
-  typedef typename intgemm_<vtype>::type Integer;
-  float quantMult = *(reinterpret_cast<const float *>(reinterpret_cast<const Integer *>(input)
-                                                      + item.shape.elements()));
-  float *output_tensor = reinterpret_cast<float *>(&(*item.bytes.begin()));
-  // Explicitly calculate n once beforehand because the compiler does not pick up on its
-  // static nature, and will end up calling marian::Shape::dim() a lot.
-  const size_t n = rows(item.shape) * cols(item.shape);
-  for(size_t i = 0; i < n; i++) {
-    output_tensor[i] = reinterpret_cast<const Integer *>(input)[i] * (1 / quantMult);
-  }
+template<Type vtype>
+void unquantizeWemb(io::Item& item, const char * input) {
+    typedef typename intgemm_<vtype>::type Integer;
+    float quantMult = *(reinterpret_cast<const float *>(reinterpret_cast<const Integer *>(input) + item.shape.elements()));
+    float * output_tensor = reinterpret_cast<float *>(&(*item.bytes.begin()));
+    // Explicitly calculate n once beforehand because the compiler does not pick up on its
+    // static nature, and will end up calling marian::Shape::dim() a lot.
+    const size_t n = rows(item.shape) * cols(item.shape);
+    for (size_t i = 0; i < n; i++) {
+        output_tensor[i] = reinterpret_cast<const Integer *>(input)[i]*(1/quantMult);
+    }
 }
 
-}  // namespace integer
-}  // namespace cpu
-}  // namespace marian
+} //integer
+} //cpu
+} //marian

From 38b608af5ae1af852e68fe9c2e835a22ef11be59 Mon Sep 17 00:00:00 2001
From: Jerin Philip <jerinphilip@live.in>
Date: Fri, 24 Jun 2022 07:48:38 +0000
Subject: [PATCH 65/67] Disable formatting then local edit -Wall -> -Wcomment

---
 src/tensors/cpu/integer_common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tensors/cpu/integer_common.h b/src/tensors/cpu/integer_common.h
index 389f5c4e8..d05440ef1 100644
--- a/src/tensors/cpu/integer_common.h
+++ b/src/tensors/cpu/integer_common.h
@@ -8,7 +8,7 @@
 #include "3rd_party/intgemm/intgemm/intgemm.h"
 #else // USE_INTGEMM
 #pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wall"
+#pragma GCC diagnostic ignored "-Wcomment"
 #include <ruy/ruy.h>
 #pragma GCC diagnostic pop
 

From 86c8d44af6edf8d501c6845dc31819e483122237 Mon Sep 17 00:00:00 2001
From: Jerin Philip <jerinphilip@live.in>
Date: Mon, 27 Jun 2022 11:30:53 +0000
Subject: [PATCH 66/67] Do not check for BLAS on usual ARM, except Mac: Apple
 Accelerate

---
 CMakeLists.txt | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f505188d1..b0e707012 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -69,7 +69,12 @@ CMAKE_DEPENDENT_OPTION(COMPILE_WITHOUT_EXCEPTIONS "Compile without exceptions" O
 
 if(${CMAKE_TARGET_ARCHITECTURE_CODE} MATCHES "arm")
   set(USE_RUY ON)
-  set(USE_RUY_SGEMM ON)
+
+  # Apple M1 has Apple Accelerate(?).
+  if(NOT APPLE)
+    set(USE_RUY_SGEMM ON)
+  endif(NOT APPLE)
+
   set(USE_SIMD_UTILS ON)
 else()
   set(USE_INTGEMM ON)
@@ -593,9 +598,8 @@ if(COMPILE_CPU)
       set(EXT_LIBS ${EXT_LIBS} intgemm)       # Move the intgemm bits on top since they compile with every single variant
   endif(USE_INTGEMM)
 
-  if(USE_RUY)
+  if(USE_RUY OR USE_RUY_SGEMM)
     set(EXT_LIBS ${EXT_LIBS} ruy) 
-    add_compile_definitions(USE_RUY_SGEMM=1)
   endif(USE_RUY)
 
   add_definitions(-DCOMPILE_CPU=1)        # Move the compile CPU definition on top since we want to compile intgemm when we set compile CPU
@@ -613,6 +617,8 @@ if(COMPILE_CPU)
     include_directories("/Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/System/Library/Frameworks/Accelerate.framework/Frameworks/vecLib.framework/Headers")
     set(EXT_LIBS ${EXT_LIBS} "-framework Accelerate")
     add_definitions(-DBLAS_FOUND=1)
+  elseif(USE_RUY_SGEMM)
+    add_compile_definitions(USE_RUY_SGEMM=1)
   else(USE_ONNX_SGEMM)
     if(USE_MKL)
       find_package(MKL)

From 861e31d55931bf2578ae2765dd0859e33ee2700c Mon Sep 17 00:00:00 2001
From: Jerin Philip <jerinphilip@live.in>
Date: Fri, 1 Jul 2022 10:58:47 +0000
Subject: [PATCH 67/67] Fix endif: CMakeScript quirks

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b0e707012..efa096f55 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -600,7 +600,7 @@ if(COMPILE_CPU)
 
   if(USE_RUY OR USE_RUY_SGEMM)
     set(EXT_LIBS ${EXT_LIBS} ruy) 
-  endif(USE_RUY)
+  endif(USE_RUY OR USE_RUY_SGEMM)
 
   add_definitions(-DCOMPILE_CPU=1)        # Move the compile CPU definition on top since we want to compile intgemm when we set compile CPU
                                           # in case a BLAS vendor is not found, we have a runtime error, although we should probably not allow the compilation to go on